def get_config(self): """ Get config and its associated model """ (config, attention) = MODELS[self.model_type] config = config(**self.config) transformers.models.roberta.modeling_roberta.RobertaSelfAttention = attention return config
def __init__(self, model_path='tmp/model', w2id_path='data/w2id.json'): with open(w2id_path) as f: self.w2id = json.load(f) conf = config() self.num_steps = conf.num_steps self.cutModel = lstm_model(conf) self.sess = tf.Session() saver = tf.train.Saver() saver.restore(self.sess, model_path)
#Instanciation object Redis objRedis = initRedis() #Va contenir le texte du SMS dataTextSms = "" #Défini si l'exécution doit continuer pour l'ensemble des scripts objRedis.set('flagExecute_Treatment', 0) objRedis.set('nameExecute_Treatment', 'INIT') """ On instancie l'objet gérant les configurations Et on récupère les informations pour les placer dans Redis Clé : config_* """ objConfig = config() objConfig.getThis() dataTextSms = dataTextSms + "Config OK \r\n" # On definit le path du log logging.basicConfig(filename=objRedis.get('config_path_log').decode("utf-8") + time.strftime('%Y%m%d') + '_init_adsb.log', level=logging.INFO) """ On instancie l'objet gérant les squawk Et on récupère les informations pour les placer dans Redis Clé : squawk """ objSquawk = squawk() returnSquawk = objSquawk.setDataInRedis() dataTextSms = dataTextSms + " " + returnSquawk.__str__() + " Squawk \r\n"
doc_file_path = model.generate_path([main_path, "docs", doc_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) # 下载 if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(info_list + video_list + doc_list) model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length ) # 需要session或者有时间期限的 else: # 默认调用自建下载 model.download_queue(session, info_list + video_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: print("No course Id,Please check!") return if __name__ == '__main__': course_url = "" # Loading config config = model.config("settings.conf", "xuetangx") main(course_url, config)
print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link)) srt_list.append((srt_eng_link, srt_file_path)) video_in_chapter_list[-1] += 1 if lesson_content_type == 3 and config.Download_Docs: # Documentation doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1)) doc_name = "{0}.pdf".format(lesson_name) doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern]) doc_file_path = model.generate_path([doc_path, doc_name]) doc_list.append((doc_link, doc_file_path)) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(info_list + video_list) # 需要session或者有时间期限的,仍然使用自建下载 model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length) else: # 默认调用自建下载 model.download_queue(session, info_list + video_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: err_message = re.search(r'message:(.+)\}\)', rdata).group(1) print("Error:{0},Please make sure you login by 163-email " "and your \"Session-Cookies\" pair is right.".format(err_message)) if __name__ == '__main__': course_url = "" config = model.config("settings.conf", "icourse163") main(course_url, config=config)
def main(course_url): config = model.config("settings.conf", "icourse163") session = model.login(site="icourse163", conf=config) http_session_id = session.cookies["NTESSTUDYSI"] c_tid = re.search(r"(?:(learn)|(course))/(?P<id>(?P<c_id>[\w:+-]+)(\?tid=(?P<t_id>\d+))?)#?/?", course_url) # Download cache list main_list = [] srt_list = [] doc_list = [] # handle the course_url links to Get right courseId and termId if c_tid: if c_tid.group("t_id"): # 当使用者提供tid的时候默认使用使用者tid term_id = c_tid.group("t_id") info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('id')) else: # 否则通过info页面重新获取最新tid term_id = None print("No termId which you want to download.Will Choose the Lastest term.") info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('c_id')) # 使用课程默认地址 page_about = session.get(url=info_url) if page_about.url == page_about.request.url: # 存在该课程 # 当课程不存在的时候会302重定向到http://www.icourse163.org/,通过检查返回、请求地址是否一致判断 page_about_bs = BeautifulSoup(page_about.text, "lxml") course_info_raw = page_about_bs.find("script", text=re.compile(r"termDto")).string.replace("\n", "") if term_id is None: # 没有提供tid时候自动寻找最新课程信息 term_id = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1) # 获取课程信息 course_page_title = re.search(r'(.+?)_(.+?)_(.+?)', page_about_bs.title.string) course_title = model.clean_filename(course_page_title.group(1)) school = course_page_title.group(2) teacher = model.sort_teacher(page_about_bs.find_all('h3', class_="f-fc3")) folder = model.clean_filename('-'.join([course_title, school, teacher])) print("The Download INFO:\n" # Output download course info "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=info_url, folder=folder, id=term_id)) main_path = model.generate_path([config.Download_Path, folder]) info_img_link = page_about_bs.find("div", id="j-courseImg").img["src"] img_file_name = r"课程封面图-{title}.png".format(title=course_title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info_img_link)) main_list.append((info_img_link, img_file_path)) # intro_video video_search = re.search(r"videoId : \"(\d+)\"", course_info_raw) if video_search: payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLessonUnitPreviewVo', 'c0-id': 0, 'c0-param0': video_search.group(1), 'c0-param1': 1, 'batchId': random.randint(1000000000000, 20000000000000) } ask_video_url = "http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitPreviewVo.dwr" resp = session.post(url=ask_video_url, data=payload).text for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']: # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl' video_search_group = re.search(r's\d+.(?P<VideoType>' + str(k) + ')="(?P<dllink>.+?)";', resp) if video_search_group: info_video_link = video_search_group.group("dllink") video_file_name = r"课程简介-{title}.mp4".format(title=course_title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info_video_link)) main_list.append((info_video_link, video_file_path)) break else: print("Not found this course in \"icourse163.org\",Check Please") return # Get course's chapter payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': 0, 'c0-param0': term_id, 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text if re.search(r"var s\d+={}", rdata): print("Generate Download information.") # Data cleaning Reg week_reg = re.compile(r"s\d+.contentId=null;" r".+s\d+.lessons=(?P<lessons>s\d+)" r".+s\d+.name=\"(?P<week_name>.+?)\"") chapter_reg = re.compile(r"s\d+.chapterId=\d+;" r".+s\d+.name=\"(?P<chapter_name>.+?)\"") lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);" r".+s\d+.contentId=(?P<contentId>\d+)" r".+s\d+.contentType=(?P<contentType>\d+)" r".+s\d+.id=(?P<id>\d+)" r".+s\d+.name=\"(?P<lesson_name>.+?)\"") # count_list week_list = [] chapter_list = [] video_in_chapter_list = [] for line in rdata.splitlines(): if re.match(week_reg, line): # Week week_re = re.search(week_reg, line) week_name = model.clean_filename(model.raw_unicode_escape(week_re.group("week_name"))) week_list.append(week_name) if re.match(chapter_reg, line): # Chapter chapter_re = re.search(chapter_reg, line) chapter_name = model.clean_filename(model.raw_unicode_escape(chapter_re.group("chapter_name"))) chapter_list.append(chapter_name) print("\n", week_list[-1], chapter_list[-1]) video_in_chapter_list.append(0) if re.match(lesson_reg, line): lesson_re = re.search(lesson_reg, line) lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]]) lesson_name = model.clean_filename(model.raw_unicode_escape(lesson_re.group("lesson_name"))) lesson_content_type = int(lesson_re.group("contentType")) # prepare data and post payload = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)), 'httpSessionId': http_session_id, 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': 1, 'c0-param0': lesson_re.group("contentId"), 'c0-param1': lesson_content_type, 'c0-param2': 0, 'c0-param3': lesson_re.group("id"), 'batchId': random.randint(1000000000000, 20000000000000) } cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr' rdata = session.post(cs_url, data=payload, timeout=None).text # Sort data depend on it's contentType # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion if lesson_content_type == 1: # Video count = video_in_chapter_list[-1] count_lesson_name = model.clean_filename("{0} {lesson}".format(count, lesson=lesson_name)) for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']: # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl' if re.search(r's\d+.{0}=".+?";'.format(k), rdata): k_type = re.search("mp4(.+)Url", k).group(1) video_file_name = "{0}.mp4".format(count_lesson_name) if k_type != "Shd": video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type) video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1) video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name]) main_list.append((video_link, video_file_path)) print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link)) break # Subtitle if config.Download_Srt: srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern]) if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata): # Chinese srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.chs.srt".format(count_lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_chs_link = srt_chs_re.group("url") print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link)) srt_list.append((srt_chs_link, srt_file_path)) if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata): # English srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata) srt_file_name = "{0}.eng.srt".format(lesson_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) srt_eng_link = srt_eng_re.group("url") print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link)) srt_list.append((srt_eng_link, srt_file_path)) video_in_chapter_list[-1] += 1 if lesson_content_type == 3 and config.Download_Docs: # Documentation doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1)) doc_name = "{0}.pdf".format(lesson_name) doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern]) doc_file_path = model.generate_path([doc_path, doc_name]) doc_list.append((doc_link, doc_file_path)) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(main_list) # 需要session或者有时间期限的,仍然使用自建下载 model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length) else: # 默认调用自建下载 model.download_queue(session, main_list + srt_list + doc_list, queue_length=config.Download_Queue_Length) else: err_message = re.search(r'message:(.+)\}\)', rdata).group(1) print("Error:{0},Please make sure you login by 163-email " "and your \"Session-Cookies\" pair is right.".format(err_message)) else: print("No course Id,Please check!") return
def main(course_url): config = model.config("settings.conf", "xuetangx") # Loading config session = model.login(site="xuetangx", conf=config) course_id_search = re.search(r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)", course_url) # Download cache list main_list = [] srt_list = [] doc_list = [] if course_id_search: course_id = course_id_search.group("id") main_page = "http://www.xuetangx.com/courses/{course_id}".format(course_id=course_id) page_about_url = "{course_host}/about".format(course_host=main_page) page_about = session.get(url=page_about_url) if page_about.text.find("页面无法找到") == -1: # if Exist page_about_bs = BeautifulSoup(page_about.text, "lxml") # load course info course_detail_bs = page_about_bs.find("section", class_="courseabout_detail") course_name_tag = course_detail_bs.find("h3", class_="courseabout_title") course_title = model.clean_filename(course_name_tag.get_text()) school = course_name_tag.find_next("a").get_text() teacher = model.sort_teacher( page_about_bs.find("ul", class_="teacher_info").find_all("span", class_="name")) folder = model.clean_filename('-'.join([course_title, school, teacher])) print("The Download INFO:\n" # Output download course info "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=page_about_url, folder=folder, id=course_id)) main_path = model.generate_path([config.Download_Path, folder]) video_box = course_detail_bs.find('div', class_='video_box') try: info_img_link = model.link_check("http://www.xuetangx.com", video_box['data-poster']) info_video_link = get_video(session, video_box["data-ccid"]) if info_video_link: video_file_name = r"课程简介-{title}.mp4".format(title=course_title) video_file_path = model.generate_path([main_path, video_file_name]) print("课程简介视频: {link}".format(link=info_video_link)) main_list.append((info_video_link, video_file_path)) except KeyError: info_img_link = model.link_check("http://www.xuetangx.com", video_box.img["src"]) if info_img_link: img_file_name = r"课程封面图-{title}.jpg".format(title=course_title) img_file_path = model.generate_path([main_path, img_file_name]) print("课程封面图: {link}".format(link=info_img_link)) main_list.append((info_img_link, img_file_path)) else: print("Not found this course in \"xuetangx.com\",Check Please") return # 获取课程参与信息及判断是否已经参加课程 page_courseware = session.get(url="{0}/courseware".format(main_page)) if page_courseware.url.find("about") == -1 and page_courseware.url.find("login") == -1: # 成功获取目录 # 这里根据url判断: # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面 # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面 print("Generate Download information.") # 处理courseware页面 courseware_bs = BeautifulSoup(page_courseware.text, "lxml") chapter = courseware_bs.find_all("div", class_="chapter") for week in chapter: week_name = model.clean_filename(week.h3.a.string.strip()) for lesson in week.ul.find_all("a"): # 获取课程信息 lesson_name = model.clean_filename(lesson.p.string) # 主标题 lesson_page = session.get(url="http://www.xuetangx.com{href}".format(href=lesson['href']), timeout=None) lesson_bs = BeautifulSoup(lesson_page.text, "lxml") tab_list = {} for tab in lesson_bs.find_all("a", role="tab"): tab_list[tab.get('id')] = re.search("(.+)", tab.get('title')).group(1) seq_contents = lesson_bs.find_all('div', class_="seq_contents") print("\n", week_name, lesson_name) seq_video_content_len = 0 for seq in seq_contents: if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): seq_video_content_len += 1 for i, seq in enumerate(seq_contents): seq_name = lesson_name seq_path = model.generate_path([main_path, week_name]) srt_path = model.generate_path([main_path, "srt", week_name]) doc_path = model.generate_path([main_path, "docs", week_name]) if seq_video_content_len > 1: # 如果只有一个的话,就不用建立子文件夹了 seq_name_raw = model.clean_filename(tab_list[seq.get("aria-labelledby")]) seq_name = r"{0} {1}".format(i, seq_name_raw) seq_path = model.generate_path([seq_path, lesson_name]) srt_path = model.generate_path([srt_path, lesson_name]) doc_path = model.generate_path([doc_path, lesson_name]) if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text): # 视频 lesson_ccsource = re.search(r"data-ccsource=[\'\"](.+)[\'\"]", seq.text).group(1) video_link = get_video(session, lesson_ccsource) video_file_name = "{0}.mp4".format(seq_name) if video_link.find == -1: video_file_name = "{0}_sd.mp4".format(seq_name) video_file_path = model.generate_path([seq_path, video_file_name]) print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link)) main_list.append((video_link, video_file_path)) seq_bs = BeautifulSoup(seq.text, "lxml") if config.Download_Srt and seq_bs.find("a", text="下载字幕"): # 字幕 raw_link = seq_bs.find("a", text="下载字幕")["href"] srt_link = model.link_check("http://www.xuetangx.com", raw_link) srt_file_name = "{0}.srt".format(seq_name) srt_file_path = model.generate_path([srt_path, srt_file_name]) print("字幕: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_link)) srt_list.append((srt_link, srt_file_path)) if config.Download_Docs and seq_bs.find("a", text="下载讲义"): # 讲义 raw_link = seq_bs.find("a", text="下载讲义")["href"] doc_link = model.link_check("http://www.xuetangx.com", raw_link) doc_file_name = model.clean_filename(doc_link.split("/")[-1]) doc_file_path = model.generate_path([doc_path, doc_file_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_file_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) else: # 未登陆成功或者没参加该课程 print("Something Error,You may not Join this course or Enter the wrong password.") return # 处理info页面的课程讲义 page_info = session.get(url="{0}/info".format(main_page)) info_bs = BeautifulSoup(page_info.text, "lxml") doc_menu = info_bs.find("section", attrs={"aria-label": re.compile("讲义导航")}) for each in doc_menu.find_all("a"): doc_name = each["href"].split("/")[-1] doc_link = model.link_check("http://www.xuetangx.com", each["href"]) doc_file_path = model.generate_path([main_path, "docs", doc_name]) print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link)) doc_list.append((doc_link, doc_file_path)) # 下载 if config.Download: if config.Download_Method == "Aria2": # 这里是调用aria2的下载 model.aira2_download(main_list + doc_list) model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length) # 需要session或者有时间期限的 else: # 默认调用自建下载 model.download_queue(session, main_list + srt_list + doc_list,queue_length=config.Download_Queue_Length) else: print("No course Id,Please check!") return
def main(): conf = config() conf.batch_size = 50 conf.num_steps = 50 train(conf,1000)
print(' Word 编号: {}'.format([i for i in answer_logits if i != pad])) print(' Response Words: {}'.format( " ".join([target_int_to_letter[i] for i in answer_logits if i != pad]))) if __name__ == '__main__': with open(r'data/letters_source.txt', 'r', encoding='utf-8') as f: source_data = f.read() with open(r'data/letters_target.txt', 'r', encoding='utf-8') as f: target_data = f.read() print(source_data.split('\n')[:10]) print(target_data.split('\n')[:10]) # 构造映射表 source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data) target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data) # 对字母进行转换 source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) for letter in line] for line in source_data.split('\n')] target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')] config = config() model = Seq2Seq(config, target_letter_to_int, source_letter_to_int) train(config, model, source_int, target_int)