def matching_confidence(video, subtitle): """Compute the confidence that the subtitle matches the video. Returns a float between 0 and 1. 1 being the perfect match.""" guess = guessit.guess_file_info(subtitle.release, "autodetect") video_keywords = utils.get_keywords(video.guess) subtitle_keywords = utils.get_keywords(guess) | subtitle.keywords replacement = {"keywords": len(video_keywords & subtitle_keywords)} if isinstance(video, videos.Episode): replacement.update({"series": 0, "season": 0, "episode": 0}) matching_format = "{series:b}{season:b}{episode:b}{keywords:03b}" best = matching_format.format(series=1, season=1, episode=1, keywords=len(video_keywords)) if guess["type"] in ["episode", "episodesubtitle"]: if "series" in guess and guess["series"].lower() == video.series.lower(): replacement["series"] = 1 if "season" in guess and guess["season"] == video.season: replacement["season"] = 1 if "episodeNumber" in guess and guess["episodeNumber"] == video.episode: replacement["episode"] = 1 elif isinstance(video, videos.Movie): replacement.update({"title": 0, "year": 0}) matching_format = "{title:b}{year:b}{keywords:03b}" best = matching_format.format(title=1, year=1, keywords=len(video_keywords)) if guess["type"] in ["movie", "moviesubtitle"]: if "title" in guess and guess["title"].lower() == video.title.lower(): replacement["title"] = 1 if "year" in guess and guess["year"] == video.year: replacement["year"] = 1 else: return 0 confidence = float(int(matching_format.format(**replacement), 2)) / float(int(best, 2)) return confidence
def matching_confidence(video, subtitle): '''Compute the confidence that the subtitle matches the video. Returns a float between 0 and 1. 1 being the perfect match.''' guess = guessit.guess_file_info(subtitle.release, 'autodetect') video_keywords = utils.get_keywords(video.guess) subtitle_keywords = utils.get_keywords(guess) | subtitle.keywords replacement = {'keywords': len(video_keywords & subtitle_keywords)} if isinstance(video, videos.Episode): replacement.update({'series': 0, 'season': 0, 'episode': 0}) matching_format = '{series:b}{season:b}{episode:b}{keywords:03b}' best = matching_format.format(series=1, season=1, episode=1, keywords=len(video_keywords)) if guess['type'] in ['episode', 'episodesubtitle']: if 'series' in guess and guess['series'].lower() == video.series.lower(): replacement['series'] = 1 if 'season' in guess and guess['season'] == video.season: replacement['season'] = 1 if 'episodeNumber' in guess and guess['episodeNumber'] == video.episode: replacement['episode'] = 1 elif isinstance(video, videos.Movie): replacement.update({'title': 0, 'year': 0}) matching_format = '{title:b}{year:b}{keywords:03b}' best = matching_format.format(title=1, year=1, keywords=len(video_keywords)) if guess['type'] in ['movie', 'moviesubtitle']: if 'title' in guess and guess['title'].lower() == video.title.lower(): replacement['title'] = 1 if 'year' in guess and guess['year'] == video.year: replacement['year'] = 1 else: return 0 confidence = float(int(matching_format.format(**replacement), 2)) / float(int(best, 2)) return confidence
def list(self, video, languages): languages = languages & self.availableLanguages() if not languages: self.logger.debug(u'No language available') return [] if not self.isValidVideo(video): self.logger.debug(u'Not a valid video') return [] results = [] if isinstance(video, Episode): results = self.query(video.path or video.release, languages, get_keywords(video.guess), series=video.series, season=video.season, episode=video.episode) elif isinstance(video, Movie) and video.year: results = self.query(video.path or video.release, languages, get_keywords(video.guess), movie=video.title, year=video.year) return results
def filterchain(request, app, model, field, foreign_key_app_name, foreign_key_model_name, foreign_key_field_name, value, manager=None): model_class = get_model(app, model) m2m = is_m2m(model_class, field) keywords = get_keywords(field, value, m2m=m2m) # filter queryset using limit_choices_to limit_choices_to = get_limit_choices_to(foreign_key_app_name, foreign_key_model_name, foreign_key_field_name) queryset = get_queryset(model_class, manager, limit_choices_to) results = queryset.filter(**keywords) # Sort results if model doesn't include a default ordering. if not getattr(model_class._meta, 'ordering', False): results = list(results) sort_results(results) serialized_results = serialize_results(results) results_json = json.dumps(serialized_results) return HttpResponse(results_json, content_type='application/json')
def filterchain_all(request, app, model, field, foreign_key_app_name, foreign_key_model_name, foreign_key_field_name, value): """Returns filtered results followed by excluded results below.""" model_class = get_model(app, model) keywords = get_keywords(field, value) # filter queryset using limit_choices_to limit_choices_to = get_limit_choices_to(foreign_key_app_name, foreign_key_model_name, foreign_key_field_name) queryset = get_queryset(model_class, limit_choices_to=limit_choices_to) filtered = list(queryset.filter(**keywords)) sort_results(filtered) excluded = list(queryset.exclude(**keywords)) sort_results(excluded) # Empty choice to separate filtered and excluded results. empty_choice = {'value': "", 'display': "---------"} serialized_results = (serialize_results(filtered) + [empty_choice] + serialize_results(excluded)) results_json = json.dumps(serialized_results) return HttpResponse(results_json, content_type='application/json')
def list(self, video, languages): languages = languages & self.availableLanguages() if not languages: self.logger.debug(u'No language available') return [] if not self.isValidVideo(video): self.logger.debug(u'Not a valid video') return [] results = self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode) return results
def apply_(self, p1, p2, p3, p4, key): grab_screen(p1, p2, "data/pic_ques.png") grab_screen(p3, p4, "data/pic_ans.png") merge_pic("data/pic_ques.png", "data/pic_ans.png") r = pic_handle("data/target_img.png", key) if r: res = r # print("words result: ", res) ques_content, ans_content = handle_words(res) keywords = get_keywords(ques_content) data_res = search_related_records(keywords) print(data_res, "data_res", len(data_res)) if len(data_res) >= 1: # 数据库中存在该题目 real_ans, final_index = get_the_most_similar(ques_content, [r[1] for r in data_res]) real_ans = data_res[final_index][2] pno = data_res[final_index][0] if real_ans: # find correct answer given_ans, index = get_the_most_similar(real_ans, ans_content) position: str = self.find_position_by_index(index+1) print(real_ans) if self.auto_apply: mouse_click_(position) else: # not find correct answer but find wrong answer wrong_ans = data_res[final_index][3] print("题目尚无正确答案,进行随即作答") index = auto_choose_answer(wrong_ans, ans_content) print("index: ", index) position = self.find_position_by_index(index) mouse_click_(position) wrong_ans = json.loads(wrong_ans) correct = sg.popup_yes_no("选择对了吗?", keep_on_top=True) if correct == "Yes": if len(wrong_ans) < 4: engine.update_or_insert(pno=pno, ques=ques_content, ans=ans_content[len(wrong_ans)]) else: if len(wrong_ans) < 4: engine.update_or_insert(pno=pno, ques=ques_content, wrong_ans=[ans_content[len(wrong_ans)]]) else: # 未收录的情况进行顺序作答,记录正确错误 print("题目可能未收录, 进行随机作答") real_ans = "" # index = self.auto_choose_answer("", ans_content) position = self.find_position_by_index(1) mouse_click_(position) correct = sg.popup_yes_no("选择对了吗?", keep_on_top=True) # 百度文字识别每日字数限制,这里选择手动识别 if correct == "Yes": engine.update_or_insert(ques=ques_content, ans=ans_content[0]) # first time apply 0 else: engine.update_or_insert(ques=ques_content, wrong_ans=ans_content[:1]) # wrong_ans must be a list print(real_ans) self.start_btn.set_focus(force=True)
def test_spider(): """ test spider """ config = JsonConf.load('./../conf.json') keys = config['url_api'].keys() fetcher = createInstance('url_crawler', 'UrlFetcher', max_repeat=2, sleep_time=1) parser = createInstance('url_crawler', 'UrlParser', max_deep=1) saver = createInstance('url_crawler', 'UrlSaver', config) # if need_proxy == '1': proxieser = createInstance('url_crawler', 'UrlProxieser', sleep_time=1) # else: # proxieser = None # initial web_spider web_spider = WebSpider(fetcher, parser, saver, proxieser, monitor_sleep_time=1) keywords = get_keywords() # urls = [] for i in keywords: for key in keys: api = config['url_api'][key] url = api.format(i) web_spider.set_start_url(url, keys={ 'key': key, 'replace': config['json_replace'][key], 'depth': config['url_depth'][key], 'need_proxy': config['need_proxy'][key] }) # web_spider.start_working(fetcher_num=2) # # wait for finished # web_spider.wait_for_finished() web_spider.start_working(fetcher_num=10) # wait for finished web_spider.wait_for_finished() return
def create_lateral_pseudo_steps(): lateral_files = files_folder.joinpath('lateral') element_data = pd.read_excel(lateral_files.joinpath('element_data.xlsx'), header=None) inital_gap = pd.read_excel(lateral_files.joinpath('initial_gap.xlsx'), header=None) surface_behav = pd.read_excel(lateral_files.joinpath('surface_behav.xlsx'), header=None) surface_behav = list(surface_behav.ffill().groupby(0).apply( lambda df: df.iloc[:, 1:].values)) assert len(surface_behav) == inital_gap.shape[0] == 61 max_count = len(surface_behav) pseudo_steps = list() for i in range(max_count): pseudo_step = Step( get_keywords(lateral_files.joinpath('lateral_template.txt'))) element_kw = pseudo_step['Element'] # Update *Element new_name = element_kw.params['ELSET'][:-1] + str(i + 1) element_kw.params['ELSET'] = new_name # Read i'th row from element_data.xlsx and insert it into element_kw.data element_datum = element_data.iloc[i].tolist() element_kw.data = [element_datum] # Update *GAP gap_kw = pseudo_step['GAP'] gap_kw.params['ELSET'] = new_name gap_kw.data[0][0] = -inital_gap.iloc[i, 0] if i == 0 or i == max_count - 1: # if first or last, then ignore pass else: # else multiply by 2 gap_kw.data[0][-1] = float(gap_kw.data[0][-1]) * 2 # Update *SURFACE BEHAVIOR surfbeh_kw = pseudo_step['SURFACE BEHAVIOR'] surfbeh_kw.data = list(surface_behav[i]) pseudo_steps.append(pseudo_step) write_steps(pseudo_steps, results_folder.joinpath('lateral_pseudo_steps.txt'))
def generate_data(files): """given a list of files, returns ...""" data = [] current_ids = 0 for file in files: text = open(file,'r').read() doc = nlp(text) for i, phrase in enumerate(doc.sents, current_ids): phrase = str(phrase) if ('\n' in phrase[:-2]): continue keywords = get_keywords(phrase) if len(keywords) > 3: data.append({"sentence": phrase, "keywords": keywords, "source": os.path.basename(file)}) current_ids += 1 with open('data.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4)
def create_end_pseudo_steps(): end_files = files_folder.joinpath('end') end_data = pd.read_excel(end_files.joinpath('element_data_end.xlsx'), header=None) area_data = pd.read_excel(end_files.joinpath('area_data.xlsx'), header=None) qz_curve = pd.read_excel(end_files.joinpath('qz_curve.xlsx'), header=None) max_count = 21 pseudo_steps = list() for i in range(max_count): pseudo_step = Step(get_keywords( end_files.joinpath('end_template.txt'))) element_kw = pseudo_step['Element'] # Update *Element new_name = element_kw.params['ELSET'][:-1] + str(i + 1) element_kw.params['ELSET'] = new_name # Read i'th row from element_data_end.xlsx and insert it into element_kw.data element_datum = end_data.iloc[i].tolist() element_kw.data = [element_datum] # Update *GAP gap_kw = pseudo_step['GAP'] gap_kw.params['ELSET'] = new_name # Replace only the last value gap_kw.data[0][-1] = area_data.iloc[i, 0] # Update *SURFACE BEHAVIOR surfbeh_kw = pseudo_step['SURFACE BEHAVIOR'] surfbeh_kw.data = list(qz_curve.values) pseudo_steps.append(pseudo_step) write_steps(pseudo_steps, results_folder.joinpath('end_pseudo_steps.txt')) pass
continue if (looking_concept in words): total = total + 1 concept = words[0] + " " + words[1] + " " + words[2] concept = masking(concept, looking_concept) mapped_concept = relation_mapping(words, concept) sentiment_dict = sid_obj.polarity_scores(mapped_concept) if sentiment_dict['compound'] >= 0.05: pos_count = pos_count + 1 elif sentiment_dict['compound'] <= -0.05: neg_count = neg_count + 1 else: neut_count = neut_count + 1 outfile.write(mapped_concept + " " + str(sentiment_dict['compound']) + "\n") write_results(outfile, looking_concept, total, neg_count, pos_count, neut_count) if __name__ == "__main__": all_concepts_long = [] infile = open("./../data/ConceptNet_data", "r") for line in infile.readlines(): all_concepts_long.append(line) key_words = get_keywords() sentiment_analyzer(key_words, all_concepts_long)
def onMessage(bot, update, user_data): global first global specified_disease global data global information_hospitals global lat global lon if update.message.chat.id not in users: users.append(update.message.chat.id) else: first = False if first: # First time, it displays a presentation message username = update.message.chat.first_name if username is None: username = "******" else: username = "******" + username first = False msg = tr2english(update.message.text.lower(), user_data) message1 = "Hi" + username + "! I'm your Health Mate. I'm here to inform you about rare diseases. Please don't hesitate to ask me any question you have about it, I'll try my best to answer your doubts using trustful sources. Remember that I'm just here to inform you; for anything else contact a professional." message2 = "First of all, I would like you to tell me which disease you want to find out about. Please, tell me the name of the disease." bot.send_message(chat_id=update.message.chat_id, text=tr2other(message1, user_data['language'])) bot.send_message(chat_id=update.message.chat_id, text=tr2other(message2, user_data['language'])) specified_disease = False return else: msg = tr2english(update.message.text.lower(), user_data) keywords = get_keywords(msg) if specified_disease == False: for i in diseases: if i in msg: data = read_book("main_data_" + i + ".json") information_hospitals = read_book( 'hospitals_information_' + i + '.json') message = "Perfect!. I understand you asked about " + i + ". From now on we will be talking about this disease in particular." message2a = "If I did not understand it wel, or you wish to change the subject, you can type" message2b = "and I will ask again." bot.send_message(chat_id=update.message.chat_id, text=tr2other(message, user_data['language'])) bot.send_message( chat_id=update.message.chat_id, text=tr2other(message2a, user_data['language']) + ' /disease ' + tr2other(message2b, user_data['language'])) bot.send_message( chat_id=update.message.chat_id, text=tr2other( 'You can ask me any question related to this disease.', user_data['language'])) bot.send_message( chat_id=update.message.chat_id, text=tr2other( 'You can also send me your location and I will find the closest hospital where this disease can be diagnosed and treated.', user_data['language'])) bot.send_message( chat_id=update.message.chat_id, text=tr2other('You can type in', user_data['language']) + ' /help ' + tr2other('To know about all the text commands.', user_data['language'])) specified_disease = True return else: bot.send_message( chat_id=update.message.chat_id, text=tr2other( "Sorry, I did not understand you. Please try again. The diseases I have information about are: " + diseases[0] + '.', user_data['language'])) return elif 'hello' in keywords or 'hi' in keywords or 'greetings' in keywords: bot.send_message(chat_id=update.message.chat_id, text=tr2other("Hello!", user_data['language'])) elif 'bye' in keywords or 'goodbye' in keywords: bot.send_message(chat_id=update.message.chat_id, text=tr2other("Goodbye! Thanks for trusting me.", user_data['language'])) elif 'thank' in keywords or 'appreciate' in keywords or 'thanks' in keywords: bot.send_message( chat_id=update.message.chat_id, text=tr2other( "You are welcome! I am always trying to give my best", user_data['language'])) else: message = tr2other('Okay, give me some time to think about it...', user_data['language']) bot.send_message(chat_id=update.message.chat_id, text=message, parse_mode=telegram.ParseMode.MARKDOWN) info = process_message(msg, data) if info: source_message = f"In case you want more information about it, the information I found comes from this source:\n `{info['URL']}` \n From the section *{info['title']}*" answer = f"Alright! I found the following information: \n \n {info['text']}" bot.send_message(chat_id=update.message.chat_id, text=tr2other(answer, user_data['language']), parse_mode=telegram.ParseMode.MARKDOWN) bot.send_message(chat_id=update.message.chat_id, text=tr2other(source_message, user_data['language']), parse_mode=telegram.ParseMode.MARKDOWN) else: error_msg = 'I am sorry, but I cannot answer this properly. You can try asking it a different way or contact a specialist for further information' bot.send_message(chat_id=update.message.chat_id, text=tr2other(error_msg, user_data['language']), parse_mode=telegram.ParseMode.MARKDOWN)
def analysis(fpath: str, extname, imgdir=None, do_drawings=False): content = None images = [] # drawings = [] kw_arr = [] freq_arr = [] ph_arr = [] nw_arr = [] sum_arr = [] # if not do_drawings: if True: if extname == '.txt': content = readtxt.read(fpath) if extname == '.docx': content = readword.readtxt(fpath) images = readword.readimg(fpath, imgdir, str(uuid.uuid4())) if extname == '.doc': content = readword.readtxt(fpath + 'x') images = readword.readimg(fpath + 'x', imgdir, str(uuid.uuid4())) if extname == '.pptx': content = readppt.readtxt(fpath) images = readppt.readimg(fpath, imgdir, str(uuid.uuid4())) if extname == '.ppt': content = readppt.readtxt(fpath + 'x') images = readppt.readimg(fpath + 'x', imgdir, str(uuid.uuid4())) if extname == '.pdf': content = readpdf.readtext(fpath) drawings = None do_split_drawing = False if do_drawings: if extname == '.dxf': content = readdxf.readtxt(fpath) if do_split_drawing: drawings = readdxf.split_drawing_byblock(fpath) if extname == '.dwg': maxtry = 30 transpath = fpath.replace('.dwg', '.dxf') for ii in range(maxtry): print(ii) time.sleep(3) if os.path.isfile(transpath): content = readdxf.readtxt(transpath) if do_split_drawing: drawings = readdxf.split_drawing_byblock(fpath) break if extname == '.rar': content = readrar.readrar(fpath, rm_prefix=True, maxnames=10) if extname == '.zip': content = readrar.readzip(fpath, rm_prefix=True, maxnames=10) # do analysis if content is not None: # too long!!! total_words_count = len(' '.join(content)) total_paragraph_count = len(content) max_words = 50000 if total_words_count > max_words: paragraph_limit = math.ceil(max_words / total_words_count * total_paragraph_count) content = content[:paragraph_limit] print('limit paragraphs ' + str(paragraph_limit)) print('limit words ' + str(len(' '.join(content)))) # key words kw_arr = utils.get_keywords(content, config.kw_topk) # word frequency array freq = utils.get_freq(content) freq_arr = list(map(lambda x: str(freq[x]) if x in freq else 0, kw_arr)) # key phrases ph_arr = utils.get_phrase(content, n=10) # new words if not extname == '.dwg': nw_arr = utils.get_newwords(content, n=20) # auto summary if extname == '.rar' or extname == '.zip': sum_arr = content else: sum_arr = utils.get_summary(content, n=10) # give keywords to images # ['fname', 'keywords', 'relatedtxt'] makeparam = {} if images: for cimg in images: # cimg['keywords'] = ','.join(utils.get_keywords([cimg['relatedtxt']], config.kw_topk_image)) makeparam[cimg['fname']] = cimg['relatedtxt'] kwdic = utils.get_keywordsmany(makeparam, config.kw_topk_image) for cimg in images: cimg['keywords'] = ','.join(kwdic[cimg['fname']][0]) cimg['newwords'] = ','.join(kwdic[cimg['fname']][1]) cimg['docname'] = fpath return ( ','.join(kw_arr), # ','.join(freq_arr), ','.join([x + ':' + y for x, y in zip(kw_arr, freq_arr)]), ','.join(ph_arr), ','.join(nw_arr), sum_arr, images, drawings)
def run(self): logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(levelname)s\t%(message)s") need_proxy = self.config.getStr('need_proxy', self.name) fetcher = createInstance('url_crawlers', self.name + 'UrlFetcher', max_repeat=2, sleep_time=1) parser = createInstance('url_crawlers', self.name + 'UrlParser', max_deep=1) saver = createInstance('url_crawlers', self.name + 'UrlSaver') if need_proxy == '1': proxieser = createInstance('url_crawlers', self.name + 'UrlProxieser', sleep_time=1) else: proxieser = None # initial web_spider web_spider = WebSpider(self.name, fetcher, parser, saver, proxieser, monitor_sleep_time=5) keywords = get_keywords() # urls = [] api = self.config.getStr('url_api', self.name) for i in keywords: url = api.format(i) web_spider.set_start_url( url, keys={ 'website': self.name, 'keyword': i, 'params': { 'jsv': '2.3.16', 'appKey': '12574478', 't': None, 'sign': None, 'api': 'mtop.taobao.wsearch.h5search', 'v': '1.0', 'H5Request': 'true', 'ecode': '1', 'type': 'jsonp', 'dataType': 'jsonp', 'callback': 'mtopjsonp1', 'data': '{{"q":"{0}","search":"提交","tab":"{2}","sst":"1","n":20,"buying":"buyitnow","m":"api4h5","token4h5":"","abtest":"29","wlsort":"29","page":{1}}}' .format(i, 1, 'all' if self.name == 'TaoBao' else 'mall') } }) web_spider.start_working(fetcher_num=2) # wait for finished web_spider.wait_for_finished()
# Package Imports import authorization import utils import settings # Help Text helpTxt = settings.get_actions()['/newevent']['helpTxt'] # Candidate Worksheets sheetNames = ['Socials', 'Professional Events'] socialSheet, profSheet = authorization.get_sheet_objects(sheetNames) # Keywords - Type attribute keywords = ['social', 'prof'] socials, profs = utils.get_keywords(keywords) # Row values on spreadsheet for gspread to retrieve correct values NAME_ROW = 2 PWD_ROW = 1 """ Add event into the Candidate Tracker spreadsheet @params: event, name, pwd - attributes needed to insert into spreadsheet @return: err - if err is None then successful add """ def add_event(event, name, pwd): # Figure out exact event worksheet worksheet = None if event == 'social':
def get_first_summaries(text, stopwords, model): """ :param text: 文档 :param stopwords: 停用词 :param model: 词向量模型 :return: 摘要列表 按照权重从大到小排列[(句子,权重),(句子,权重)] """ #获取(位置,句子)列表 sentences = utils.get_sentences(text) #获取句子列表 sen_lis = [x[1] for x in sentences] # print(sen_lis) #获取文档向量 docvec = generate_vector.doc_vector(text, stopwords, model) #获取句子向量列表 sen_vecs = [] for i in range(len(sen_lis)): #假设是首句 if i == 0: sen_vecs.append( generate_vector.sentence_vector(sen_lis[i], stopwords, model) * GlobalParameters.locFirst_weight) #如果是最后一句 elif i == len(sen_lis) - 1: sen_vecs.append( generate_vector.sentence_vector(sen_lis[i], stopwords, model) * GlobalParameters.locLast_weight) #如果是中间的句子 else: sen_vecs.append( generate_vector.sentence_vector(sen_lis[i], stopwords, model)) #计算余弦值列表 cos_lis = [utils.cos_dist(docvec, x) for x in sen_vecs] #计算关键词权重列表 #获取关键词 keywords = utils.get_keywords(text) #计算权重 keyweights = [utils.keyword_weight(x, keywords) for x in sen_lis] #计算长度权重 len_weigths = [utils.len_weight(x) for x in sen_lis] #根据余弦相似度 关键词权重 长度权重 计算每个句子最终权重 final_weights = [ cos * keyword * length for cos in cos_lis for keyword in keyweights for length in len_weigths ] #形成最后的(句子,权重列表) final_lis = [] for sen, weight in zip(sen_lis, final_weights): final_lis.append((sen, weight)) #将句子按照权重大小 从高到低排序 final_lis = sorted(final_lis, key=lambda x: x[1], reverse=True) #取出第一次摘要的橘子个数 final_lis = final_lis[:GlobalParameters.first_num] return final_lis
def get_keyword(self): return self.keyword def get_num_linha(self): return self.num_linha def get_sentenca(self): return self.sentenca # Percorrer todos os prontuarios. Para cada prontuário, verificar # a ocorrência das palavras-chave. Guardar as sentenças onde as # palavras-chave foram encontradas. prontuarios = utils.get_prontuarios() keywords = utils.get_keywords() lista_ocorrencias = [] for prontuario in prontuarios: for keyword in keywords: linhas_do_prontuario = utils.get_linhas_arq_texto(prontuario) for num_da_linha in range(len(linhas_do_prontuario)): linha = linhas_do_prontuario[num_da_linha] sentenca = utils.extrair_sentenca(linha, keyword) if sentenca: ocorrencia = Ocorrencia(prontuario, num_da_linha, keyword, sentenca) lista_ocorrencias.append(ocorrencia) # Gerar o relatório relatorio.gerar(prontuarios, keywords, lista_ocorrencias)
print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") + 'Making Keyword & Hint table ~') keywords = news.groupby('category').apply(get_keywords) t = tqdm(total=25) for category, keywordss in tqdm(keywords.iteritems()): # insert keywords db.insert_keywords(keywordss, category, date, time) news_cat = news[news['category']==category] for keyword in keywordss: # news for keyqord news_keyword = news_cat[news_cat['text'].str.contains(keyword)] n = len(news_keyword) news_keyword = pd.concat([news_keyword, get_search_news(keyword, news=n).iloc[:n]], axis=0) # hints hints = get_keywords(news_keyword, only_noun=True, n=10) if keyword in hints: hints.remove(keyword) hints = hints[:9] # insert hints keywordID = db.keywordID(keyword, db.categoryID(category)) db.insert_hints(keywordID, hints) # post categoryID = db.categoryID(category) for i in range(len(news_keyword)): postTitle = news_keyword.iloc[i]['title'] postDetail = news_keyword.iloc[i]['text'] userID = 'test1' a = (postTitle, postDetail, userID, keywordID, categoryID, date)
print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") + 'Making Keyword & Hint table ~') keywords = news.groupby('category').apply(get_keywords) t = tqdm(total=25) for category, keywordss in tqdm(keywords.iteritems()): db.insert_keywords(keywordss, category, date, time) news_cat = news[news['category'] == category] for keyword in keywordss: news_keyword = news_cat[news_cat['text'].str.contains(keyword)] n = len(news_keyword) news_search = get_search_news(keyword, news=n).iloc[:n] hints = get_keywords(pd.concat([news_keyword, news_search], axis=0), only_noun=True, n=10) if keyword in hints: hints.remove(keyword) hints = hints[:9] keywordID = db.keywordID(keyword, db.categoryID(category)) db.insert_hints(keywordID, hints) t.update(1) t.close() print('\033[92m' + 'COMPLETE!' + '\033[0m') db.close() print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") + '\033[94m' + 'Bye~' + '\033[0m')