def get_fact(): overall_start = time.time() qid = request.args.get('subject_id') pid = request.args.get('property_id') query = [construct_query(qid, pid)] if not os.path.exists('/tmp/data'): os.makedirs('/tmp/data') if not os.path.exists('/tmp/features'): os.makedirs('/tmp/features') if not os.path.exists('/tmp/predictions'): os.makedirs('/tmp/predictions') with open('/tmp/data/{}_test.json'.format(pid), 'w') as f: json.dump(query, f) # prepare article and label. fname = get_filename_for_article_id( query[0]['wikipedia_link'].split('wiki/')[-1]) if not os.path.exists(os.path.join('data', 'wiki', fname)): print('Article not exists start downloading') get_article() if not os.path.exists( os.path.join('data', 'labels', '{}_labels.json'.format(pid))): print('Label not exists start downloading') get_labels_data(relations=[pid]) # make prediction start_time = time.time() print('[INFO] Prediction') make_predictions(model_path='/home/guo/model', data_path='/tmp/data', feature_path='/tmp/features') time_1 = (time.time() - start_time) # pass to rankerNet start_time = time.time() print('[INFO] rankerNet') call_ranker_net() time_2 = (time.time() - start_time) # pass to enhanced_linker start_time = time.time() print('[INFO] Linker') result = call_enhanced_linker(pid) time_3 = time.time() - start_time # return fact result["runtime"] = { "time_predict": time_1, "time_rankerNet": time_2, "time_linker": time_3, "overall_": time.time() - overall_start } result["time"] = str(datetime.now()) return jsonify(result)
def do_parse(ruthless): try: html = deepcopy(self.html) for i in utils.tags(html, 'script', 'style'): i.drop_tree() for i in utils.tags(html, 'body'): i.set('id', 'readabilityBody') if ruthless: html = utils.remove_unlikely_candidates(html) html = utils.transform_misused_divs_into_paragraphs(html) candidates = utils.score_paragraphs(html) # first try to get an article article_node = utils.get_article_element(html) if article_node: best_candidate = article_node else: best_candidate = select_best_candidate(candidates) if best_candidate: # TODO: there was some logic here about retrying if the article wasn't long enough return utils.sanitize(utils.get_article(candidates, best_candidate), candidates) else: return None except StandardError, e: log.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article_list(self, fout=None, order_by='added_at', max_get=1000000): '''获取此专题的文章列表 (包括阅读,点赞,评论,打赏数目)order_by = {'added_at', 'likes_count'} ''' articles_list = [] page = 1 num_id = self.get_collection_num_id() coll_name = self.get_collection_name() logger.info(u'专题:%s' % coll_name) while True: url = BASE_URL + '/collections/' + str( num_id) + '/notes?order_by=' + order_by + '&page=' + str(page) page += 1 content = get_content(url) page_arts = get_article(content) for page_art in page_arts: art = Article(page_art['id']) title, text = art.get_article_text(delete_wrap=True) if fout != None: fout.write(text + '\n') if len(page_arts) == 0 or len(articles_list) > max_get: logger.info(u'专题 %s 一共获取 %d 篇文章' % (coll_name, len(articles_list))) return articles_list articles_list.extend(page_arts) logger.info(u'已经获取了 %d 篇文章' % len(articles_list))
def get_change_article(driver, meet_seq, rcp_no, cursor): try: #print('------------------------- 정관의 변경 -------------------------') # 정관 변경(j:집중투표제 표, e:그외 표) try: jipjung_tb, jipjung_tb_tag, etc_tb, etc_tb_tag = get_article( driver) except: jipjung_tb = [] jipjung_tb_tag = [] etc_tb = [] etc_tb_tag = [] code_j_tb = get_article_code(jipjung_tb) code_e_tb = get_article_code(etc_tb) if check_empty_table(jipjung_tb) != 0: jipjung_aoi_db(meet_seq, rcp_no, jipjung_tb, jipjung_tb_tag, cursor) if check_empty_table(etc_tb) != 0: etc_aoi_db(meet_seq, rcp_no, etc_tb, etc_tb_tag, cursor) info_logger.info('[7] chagne article of incorporation success.') except Exception as e: error_logger.error( '[7] chagne article of incorporation fail. [{0}] : {1}'.format( rcp_no, e))
def get_predictions(df: pd.DataFrame) -> List[dict]: model.to("cuda:0") model.eval() pred_rows = [] for i, v in tqdm(df.iterrows(), total=len(df)): line = v["text"] if line == "[SKIP]": continue toks = tokenizer.tokenize(v["text"]) ids = torch.tensor([tokenizer.convert_tokens_to_ids(toks)]) article = get_article(v["article_path"]).lower() try: with torch.no_grad(): ids = ids.to("cuda:0") preds, _ = model(ids, attention_mask=None) preds = preds.tolist()[0] spans = extract_pred_spans(preds) except: continue for span in spans: if span[0] == span[1]: pred_text = tokenizer.decode([ids[0][span[0]]]) else: pred_text = tokenizer.decode(ids[0][span[0]:span[1]]) pred_text = normalize_text(pred_text) if len(pred_text) < 4: continue # Gradually increase fuzzy match dist to prevent bad match for short spans max_l_dists = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] for dist in max_l_dists: match = find_near_matches(pred_text, article, max_l_dist=dist) if len(match) > 0: pred_rows.append({ "article_id": v["article_id"], "pred_text": pred_text, "span_start": match[0].start, "span_end": match[0].end }) break return pred_rows
def get_bookmarks_articles(self, max_get=10000): '''获取用户收藏的文章,需要登入,设置cookie ''' mark_url = BASE_URL + '/bookmarks?' page = 1 bookmarks_list = [] while True: url = mark_url + 'page=' + str(page) page += 1 content = get_content(url, COOKIE) page_arts = get_article(content) if len(page_arts) == 0 or len(bookmarks_list) > max_get: logger.info('一共获取 %d 篇文章' % len(bookmarks_list)) return bookmarks_list bookmarks_list.extend(page_arts)
def get_favourites_articles(self, max_get=100000): '''获取用户喜欢的文章,需要登入,设置cookie ''' liked_url = BASE_URL + '/favourites?type=notes' page = 1 favourites_list = [] while True: url = liked_url + '&page=' + str(page) page += 1 content = get_content(url, COOKIE) page_arts = get_article(content) if len(page_arts) == 0 or len(favourites_list) > max_get: logger.info('一共获取 %d 篇文章' % len(favourites_list)) return favourites_list favourites_list.extend(page_arts)
def get_article_list(self, order_by='latest', max_get=1000000): '''获取该文集的所有文章列表 order_by : {top, latest} ''' page = 1 articles_list = [] while True: url = BASE_URL + '/notebooks/' + self.notebook_id + '/' + order_by + '?page=' + str( page) page += 1 content = get_content(url) page_arts = get_article(content) if len(page_arts) == 0 or len(articles_list) > max_get: logger.info('一共获取 %d 篇文章' % len(articles_list)) return articles_list articles_list.extend(page_arts)
def get_subscription_notes(self, max_get=100): '''获取关注专题列表的最新max_get篇文章,需要设置cookie ''' article_list = [] load_more_url = '/subscription_notes' while True: url = BASE_URL + load_more_url content = get_content(url, COOKIE) if content == 'FAIL': logger.warning(u'失败') return # logger.info(content) page_arts = get_article(content) soup = BeautifulSoup(content, 'lxml') load_more_url = soup.find('button', attrs={'class': 'ladda-button'})['data-url'] logger.info(load_more_url) if len(page_arts) == 0 or len(article_list) >= max_get or ( not load_more_url): logger.info(u'一共获取 %d 篇文章' % len(article_list)) return article_list article_list.extend(page_arts)
binary_model = binary_model.to(device1) span_model = span_model.to(device2) binary_model.eval() span_model.eval() tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") pred_rows = [] for i, v in dev.iterrows(): line = v["text"] toks = tokenizer.tokenize(line) input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(toks)]) input_ids2 = torch.tensor([tokenizer.convert_tokens_to_ids(toks)]) article = get_article(v["article_path"]).lower() with torch.no_grad(): input_ids = input_ids.to(device1) binary_pred = binary_model(input_ids)[0] if torch.argmax(binary_pred) == 1: input_ids = input_ids.to(device2) preds, log = span_model(input_ids) else: continue preds = preds[0].cpu().tolist() spans = extract_pred_spans(preds) if len(spans) == 0: continue
from transformers import BertTokenizer # Generate dev dataset for prediction dev_ids = [] with open("data/dev-task-TC-template.out") as f: for line in f.readlines(): l = line.split("\t") dev_ids.append(l[0].strip()) dev_ids = list(set(dev_ids)) dev_articles_map = {} dev_dat = [] for i in dev_ids: file = get_dev_article_file(i) article_text = get_article(file) for line in article_text.split("\n"): if len(line) < 4 : continue else: dev_dat.append({"article_id": i, "text": line, "article_path": file}) dev_articles_map[i] = article_text dev_df = pd.DataFrame(dev_dat) dev_df.to_csv("data/task1_dev.csv") # # 1. Generate train labels with open("data/train-task1-SI.labels", "r") as f: train_lines = f.readlines()from utils import get_article, get_span, get_span_text, get_article_file, \ get_task1_file, get_gold_spans, get_dev_article_file
#%% from utils import get_article from os import listdir from os.path import isfile, join import re import pandas as pd mypath = "data/test-articles/" test_articles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))] id_extractor = re.compile("\d+") row = [] for path in test_articles: article_id = id_extractor.findall(path)[0] text = get_article(path) lines = text.split("\n") start_idx = 0 for line in lines: end_idx = len(line) + start_idx + 2 if len(line) > 1: row.append({"article_id": article_id, "text": "[SKIP]" if len(line) < 1 else line.strip(), "article_path": path, "start_idx": start_idx, "end_idx": end_idx}) start_idx = end_idx + 1 df = pd.DataFrame(row)
t = pd.DataFrame({"file":train_files}) t["src"] = "train" d = pd.DataFrame({"file":dev_files}) d["src"] = "dev" tt = pd.DataFrame({"file":test_files}) tt["src"] = "test" df = pd.concat([t,d]) # %% from sklearn.model_selection import train_test_split random_state = 1024 train, val = train_test_split(df, test_size=.15, random_state=random_state, stratify = df["src"]) # %% from utils import get_article with open("data/lm_train_v2.txt", "w") as f: for x in train["file"]: f.write(get_article(x)) with open("data/lm_val.txt_v2", "w") as f: for x in val["file"]: f.write(get_article(x)) # %%