Esempio n. 1
0
def get_fact():
    overall_start = time.time()
    qid = request.args.get('subject_id')
    pid = request.args.get('property_id')
    query = [construct_query(qid, pid)]
    if not os.path.exists('/tmp/data'):
        os.makedirs('/tmp/data')
    if not os.path.exists('/tmp/features'):
        os.makedirs('/tmp/features')
    if not os.path.exists('/tmp/predictions'):
        os.makedirs('/tmp/predictions')

    with open('/tmp/data/{}_test.json'.format(pid), 'w') as f:
        json.dump(query, f)

    # prepare article and label.
    fname = get_filename_for_article_id(
        query[0]['wikipedia_link'].split('wiki/')[-1])
    if not os.path.exists(os.path.join('data', 'wiki', fname)):
        print('Article not exists start downloading')
        get_article()
    if not os.path.exists(
            os.path.join('data', 'labels', '{}_labels.json'.format(pid))):
        print('Label not exists start downloading')
        get_labels_data(relations=[pid])

    # make prediction
    start_time = time.time()
    print('[INFO] Prediction')
    make_predictions(model_path='/home/guo/model',
                     data_path='/tmp/data',
                     feature_path='/tmp/features')
    time_1 = (time.time() - start_time)

    # pass to rankerNet
    start_time = time.time()
    print('[INFO] rankerNet')
    call_ranker_net()
    time_2 = (time.time() - start_time)

    # pass to enhanced_linker
    start_time = time.time()
    print('[INFO] Linker')
    result = call_enhanced_linker(pid)
    time_3 = time.time() - start_time

    # return fact
    result["runtime"] = {
        "time_predict": time_1,
        "time_rankerNet": time_2,
        "time_linker": time_3,
        "overall_": time.time() - overall_start
    }
    result["time"] = str(datetime.now())
    return jsonify(result)
Esempio n. 2
0
        def do_parse(ruthless):
            try:
                html = deepcopy(self.html)
                for i in utils.tags(html, 'script', 'style'):
                    i.drop_tree()
                for i in utils.tags(html, 'body'):
                    i.set('id', 'readabilityBody')
                if ruthless:
                    html = utils.remove_unlikely_candidates(html)
                html = utils.transform_misused_divs_into_paragraphs(html)

                candidates = utils.score_paragraphs(html)

                # first try to get an article
                article_node = utils.get_article_element(html)
                if article_node:
                    best_candidate = article_node
                else:
                    best_candidate = select_best_candidate(candidates)

                if best_candidate:
                    # TODO: there was some logic here about retrying if the article wasn't long enough
                    return utils.sanitize(utils.get_article(candidates, best_candidate), candidates)
                else:
                    return None
            except StandardError, e:
                log.exception('error getting summary: ')
                raise Unparseable(str(e)), None, sys.exc_info()[2]
Esempio n. 3
0
    def get_article_list(self,
                         fout=None,
                         order_by='added_at',
                         max_get=1000000):
        '''获取此专题的文章列表 (包括阅读,点赞,评论,打赏数目)order_by = {'added_at', 'likes_count'}
        '''
        articles_list = []
        page = 1
        num_id = self.get_collection_num_id()
        coll_name = self.get_collection_name()
        logger.info(u'专题:%s' % coll_name)
        while True:
            url = BASE_URL + '/collections/' + str(
                num_id) + '/notes?order_by=' + order_by + '&page=' + str(page)
            page += 1
            content = get_content(url)
            page_arts = get_article(content)

            for page_art in page_arts:
                art = Article(page_art['id'])
                title, text = art.get_article_text(delete_wrap=True)
                if fout != None:
                    fout.write(text + '\n')

            if len(page_arts) == 0 or len(articles_list) > max_get:
                logger.info(u'专题 %s 一共获取 %d 篇文章' %
                            (coll_name, len(articles_list)))
                return articles_list
            articles_list.extend(page_arts)
            logger.info(u'已经获取了 %d 篇文章' % len(articles_list))
Esempio n. 4
0
def get_change_article(driver, meet_seq, rcp_no, cursor):
    try:
        #print('------------------------- 정관의 변경 -------------------------')
        # 정관 변경(j:집중투표제 표, e:그외 표)
        try:
            jipjung_tb, jipjung_tb_tag, etc_tb, etc_tb_tag = get_article(
                driver)
        except:
            jipjung_tb = []
            jipjung_tb_tag = []
            etc_tb = []
            etc_tb_tag = []

        code_j_tb = get_article_code(jipjung_tb)
        code_e_tb = get_article_code(etc_tb)

        if check_empty_table(jipjung_tb) != 0:
            jipjung_aoi_db(meet_seq, rcp_no, jipjung_tb, jipjung_tb_tag,
                           cursor)
        if check_empty_table(etc_tb) != 0:
            etc_aoi_db(meet_seq, rcp_no, etc_tb, etc_tb_tag, cursor)

        info_logger.info('[7] chagne article of incorporation success.')
    except Exception as e:
        error_logger.error(
            '[7] chagne article of incorporation fail. [{0}] : {1}'.format(
                rcp_no, e))
Esempio n. 5
0
def get_predictions(df: pd.DataFrame) -> List[dict]:
    model.to("cuda:0")
    model.eval()

    pred_rows = []
    for i, v in tqdm(df.iterrows(), total=len(df)):

        line = v["text"]

        if line == "[SKIP]":
            continue

        toks = tokenizer.tokenize(v["text"])
        ids = torch.tensor([tokenizer.convert_tokens_to_ids(toks)])
        article = get_article(v["article_path"]).lower()

        try:
            with torch.no_grad():
                ids = ids.to("cuda:0")
                preds, _ = model(ids, attention_mask=None)

            preds = preds.tolist()[0]
            spans = extract_pred_spans(preds)
        except:
            continue

        for span in spans:
            if span[0] == span[1]:
                pred_text = tokenizer.decode([ids[0][span[0]]])
            else:
                pred_text = tokenizer.decode(ids[0][span[0]:span[1]])
            pred_text = normalize_text(pred_text)

            if len(pred_text) < 4:
                continue

            # Gradually increase fuzzy match dist to prevent bad match for short spans
            max_l_dists = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            for dist in max_l_dists:
                match = find_near_matches(pred_text, article, max_l_dist=dist)
                if len(match) > 0:
                    pred_rows.append({
                        "article_id": v["article_id"],
                        "pred_text": pred_text,
                        "span_start": match[0].start,
                        "span_end": match[0].end
                    })
                    break
    return pred_rows
Esempio n. 6
0
 def get_bookmarks_articles(self, max_get=10000):
     '''获取用户收藏的文章,需要登入,设置cookie
     '''
     mark_url = BASE_URL + '/bookmarks?'
     page = 1
     bookmarks_list = []
     while True:
         url = mark_url + 'page=' + str(page)
         page += 1
         content = get_content(url, COOKIE)
         page_arts = get_article(content)
         if len(page_arts) == 0 or len(bookmarks_list) > max_get:
             logger.info('一共获取 %d 篇文章' % len(bookmarks_list))
             return bookmarks_list
         bookmarks_list.extend(page_arts)
Esempio n. 7
0
 def get_favourites_articles(self, max_get=100000):
     '''获取用户喜欢的文章,需要登入,设置cookie
     '''
     liked_url = BASE_URL + '/favourites?type=notes'
     page = 1
     favourites_list = []
     while True:
         url = liked_url + '&page=' + str(page)
         page += 1
         content = get_content(url, COOKIE)
         page_arts = get_article(content)
         if len(page_arts) == 0 or len(favourites_list) > max_get:
             logger.info('一共获取 %d 篇文章' % len(favourites_list))
             return favourites_list
         favourites_list.extend(page_arts)
Esempio n. 8
0
 def get_article_list(self, order_by='latest', max_get=1000000):
     '''获取该文集的所有文章列表 order_by : {top, latest}
     '''
     page = 1
     articles_list = []
     while True:
         url = BASE_URL + '/notebooks/' + self.notebook_id + '/' + order_by + '?page=' + str(
             page)
         page += 1
         content = get_content(url)
         page_arts = get_article(content)
         if len(page_arts) == 0 or len(articles_list) > max_get:
             logger.info('一共获取 %d 篇文章' % len(articles_list))
             return articles_list
         articles_list.extend(page_arts)
Esempio n. 9
0
    def get_subscription_notes(self, max_get=100):
        '''获取关注专题列表的最新max_get篇文章,需要设置cookie
        '''
        article_list = []
        load_more_url = '/subscription_notes'
        while True:
            url = BASE_URL + load_more_url
            content = get_content(url, COOKIE)
            if content == 'FAIL':
                logger.warning(u'失败')
                return
            # logger.info(content)
            page_arts = get_article(content)

            soup = BeautifulSoup(content, 'lxml')
            load_more_url = soup.find('button',
                                      attrs={'class':
                                             'ladda-button'})['data-url']
            logger.info(load_more_url)
            if len(page_arts) == 0 or len(article_list) >= max_get or (
                    not load_more_url):
                logger.info(u'一共获取 %d 篇文章' % len(article_list))
                return article_list
            article_list.extend(page_arts)
Esempio n. 10
0
binary_model = binary_model.to(device1)
span_model = span_model.to(device2)

binary_model.eval()
span_model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

pred_rows = []
for i, v in dev.iterrows():
    line = v["text"]
    toks = tokenizer.tokenize(line)
    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(toks)])
    input_ids2 = torch.tensor([tokenizer.convert_tokens_to_ids(toks)])

    article = get_article(v["article_path"]).lower()

    with torch.no_grad():
        input_ids = input_ids.to(device1)
        binary_pred = binary_model(input_ids)[0]

        if torch.argmax(binary_pred) == 1:
            input_ids = input_ids.to(device2)
            preds, log = span_model(input_ids)
        else:
            continue
    preds = preds[0].cpu().tolist()
    spans = extract_pred_spans(preds)

    if len(spans) == 0:
        continue
from transformers import BertTokenizer

# Generate dev dataset for prediction
dev_ids = []
with open("data/dev-task-TC-template.out") as f:
    for line in f.readlines():
        l = line.split("\t")
        dev_ids.append(l[0].strip())
        
dev_ids = list(set(dev_ids))
dev_articles_map = {}

dev_dat = []
for i in dev_ids:
    file = get_dev_article_file(i)
    article_text = get_article(file)
    for line in article_text.split("\n"):
        if len(line) < 4 :
            continue
        else:
            dev_dat.append({"article_id": i, "text": line, "article_path": file})
    dev_articles_map[i] = article_text

dev_df = pd.DataFrame(dev_dat)
dev_df.to_csv("data/task1_dev.csv")

# # 1. Generate train labels
with open("data/train-task1-SI.labels", "r") as f:
    train_lines = f.readlines()from utils import get_article, get_span, get_span_text, get_article_file, \
get_task1_file, get_gold_spans, get_dev_article_file
Esempio n. 12
0
#%%
from utils import get_article
from os import listdir
from os.path import isfile, join
import re 
import pandas as pd 

mypath = "data/test-articles/"
test_articles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

id_extractor = re.compile("\d+")

row = []
for path in test_articles:
    article_id = id_extractor.findall(path)[0]
    text = get_article(path)
    
    lines = text.split("\n")
    
    start_idx = 0
    for line in lines:
            end_idx = len(line) + start_idx + 2
            if len(line) > 1:
                row.append({"article_id": article_id,
                            "text": "[SKIP]" if len(line) < 1 else line.strip(),
                            "article_path": path,
                            "start_idx": start_idx,
                            "end_idx": end_idx})
                start_idx = end_idx + 1

df = pd.DataFrame(row)
t = pd.DataFrame({"file":train_files})
t["src"] = "train"

d = pd.DataFrame({"file":dev_files})
d["src"] = "dev"
tt = pd.DataFrame({"file":test_files})
tt["src"] = "test"


df = pd.concat([t,d])

# %%
from sklearn.model_selection import train_test_split

random_state = 1024
train, val = train_test_split(df, test_size=.15, random_state=random_state,
                              stratify = df["src"])

# %%
from utils import get_article

with open("data/lm_train_v2.txt", "w") as f:
    for x in train["file"]:
        f.write(get_article(x))


with open("data/lm_val.txt_v2", "w") as f:
    for x in val["file"]:
        f.write(get_article(x))

# %%