Beispiel #1
0
def dump_author_features_to_file():
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json')
    print('n_papers', len(pubs_dict))
    wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'w',
                     encoding='utf-8')
    for i, pid in enumerate(pubs_dict):
        if i % 1000 == 0:
            print(i, datetime.now() - start_time)
        paper = pubs_dict[pid]
        if "title" not in paper or "authors" not in paper:
            continue
        if len(paper["authors"]) > 30:
            print(i, pid, len(paper["authors"]))
        if len(paper["authors"]) > 100:
            continue
        n_authors = len(paper.get('authors', []))
        for j in range(n_authors):
            author_feature = feature_utils.extract_author_features(paper, j)
            aid = '{}-{}'.format(pid, j)
            wf.write(aid + '\t' + ' '.join(author_feature) + '\n')
    wf.close()
def dump_author_features_to_file():  #提取作者特征到文件中
    """
    generate author features by raw publication data and dump to files
    author features are defined by his/her paper attributes excluding the author's name
    """
    pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                     'pubs_raw.json')  #原始数据 pubs_raw.json
    print('n_papers', len(pubs_dict))  #论文数量
    wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'),
                     'w',
                     encoding='utf-8')  #特征写入 author_features.txt
    for i, pid in enumerate(pubs_dict):  #枚举一篇论文 i, pid = 索引, 枚举对象
        if i % 1000 == 0:
            print(i, datetime.now() - start_time)
        paper = pubs_dict[pid]  # 某个paper 的信息
        if "title" not in paper or "authors" not in paper:
            continue
        if len(paper["authors"]) > 30:  # 合作者 人数
            print(i, pid, len(paper["authors"]))
        if len(paper["authors"]) > 100:
            continue
        n_authors = len(
            paper.get('authors', [])
        )  #该论文的作者数 dict.get(key, default=None) 在字典中查询键值key 若不存在返回默认值default
        for j in range(n_authors):  #枚举每一位作者
            if 'id' not in paper['authors'][j]:
                continue
            author_feature = feature_utils.extract_author_features(
                paper, j)  #提取论文paper中的作者j的特征 __$f_name$_$word$
            aid = '{}-{}'.format(pid, j)  #aid: pid-j
            wf.write(aid + '\t' + ' '.join(author_feature) +
                     '\n')  #往wf中写入特征信息 aid\t author_feature\n
    wf.close()
Beispiel #3
0
def pubs2txt(rfpath, wfpath):
    start_time = datetime.now()
    pubs = load_json(rfpath)
    with open(wfpath, 'w', encoding='utf-8') as wf:
        for cnt, paper in enumerate(chain.from_iterable(pubs.values())):
            if not (cnt + 1) % 1000:
                print('json2txt %d  ' % (cnt + 1), datetime.now() - start_time)
            # n_authors = len(paper.get('authors', []))
            # if n_authors > 100:
            #     continue
            pid = paper['id']
            line = extract_author_features(paper)
            wf.write(pid + '\t' + line + '\n')
def get_pub_feature(i):
    if i % 1000 == 0:
        print("The %dth paper"%i)
    pid = list(_pubs_dict)[i]
    paper = _pubs_dict[pid]
    if "title" not in paper or "authors" not in paper:
        return None
    if len(paper["authors"]) > 300:
        return None
    if len(paper["authors"]) > 30:
        print(i, pid, len(paper["authors"]))
    n_authors = len(paper.get('authors', []))
    authors = []
    for j in range(n_authors):
        author_features, word_features = feature_utils.extract_author_features(paper, j)
        aid = '{}-{}'.format(pid, j)
        authors.append((aid, author_features, word_features))
    return authors
Beispiel #5
0
def dump_paper_feature_to_file():
    pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json')
    wf = codecs.open(join(settings.ISLAND_LOSS_DIR, 'paper_features.txt'),
                     'w',
                     encoding='utf-8')
    for i, pid in enumerate(pubs_dict):
        if i % 1000 == 0:
            print(i, datetime.now() - start_time)
        paper = pubs_dict[pid]
        if "title" not in paper or "authors" not in paper:
            continue
        if len(paper["authors"]) > 30:
            print(i, pid, len(paper["authors"]))
        if len(paper["authors"]) > 100:
            continue
        author_feature = feature_utils.extract_author_features(paper, 0)
        pid = '{}'.format(pid)
        wf.write(pid + '\t' + ' '.join(author_feature) + '\n')
    wf.close()