def dump_author_features_to_file(): """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') print('n_papers', len(pubs_dict)) wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'w', encoding='utf-8') for i, pid in enumerate(pubs_dict): if i % 1000 == 0: print(i, datetime.now() - start_time) paper = pubs_dict[pid] if "title" not in paper or "authors" not in paper: continue if len(paper["authors"]) > 30: print(i, pid, len(paper["authors"])) if len(paper["authors"]) > 100: continue n_authors = len(paper.get('authors', [])) for j in range(n_authors): author_feature = feature_utils.extract_author_features(paper, j) aid = '{}-{}'.format(pid, j) wf.write(aid + '\t' + ' '.join(author_feature) + '\n') wf.close()
def dump_author_features_to_file(): #提取作者特征到文件中 """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') #原始数据 pubs_raw.json print('n_papers', len(pubs_dict)) #论文数量 wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'w', encoding='utf-8') #特征写入 author_features.txt for i, pid in enumerate(pubs_dict): #枚举一篇论文 i, pid = 索引, 枚举对象 if i % 1000 == 0: print(i, datetime.now() - start_time) paper = pubs_dict[pid] # 某个paper 的信息 if "title" not in paper or "authors" not in paper: continue if len(paper["authors"]) > 30: # 合作者 人数 print(i, pid, len(paper["authors"])) if len(paper["authors"]) > 100: continue n_authors = len( paper.get('authors', []) ) #该论文的作者数 dict.get(key, default=None) 在字典中查询键值key 若不存在返回默认值default for j in range(n_authors): #枚举每一位作者 if 'id' not in paper['authors'][j]: continue author_feature = feature_utils.extract_author_features( paper, j) #提取论文paper中的作者j的特征 __$f_name$_$word$ aid = '{}-{}'.format(pid, j) #aid: pid-j wf.write(aid + '\t' + ' '.join(author_feature) + '\n') #往wf中写入特征信息 aid\t author_feature\n wf.close()
def pubs2txt(rfpath, wfpath): start_time = datetime.now() pubs = load_json(rfpath) with open(wfpath, 'w', encoding='utf-8') as wf: for cnt, paper in enumerate(chain.from_iterable(pubs.values())): if not (cnt + 1) % 1000: print('json2txt %d ' % (cnt + 1), datetime.now() - start_time) # n_authors = len(paper.get('authors', [])) # if n_authors > 100: # continue pid = paper['id'] line = extract_author_features(paper) wf.write(pid + '\t' + line + '\n')
def get_pub_feature(i): if i % 1000 == 0: print("The %dth paper"%i) pid = list(_pubs_dict)[i] paper = _pubs_dict[pid] if "title" not in paper or "authors" not in paper: return None if len(paper["authors"]) > 300: return None if len(paper["authors"]) > 30: print(i, pid, len(paper["authors"])) n_authors = len(paper.get('authors', [])) authors = [] for j in range(n_authors): author_features, word_features = feature_utils.extract_author_features(paper, j) aid = '{}-{}'.format(pid, j) authors.append((aid, author_features, word_features)) return authors
def dump_paper_feature_to_file(): pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') wf = codecs.open(join(settings.ISLAND_LOSS_DIR, 'paper_features.txt'), 'w', encoding='utf-8') for i, pid in enumerate(pubs_dict): if i % 1000 == 0: print(i, datetime.now() - start_time) paper = pubs_dict[pid] if "title" not in paper or "authors" not in paper: continue if len(paper["authors"]) > 30: print(i, pid, len(paper["authors"])) if len(paper["authors"]) > 100: continue author_feature = feature_utils.extract_author_features(paper, 0) pid = '{}'.format(pid) wf.write(pid + '\t' + ' '.join(author_feature) + '\n') wf.close()