def Journal_Conference(PaperAuthor_PATH, Paper_PATH, Journal_PATH, Conference_PATH, to_file): data = util.read_dict_from_csv(PaperAuthor_PATH) for item in data: PaperId = item["PaperId"] AuthorId = item["AuthorId"] dict_authors_Journal_Conference= {} if AuthorId not in dict_authors_Journal_Conference: dict_authors_Journal_Conference[AuthorId] = [] data1 = util.read_dict_from_csv(Paper_PATH) for paperId in data1: 用paperId找到Journal_Conference的ID
def load_paperIdAuthorId_to_name_and_affiliation(PaperAuthor_PATH, to_file): d = {} data = util.read_dict_from_csv(PaperAuthor_PATH) for item in data: PaperId = item["PaperId"] AuthorId = item["AuthorId"] Name = item["Name"] Affiliation = item["Affiliation"] key = "%s|%s" % (PaperId, AuthorId) if key not in d: d[key] = {} d[key]["Name"] = [] d[key]["Affiliation"] = [] if Name != "": d[key]["Name"].append(Name) if Affiliation != "": d[key]["Affiliation"].append(Affiliation) t = {} for key in d: name = "##".join(d[key]["Name"]) affiliation = "##".join(d[key]["Affiliation"]) t[key] = {} t[key]["name"] = name t[key]["affiliation"] = affiliation json.dump(t, open(to_file, "w"), encoding="utf-8")
def get_dict_auther_conference_journal(paper_path, conference_path, journal_path, paper_author_path, to_file): data_paper = util.read_dict_from_csv(paper_path) #dict_conference = util.read_dict_from_csv(conference_path) #dict_journal = util.read_dict_from_csv(journal_path) dict_paper_author = json.load(open(paper_author_path), encoding="utf-8") print("start...") bar = pyprind.ProgPercent(len(data_paper)) dict_auther_conference_journal = {} for item in data_paper: paperId = int(item["Id"]) journalId = item["JournalId"] confereceId = item["ConferenceId"] authorIds = list(map(int, (dict_paper_author[str(paperId)]))) for authorId in authorIds: if authorId not in dict_auther_conference_journal: dict_auther_conference_journal[authorId] = { "conferenceId": Counter(), "journalId": Counter() } dict_auther_conference_journal[authorId]["conferenceId"][ confereceId] += 1 dict_auther_conference_journal[authorId]["journalId"][ journalId] += 1 bar.update() print "dump..." json.dump(dict_auther_conference_journal, open(to_file, "w"), encoding="utf-8")
def load_test_data(test_path): data = util.read_dict_from_csv(test_path) authorIdPaperIds = [] for item in data: authorId = item["AuthorId"] # 构造测试样本 for paperId in item["PaperIds"].split(" "): authorIdPaperId = AuthorIdPaperId(authorId, paperId) authorIdPaperId.label = -1 # 待预测,暂时赋值为1... authorIdPaperIds.append(authorIdPaperId) return authorIdPaperIds
def prepare_author_keywords(aid_pid_path, paper_path): aid_pid = util.read_dict_from_csv(aid_pid_path) paper = util.read_dict_from_csv(paper_path) print 'finish loading csv file' max_aid, max_pid = 0, 0 for item in aid_pid: max_aid = max(max_aid, int(item["AuthorId"])) max_pid = max(max_pid, int(item['PaperId'])) for item in paper: max_pid = max(max_pid, int(item["Id"])) print 'max_aid', max_aid, 'max_pid', max_pid paper_indexd_list = range(max_pid + 1) for item in paper: paper_indexd_list[int(item["Id"])] = item del paper keywords = [[]] * (max_aid + 1) bar = pyprind.ProgPercent(len(aid_pid)) for item in aid_pid: bar.update() aid = int(item['AuthorId']) pid = int(item['PaperId']) if pid != paper_indexd_list[pid]: kw = paper_indexd_list[pid]["Keyword"] if kw: keywords[aid].append(kw) author_keywords = [] for index, item in enumerate(keywords): dic = {} if item: dic["AuthorId"] = str(index) dic["Keywords"] = " ".join(item) author_keywords.append(dic) del keywords write_dict_to_csv(['AuthorId', 'Keywords'], author_keywords, config.AUTHOR_KEYWORDS_FILE) print 'finish writing author_keywords csv in', config.AUTHOR_KEYWORES_FILE
def Evalution(gold_file_path, pred_file_path): gold_authorIdPaperId_to_label = {} pred_authorIdPaperId_to_label = {} gold_data = util.read_dict_from_csv(gold_file_path) for item in gold_data: AuthorId = item["AuthorId"] # 正样本 for paperId in item["ConfirmedPaperIds"].split(" "): gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "1" # 负样本 for paperId in item["DeletedPaperIds"].split(" "): gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "0" pred_data = util.read_dict_from_csv(pred_file_path) for item in pred_data: AuthorId = item["AuthorId"] # 正样本 for paperId in item["ConfirmedPaperIds"].split(" "): pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "1" # 负样本 for paperId in item["DeletedPaperIds"].split(" "): pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "0" # evaluation alphabet = Alphabet() alphabet.add("0") alphabet.add("1") cm = ConfusionMatrix(alphabet) for AuthorId, paperId in gold_authorIdPaperId_to_label: gold = gold_authorIdPaperId_to_label[(AuthorId, paperId)] pred = pred_authorIdPaperId_to_label[(AuthorId, paperId)] cm.add(pred, gold) return cm
def get_top_k_coauthors(paper_author_path, k, to_file): data = util.read_dict_from_csv(paper_author_path) dict_paperId_to_authors = {} bar = pyprind.ProgPercent(len(data)) for item in data: paperId = int(item["PaperId"]) authorId = int(item["AuthorId"]) if paperId not in dict_paperId_to_authors: dict_paperId_to_authors[paperId] = [] dict_paperId_to_authors[paperId].append(authorId) bar.update() print "dump..." json.dump(dict_paperId_to_authors, open(to_file, "w"), encoding="utf-8")
def load_train_data(train_path): data = util.read_dict_from_csv(train_path) authorIdPaperIds = [] for item in data: authorId = item["AuthorId"] # 构造训练正样本 for paperId in item["ConfirmedPaperIds"].split(" "): authorIdPaperId = AuthorIdPaperId(authorId, paperId) authorIdPaperId.label = 1 # 正样本类标 authorIdPaperIds.append(authorIdPaperId) # 构造训练负样本 for paperId in item["DeletedPaperIds"].split(" "): authorIdPaperId = AuthorIdPaperId(authorId, paperId) authorIdPaperId.label = 0 # 负样本类标 authorIdPaperIds.append(authorIdPaperId) return authorIdPaperIds
def get_dict_auther_keywords(paper_path, paper_author_path, k, to_file): data_paper = util.read_dict_from_csv(paper_path) dict_paper_author = json.load(open(paper_author_path), encoding="utf-8") #print(dict_paper_author["1048576"]) dict_auther_keywords = {} print("start...") bar = pyprind.ProgPercent(len(data_paper)) for item in data_paper: paperId = int(item["Id"]) title = item["Title"] keywords = item["Keyword"] key = util.get_string_splited(title + " " + keywords) for authorId in dict_paper_author[str(paperId)]: if authorId not in dict_auther_keywords: dict_auther_keywords[authorId] = [] dict_auther_keywords[authorId].extend(key) bar.update() print "dump..." json.dump(dict_auther_keywords, open(to_file, "w"), encoding="utf-8")
def get_top_k_coauthors(paper_author_path, k, to_file): data = util.read_dict_from_csv(paper_author_path) dict_paperId_to_authors = {} for item in data: paperId = int(item["PaperId"]) authorId = int(item["AuthorId"]) if paperId not in dict_paperId_to_authors: dict_paperId_to_authors[paperId] = [] dict_paperId_to_authors[paperId].append(authorId) dict_author_to_coauthor = {} for paperId in dict_paperId_to_authors: authors = dict_paperId_to_authors[paperId] n = len(authors) for i in range(n): for j in range(i + 1, n): if authors[i] not in dict_author_to_coauthor: dict_author_to_coauthor[authors[i]] = Counter() if authors[j] not in dict_author_to_coauthor: dict_author_to_coauthor[authors[j]] = Counter() # coauthor dict_author_to_coauthor[authors[i]][authors[j]] += 1 dict_author_to_coauthor[authors[j]][authors[i]] += 1 print("取 top k...") # 取 top k # authorid --> { author1: 100, author2: 45} res = {} for authorId in dict_author_to_coauthor: res[authorId] = {} for coauthorId, freq in dict_author_to_coauthor[authorId].most_common( k): res[authorId][coauthorId] = freq print("dump...") json.dump(res, open(to_file, "w"), encoding="utf-8")
def get_top_k_key(Authorkeywords_PATH, k, to_file): data_authors = util.read_dict_from_csv(Authorkeywords_PATH) dict_authors_to_key= {} for item in data_authors: AuthorId = item["AuthorId"] key = item["keyword1"] if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword2"] if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword3"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword4"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword5"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword6"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword7"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword8"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword9"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) key = item["keyword10"] if key is not None: if AuthorId not in dict_authors_to_JC: dict_authors_to_JC[AuthorId] = [] dict_authors_to_JC[AuthorId].append(key) dict_authors_to_key11={} for AuthorId in dict_authors_to_key: key11 = dict_authors_to_key[AuthorId] n = len(key11) for i in range(n): for j in range(i+1, n): if key11[i] not in dict_authors_to_key11: dict_authors_to_key11[key11[i]] = Counter() if key11[j] not in dict_authors_to_key11: dict_authors_to_key11[key11[j]] = Counter() dict_authors_to_key11[key11[i]][key11[j]] += 1 dict_authors_to_key11[key11[j]][key11[i]] += 1 print "get top k..." res = {} for AuthorId in dict_authors_to_key11: res[AuthorId] = {} for keyId, freq in dict_authors_to_key11[AuthorId].most_common(k): res[AuthorId][keyId] = freq print res print "dump..." json.dump(res, open(to_file, "w"), encoding="utf-8")