Ejemplo n.º 1
0
def Journal_Conference(PaperAuthor_PATH, Paper_PATH, Journal_PATH, Conference_PATH, to_file):

    data = util.read_dict_from_csv(PaperAuthor_PATH)
    for item in data:
        PaperId = item["PaperId"]
        AuthorId = item["AuthorId"]
		dict_authors_Journal_Conference= {}
		if  AuthorId not in dict_authors_Journal_Conference:
            dict_authors_Journal_Conference[AuthorId] = []

		data1 = util.read_dict_from_csv(Paper_PATH)
		for paperId in data1:
			用paperId找到Journal_Conference的ID
Ejemplo n.º 2
0
def load_paperIdAuthorId_to_name_and_affiliation(PaperAuthor_PATH, to_file):

    d = {}
    data = util.read_dict_from_csv(PaperAuthor_PATH)
    for item in data:
        PaperId = item["PaperId"]
        AuthorId = item["AuthorId"]
        Name = item["Name"]
        Affiliation = item["Affiliation"]

        key = "%s|%s" % (PaperId, AuthorId)
        if key not in d:
            d[key] = {}
            d[key]["Name"] = []
            d[key]["Affiliation"] = []

        if Name != "":
            d[key]["Name"].append(Name)
        if Affiliation != "":
            d[key]["Affiliation"].append(Affiliation)

    t = {}
    for key in d:
        name = "##".join(d[key]["Name"])
        affiliation = "##".join(d[key]["Affiliation"])

        t[key] = {}
        t[key]["name"] = name
        t[key]["affiliation"] = affiliation

    json.dump(t, open(to_file, "w"), encoding="utf-8")
Ejemplo n.º 3
0
def get_dict_auther_conference_journal(paper_path, conference_path,
                                       journal_path, paper_author_path,
                                       to_file):

    data_paper = util.read_dict_from_csv(paper_path)
    #dict_conference = util.read_dict_from_csv(conference_path)
    #dict_journal = util.read_dict_from_csv(journal_path)
    dict_paper_author = json.load(open(paper_author_path), encoding="utf-8")

    print("start...")
    bar = pyprind.ProgPercent(len(data_paper))
    dict_auther_conference_journal = {}
    for item in data_paper:
        paperId = int(item["Id"])
        journalId = item["JournalId"]
        confereceId = item["ConferenceId"]
        authorIds = list(map(int, (dict_paper_author[str(paperId)])))
        for authorId in authorIds:
            if authorId not in dict_auther_conference_journal:
                dict_auther_conference_journal[authorId] = {
                    "conferenceId": Counter(),
                    "journalId": Counter()
                }

            dict_auther_conference_journal[authorId]["conferenceId"][
                confereceId] += 1
            dict_auther_conference_journal[authorId]["journalId"][
                journalId] += 1

        bar.update()

    print "dump..."
    json.dump(dict_auther_conference_journal,
              open(to_file, "w"),
              encoding="utf-8")
Ejemplo n.º 4
0
def load_test_data(test_path):
    data = util.read_dict_from_csv(test_path)
    authorIdPaperIds = []
    for item in data:
        authorId = item["AuthorId"]
        # 构造测试样本
        for paperId in item["PaperIds"].split(" "):
            authorIdPaperId = AuthorIdPaperId(authorId, paperId)
            authorIdPaperId.label = -1  # 待预测,暂时赋值为1...
            authorIdPaperIds.append(authorIdPaperId)

    return authorIdPaperIds
Ejemplo n.º 5
0
def prepare_author_keywords(aid_pid_path, paper_path):
    aid_pid = util.read_dict_from_csv(aid_pid_path)
    paper = util.read_dict_from_csv(paper_path)
    print 'finish loading csv file'
    max_aid, max_pid = 0, 0
    for item in aid_pid:
        max_aid = max(max_aid, int(item["AuthorId"]))
        max_pid = max(max_pid, int(item['PaperId']))
    for item in paper:
        max_pid = max(max_pid, int(item["Id"]))
    print 'max_aid', max_aid, 'max_pid', max_pid

    paper_indexd_list = range(max_pid + 1)
    for item in paper:
        paper_indexd_list[int(item["Id"])] = item
    del paper

    keywords = [[]] * (max_aid + 1)
    bar = pyprind.ProgPercent(len(aid_pid))
    for item in aid_pid:
        bar.update()
        aid = int(item['AuthorId'])
        pid = int(item['PaperId'])
        if pid != paper_indexd_list[pid]:
            kw = paper_indexd_list[pid]["Keyword"]
            if kw:
                keywords[aid].append(kw)

    author_keywords = []
    for index, item in enumerate(keywords):
        dic = {}
        if item:
            dic["AuthorId"] = str(index)
            dic["Keywords"] = " ".join(item)
            author_keywords.append(dic)

    del keywords
    write_dict_to_csv(['AuthorId', 'Keywords'], author_keywords,
                      config.AUTHOR_KEYWORDS_FILE)
    print 'finish writing author_keywords csv in', config.AUTHOR_KEYWORES_FILE
Ejemplo n.º 6
0
def Evalution(gold_file_path, pred_file_path):
    gold_authorIdPaperId_to_label = {}
    pred_authorIdPaperId_to_label = {}

    gold_data = util.read_dict_from_csv(gold_file_path)
    for item in gold_data:
        AuthorId = item["AuthorId"]
        # 正样本
        for paperId in item["ConfirmedPaperIds"].split(" "):
            gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "1"
        # 负样本
        for paperId in item["DeletedPaperIds"].split(" "):
            gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "0"

    pred_data = util.read_dict_from_csv(pred_file_path)
    for item in pred_data:
        AuthorId = item["AuthorId"]
        # 正样本
        for paperId in item["ConfirmedPaperIds"].split(" "):
            pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "1"
        # 负样本
        for paperId in item["DeletedPaperIds"].split(" "):
            pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "0"

    # evaluation
    alphabet = Alphabet()
    alphabet.add("0")
    alphabet.add("1")

    cm = ConfusionMatrix(alphabet)
    for AuthorId, paperId in gold_authorIdPaperId_to_label:
        gold = gold_authorIdPaperId_to_label[(AuthorId, paperId)]
        pred = pred_authorIdPaperId_to_label[(AuthorId, paperId)]
        cm.add(pred, gold)

    return cm
Ejemplo n.º 7
0
def get_top_k_coauthors(paper_author_path, k, to_file):

    data = util.read_dict_from_csv(paper_author_path)

    dict_paperId_to_authors = {}
    bar = pyprind.ProgPercent(len(data))
    for item in data:
        paperId = int(item["PaperId"])
        authorId = int(item["AuthorId"])
        if paperId not in dict_paperId_to_authors:
            dict_paperId_to_authors[paperId] = []
        dict_paperId_to_authors[paperId].append(authorId)
        bar.update()

    print "dump..."
    json.dump(dict_paperId_to_authors, open(to_file, "w"), encoding="utf-8")
Ejemplo n.º 8
0
def load_train_data(train_path):
    data = util.read_dict_from_csv(train_path)
    authorIdPaperIds = []
    for item in data:
        authorId = item["AuthorId"]

        # 构造训练正样本
        for paperId in item["ConfirmedPaperIds"].split(" "):
            authorIdPaperId = AuthorIdPaperId(authorId, paperId)
            authorIdPaperId.label = 1  # 正样本类标
            authorIdPaperIds.append(authorIdPaperId)

        # 构造训练负样本
        for paperId in item["DeletedPaperIds"].split(" "):
            authorIdPaperId = AuthorIdPaperId(authorId, paperId)
            authorIdPaperId.label = 0  # 负样本类标
            authorIdPaperIds.append(authorIdPaperId)

    return authorIdPaperIds
Ejemplo n.º 9
0
def get_dict_auther_keywords(paper_path, paper_author_path, k, to_file):

    data_paper = util.read_dict_from_csv(paper_path)
    dict_paper_author = json.load(open(paper_author_path), encoding="utf-8")
    #print(dict_paper_author["1048576"])
    dict_auther_keywords = {}
    print("start...")
    bar = pyprind.ProgPercent(len(data_paper))
    for item in data_paper:
        paperId = int(item["Id"])
        title = item["Title"]
        keywords = item["Keyword"]
        key = util.get_string_splited(title + " " + keywords)

        for authorId in dict_paper_author[str(paperId)]:
            if authorId not in dict_auther_keywords:
                dict_auther_keywords[authorId] = []
            dict_auther_keywords[authorId].extend(key)
        bar.update()

    print "dump..."
    json.dump(dict_auther_keywords, open(to_file, "w"), encoding="utf-8")
Ejemplo n.º 10
0
def get_top_k_coauthors(paper_author_path, k, to_file):

    data = util.read_dict_from_csv(paper_author_path)

    dict_paperId_to_authors = {}
    for item in data:
        paperId = int(item["PaperId"])
        authorId = int(item["AuthorId"])
        if paperId not in dict_paperId_to_authors:
            dict_paperId_to_authors[paperId] = []
        dict_paperId_to_authors[paperId].append(authorId)

    dict_author_to_coauthor = {}
    for paperId in dict_paperId_to_authors:
        authors = dict_paperId_to_authors[paperId]
        n = len(authors)
        for i in range(n):
            for j in range(i + 1, n):
                if authors[i] not in dict_author_to_coauthor:
                    dict_author_to_coauthor[authors[i]] = Counter()
                if authors[j] not in dict_author_to_coauthor:
                    dict_author_to_coauthor[authors[j]] = Counter()
                # coauthor
                dict_author_to_coauthor[authors[i]][authors[j]] += 1
                dict_author_to_coauthor[authors[j]][authors[i]] += 1

    print("取 top k...")
    # 取 top k
    # authorid --> { author1: 100, author2: 45}
    res = {}
    for authorId in dict_author_to_coauthor:
        res[authorId] = {}
        for coauthorId, freq in dict_author_to_coauthor[authorId].most_common(
                k):
            res[authorId][coauthorId] = freq

    print("dump...")
    json.dump(res, open(to_file, "w"), encoding="utf-8")
Ejemplo n.º 11
0
def get_top_k_key(Authorkeywords_PATH, k, to_file):

    data_authors = util.read_dict_from_csv(Authorkeywords_PATH)
    
    dict_authors_to_key= {}
    for item in data_authors:
        AuthorId = item["AuthorId"]
        key = item["keyword1"]
	if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
        key = item["keyword2"]
	if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
        key = item["keyword3"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
        key = item["keyword4"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)        
	key = item["keyword5"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
	key = item["keyword6"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
	key = item["keyword7"]
	if key is not None:
		if AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
	key = item["keyword8"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
	key = item["keyword9"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)
	key = item["keyword10"]
	if key is not None:
		if  AuthorId not in dict_authors_to_JC:
			dict_authors_to_JC[AuthorId] = []
		dict_authors_to_JC[AuthorId].append(key)


    dict_authors_to_key11={}
    for AuthorId in dict_authors_to_key:
        key11 = dict_authors_to_key[AuthorId]
        n = len(key11)
        for i in range(n):
            for j in range(i+1, n):
                if key11[i] not in dict_authors_to_key11:
                    dict_authors_to_key11[key11[i]] = Counter()
                if key11[j] not in dict_authors_to_key11:
                    dict_authors_to_key11[key11[j]] = Counter()
                dict_authors_to_key11[key11[i]][key11[j]] += 1
                dict_authors_to_key11[key11[j]][key11[i]] += 1
    
    print "get top k..."
    
    res = {}
    for AuthorId in dict_authors_to_key11:
        res[AuthorId] = {}
        for keyId, freq in dict_authors_to_key11[AuthorId].most_common(k):
            res[AuthorId][keyId] = freq
    print res
    
    print "dump..."
    
    json.dump(res, open(to_file, "w"), encoding="utf-8")