Exemple #1
0
def stringDistance_1(AuthorIdPaperId, dict_coauthor,
                     dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    key = "%s|%s" % (paperId, authorId)
    name = str(dict_paperIdAuthorId_to_name_aff[key]["name"])
    aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"])

    T = list(Author[Author["Id"] == int(authorId)].values)[0]
    a_name = str(T[1])
    a_aff = str(T[2])
    if a_name == "nan":
        a_name = ""
    if a_aff == "nan":
        a_aff = ""

    feat_list = []

    # 计算 a_name 与 name 的距离
    feat_list.append(len(longest_common_subsequence(a_name, name)))
    feat_list.append(len(longest_common_substring(a_name, name)))
    feat_list.append(Levenshtein_distance(a_name, name))
    # 计算 a_aff 与 aff 的距离
    feat_list.append(len(longest_common_subsequence(a_aff, aff)))
    feat_list.append(len(longest_common_substring(a_aff, aff)))
    feat_list.append(Levenshtein_distance(a_aff, aff))

    return util.get_feature_by_list(feat_list)
def keywords_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal,dict_author_paperid ):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    #该作者以前写过的论文的keywords集合
    fomer_keywords = dict_author_keywords[authorId].keys()
    fomer_dict_keywords = dict_author_keywords[authorId]

    #当前论文的keywords集合current_key
    title = Paper[Paper["Id"]==int(paperId)]["Title"].values
    keywords = Paper[Paper["Id"]==int(paperId)]["Keyword"].values
    if len(title) :
        title=str(title[0])
    else:
        title = ' '
    keywords = str(keywords[0])
    if keywords=="nan":
        keywords = ' '
    curr_keywords = util.get_string_splited(title + " " + keywords)

    #统计关键字相同的个数
    nums = len(set(curr_keywords) & set(fomer_keywords))

    #统计分数
    score = 0
    for word in curr_keywords:
        if word in fomer_dict_keywords :
            score += fomer_dict_keywords[word]
    #print nums, score
    return util.get_feature_by_list([nums, score])
Exemple #3
0
def journal_conference_year(AuthorIdPaperId, dict_coauthor,
                            dict_paperIdAuthorId_to_name_aff, PaperAuthor,
                            Author, Paper, Conference, Journal):
    paperId = int(AuthorIdPaperId.paperId)

    conferenceId = Paper[Paper['Id'] == paperId]['ConferenceId'].values
    journalId = Paper[Paper['Id'] == paperId]['JournalId'].values
    paper_year = int(Paper[Paper['Id'] == int(paperId)]['Year'].values[0])

    feat_list = []
    if len(conferenceId) == 0 or int(conferenceId[0]) <= 0:
        feat_list.append(0)
    else:
        feat_list.append(int(conferenceId[0]))

    if len(journalId) == 0 or int(journalId[0]) <= 0:
        feat_list.append(0)
    else:
        feat_list.append(int(journalId[0]))

    if 1800 <= paper_year <= 2013:
        feat_list.append(paper_year)
    else:
        feat_list.append(0)

    return util.get_feature_by_list(feat_list)
Exemple #4
0
def stringDistance_2(AuthorIdPaperId, dict_coauthor,
                     dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author,
                     Paper, Conference, Journal):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    key = "%s|%s" % (paperId, authorId)
    name = str(dict_paperIdAuthorId_to_name_aff[key]["name"])
    aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"])

    T = list(Author[Author["Id"] == int(authorId)].values)[0]
    a_name = str(T[1])
    a_aff = str(T[2])
    if a_name == "nan":
        a_name = ""
    if a_aff == "nan":
        a_aff = ""

    feat_list = []

    # 计算 a_name 与 name 的距离
    lcs_distance = []
    lss_distance = []
    lev_distance = []
    for _name in name.split("##"):
        lcs_distance.append(len(longest_common_subsequence(a_name, _name)))
        lss_distance.append(len(longest_common_substring(a_name, _name)))
        # 尝试不同的字符串相似度算法
        # lev_distance.append(Levenshtein_distance(a_name, _name))
        lev_distance.append(textdistance.JaroWinkler()(a_name, _name))
        # lev_distance.append(textdistance.Jaccard()(a_name, _name))

    feat_list += [
        np.mean(lcs_distance),
        np.mean(lss_distance),
        np.mean(lev_distance)
    ]

    # 计算 a_aff 与 aff 的距离
    lcs_distance = []
    lss_distance = []
    lev_distance = []
    for _aff in aff.split("##"):
        lcs_distance.append(len(longest_common_subsequence(a_aff, _aff)))
        lss_distance.append(len(longest_common_substring(a_aff, _aff)))
        # 尝试不同的字符串相似度算法
        # lev_distance.append(Levenshtein_distance(a_aff, _aff))
        lev_distance.append(textdistance.JaroWinkler()(a_aff, _aff))
        # lev_distance.append(textdistance.Jaccard()(a_aff, _aff))

    feat_list += [
        np.mean(lcs_distance),
        np.mean(lss_distance),
        np.mean(lev_distance)
    ]

    # # feat_list
    # feat_list = [feat_list[0],feat_list[1], feat_list[3],feat_list[4]]

    return util.get_feature_by_list(feat_list)
Exemple #5
0
def publication_year(AuthorIdPaperId, dict_coauthor,
                     dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author,
                     Paper, Conference, Journal):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId
    # print('authorId', authorId)

    # paperId 的发表年份
    paper_year = Paper[Paper['Id'] == int(paperId)]['Year'].values[0]

    # 作者发表的所有论文 id
    paperIds = PaperAuthor[PaperAuthor['AuthorId'] == int(
        authorId)]['PaperId'].values
    years = []
    for id in paperIds:
        year = Paper[Paper['Id'] == int(id)]['Year'].values
        if year.shape[0] == 0:
            continue
        year = year[0]
        if 1800 <= year <= 2013:
            years.append(year)

    if not years:
        feature = [0, 0, 0]
    else:
        feature = [1, paper_year - min(years), max(years) - paper_year]

    return util.get_feature_by_list(feature)
def coauthor_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal ,conference,journal, dict_author_paperid):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    # 从PaperAuthor中,根据paperId找coauthor。
    curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values)))
    #
    top_coauthors = dict_coauthor[authorId].keys()

    # 简单计算top 10 coauthor出现的个数
    nums = len(set(curr_coauthors) & set(top_coauthors))

    return util.get_feature_by_list([nums])
Exemple #7
0
def journal_count(AuthorIdPaperId, dict_coauthor,
                  dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, Paper,
                  Conference, Journal):
    authorId = int(AuthorIdPaperId.authorId)
    paperIds = PaperAuthor[PaperAuthor['AuthorId'] == int(
        authorId)]['PaperId'].values
    journalIds = set()
    for id in paperIds:
        journalId = Paper[Paper['Id'] == int(id)]['JournalId'].values
        if len(journalId) > 0:
            journalIds.add(int(journalId[0]))

    return util.get_feature_by_list([len(journalIds)])
Exemple #8
0
def affiliation_count(AuthorIdPaperId, dict_coauthor,
                      dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author,
                      Paper, Conference, Journal):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId
    # 从PaperAuthor中,根据paperId找coauthor
    curr_coauthors = list(
        map(
            str, PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]
            ["AuthorId"].values))
    curr_affiliations = list(
        map(
            str, PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]
            ["Affiliation"].values))
    index = 0
    for author in curr_coauthors:
        if author == authorId:
            break
        index += 1
    affiliation = curr_affiliations[index]
    if affiliation == 'nan':
        return util.get_feature_by_list([0])
    else:
        return util.get_feature_by_list([curr_affiliations.count(affiliation)])
def coauthor_2(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal ,conference,journal,dict_author_paperid):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    # 从PaperAuthor中,根据paperId找coauthor。
    curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values)))

    # {"authorId": 100}
    top_coauthors = dict_coauthor[authorId]

    score = 0
    for curr_coauthor in curr_coauthors:
        if curr_coauthor in top_coauthors:
            score += top_coauthors[curr_coauthor]

    return util.get_feature_by_list([score])
def conference_journal_2(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal,dict_author_paperid ):
    authorId = AuthorIdPaperId.authorId#目前作者的id
    paperId = AuthorIdPaperId.paperId#目前paper的id


    #这篇论文所在conference和journal情况
    conferenceId = Paper[Paper["Id"]==int(paperId)]["ConferenceId"].values
    journalId = Paper[Paper["Id"]==int(paperId)]["JournalId"].values
    #print ("authorId:",authorId,type(authorId))
    #print ("paperId:",paperId,type(paperId))
    #print ("conferenceId:",conferenceId,type(conferenceId))
    #print ("journalId:",journalId,type(journalId))
    conferenceId = str(conferenceId[0])
    journalId = str(journalId[0])

    if conferenceId == "0" and journalId == "0":
        score = [max(dict_author_conference_journal[authorId]["conferenceId"][conferenceId] , dict_author_conference_journal[authorId]["journalId"][journalId])]
    else:
        score = [dict_author_conference_journal[authorId]["conferenceId"][conferenceId] + dict_author_conference_journal[authorId]["journalId"][journalId]]

    return util.get_feature_by_list(score)
Exemple #11
0
def keyword(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff,
            PaperAuthor, Author, Paper, Conference, Journal):
    def get_words(paper):
        s = str(paper.Title)
        if not pd.isna(paper.Keyword):
            s += paper.Keyword
        # print(s)
        words = re.split(r'[|\s;,]', s)
        words = [
            w for w in words
            if w and w not in nltk.corpus.stopwords.words('english')
            and not w.isdigit()
        ]
        return words

    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    papersOfAuthor = PaperAuthor[PaperAuthor['AuthorId'] == int(authorId)]
    kws = get_words(Paper[Paper['Id'] == int(paperId)].iloc[0])

    feature = []
    if papersOfAuthor.shape[0] == 0:
        feature += [0]
    else:
        cnt = 0
        s = set()
        for _, row in papersOfAuthor.iterrows():
            paper = Paper[Paper['Id'] == row.PaperId]
            if paper.shape[0] == 0:
                continue
            paper = paper.iloc[0]
            _kws = get_words(paper)
            cnt += len(_kws)
            if paper.Id != paperId:
                s.update(_kws)
        feature.append(len(s.intersection(set(kws))))

    return util.get_feature_by_list(feature)
def yeardistance(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conference,journal ,dict_author_paperid):
    authorId = AuthorIdPaperId.authorId#目前作者的id
    paperId = AuthorIdPaperId.paperId#目前paper的id

    #当前Paper年份附近年份该作者有没有发过Paper 如果作者已经很久没有发过paper那么有理由相信这篇paper是这个作者发的可能性比较小
    minyear = 0
    feat_list = []
    #当前paper发的时间
    curyear = (Paper[Paper["Id"]==int(paperId)]["Year"].values)[0]
    #根据作者的id从paperauthor数据集中找到他发过的所有paper
    """for item in dict_author_paperid[authorId]:
                    #print ("item:",item)
                    #计算每一个paper所发的时间
                    tempyear = int(Paper[Paper["Id"]==int(item)]["Year"])
                    yearlist.append(abs(int(curyear) - int(tempyear)))
                yearlist = [np.min(yearlist)]"""

    minyear = np.min(Paper[Paper["Id"].isin (dict_author_paperid[authorId])]["Year"])
    if minyear < 8 :
        feat_list = [1]
    else :
        feat_list = [-1]
    return util.get_feature_by_list(feat_list)
def conference_journal_1(AuthorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, dict_author_keywords, PaperAuthor, Author, Paper,dict_author_conference_journal,conferences,journals,dict_author_paperid):
    authorId = AuthorIdPaperId.authorId#目前作者的id
    paperId = AuthorIdPaperId.paperId#目前paper的id


    #这篇论文所在conference和journal情况
    conferenceId = Paper[Paper["Id"]==int(paperId)]["ConferenceId"].values
    journalId = Paper[Paper["Id"]==int(paperId)]["JournalId"].values
    conferenceId = str(conferenceId[0])
    journalId = str(journalId[0])

    feat_list = []
    
    thesameurl = 0
    temp = "a"
    temp2 = "b"

    #如果目前这篇confereceid和journalid均为0,那么返回[0]
    if conferenceId == "0" and journalId == "0":
        #feat_list = [ max(dict_author_conference_journal[authorId]["conferenceId"][conferenceId],dict_author_conference_journal[authorId]["journalId"][journalId]) ]
        feat_list = [0]

    #如果目前这篇conferenceid或journalid不为0,且该id作者之前没发过,那么查找该作者之前发的paper所在conference情况;其中如果作者发过该期刊,则直接拿该期刊的发布次数作为score;如果没有发过该期刊,那么求最近期刊和当前期刊的距离运算后作为权重乘以分数作为分数
    if conferenceId != "0":
        temp = conferences[conferences["Id"]==int(conferenceId)]["HomePage"].values
        #print ("aaaa",type(temp))
        if temp != None:
            str_conference = str(temp[0])
        else:
            str_conference = "0"

        for conference in dict_author_conference_journal[authorId]["conferenceId"]:
        #比较目前的conference和该作者dict里的conference,找到属于同一个主url的conference
            temp2 = conferences[conferences["Id"]==int(conference)]["HomePage"].values
            if temp2 != None:
                str_tempconference = str(temp2[0])
            else:
                str_tempconference = "1"
            if in_thesame_major_website(str_tempconference,str_conference):
                #如果属于同一个主域,那么将作者发过的这个会议或者期刊的次数记录下来累加
                thesameurl += dict_author_conference_journal[authorId]["conferenceId"][conference]
        feat_list = [thesameurl]
    
        thesameurl = 0

    if  journalId != "0":
        temp = journals[journals["Id"]==int(journalId)]["HomePage"].values
        if temp != None:
            str_journal = str(temp[0])
        else:
            str_journal = "0"

        for journal in dict_author_conference_journal[authorId]["journalId"]:
            temp2 = journals[journals["Id"]==int(journal)]["HomePage"].values
            if temp2 != None:
                str_tempjournal = str(temp2[0])
            else:
                str_tempjournal = "1"

            if in_thesame_major_website(str_tempjournal,str_journal) :
                thesameurl += dict_author_conference_journal[authorId]["journalId"][journal]
        feat_list = [thesameurl]

 

    return util.get_feature_by_list(feat_list)
# 我简单地把coauthor和当前aid作者和合作次数作为这个coauthor出现的得分。


def key(AuthorIdPaperId, dict_coauthor,dict_key, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author):
	authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    # 从PaperAuthor中,根据paperId找coauthor。
    curr_key = list(map(str, list(AuthorKeywords[AuthorKeywords["PaperId"] == int(paperId)]["AuthorId"].values)))
    #
    top_key = dict_key[AuthorId].keys()

    # 简单计算top 10 coauthor出现的个数
    nums = len(set(curr_key) & set(top_key))

    return util.get_feature_by_list([nums])


# 1. 简单计算top 10 coauthor出现的个数
def coauthor_1(AuthorIdPaperId, dict_coauthor,dict_key, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    # 从PaperAuthor中,根据paperId找coauthor。
    curr_coauthors = list(map(str, list(PaperAuthor[PaperAuthor["PaperId"] == int(paperId)]["AuthorId"].values)))
    #
    top_coauthors = dict_coauthor[authorId].keys()

    # 简单计算top 10 coauthor出现的个数
    nums = len(set(curr_coauthors) & set(top_coauthors))