def prepareNULL():
    #
    nullDOI = readSeriz(nullDOI_pickle)
    yearTitle = [[[] for j in range(27)] for i in range(60)]
    yearList = []
    num = 0
    for nid in nullDOI:
        #print(str(nid))
        getR = getResult('select time,title from paper where id=' + str(nid),
                         cur)
        sr = getR[0]
        if not (len(sr['title']) > 0 and sr['time'] > 0):
            continue
        if sr['time'] in yearList:
            yidx = yearList.index(sr['time'])
        else:
            yearList.append(sr['time'])
            yidx = yearList.index(sr['time'])
        nidx = assignNum(sr['title'])
        yearTitle[yidx][nidx].append((nid, sr['time'], sr['title']))
        print('now is ' + str(num) + ' id is ' + str(nid))
        num += 1
    print('begin saving')

    constructSeriz(yearList_pickle_null, yearList)
    for i in range(len(yearList)):
        for j in range(27):
            path = idyeartitle_path_null + str(i) + '_' + str(j) + '.pickle'
            if len(yearTitle[i][j]) > 0:
                constructSeriz(path, yearTitle[i][j])
def buildCoauthorByYear(year):
	#coauthornet xid,yid,score
	paperNetYear = readSeriz(paperNetYear_pickle)
	idList = readSeriz(idList_pickle)
	coauthorNet = readSeriz(expertNet_pickle)#这相当于初始化模板
	'''
	#----------------------------------------
	#---不考虑数据库 start
	coauthorGraph = nx.Graph()
	paperGraph = readSeriz(paperGraph_pickle)
	for tup in paperGraph.edges():
		coauthorList = paperGraph[tup[0]][tup[1]]['coauthoryear']
		coauthorGraph.add_edge(*(tup[0],tup[1]),score=calculateCoauthor(coauthorList,tagYear))
	constructSeriz(coauthorGraph_pickle,coauthorGraph)
	#---不考虑数据库 end
	#----------------------------------------
	'''
	for i in range(len(idList)):
		xid = str(idList[i])
		for j in range((i+1),len(idList)):
			yid = str(idList[j])
			#yearList = getResultList('select year where xid='+xid+', yid='+yid+', year>'+str(year),'year',cur) 
			coauthorList = paperNetYear[i][j]
			#这里就是具体计算了
			if len(yearList)==0:
				continue
			score = calculateCoauthor(coauthorList)
			#xid yid 已经转成str
			insertSQL = 'insert into coauthornet (xid,yid,score) values('+xid+','+yid+','+str(score)+')'
			cur.execute(insertSQL)
			conn.commit()
			coauthorNet[i][j] = score #这里也做一个记录

	coauthorNet_pickle = '' #这个地址要重写,因为需要加入year这个变量
	constructSeriz(coauthorNet_pickle,coauthorNet)
Example #3
0
def prepareTopic():
    subL1 = 'E:/Code/experience/pickle/subPickle 2012.pickle'
    subL2 = 'E:/Code/experience/pickle/subPickle 2013.pickle'
    subL3 = 'E:/Code/experience/pickle/subPickle 2014.pickle'
    subL4 = 'E:/Code/experience/pickle/subPickle 2015.pickle'
    #stopword = readStopWord()
    #准备保留词表数量
    sub1 = readSeriz(subL1)
    sub2 = readSeriz(subL2)
    sub3 = readSeriz(subL3)
    sub4 = readSeriz(subL4)
    sub = sub1 + sub2 + sub3 + sub4
    sub = list(set(sub))
    topicWord = []

    #停用词
    for s in sub:
        s = s.replace('\ufeff', '').replace('/', ' ')
        for w in s.split():
            if not w in stopword and len(w) > 2:
                newTopic = ''
                nw = wn.morphy(w)  #词干化
                try:
                    newTopic += (nw + '')  #变成新的
                except Exception:
                    newTopic += (w + '')  #不变

                topicWord.append(newTopic)
    newTopic = list(set(newTopic))
    #print(topicWord)
    constructSeriz(word_pickle, topicWord)  #序列化
Example #4
0
def measureCOI():
    #计算COI矩阵
    reviewerDict = readSeriz(reviewerDict_pickle)
    personDict = readSeriz(personDict_pickle)
    reviewer_ID = readSeriz(reviewer_ID_pickle)
    totalEID = readSeriz(totalEID_pickle)
    paperDict = readSeriz(paperDict_pickle)
    shortDistanceMetrx = readSeriz(shortDistanceMetrx_pickle)

    paperList = list(paperDict.keys())
    reviewerList = list(reviewerDict.keys())
    COI = [[0 for y in range(len(reviewerList))]
           for x in range(len(paperList))]
    for title in paperDict.keys():
        i = paperList.index(title)
        for rName in reviewerDict.keys():
            j = reviewerList.index(rName)
            rid = reviewer_ID[rName]
            idy = totalEID.index(rid)
            coiList = []
            for name in paperDict[title]:
                eid = personDict[name]
                idx = totalEID.index(eid)
                coi = shortDistanceMetrx[idx][idy]
                coiList.append(coi)
            mcoi = 1 - min(coiList)  #这个COI就是这个评审对这个文章的COI
            COI[i][j] = mcoi
    constructSeriz(COI_pickle, COI)
Example #5
0
def extractPaper2013():
    #
    rfile = open(accepted_path, encoding='utf-8')
    rfile = list(rfile)
    paperDict = {}
    i = 0
    title = ''
    for j in range(len(rfile)):
        i += 1
        if '<strong>' in rfile[j] and '<br />' in rfile[j]:
            #title
            start = rfile[j].index('</strong>')
            end = rfile[j].index('<br />')
            title = rfile[j][start + len('</strong>'):end].strip()
            continue
        elif 'Authors: ' in rfile[j]:
            authorInstit = rfile[j].replace('Authors: ', '').replace(
                '<br />', '').replace('</h5>', '')
            authors = []
            aipairs = authorInstit.split(';')
            for k in range(len(aipairs)):
                if len(aipairs[k]) < 5:
                    continue
                if ',' in aipairs[k]:
                    idx = aipairs[k].index(',')
                    name = aipairs[k][:idx].strip()
                    institu = aipairs[k][idx + 1:]
                    if len(institu) > 1:
                        institu = institu.strip()
                    else:
                        institu = 'Null'
                else:
                    try:
                        institu = aipairs[k].strip()
                        idx = aipairs[k - 1].rindex(',')
                        name = aipairs[k][idx + 1:].strip()
                        authors[len(authors) - 1][1] = authors[len(authors) -
                                                               1][1][:idx]
                    #print('error: '+authorInstit)
                    except Exception:
                        print(authorInstit)
                        break
                authors.append([name, institu])
            if title in paperDict.keys():
                title += (' ' + str(i))
                paperDict[title] = authors
            else:
                paperDict[title] = authors
            print('completed: ' + title)
            print(authors)
            print()
        else:
            #none
            pass

        #break
    constructSeriz(paperDict_pickle, paperDict)
    print('OK')
Example #6
0
def flody():
    #运行floyd 算最短距离
    distanceMetrx = readSeriz(distanceMetrx_pickle)
    lenD = len(distanceMetrx)
    DSet = []
    D0 = [[0 for i in range(lenD)] for j in range(lenD)]
    DSet.append(distanceMetrx)

    A0 = [[[] for i in range(lenD)] for j in range(lenD)]
    ASet = []
    ASet.append(A0)

    for k in range(1, MAX_TIME):  #从1开始,迭代次数
        #初始化一个矩阵
        DSet.append(D0)
        ASet.append(A0)
        for i in range(lenD):
            for j in range(lenD):
                if i == j:
                    DSet[k][j][j] = 0
                    ASet[k][j][j] = 0
                    continue

                MIN_old = DSet[k - 1][i][j]
                for x in range(lenD):
                    if (x == i) or (x == j):
                        continue
                    if (DSet[k - 1][i][x] >= MIN_old) or (DSet[k - 1][x][j] >=
                                                          MIN_old):
                        DSet[k][i][j] = DSet[k - 1][i][j]
                    else:
                        MIN_new = min(MIN_old,
                                      (DSet[k - 1][i][x] + DSet[k - 1][x][j]))
                        DSet[k][i][j] = MIN_new
                        if MIN_new < MIN_old:
                            #记下x
                            ASet[k][i][j] = [
                                ASet[k - 1][i][x], x, ASet[k - 1][x][j]
                            ]
                        else:
                            ASet[k][i][j] = ASet[k - 1][i][j]
                        MIN_old = MIN_new

        print('now is ' + str(k))
        shortDistanceMetrx = DSet[k][:]
        if DSet[k] == DSet[k - 1]:
            break

    constructSeriz(shortDistanceMetrx_pickle, shortDistanceMetrx)
Example #7
0
def extractPaper2014():
    #
    rfile = open(accepted_path, encoding='utf-8')
    rfile = list(rfile)
    paperDict = {}
    i = 0
    for j in range(len(rfile)):
        i += 1
        if not '<strong>' in rfile[j]:
            continue
        else:
            parts = rfile[j].split('</strong> <br/>')
            title = parts[0].replace('<strong>', '').replace('<s>', '').strip()
            authorInstit = parts[1].split('<br/>')[0].strip()
            authors = []
            aipairs = authorInstit.split(';')
            for k in range(len(aipairs)):
                if len(aipairs[k]) < 5:
                    continue
                if ',' in aipairs[k]:
                    idx = aipairs[k].index(',')
                    name = aipairs[k][:idx].strip()
                    institu = aipairs[k][idx + 1:].strip()
                else:
                    try:
                        institu = aipairs[k].strip()
                        idx = aipairs[k - 1].rindex(',')
                        name = aipairs[k][idx + 1:].strip()
                        authors[len(authors) - 1][1] = authors[len(authors) -
                                                               1][1][:idx]
                    #print('error: '+authorInstit)
                    except Exception:
                        print(authorInstit)
                        break
                authors.append([name, institu])
            if title in paperDict.keys():
                title += (' ' + str(i))
                paperDict[title] = authors
            else:
                paperDict[title] = authors
        print('completed: ' + title)
        #break
    constructSeriz(paperDict_pickle, paperDict)
    print('OK')
def compareNull():

    yearListNull = readSeriz(yearList_pickle_null)
    yearList = readSeriz(yearList_pickle)
    for fp in readFiles(nullDict):
        sameList = []
        single = []
        print('now begin: ' + str(fp))
        nullYearTitle = readSeriz(fp)
        yidx_null, nidx_null = extractYearTitle(fp)
        if yidx_null > len(yearListNull):
            print('error!!!!!!!!!!????')
            continue
        year = yearListNull[yidx_null]
        if year in yearList:
            yidx = yearList.index(year)
        else:
            print('error!!!!!!!!!!')
            continue

        path = idyeartitle_path + str(yidx) + '_' + str(nidx_null) + '.pickle'
        yearTitle = readSeriz(path)

        if len(yearTitle) < 1:
            for i in range(len(nullYearTitle)):
                single.append(nullYearTitle[i][0])

        for i in range(len(nullYearTitle)):
            flag = False
            for j in range(len(yearTitle)):
                if nullYearTitle[i][2] == yearTitle[j][2]:
                    sameList.append([nullYearTitle[i][0], yearTitle[j][0]])
                    flag = True
                    continue
            if flag == False:
                single.append(nullYearTitle[i][0])

        sameList_path = sameList_pickle + str(yidx) + '_' + str(
            nidx_null) + '.pickle'
        single_path = single_pickle + str(yidx) + '_' + str(
            nidx_null) + '.pickle'
        constructSeriz(sameList_path, sameList)
        constructSeriz(single_path, single)
def buildFinalGraph():
	#对两个图进行加和
	#coauthorGraph = readSeriz(coauthorGraph_pickle)
	#institutionGraph = readSeriz(institutionGraph_pickle)
	finalLen = len(coauthorNet)
	coauthorNet = readSeriz(coauthorNet_pickle)
	institutionNet = readSeriz(institutionNet_pickle)
	finalNet = readSeriz(expertNet_pickle)  #初始化一个模板
	finalGraph = nx.Graph()

	for i in range(finalLen):
		for j in range((i+1),finalLen):
			if i==j:
				continue
			finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j]
			finalNet[j][i] = finalNet[i][j]
			if finalNet[i][j] != 0:
				finalGraph.add_edge(*(i,j),score=finalNet[i][j])

	constructSeriz(finalGraph_pickle,finalGraph)
Example #10
0
def extractSub2012():
    topicSet = []
    file = open(file_path)
    for fl in file:
        #按行读取
        if len(fl) > 3:
            fl = fl.lower()
            text = fl.split(': ')[1]
            text = text.replace(',', ' and')
            text = text.replace('//', ' and ')
            if 'and' in text:
                for word in text.split('and'):
                    topicSet.append(word.strip())
            else:
                topicSet.append(text.strip())

    topicSet = list(set(topicSet))
    for tp in topicSet:
        print(tp)
    constructSeriz(subPickle_path, topicSet)
def buildTopic():
	# colleaguenet xid,yid,score
	topicNet = readSeriz(expertNet_pickle)
	topicNetMore = readSeriz(expertNet_pickle)
	topicGraph = nx.Graph()
	#预读
	totalSet = []
	for i in range(len(idList)):
		#other还没有设置,相当于tid
		xResult = getResult('select other from topic where eid='+str(idList[i]),cur)
		#需要有个other list 建立一个字典 key是other,num是内容
		xDict = {}
		for x in xResult:
			xDict[x['other']] = x['num']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(topicList)>0:
				#全部插入数据库中
				score = calculateTopic(topicList)
				#xid yid 已经转成str
				insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')'
				cur.execute(insertSQL)
				conn.commit()
				topicNet[i][j] = score #这里也做一个记录
				topicGraph(*(i,j),similar=score)
				topicNetMore[i][j] = topicList

	constructSeriz(topicNet_pickle,topicNet)
	constructSeriz(topicGraph_pickle,topicGraph)
	constructSeriz(topicNetMore_pickle,topicNetMore)
def buidPaperNet():
	#
	#papernet  id xid yid perid year
	paperNet = readSeriz(expertNet_pickle)#这相当于初始化模板
	paperNetYear = readSeriz(expertNet_pickle)#这相当于初始化模板
	paperGraph=nx.Graph() #初始化一个图
	#预读
	totalSet = []
	for i in range(len(idList)):
		xResult = getResult('select id,time from paper where eid='+str(),cur)
		#需要有个pid list 建立一个字典 key是id,time是内容
		xDict = {}
		for x in xResult:
			xDict[x['id']] = x['time']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	#预存一次表, 方便用时间这个条件进行控制
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			coauthorList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(coauthorList) >0:
				#全部插入数据库中
				for cid in coauthorList:
					year = totalSet[i][0][cid]
					insertSQL = 'insert into papernet (xid,yid,perid,year) values('+str(idList[i])+','+str(idList[j])+','+str(cid)+','+str(year)+')'
				paperNet[i][j] = len(coauthorList)
				paperNetYear[i][j] = coauthorList
				paperGraph.add_edge(*(i,j),coauthoryear=coauthorList) #这个增加一条边
	
	constructSeriz(paperNet_pickle,paperNet)
	constructSeriz(paperNetYear_pickle,paperNetYear)
	constructSeriz(paperGraph_pickle,paperGraph)
Example #13
0
def extractReviewer():
    reviewerDict = {}
    for p in pathList:
        rfile = open(p, encoding='utf-8')
        rfile = list(rfile)
        for j in range(len(rfile)):
            if '(' in rfile[j]:
                parts = rfile[j].split(' (')
                name = parts[0].strip()
                if not name in reviewerDict.keys():
                    reviewerDict[name] = parts[1].replace(')', '').strip()
    for p in pathListICDM:
        rfile = open(p, encoding='utf-8')
        rfile = list(rfile)
        for j in range(len(rfile)):
            if len(rfile[j]) > 4:
                parts = rfile[j].split(' 	')
                name = parts[0].strip()
                if not name in reviewerDict.keys():
                    reviewerDict[name] = parts[1].strip()
    print(reviewerDict)
    constructSeriz(reviewerDict_pickle, reviewerDict)
def cleanDOI():
    #id,doi
    paperid = 0
    selectResult = getResult('select id,doi from paper', cur)
    print('read completed')
    doiDict = {}
    idPaperid = []
    for i in range(len(selectResult)):
        if i % 5000 == 0:
            print('now is ' + str(i))
        doi = selectResult[i]['doi']
        if doi in doiDict:
            newPaperid = doiDict[doi][1]
            idPaperid.append([selectResult[i]['id'], newPaperid])
        else:
            #add, assign paperid
            paperid += 1
            doiDict[doi] = (selectResult[i]['id'], paperid)
            idPaperid.append([selectResult[i]['id'], paperid])

    #print(doiDict)
    #print(idPaperid)
    constructSeriz(doiDict_pickle, doiDict)
    constructSeriz(idPaperid_pickle, idPaperid)
Example #15
0
def prepareDistanceMetrx():
    # 计算合作者之间的关联度
    #import os;os.chdir('e:/Code/Python');import geng_coi;geng_coi.prepareDistanceMetrx()
    PART = 5
    totalEID = readSeriz(totalEID_pickle)
    totalEID = list(set(totalEID))
    lenEID = len(totalEID)
    #distanceMetrx = [[0 for i in range(lenEID)] for j in range(lenEID)]
    limit = int(lenEID / 10)
    print('len is ' + str(lenEID))
    distanceMetrx_pickle = dictPath + 'distanceMetrx0.pickle'
    distanceMetrx = readSeriz(distanceMetrx_pickle)
    #distanceMetrx_pickle_new = dictPath + 'distanceMetrx'+str(1)+'.pickle'
    distanceMetrx_pickle_new = dictPath + 'distanceMetrx' + str(
        PART) + '.pickle'
    #print('here0')
    #for i in range(lenEID-1):
    for i in range((limit * (PART - 1)), lenEID - 1):
        #print('i is '+str(i))
        py1 = getResultList(
            'select paperid from tmp_paper where eid = ' + str(totalEID[i]),
            'paperid', cur)
        set1 = set(py1)
        #print(set1)
        if i > (limit * PART):
            #print('???')
            break
        distanceMetrx[i][i] = 0
        #print('here1')
        for j in range(i + 1, lenEID):
            py2 = getResultList(
                'select paperid from tmp_paper where eid = ' +
                str(totalEID[j]), 'paperid', cur)
            set2 = set(py2)
            sameset = list(set1 & set2)
            value = 0
            if len(sameset) == 0:
                #print('here2')
                distanceMetrx[i][j] = 0  #做完标准化后再附成M
                distanceMetrx[j][i] = 0
            else:
                #print('here3')
                value = measureCoauthor(py1, py2, sameset)
                distanceMetrx[i][j] = value
                distanceMetrx[j][i] = value
            if j % 100 == 0:
                constructSeriz(distanceMetrx_pickle_new, distanceMetrx)
            print('completed: ' + str(i) + ', ' + str(j) + ' value: ' +
                  str(value))
            #break
        #break
        constructSeriz(distanceMetrx_pickle_new, distanceMetrx)
    constructSeriz(distanceMetrx_pickle_new, distanceMetrx)
def analysisNull():
    #
    '''
    nullDOI = readSeriz(nullDOI_pickle)
    print('num is '+str(len(nullDOI)))
    
    nullDOI = readSeriz(nullDOI_pickle)
    for nid in nullDOI:
        updateSQL = 'update paper set paperid=null where id='+str(nid)
        cur.execute(updateSQL)
        conn.commit()
        print('completed: '+str(nid))
        #break
    '''
    yearTitle = [[[] for j in range(27)] for i in range(100)]
    yearList = []
    #
    selectResult = getResult(
        'select id,time,title from paper where paperid is not null', cur)
    print('read completed, the total is ' + str(len(selectResult)))
    num = 0
    for sr in selectResult:
        if not (len(sr['title']) > 0 and sr['time'] > 0):
            continue
        if sr['time'] in yearList:
            yidx = yearList.index(sr['time'])
        else:
            yearList.append(sr['time'])
            yidx = yearList.index(sr['time'])
        nidx = assignNum(sr['title'])

        yearTitle[yidx][nidx].append((sr['id'], sr['time'], sr['title']))
        print('now is ' + str(num) + ' id is ' + str(sr['id']))
        num += 1
    print('begin saving')
    constructSeriz(yearList_pickle, yearList)
    for i in range(len(yearList)):
        for j in range(27):
            path = idyeartitle_path + str(i) + '_' + str(j) + '.pickle'
            if len(yearTitle[i][j]) > 0:
                constructSeriz(path, yearTitle[i][j])
            else:
                constructSeriz(path, [])
def preparePid():
    #
    #
    pidList = []
    nidList = []
    selectResult = getResult(
        'select tem.pid,group_concat(tem.id) ids from (select pid, id from paper) tem group by tem.pid',
        cur)
    constructSeriz(selectResult_pickle, selectResult)
    for sr in selectResult:
        if ',' in sr['ids']:
            #说明不止一个
            idList = sr['ids'].split(',')
            #pidQueue.put(idList)
            #print(idList)
            pidList.append(idList)
        else:
            #nidQueue.put([sr['ids']])
            #print([sr['ids']])
            #print('pid: '+sr['pid'])
            nidList.append(sr['ids'])
    constructSeriz(pidList_pickle, pidList)
    constructSeriz(nidList_pickle, nidList)
    print('completed')
Example #18
0
def measure_topicSIM(topicsNum=30):
    #
    paperDictTopic = readSeriz(paperDictTopic_pickle)
    reviewerDictTopic = readSeriz(reviewerDictTopic_pickle)
    wordOfBagSet = []

    i = 0
    paperID = {}
    #通过wordofbad的序号,索引paperID列表,然后知道这是paper还是reviewer的paper
    for title in paperDictTopic:
        wordOfBagSet.append(paperDictTopic[title])
        paperID[i] = (True, title)
        i += 1

    for reviewer in reviewerDictTopic:
        for title in reviewerDictTopic[reviewer]:
            wordOfBagSet.append(title)
            paperID[i] = (False, title, reviewer)
            i += 1

    texts = wordOfBagSet[:]

    dictionary = corpora.Dictionary(texts)
    #dictionary.save('F:/newsAnalysis/data/newswordsall.dict')
    corpus = [dictionary.doc2bow(text) for text in texts]
    #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ldaModel = models.LdaModel(corpus_tfidf,
                               id2word=dictionary,
                               num_topics=topicsNum)

    ldaModel.show_topics()
    corpus_lda = ldaModel[corpus]  #得到各文本的倾向
    index = similarities.MatrixSimilarity(ldaModel[corpus_lda])
    #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
    #corpus_lda = [[(d[0],round(float(d[1]),5)) for d in doc] for doc in corpus_lda0]
    #for doc in corpus_lda:
    #	print(doc)
    '''
	#参考部分
	doc = "Human computer interaction"
	vec_bow = dictionary.doc2bow(doc.lower().split())
	vec_lsi = ldaModel[vec_bow] # convert the query to LSI space
	index = similarities.MatrixSimilarity(ldaModel[corpus_lda])
	sims = index[vec_lsi]
	print(sims)
	'''
    '''
	paperList = list(paperDictT.keys())
	reviewerList = list(reviewerDictT.keys())

	TOPIC = [[0 for j in range(len(reviewerList)) ] for i in range(len(paperList))]
	paperDictT = {}
	#reviewerDictT = {}

	for i in range(len(corpus_lda)):
		if paperID[i][0] == True:
			#投稿文章    {title:[1,2,3,4,5],}
			#paperDictT[paperID[i][1]] = corpus_lda[i]
			paperDictT[paperID[i][1]] = 
		else:
			#pass
			if paperID[i][2] in reviewerDictT.keys():
				reviewerDictT[paperID[i][2]].append(corpus_lda[i])
			else:
				reviewerDictT[paperID[i][2]] = []
				reviewerDictT[paperID[i][2]].append(corpus_lda[i])
	'''
    corpusLen = 0
    TOPIC = [[0 for j in range(len(reviewerList))]
             for i in range(len(paperList))]
    #metrx = [[0 for i in range(corpusLen)] for ]
    for i in range(len(paperList)):
        paperList[i]
        corpus_lda_vector = paperDictT[paperList[i]]
        sims = index[corpus_lda_vector]

        for j in range(len(reviewerList)):
            TOPIC[i][j] = sims[j]
    return

    #print(paperDictT_pickle)
    constructSeriz(paperDictT_pickle, paperDictT)
    constructSeriz(reviewerDictT_pickle, reviewerDictT)
    constructSeriz(corpus_lda_pickle, corpus_lda)
def basePrepare():
	#expertNet xid,yid,status,attr
	#selectSQL = 'select id from dlurl1'
	#idList = getResultList(selectSQL,'id',cur)
	#constructSeriz(idList_pickle,idList)
	#print('read complete!')
	#构建学者的id网
	idList = readSeriz(idList_pickle)
	idLength = len(idList)
	#expertNet = [[0 for j in range(idLength)] for i in range(idLength)] #用于最后的加和
	for i in range(idLength):
		for j in range()
		insertSQL = 'insert into expertNet (xid,yid) values ('+str(idList[i])+','+str(yid)+')'
	#序列化
	constructSeriz(expertNet_pickle,expertNet)
	del expertNet
	gc.collect()	
	print('all complete')

def buildInstitution():
	# colleaguenet xid,yid,score
	'''
	institutionNet = readSeriz(expertNet_pickle)
	institutionGraph = nx.Graph()
	#预读
	totalSet = []
	for i in range(len(idList)):
		#iid还没有设置,应该是status之类的项
		xResult = getResult('select iid from experience1 where eid='+str(idList[i]),cur)
		#需要有个pid list 建立一个字典 key是id,time是内容
		xDict = {}
		for x in xResult:
			xDict[x['id']] = x['time']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	#这里与paper/coauthor不一样,不用进行一次控制,可以直接放入,类似coauthornet那种
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			institutionList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(institutionList)>0:
				#全部插入数据库中
				score = calculateColleague(institutionList)
				#xid yid 已经转成str
				insertSQL = 'insert into colleaguenet (xid,yid,score) values('+xid+','+yid+','+str(score)+')'
				cur.execute(insertSQL)
				conn.commit()
				institutionNet[i][j] = score #这里也做一个记录
				institutionGraph(*(i,j),coinstitution=institutionList, weight=score)
	constructSeriz(institutionNet_pickle,institutionNet)
	constructSeriz(institutionGraph_pickle,institutionGraph)
	'''
def calculateColleague(institutionList):
	#具体的计算方法
	pass

def buildFinalGraph():
	#对两个图进行加和
	#coauthorGraph = readSeriz(coauthorGraph_pickle)
	#institutionGraph = readSeriz(institutionGraph_pickle)
	finalLen = len(coauthorNet)
	coauthorNet = readSeriz(coauthorNet_pickle)
	institutionNet = readSeriz(institutionNet_pickle)
	finalNet = readSeriz(expertNet_pickle)  #初始化一个模板
	finalGraph = nx.Graph()

	for i in range(finalLen):
		for j in range((i+1),finalLen):
			if i==j:
				continue
			finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j]
			finalNet[j][i] = finalNet[i][j]
			if finalNet[i][j] != 0:
				finalGraph.add_edge(*(i,j),score=finalNet[i][j])

	constructSeriz(finalGraph_pickle,finalGraph)

def mainFunction():
	#
	basePrepare()

	buidPaperNet()
	buildCoauthorByYear(2012) #定下大于多少年

	buildInstitution()

def analysisGraph(finalGraph):

	if nx.is_connected(finalGraph):
		print('Yes')
	else:
		number_connected_components(finalGraph)
		nx.connected_component_subgraphs(finalGraph) #这个返回的是什么类型?

def buildTopic():
	# colleaguenet xid,yid,score
	topicNet = readSeriz(expertNet_pickle)
	topicNetMore = readSeriz(expertNet_pickle)
	topicGraph = nx.Graph()
	#预读
	totalSet = []
	for i in range(len(idList)):
		#other还没有设置,相当于tid
		xResult = getResult('select other from topic where eid='+str(idList[i]),cur)
		#需要有个other list 建立一个字典 key是other,num是内容
		xDict = {}
		for x in xResult:
			xDict[x['other']] = x['num']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(topicList)>0:
				#全部插入数据库中
				score = calculateTopic(topicList)
				#xid yid 已经转成str
				insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')'
				cur.execute(insertSQL)
				conn.commit()
				topicNet[i][j] = score #这里也做一个记录
				topicGraph(*(i,j),similar=score)
				topicNetMore[i][j] = topicList

	constructSeriz(topicNet_pickle,topicNet)
	constructSeriz(topicGraph_pickle,topicGraph)
	constructSeriz(topicNetMore_pickle,topicNetMore)
	
def calculateTopic():
	pass
if __name__ == '__main__':
	basePrepare()
Example #20
0
def selectEID():
    #authours X reviewers 根据eid 找到其合作者
    totalEID = readSeriz(totalEID_new_pickle)
    '''
	Dict = readSeriz(reviewer_ID_pickle)
	#Dict = readSeriz(paperAuthorDict_pickle) #这里包有[]
	totalEID = list(set(totalEID))
	print('begin totleEID len is '+str(len(totalEID)))
	for name in Dict.keys():
		eid = Dict[name]
		if eid ==-1:
			continue
		selectEIDList1 = []
		selectEIDList2 = [eid] #下一次要查询的列表
		selectEIDCompleted = [] #已经完成查询的列表
		for i in range(1):
			if len(selectEIDList2)==0:
				break
			selectEIDList1 = selectEIDList2[:]
			#print(selectEIDList1)
			selectEIDList2 = []
			for j in selectEIDList1:
				print('--query: '+str(j))
				selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid='+str(j)+' ) limit 100'
				newEidList = getResultList(selectSQL,'eid',cur)
				newList = 0
				for kid in newEidList:
					if (not kid in selectEIDCompleted) and (not kid in totalEID) and (not kid in selectEIDList2):
						selectEIDList2.append(kid)
						newList += 1
						if newList>300:
							break
				selectEIDCompleted.append(j)
				print('++add: '+str(newList))
				totalEID += selectEIDList2
				totalEID = list(set(totalEID))
			totalEID = list(set(totalEID))	
			#print('selectEIDList2 len is '+str(len(selectEIDList2)))
			#print('now len is '+str(len(selectEIDCompleted)))

		print('===========completed: '+name)
		print('now totleEID len is '+str(len(totalEID)))
		totalEID = list(set(totalEID))
		constructSeriz(totalEID_new_pickle,totalEID)
	print('total totleEID len is '+str(len(totalEID)))
	totalEID = list(set(totalEID))
	constructSeriz(totalEID_new_pickle,totalEID)
	'''
    DictAuthor = readSeriz(paperAuthorDict_pickle)  #这里包有[]
    DictID = readSeriz(personDict_pickle)  #这里包有[]
    totalEID = list(set(totalEID))
    print('begin totleEID len is ' + str(len(totalEID)))
    for title in DictAuthor.keys():
        names = DictAuthor[title]
        #print(eid)
        if len(names) == 0:
            continue
        selectEIDList1 = []
        selectEIDList2 = [DictID[na] for na in names]  #下一次要查询的列表
        selectEIDCompleted = []  #已经完成查询的列表
        for i in range(1):
            if len(selectEIDList2) == 0:
                break
            selectEIDList1 = selectEIDList2[:]
            #print(selectEIDList1)
            selectEIDList2 = []
            for j in selectEIDList1:
                print('--query: ' + str(j))
                selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid=' + str(
                    j) + ' ) limit 100'
                newEidList = getResultList(selectSQL, 'eid', cur)
                newList = 0
                for kid in newEidList:
                    if (not kid in selectEIDCompleted) and (
                            not kid in totalEID) and (not kid
                                                      in selectEIDList2):
                        selectEIDList2.append(kid)
                        newList += 1
                        if newList > 300:
                            break
                selectEIDCompleted.append(j)
                print('++add: ' + str(newList))
                totalEID += selectEIDList2
                totalEID = list(set(totalEID))
            totalEID = list(set(totalEID))
            #print('selectEIDList2 len is '+str(len(selectEIDList2)))
            #print('now len is '+str(len(selectEIDCompleted)))

        print('===========completed: ' + title)
        print('now totleEID len is ' + str(len(totalEID)))
        totalEID = list(set(totalEID))
        constructSeriz(totalEID_new_pickle, totalEID)
        #break
    print('total totleEID len is ' + str(len(totalEID)))
    totalEID = list(set(totalEID))
    constructSeriz(totalEID_new_pickle, totalEID)
Example #21
0
def indexEID():
    # 根据author的institute对应dlurl1里面的id
    paperDict = readSeriz(paperDict_pickle)
    reviewerDict = readSeriz(reviewerDict_pickle)
    reviewer_ID = {}
    totalEID = []
    personDict = {}  #存放人
    paperAuthorDict = {}
    #paper 部分 ------------
    #if False:
    for title in paperDict.keys():
        #title = ''
        authorsInstitu = paperDict[title]
        #print(authorsInstitu)
        paperAuthorDict[title] = []
        for ai in authorsInstitu:
            name = ai[0]

            if name in personDict.keys():
                continue
            Institu = ai[1]
            eidList = getResultList(
                'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "'
                + cleanName(name) + '") and institution like "' + Institu +
                '" limit 1', 'eid', cur)
            if len(eidList) > 0:
                personDict[name] = eidList[0]
                totalEID += eidList
                paperAuthorDict[title].append(name)
                print('comppleted: ' + name)
            else:
                eidList = getResultList(
                    'select id from dlurl1 where name like "' +
                    cleanName(name) + '" limit 1', 'id', cur)
                if len(eidList) > 0:
                    personDict[name] = eidList[0]
                    totalEID += eidList
                    print('comppleted: ' + name)
                    paperAuthorDict[title].append(name)
                else:
                    print('error!!!!!!1  ' + name)
        print('-----------completed:' + title)
        #break
    totalEID = list(set(totalEID))
    constructSeriz(totalEID_pickle, totalEID)
    constructSeriz(personDict_pickle, personDict)
    constructSeriz(paperAuthorDict_pickle, paperAuthorDict)
    print('completed Paper')

    #reviewer部分
    for name in reviewerDict.keys():
        #print(reviewerDict[name])
        Institu = reviewerDict[name][0]
        eidList = getResultList(
            'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "'
            + cleanName(name) + '") and institution like "' + Institu +
            '" limit 1', 'eid', cur)
        if len(eidList) > 0:
            reviewer_ID[name] = eidList[0]
            totalEID += eidList
            print('comppleted: ' + name)
        else:
            eidList = getResultList(
                'select id from dlurl1 where name like "' + cleanName(name) +
                '" limit 1', 'id', cur)
            if len(eidList) > 0:
                reviewer_ID[name] = eidList[0]
                totalEID += eidList
                print('comppleted: ' + name)
            else:
                print('error!!!!!!1  ' + name)
                reviewer_ID[name] = -1
        #break

    constructSeriz(totalEID_pickle, totalEID)
    constructSeriz(reviewer_ID_pickle, reviewer_ID)