def updateAuthorIDfromWholeData(authorID): """ 更新原库作者信息 :param authorID: :return: 作者ID,作者机构ID """ server = dbIO(originDBStr) sql = "select id,affillist from authorlist where aid = '%s'" % authorID wholerows = server.load(sql) if len(wholerows) > 0: singleID = wholerows[0][0] affilid = wholerows[0][1].split("|")[0] else: server2 = dbIO(newDBStr) sql = "select * from authortest where aid = '%s'" % authorID print(sql) alterdata = server2.load(sql)[0] affilid = alterdata[9] # print(affilid,alterdata) if len(affilid) > 9: affilid = selectAffilName(alterdata[9].split("|")) result = updateAffilIDfromWholeData(affilid) if result == -1: affilid = '' sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname," \ "articlelist,affillist)values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"\ % (alterdata[1], alterdata[2], alterdata[3].replace("'", "''"), alterdata[4].replace("'", "''"), alterdata[5].replace("'", "''"), alterdata[6].replace("'", "''"), alterdata[7], '', affilid) # print(sql) server.save(sql) return authorID, affilid
def getAlterArticleData(): """ 为原库更新论文信息,插入信息到articlelist :return: """ server = dbIO(newDBStr) sql = "select * from articletest" articleData = server.load(sql) for item in articleData: authorlist = item[3].split("|") affillist = item[4].split("|") if len(authorlist) > 200 or checkTitleInWholeData( item[5]): # pass the reviewers paper 作者过多或出现过的论文无用 continue # print(item,len(authorlist),len(affillist)) newAuthorlist = [] newAffillist = [] for author in authorlist: if author == '': continue authorID, affilID = updateAuthorIDfromWholeData(author) newAuthorlist.append(authorID) newAffillist.append(affilID) server2 = dbIO(originDBStr) sql = """insert into articlelist (sid,doi,authorlist,affillist, title,abstract,keywords,date,journalName,articleType,abstractLang,citation) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')""" \ % (item[1], item[2], "|".join(newAuthorlist), "|".join(newAffillist), item[5].replace("'", "''"), item[6].replace("'", "''"), item[7].replace("'", "''"), item[8], item[9].replace("'", "''"), item[10], item[11], item[12]) # print(sql) server2.save(sql)
def updateSql(author, afil, pid): """ 更新作者表与机构表内容 :param author: 作者信息字典 :param afil: 机构信息字典 :param pid: 文章ID :return: """ server = dbIO() sql = "select * from authorlist where aid='%s'" % (author["id"]) if server.count(sql) <= 0: if not afil: affiliation = "" else: affiliation = author["affiliation"] sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname,articlelist,affillist)" \ " values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (author["id"], author["url"], author["fullname"], author["simname"], author["firstname"], author["lastname"], author["simlastname"], pid, affiliation) # print sql server.save(sql) if afil: sql = "select * from affillist where afid='%s'" % (afil["id"]) if server.count(sql) <= 0: sql = "insert into affillist (afid,name,city,country,url) values ('%s','%s','%s','%s','%s')" \ % (afil["id"], afil["name"], afil["city"], afil["country"], afil["url"]) server.save(sql)
def getTittle(): """ :return: 标题与flag情况,对于scopus ID为空的数据(Scopus信息未更新或是找不到) """ server = dbIO() sql = "select title,flag from searchlist2 where sid=''" return server.load(sql)
def getArticleInfo(): if True: # try: server = dbIO() sql = "select sid from searchlist2 where flag = 0 and" \ " id > (SELECT max(id) FROM paper_data.searchlist2 where flag = 1)" totalNum = server.count(sql) if totalNum > 0: finishNum = 0 passNum = 0 print('totalNum:' + str(totalNum)) keIndex = checkScopusKeyIndex() for row in server.load(sql): sid = row[0] passNum += 1 try: infodict, authorList, affilList = getInfofromAbstractAPI( int(sid), keIndex) if infodict == []: continue flag = saveWholeArticletoDB(infodict, authorList, affilList) if flag > -1: sql = "update searchlist2 set flag = 1 where sid = " + sid finishNum += 1 print("No." + str(finishNum) + " Finish: " + sid + " PassNum:" + str(passNum - finishNum)) server.save(sql) except Exception as e: # ConnectionResetError print("Wrong: " + sid + "*" * 20) print(e) continue
def singlePaperRun(paperID, keyIndex): """ 每篇论文的处理,对应于一个singleThread对象 :param paperID: 论文ID :param keyIndex: :return: 异常或没有信息返回,则返回0,更新论文信息并更新列表(searchlist)的flag =1,返回1 """ try: global onceSuccess start = time.clock() server = dbIO() infodict, authorList, affilList = getInfofromAbstractAPI(paperID, keyIndex) if infodict is []: return 0 flag = saveWholeArticletoDB(infodict, authorList, affilList) if flag == 1: sql = "select id from searchlist where sid = " + paperID sql = "update searchlist set flag = 1 where id = " + str(server.load(sql)[0][0]) server.save(sql) end = time.clock() onceSuccess = onceSuccess + 1 print("Thread%s:Finished_CostTime:%d" % (paperID, (end - start))) return 1 except ConnectionResetError as e: print("wrongThread%s:%s" % (paperID, str(e))) return 0
def udpateSearchlist(): """ 从新库中获取新的文章信息添加到原库中,更新检索列表 :return: """ server = dbIO(newDBStr) sql = """select title,url,year,publication_id,flag,sid from searchlist""" datarows = server.load(sql) server = dbIO(originDBStr) print(len(datarows)) for data in datarows: title, url, year, publication_id, flag, sid = data if not sid: sid = "" sql = """insert into searchlist2 (title,url,year,publication_id,flag,sid) values ('%s','%s','%s','%d','%d','%s')""" \ % (title.replace("'", "''"), url.replace("'", "''"), year.replace("'", "''"), publication_id, flag, sid.replace("'", "''")) server.save(sql)
def loadTaskList(key): """ :param key: 关键词 :return: 查询未被处理的任务对应列表 """ dbIOserver = dbIO() sql = "select id, totalNum, query from tasklist where keyword = '" + key + "' and flag != 1" data = dbIOserver.load(sql) taskList = [[int(item[0]), item[2]] for item in data] return taskList
def getMaxFrequentAffilName(affillist): """ for mutil-affil get the most frequent one (模糊获取高频机构) :param affillist: 机构列表 :return: 打印机构与对应文章数 """ server = dbIO(originDBStr) for affil in affillist: sql = "select count(*) from articlelist where affillist like '%" + affil + "%'" num = server.load(sql)[0][0] print(affil, num)
def selectAffilName(affillist): """ 选取机构名称,优先选取第一机构 :param affillist: :return: 机构的ID """ server = dbIO(newDBStr) for affilId in affillist: sql = "select * from affiltest where afid = '%s'" % affilId if server.count(sql) > 0: return affilId
def getAuthorlist(pid): """ :param pid: 文章ID :return: 获取同一文章的全部作者列表 """ server = dbIO() authorlist = [] sql = "select aid from authorlist where articlelist='%s'" % pid datarows = server.load(sql) for row in datarows: authorlist.append(row[0]) return authorlist
def checkTitleInWholeData(title): """ 检查论文是否在原库出现过 :param title: 文章标题 :return: True表示文章出现过,否则未出现 """ server = dbIO(originDBStr) sql = "select * from articlelist where title = '%s'" % (title.replace( "'", "''")) if server.count(sql) > 0: return True else: return False
def authoraffil(authorlist): """ :param authorlist: 作者ID列表 :return: 作者对应机构ID列表 """ server = dbIO() affillist = [] for aid in authorlist: sql = "select affillist from authorlist where aid='%s'" % aid datarows = server.load(sql) for row in datarows: affillist.append(row[0]) return affillist
def getArticleInfo(): """ 对于searchlist中的flag,如果flag为1,那么有scopus id,并且完成操作 如果flag为2可能,在插入有scops id数据时出现错误 如果flag为3,则进入补充数据Supply data阶段,表示数据补充完成,但其sid为semantic id :return: """ if True: # 获取没有scopus id的数据部分 server = dbIO() sql = "select title,flag,sid from searchlist2 where flag=0 and sid is NULL" totalNum = server.count(sql) if totalNum > 0: finishNum = 0 passNum = 0 print('totalNum:'+str(totalNum)) keIndex = checkScopusKeyIndex() for row in server.load(sql): title, flag, sid = row passNum += 1 # try: 依据标题获取scopus id sid = dealInfo(titleGetRes(title), title) print("sid:", sid) # 有sid存在时 if sid: print(title, flag) # 获取文章信息,作者信息,机构信息 infodict, authorList, affilList = getInfofromAbstractAPI(int(sid), keIndex) if infodict is []: continue flag = saveWholeArticletoDB(infodict, authorList, affilList) if flag > -1: sql = "update searchlist2 set flag = 1 where sid = '%s'" % sid finishNum += 1 print("No."+str(finishNum)+" Finish: "+sid+" PassNum:"+str(passNum-finishNum)) server.save(sql) # 不存在sid,从semantic进行数据补充处理 else: print(title, flag) m = getJsonDict(title) # semantic也不存在与该标题一致的数据,依据作者信息搜索相似文章 if not m: continue else: for author in m["authors"]: if len(author["ids"]) == 0: authorGetRes(author["name"], [], m["id"]) else: authorGetRes(author["name"], author["ids"][0], m["id"]) findArticle(m)
def updateAffilIDfromWholeData(affilID): """ 原库中没有该机构而新库中存在,查询到机构的信息并插入到原库的机构表中 :param affilID: 机构的ID :return: 原库存在该机构,返回其ID,新库不存在该机构返回-1,否则返回0 """ server = dbIO(originDBStr) sql = "select id from affillist where afid = '%s'" % affilID wholerows = server.load(sql) if len(wholerows) > 0: singleID = wholerows[0][0] return singleID else: server2 = dbIO(newDBStr) sql = "select * from affiltest where afid = '%s'" % affilID data = server2.load(sql) if len(data) == 0: return -1 alterdata = data[0] sql = "insert into affillist (afid,name,city,country,url) values ('%s','%s','%s','%s','%s')" \ % (alterdata[1], alterdata[2].replace("'", "''"), alterdata[3].replace("'", "''"), alterdata[4].replace("'", "''"), alterdata[5].replace("'", "''")) # print(sql) server.save(sql) return 0
def saveTaskList(taskList, key): """ :param taskList: [任务数量,Scopus高级子命令],getTaskQueryList计算获得 :param key: 关键词(例如人工智能) :return: 保存到数据库的任务列表的数目 """ dbIOserver = dbIO() insertNum = 0 for task in taskList: sql = "select * from tasklist where query = '" + task[1] + "'" if dbIOserver.count(sql) == 0: sql = "INSERT INTO tasklist (keyword,query,totalNum,taskType) VALUES " \ "('" + key + "','" + task[1] + "','" + str(task[0]) + "','test')" if dbIOserver.save(sql) > 0: insertNum += 1 return insertNum
def getTasks(key): """ 入口函数 :param key:关键词 :return: 没有关键词对应的任务,则获取依关键词查找的全部子命令,并存成任务列表到数据库,有对应任务则查看,返回任务列表 """ dbIOserver = dbIO() sql = "SELECT * FROM tasklist where keyword = '" + key + "'" # print(sql) taskList = [] if dbIOserver.count(sql) <= 0: queryList = buildOriginTask(key) taskList = getTaskQueryList(queryList) saveTaskList(taskList, key) else: taskList = loadTaskList(key) return taskList
def main(mThread): """ 构建Scopus ID对应的线程池,其中线程的个数由searchNum决定,不断停止与启动其中线程资源,在mainThread对象中run调用开启多线程 :param mThread: 当前线程 :return: 已有scopus ID的未更新数据的条数 """ global onceSum, onceSuccess, control_flag onceSum = 0 onceSuccess = 0 # 计数并查询searchNum条更新的Scupus库中数据信息的论文信息 sql = r"""select id,sid from searchlist where flag = 0 and sid != '' order by rand() limit 0,%d;""" % searchNum threadList = [] server = dbIO() onceSum = server.count(sql) keyIndex = checkScopusKeyIndex() print("Available Key Index:", keyIndex, " Rent Keys:", len(Settings.apikeyList) - keyIndex) ISOTIMEFORMAT = '%X' print("Get %s Papers Into Cawler %s" % (onceSum, time.strftime(ISOTIMEFORMAT, time.localtime()))) if onceSum > 0: # control thread working control_flag = 1 maxID = '0' minID = '0' for row in server.load(sql): threadList.append(singleThread(row[1], keyIndex)) if minID == '0': minID = row[0] maxID = row[0] _thread.start_new_thread(checkThreadActive, (threadList, mThread)) for single in threadList: single.start() time.sleep(0.25) print("Threads %s-%s Running" % (minID, maxID)) return onceSum
def searchArticlesByQuery(query, taskID): """ :param query: Scopus高级检索命令语句 :param taskID: 任务序号 :return: 插入文章信息到searchlist数据库中 """ dbIOserver = dbIO() totalNum = getTotalNumFromSearchAPI(query) print("searchResults:" + str(totalNum)) sql = "SELECT * FROM searchlist where taskID = " + str(taskID) dataLength = dbIOserver.count(sql) print(dataLength) repeatNum = 0 lastNum = 0 maxRepeatNum = 5 while (totalNum > 0 and dataLength < totalNum and dataLength < 5000) and repeatNum <= maxRepeatNum: if lastNum != totalNum + dataLength: repeatNum = 0 lastNum = totalNum + dataLength else: repeatNum += 1 articlesList = getArticlefromAPI(query, totalNum, dataLength) insertNum = 0 for article in articlesList: sql = "select * from searchlist where sid = " + article['id'] if dbIOserver.count(sql) == 0: sql = "INSERT INTO searchlist (sid,doi,flag,taskID) VALUES ('" + str(article['id'])\ + "','" + article['doi'] + "',0,'"+str(taskID)+"')" if dbIOserver.save(sql) > 0: insertNum += 1 dataLength += len(articlesList) print("dataLength:" + str(dataLength)) if repeatNum > 3: sql = "update tasklist set flag = 0 where id = " + str(taskID) dbIOserver.save(sql) return sql = "update tasklist set flag = 1 where id = " + str(taskID) dbIOserver.save(sql)
def addArticle(dict): """ :param dict: 文章信息字典(包含作者ID列表与机构ID列表) :return: 文章数据库更新,从semantic补充,故不含scopus ID """ authors = "" authorlist = dict["authorlist"] for author in authorlist: authors += author + "|" # print (authors[:-1]) keywords = "" keywordlist = dict["keywords"] for keyword in keywords: keywords += keyword + "|" affils = "" affillist = dict["affillist"] for affil in affillist: affils += affil + "|" # print (affils[:-1]) # add in datebase server = dbIO() sql = "select * from articlelist where sid='%s'" % (dict["sid"]) if server.count(sql) <= 0: sql = "insert into articlelist(sid,doi,authorlist,affillist,title,abstract,keywords,date," \ "journalName,articletype,abstractLang,citation)" \ "values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%d')" \ % (dict["sid"], dict["doi"], authors[:-1], affils[:-1], dict["title"].replace("'", "''"), dict["abstract"].replace("'", "''").replace("\n\n\n", " |").replace("\n", "| "), keywords[:-1].replace("'", "''"), dict["date"], dict["journalName"], dict["articleType"], dict["abstractLang"], dict["citation"]) server.save(sql) sql = "select flag from searchlist2 where title='%s'" % ( dict["title"].replace("'", "''")) # print (dict["title"],server.load(sql)) server.load(sql) flag = server.load(sql)[0][0] if flag == 0: sql = "update searchlist2 set flag=3 where title='%s'" % ( dict["title"].replace("'", "''")) server.save(sql)
def dealInfo(dict, title): """ 更新检索列表searchlist2,flag=2为模糊匹配的标题,flag=1 为完全匹配的标题 :param dict: sid与rel字典 :param title: 标题 :return: 有sid则返回sid,没有则返回None """ if "sid" in dict: server = dbIO() # sql = "select * from searchlist2 where sid='%s'" % (dict["sid"]) if True: # server.count(sql) <= 0: if dict["rel"] == 0: sql = "update searchlist2 set flag=1,sid='%s'where title='%s'" % ( dict["sid"], title.replace("'", "''")) server.save(sql) elif dict["rel"] <= 0.1: sql = "update searchlist2 set flag=2,sid='%s'where title='%s'" % ( dict["sid"], title.replace("'", "''")) server.save(sql) print("contains sid dict:", dict) return dict["sid"] else: return None
def saveWholeArticletoDB(infoDict, authorList, affilList): """ 保存爬取到的内容到数据库中 :param infoDict: 文章信息字典 :param authorList: 对应的作者信息列表(可能多作者) :param affilList: 对应的机构信息列表(可能多机构) :return: """ server = dbIO() sql = "select * from articlelist where sid = '%s'" % (infoDict['id']) if server.count(sql) != 0: return 0 # save the affilication data affils = "" for affil in affilList: affils += affil['id'] + "|" sql = "select * from affillist where afid = " + affil['id'] if server.count(sql) == 0: sql = "insert into affillist (afid,name,country,city,url) VALUES ('" + affil[ 'id'] + "','" + affil['name'] + "','" + affil[ 'country'] + "','" + affil['city'] + "','" + affil[ 'url'] + "')" sta = server.save(sql) """ else: sql = "udpate affillist set name = '" + affil['name'] + "',city = '" + affil['city'] + "',country = '" + affil['country'] + "', url = '" + affil['url'] + "' where afid = " + affil['id'] sta = cur.execute(sql)""" affils = affils[:-1] # save the author data authors = "" for author in authorList: authors += author['id'] + "|" sql = "select * from authorlist where aid = " + author['id'] if server.count(sql) == 0: affiliation = author['affiliation'] affilids = "" for affil in affiliation: affilids += affil + "|" articles = infoDict['id'] sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname,affillist,articlelist) VALUES ('" +\ author['id'] + "','" + author['url'] + "','" + author['fullname'] + "','" + author['simname'] + "','" +\ author['firstname'] + "','" + author['lastname'] + "','" + author['simlastname'] +\ "','" + affilids[:-1] + "','" + articles + "')" sta = server.save(sql) else: articles = server.load(sql)[0][8] articleStrs = articles.split("|") if infoDict['id'] not in articleStrs: articles += "|" + infoDict['id'] sql = "select id from authorlist where aid = " + author['id'] if server.count(sql) == 0: sql = "update authorlist set articlelist = '" + articles + "' where id = " + server.load( sql)[0][0] sta = server.save(sql) authors = authors[:-1] # save the article data keywords = "" for key in infoDict['authorKeywords']: keywords += key.replace("'", "''") + "|" keywords = keywords[:-1] #print(infoDict) sql = "insert into articlelist (sid,doi,title,abstract,journalName,articleType,date,citation,abstractLang,keywords,authorlist,affillist) VALUES ('" + \ infoDict['id'] + "','" + infoDict['doi'] + "','" + infoDict['title'] + "','" + infoDict['abstract'] + "','" + \ infoDict['journal'] + "','" + infoDict['articleType'] + "','" + infoDict['date'] + \ "','" + infoDict['citation'] + "','" + infoDict['abstractLang'] + "','" + keywords + \ "','" + authors + "','" + affils + "')" sta = server.save(sql) return 1