def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) eid = getID(html) if eid >0: instit = extractInstitut(html) if len(instit)>0: insertInstitution(eid,instit,fileP) else: try: print('empty: '+fileP) updateSQL = 'update dlurl1 set tem=5 where id='+str(eid) cur.execute(updateSQL)#标记已抽取 conn.commit() except Exception: print('error: '+updateSQL) else: print('eid error: '+fileP) #break#只运行一次 cur.close();conn.close();
def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) eid = getID(html) if eid > 0: topicID = extractTopic(eid, html) if len(topicID) > 0: insertTopic(eid, topicID, fileP) else: try: print('empty: ' + fileP) updateSQL = 'update dlurl1 set status=2 where id=' + str( eid) cur.execute(updateSQL) #标记已抽取 conn.commit() except Exception: print('error: ' + updateSQL) else: print('eid error: ' + fileP) #break#只运行一次 cur.close() conn.close()
def mainFunction(): # filePathList = readFiles() #从列表中读取文件 for i in range(len(filePathList)): #用下标的形式,方便控制数量 html = readTXT(filePathList[i]) soup = BeautifulSoup(''.join(html), "lxml") dlInfo(html, soup) # cur.close() conn.close()
def run(self): while True: #取出html filePath = self.fileQueue.get() html = str(readTXT(filePath)) #取出url print('get: ' + filePath) if html.find('<全部同伴的名字>') > 0: ind = html.find('<全部同伴的名字>') html = html[:ind] nameLink = analysisPage(html) if len(nameLink) > 0: for nl in nameLink: addInfo(nl) self.fileQueue.task_done()
def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) #print('do here') eid = getID(html) #print('do here0') if eid >0: instit = extractInstitut(html) if len(instit)>0: #print('do here1') insertInstitution(eid,instit,fileP) #print(instit) #break#只运行一次 cur.close();conn.close();
def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready') for fileP in filePathList: html = readTXT(fileP) #print('do here') eid = getID(html) #print('do here0') if eid > 0: topics = extractTopic(html) institution = extractInstitut(html) if len(topics) > 0: insertTopic(eid, topics, fileP) if len(institution) > 0: insertInstitution(eid, institution, fileP) #break#只运行一次 cur.close() conn.close()
def mainFunction(): # #读取文件 filePathList = readFiles(files_path) print('read is ready len is ' + str(len(filePathList))) for fileP in filePathList: html = readTXT(fileP) #print('do here') eid = getID(html) #print('do here0') if len(eid) > 0: for e in eid: topics = extractTopic(html) if len(topics) > 0: #print('do here1') insertTopic(e, topics, fileP) else: updateSQL = 'update dlurl1 set status=2 where id=' + str(e) cur.execute(updateSQL) conn.commit()
def mainFunction(): # filePathList = readFiles() #从列表中读取文件 for i in range(len(filePathList)): #用下标的形式,方便控制数量 html = readTXT(filePathList[i]) dl = dlInfo(html) #查重 try: if len(dl) > 1: #print('ok0') infoSet = analysisPage(html, dl['id']) #print('ok') addInfo(infoSet, dl) #print('ok1') #if i>5: #break except Exception: print('analysis error:' + str(dl['id'])) cur.close() conn.close()
# -*- coding: utf-8 -*- from tool import readTXT from bs4 import BeautifulSoup if __name__ == '__main__': path = 'e:/test.xml' xml = readTXT(path) #print(xml) if xml.find('<全部同伴的名字>') > -1: ind = xml.find('<全部同伴的名字>') xml = xml[:ind] else: print('error') soup = BeautifulSoup(''.join(xml), "lxml") a = soup.find('a', {'name': "collab"}) try: divAb = a.parent.parent tr = divAb.table.tr for td in tr.findAll('td'): for div in td.findAll('div'): if div.a.string != None: print(div.a.string) url = 'http://dl.acm.org/' + div.a['href'] print(url) except Exception: print('error')
id = urlid.replace('http://dl.acm.org/author_page.cfm?id=', '') userid = id[4:] #only numbers begin at 4 are considered return urlid, userid def mainFunction(): # filePathList = readFiles() #从列表中读取文件 for i in range(len(filePathList)): #用下标的形式,方便控制数量 html = readTXT(filePathList[i]) soup = BeautifulSoup(''.join(html), "lxml") dlInfo(html, soup) # cur.close() conn.close() if __name__ == '__main__': # filePathList = readFiles() #从列表中读取文件 for i in range(len(filePathList)): #用下标的形式,方便控制数量 html = readTXT(filePathList[i]) soup = BeautifulSoup(''.join(html), "lxml") dlInfo(html, soup) # cur.close() conn.close()
def getHttpUa(http_path, ua_path): # http = readTXT(http_path) # read txt ua = readList(ua_path) # read csv return http, ua