def run(self): while True: # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(5, 20))#随机休眠 #取出html html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url papercsv = getCsvUrl(html) dl['papercsv'] = papercsv #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 else: print('get: '+dl['id']) self.htmlQueue.put(dl) self.dlQueue.task_done()
def run(self): while True: # Get the work from the queue and expand the tuple # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 #httpProxies=editeProxies(http,httpProxies)#改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ': #未获取成功,重新放入 self.dlQueue.put(dl) #放入 self.htmlQueue.put((html, dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
def run(self): while True: # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() ua = self.uaQueue.get() dl = self.htmlQueue.get() http = self.ipQueue.get() httpProxies['https'] = http #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(6, 15))#随机休眠 #-----------------------------这里就是不一样的地方 #取出csv flag = findCSV(dl,httpProxies,headers,cookies)#取出url self.ipQueue.put(http)#放回 self.uaQueue.put(ua) if not flag:#未获取成功,重新放入 self.htmlQueue.put(dl) #放入 self.htmlQueue.task_done()
def run(self): while True: # Get the work from the queue and expand the tuple # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() ua = self.uaQueue.get() dl = self.dlQueue.get() http = self.ipQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 #httpProxies=editeProxies(http,httpProxies)#改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(7, 15)) #随机休眠 #取出html flag = findCSV(dl, httpProxies, headers, cookies) #取出url #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') self.ipQueue.put(http) #放回 self.uaQueue.put(ua) if not flag: #未获取成功,重新放入 self.dlQueue.put(dl) #放入 #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
def findPage(): httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() #从数据库中获取预访问的url列表,循环访问url地址 http, ua = getHttpUa() #获取伪装的备选http,userAgent列表 conn, cur = getCursor() #获取数据库连接和游标 dlList = getResult(sltDLNotCom, cur) #返回url实体的二维数组 #for i in range(15): #i = 0 for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: #随机触发 httpProxies = editeProxies(http, httpProxies) #改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(2, 12)) #随机休眠 print(str(httpProxies['https'])) print(str(headers['User-Agent'])) print(str(headers['Referer'])) print(str(cookies['CFID'])) print() ''' html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url if html != ' ': infoSet = analysisPage(html,int(dl['id']))#分析页面 addInfo(conn,cur,infoSet,dl)#存入数据库 cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取 conn.commit() print('Now is '+str(dl['id'])) #break#only run one time ''' cur.close() conn.close()
def run(self): while True: httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(3, 14))#随机休眠 #Optional dl['colleage'] = cleanURL(dl['colleage'])#delete cookies #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']: # dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','') #取出html html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 print('get HTML:'+str(dl['id'])) self.htmlQueue.put((html,dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
if __name__ == '__main__': http, uag = getHttpUa() dlList = [] with open(expertList_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: dlList.append(row) for dl in dlList: dlQueue.put(dl) httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() while not dlQueue.empty(): dl = dlQueue.get() http = random.choice(http) ua = random.choice(uag) httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html