def run(self):
        while True:
           # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(5, 20))#随机休眠

            #取出html
            html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
            papercsv = getCsvUrl(html)
            dl['papercsv'] = papercsv
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            else:
                print('get: '+dl['id'])
            self.htmlQueue.put(dl)
            self.dlQueue.task_done()
    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:  #随机触发
                #httpProxies=editeProxies(http,httpProxies)#改变http
                headers = editeHeader(ua, headers, dl['name'])  #改变user agent
                cookies = editeCookies(cookies)
            time.sleep(random.randint(5, 20))  #随机休眠

            #取出html
            html = str(getPage(dl['url'], httpProxies, headers,
                               cookies))  #取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':  #未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            self.htmlQueue.put((html, dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
    def run(self):
        while True:
           # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()
            
            ua = self.uaQueue.get()
            dl  = self.htmlQueue.get()
            http = self.ipQueue.get()
            httpProxies['https'] = http
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(6, 15))#随机休眠

            #-----------------------------这里就是不一样的地方
            
            #取出csv
            flag = findCSV(dl,httpProxies,headers,cookies)#取出url
            self.ipQueue.put(http)#放回
            self.uaQueue.put(ua)
            if not flag:#未获取成功,重新放入
                self.htmlQueue.put(dl)
            #放入
            self.htmlQueue.task_done()
Beispiel #4
0
    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            ua = self.uaQueue.get()
            dl = self.dlQueue.get()
            http = self.ipQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:  #随机触发
                #httpProxies=editeProxies(http,httpProxies)#改变http
                headers = editeHeader(ua, headers, dl['name'])  #改变user agent
                cookies = editeCookies(cookies)
            time.sleep(random.randint(7, 15))  #随机休眠

            #取出html
            flag = findCSV(dl, httpProxies, headers, cookies)  #取出url
            #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
            self.ipQueue.put(http)  #放回
            self.uaQueue.put(ua)
            if not flag:  #未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
Beispiel #5
0
def findPage():
    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()
    #从数据库中获取预访问的url列表,循环访问url地址
    http, ua = getHttpUa()  #获取伪装的备选http,userAgent列表
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    #for i in range(15):
    #i = 0
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')

        if ChangeOrNot() == True:  #随机触发
            httpProxies = editeProxies(http, httpProxies)  #改变http
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(2, 12))  #随机休眠

        print(str(httpProxies['https']))
        print(str(headers['User-Agent']))
        print(str(headers['Referer']))
        print(str(cookies['CFID']))
        print()
        '''
        html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
        
        if html != ' ':
            infoSet = analysisPage(html,int(dl['id']))#分析页面
            addInfo(conn,cur,infoSet,dl)#存入数据库
            cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取
            conn.commit()
            print('Now is '+str(dl['id']))            
        #break#only run one time
        '''
    cur.close()
    conn.close()
    def run(self):
        while True:
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(3, 14))#随机休眠

            #Optional            
            dl['colleage'] = cleanURL(dl['colleage'])#delete cookies
            #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']:
            #    dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','')

            #取出html
            html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            print('get HTML:'+str(dl['id']))
            self.htmlQueue.put((html,dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
global cookies

if __name__ == '__main__':

    http, uag = getHttpUa()
    dlList = []

    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dlList.append(row)
    for dl in dlList:
        dlQueue.put(dl)

    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()

    while not dlQueue.empty():

        dl = dlQueue.get()
        http = random.choice(http)
        ua = random.choice(uag)
        httpProxies['https'] = http
        #ts1 = datetime.datetime.now()
        #修饰参数
        if ChangeOrNot() == True:  #随机触发
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(5, 20))  #随机休眠