コード例 #1
0
def findPage():
    http,ua = getHttpUa()
    conn,cur = getCursor()
    dlList = getResult(sltCollNotNull,cur)
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 12))
        
        html = str(getPage(dl['colleage']))#取出url
        if html != ' ':
            nameLink = analysisPage(html)
            for nl in nameLink:
            	addInfo(conn,cur,nl)
            	#print(nl)
            print('Now is '+str(dl['id']))
            
        #break#only run one time
    
    cur.close()
    conn.close()
コード例 #2
0
    def run(self):
        while True:
           # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(5, 20))#随机休眠

            #取出html
            html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
            papercsv = getCsvUrl(html)
            dl['papercsv'] = papercsv
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            else:
                print('get: '+dl['id'])
            self.htmlQueue.put(dl)
            self.dlQueue.task_done()
コード例 #3
0
    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:  #随机触发
                #httpProxies=editeProxies(http,httpProxies)#改变http
                headers = editeHeader(ua, headers, dl['name'])  #改变user agent
                cookies = editeCookies(cookies)
            time.sleep(random.randint(5, 20))  #随机休眠

            #取出html
            html = str(getPage(dl['url'], httpProxies, headers,
                               cookies))  #取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':  #未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            self.htmlQueue.put((html, dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
コード例 #4
0
    def run(self):
        while True:
           # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()
            
            ua = self.uaQueue.get()
            dl  = self.htmlQueue.get()
            http = self.ipQueue.get()
            httpProxies['https'] = http
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(6, 15))#随机休眠

            #-----------------------------这里就是不一样的地方
            
            #取出csv
            flag = findCSV(dl,httpProxies,headers,cookies)#取出url
            self.ipQueue.put(http)#放回
            self.uaQueue.put(ua)
            if not flag:#未获取成功,重新放入
                self.htmlQueue.put(dl)
            #放入
            self.htmlQueue.task_done()
コード例 #5
0
    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            ua = self.uaQueue.get()
            dl = self.dlQueue.get()
            http = self.ipQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:  #随机触发
                #httpProxies=editeProxies(http,httpProxies)#改变http
                headers = editeHeader(ua, headers, dl['name'])  #改变user agent
                cookies = editeCookies(cookies)
            time.sleep(random.randint(7, 15))  #随机休眠

            #取出html
            flag = findCSV(dl, httpProxies, headers, cookies)  #取出url
            #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
            self.ipQueue.put(http)  #放回
            self.uaQueue.put(ua)
            if not flag:  #未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
コード例 #6
0
def findCSV():
    httpProxies = commHttpProxies
    headers = commHeaders
    
    http,ua = getHttpUa()#获取伪装的备选http,userAgent列表
    conn,cur = getCursor()#获取数据库连接和游标
    dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组

    for dl in dlList:
        if ChangeOrNot() == True:#随机触发
            httpProxies=editeProxies(http,httpProxies)#改变http
            headers=editeHeader(ua,headers)#改变user agent
        time.sleep(random.randint(1, 12))#随机休眠
        
        url = dl['papercsv']

        if url != None and len(url)> 15:
            try:
                r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                if r.status_code == 200:
                    csv_path = file_path+str(dl['id'])+'.csv'
                    with open(csv_path,'wb') as csv:
                        csv.write(r.content)
                        print('Now is '+str(dl['id']))
            except requests.RequestException as e:
                print(e)
                httpProxies=editeProxies(http,httpProxies)#改变http
                headers=editeHeader(ua,headers)#改变user agent
                time.sleep(random.randint(1, 12))#随机休眠
                try:
                    r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                    if r.status_code == 200:
                        csv_path = file_path+str(dl['id'])+'.csv'
                        with open(csv_path,'wb') as csv:
                            csv.write(r.content)
                            print('Now is '+str(dl['id']))
                except Exception:
                    print('another try is failed! id:'+str(dl['id']))
        #break# only run one time
    
    cur.close()
    conn.close()        
コード例 #7
0
def findPage():
    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()
    #从数据库中获取预访问的url列表,循环访问url地址
    http, ua = getHttpUa()  #获取伪装的备选http,userAgent列表
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    #for i in range(15):
    #i = 0
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')

        if ChangeOrNot() == True:  #随机触发
            httpProxies = editeProxies(http, httpProxies)  #改变http
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(2, 12))  #随机休眠

        print(str(httpProxies['https']))
        print(str(headers['User-Agent']))
        print(str(headers['Referer']))
        print(str(cookies['CFID']))
        print()
        '''
        html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
        
        if html != ' ':
            infoSet = analysisPage(html,int(dl['id']))#分析页面
            addInfo(conn,cur,infoSet,dl)#存入数据库
            cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取
            conn.commit()
            print('Now is '+str(dl['id']))            
        #break#only run one time
        '''
    cur.close()
    conn.close()
    def run(self):
        while True:
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(3, 14))#随机休眠

            #Optional            
            dl['colleage'] = cleanURL(dl['colleage'])#delete cookies
            #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']:
            #    dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','')

            #取出html
            html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            print('get HTML:'+str(dl['id']))
            self.htmlQueue.put((html,dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
コード例 #9
0
        dlQueue.put(dl)

    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()

    while not dlQueue.empty():

        dl = dlQueue.get()
        http = random.choice(http)
        ua = random.choice(uag)
        httpProxies['https'] = http
        #ts1 = datetime.datetime.now()
        #修饰参数
        if ChangeOrNot() == True:  #随机触发
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(5, 20))  #随机休眠

        #取出html
        html = str(getPage(dl['url'], httpProxies, headers, cookies))  #取出url

        #放回
        if html == ' ':  #未获取成功,重新放入
            dlQueue.put(dl)
        #放入文件中
        try:
            path = dirPath + str(dl['id']) + '.txt'
            writeTXT(path, html)
            print('complete: ' + str(dl['id']))
        except Exception:
コード例 #10
0
from tool import signal,getHttpUa,httpProxies,headers,readTXT,getCursor,getResult,ChangeOrNot,editeProxies,editeHeader,getPage
import time
import random
from bs4 import BeautifulSoup

#主要的循环方法
def findSubject():
	http,ua = getHttpUa()
    conn,cur = getCursor()
    subList = getResult(sltDLNotCom,cur)#返回二维数组
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 20))

        html = str(getPage(dl['subject']))#取出url
        if html != ' ':
            subject = extractSubject(html)
            status = addInfo(conn,cur,subject,dl['id'])
            if status == 1:
                print('Now is '+str(dl['id']))
                
    cur.close()
    conn.close()
    
def extractSubject(doc):
	#
	combineStr = ''