コード例 #1
0
def findPage():
    http,ua = getHttpUa()
    conn,cur = getCursor()
    dlList = getResult(sltCollNotNull,cur)
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 12))
        
        html = str(getPage(dl['colleage']))#取出url
        if html != ' ':
            nameLink = analysisPage(html)
            for nl in nameLink:
            	addInfo(conn,cur,nl)
            	#print(nl)
            print('Now is '+str(dl['id']))
            
        #break#only run one time
    
    cur.close()
    conn.close()
コード例 #2
0
    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:  #随机触发
                #httpProxies=editeProxies(http,httpProxies)#改变http
                headers = editeHeader(ua, headers, dl['name'])  #改变user agent
                cookies = editeCookies(cookies)
            time.sleep(random.randint(5, 20))  #随机休眠

            #取出html
            html = str(getPage(dl['url'], httpProxies, headers,
                               cookies))  #取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':  #未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            self.htmlQueue.put((html, dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
コード例 #3
0
    def run(self):
        while True:
           # 从队列中获取任务并扩展tuple
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(5, 20))#随机休眠

            #取出html
            html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
            papercsv = getCsvUrl(html)
            dl['papercsv'] = papercsv
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            else:
                print('get: '+dl['id'])
            self.htmlQueue.put(dl)
            self.dlQueue.task_done()
    def run(self):
        while True:
            httpProxies = commHttpProxies.copy()
            headers = commHeaders.copy()
            cookies = commCookies.copy()

            dl  = self.dlQueue.get()
            http = self.ipQueue.get()
            ua = self.uaQueue.get()
            httpProxies['https'] = http
            #ts1 = datetime.datetime.now()
            #修饰参数
            if ChangeOrNot() == True:#随机触发
                headers=editeHeader(ua,headers,dl['name'])#改变user agent
                cookies=editeCookies(cookies)
            time.sleep(random.randint(3, 14))#随机休眠

            #Optional            
            dl['colleage'] = cleanURL(dl['colleage'])#delete cookies
            #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']:
            #    dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','')

            #取出html
            html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url
            #放回
            self.ipQueue.put(http)
            self.uaQueue.put(ua)
            if html == ' ':#未获取成功,重新放入
                self.dlQueue.put(dl)
            #放入
            print('get HTML:'+str(dl['id']))
            self.htmlQueue.put((html,dl))
            #print('get: '+str(dl['id']))
            #ts2 = datetime.datetime.now()
            #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1))
            self.dlQueue.task_done()
コード例 #5
0
    headers = commHeaders.copy()
    cookies = commCookies.copy()

    while not dlQueue.empty():

        dl = dlQueue.get()
        http = random.choice(http)
        ua = random.choice(uag)
        httpProxies['https'] = http
        #ts1 = datetime.datetime.now()
        #修饰参数
        if ChangeOrNot() == True:  #随机触发
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(5, 20))  #随机休眠

        #取出html
        html = str(getPage(dl['url'], httpProxies, headers, cookies))  #取出url

        #放回
        if html == ' ':  #未获取成功,重新放入
            dlQueue.put(dl)
        #放入文件中
        try:
            path = dirPath + str(dl['id']) + '.txt'
            writeTXT(path, html)
            print('complete: ' + str(dl['id']))
        except Exception:
            print('error:' + str(dl['id']))
            dlQueue.put(dl)
コード例 #6
0
from bs4 import BeautifulSoup

#主要的循环方法
def findSubject():
	http,ua = getHttpUa()
    conn,cur = getCursor()
    subList = getResult(sltDLNotCom,cur)#返回二维数组
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 20))

        html = str(getPage(dl['subject']))#取出url
        if html != ' ':
            subject = extractSubject(html)
            status = addInfo(conn,cur,subject,dl['id'])
            if status == 1:
                print('Now is '+str(dl['id']))
                
    cur.close()
    conn.close()
    
def extractSubject(doc):
	#
	combineStr = ''
	subject = []
	soup = BeautifulSoup(''.join(doc),"lxml")
	h5 = soup.find(text='Subject Areas')