def findPage():
    http,ua = getHttpUa()
    conn,cur = getCursor()
    dlList = getResult(sltCollNotNull,cur)
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 12))
        
        html = str(getPage(dl['colleage']))#取出url
        if html != ' ':
            nameLink = analysisPage(html)
            for nl in nameLink:
            	addInfo(conn,cur,nl)
            	#print(nl)
            print('Now is '+str(dl['id']))
            
        #break#only run one time
    
    cur.close()
    conn.close()
def mainFunction():
    #
    #0.数据准备
    http,uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)

    for k in range(1):
        aWorker = analysisWorker(ipQueue,uaQueue,htmlQueue)
        aWorker.daemon = True
        aWorker.start()

    for i in range(1):
        pWorker = pageWorker(ipQueue,uaQueue,dlQueue,htmlQueue)
        pWorker.daemon = True
        pWorker.start()  

    #1.get that list of error csv
    dlList=[]
    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dlList.append(row)
            
    
    #dl 是字典型的
    for dl in dlList:
        dlQueue.put(dl)
def mainFunction():
    http, uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(1):
        aWorker = analysisWorker(htmlQueue, infoQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(4):
        pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
    conn, cur = getCursor()
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    for dl in dlList:
        dlQueue.put(dl)
    cur.close()
    conn.close()
    print('ok3')
    for j in range(1):
        mWorker = mysqlWorker(infoQueue)
        mWorker.daemon = True
        mWorker.start()
Ejemplo n.º 4
0
def mainFunction():
    http,uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(1):
        aWorker = analysisWorker(htmlQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(7):
        pWorker = pageWorker(ipQueue,uaQueue,dlQueue,htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
    
    #dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组
    dlList=[]
    with open(expertList_path,newline = '',encoding= 'utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if len(row[2])>5:
                dlList.append({'id':row[0],'name':row[1],'url':row[2]})
                #break
        print('total:'+str(len(dlList)))

    for dl in dlList:
        dlQueue.put(dl)
def mainFunction():
    http, uag = getHttpUa()

    for ua in uag:
        uaQueue.put(ua)

    for ip in http:
        ipQueue.put(ip)

    for i in range(5):
        pWorker = pageWorker(uaQueue, ipQueue, dlQueue)
        pWorker.daemon = True
        pWorker.start()

    conn, cur = getCursor()

    dlList = getResult(selectSQL, cur)
    '''
    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dlList.append(row)
    '''

    for dl in dlList:
        dlQueue.put(dl)
def mainFunction():
    http,uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(2):
        aWorker = analysisWorker(htmlQueue,infoQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(8):
        pWorker = pageWorker(ipQueue,uaQueue,dlQueue,htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
    '''
    conn,cur = getCursor()
    dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组
    '''
    dlList=[]
    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if int(row['id']) > 0:
                dlList.append(row)
            #break
        print('total:'+str(len(dlList)))

    for dl in dlList:
        dlQueue.put(dl)
Ejemplo n.º 7
0
def findCSV():
    httpProxies = commHttpProxies
    headers = commHeaders
    
    http,ua = getHttpUa()#获取伪装的备选http,userAgent列表
    conn,cur = getCursor()#获取数据库连接和游标
    dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组

    for dl in dlList:
        if ChangeOrNot() == True:#随机触发
            httpProxies=editeProxies(http,httpProxies)#改变http
            headers=editeHeader(ua,headers)#改变user agent
        time.sleep(random.randint(1, 12))#随机休眠
        
        url = dl['papercsv']

        if url != None and len(url)> 15:
            try:
                r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                if r.status_code == 200:
                    csv_path = file_path+str(dl['id'])+'.csv'
                    with open(csv_path,'wb') as csv:
                        csv.write(r.content)
                        print('Now is '+str(dl['id']))
            except requests.RequestException as e:
                print(e)
                httpProxies=editeProxies(http,httpProxies)#改变http
                headers=editeHeader(ua,headers)#改变user agent
                time.sleep(random.randint(1, 12))#随机休眠
                try:
                    r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                    if r.status_code == 200:
                        csv_path = file_path+str(dl['id'])+'.csv'
                        with open(csv_path,'wb') as csv:
                            csv.write(r.content)
                            print('Now is '+str(dl['id']))
                except Exception:
                    print('another try is failed! id:'+str(dl['id']))
        #break# only run one time
    
    cur.close()
    conn.close()        
Ejemplo n.º 8
0
def findPage():
    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()
    #从数据库中获取预访问的url列表,循环访问url地址
    http, ua = getHttpUa()  #获取伪装的备选http,userAgent列表
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    #for i in range(15):
    #i = 0
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')

        if ChangeOrNot() == True:  #随机触发
            httpProxies = editeProxies(http, httpProxies)  #改变http
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(2, 12))  #随机休眠

        print(str(httpProxies['https']))
        print(str(headers['User-Agent']))
        print(str(headers['Referer']))
        print(str(cookies['CFID']))
        print()
        '''
        html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
        
        if html != ' ':
            infoSet = analysisPage(html,int(dl['id']))#分析页面
            addInfo(conn,cur,infoSet,dl)#存入数据库
            cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取
            conn.commit()
            print('Now is '+str(dl['id']))            
        #break#only run one time
        '''
    cur.close()
    conn.close()
    conn, cur = getCursor()
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    for dl in dlList:
        dlQueue.put(dl)
    cur.close()
    conn.close()
    print('ok3')
    for j in range(1):
        mWorker = mysqlWorker(infoQueue)
        mWorker.daemon = True
        mWorker.start()


if __name__ == '__main__':

    http, uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(1):
        aWorker = analysisWorker(htmlQueue, infoQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(4):
        pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
def findSubject():
	http,ua = getHttpUa()