def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        eid = getID(html)
        if eid >0:
            instit = extractInstitut(html)
            if len(instit)>0:
                insertInstitution(eid,instit,fileP)
            else:
                try:
                    print('empty: '+fileP)
                    updateSQL = 'update dlurl1 set tem=5 where id='+str(eid)
                    cur.execute(updateSQL)#标记已抽取
                    conn.commit()
                except Exception:
                    print('error: '+updateSQL)
        else:
            print('eid error: '+fileP)
        	
        #break#只运行一次

    cur.close();conn.close();
コード例 #2
0
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        eid = getID(html)
        if eid > 0:
            topicID = extractTopic(eid, html)
            if len(topicID) > 0:
                insertTopic(eid, topicID, fileP)
            else:
                try:
                    print('empty: ' + fileP)
                    updateSQL = 'update dlurl1 set status=2 where id=' + str(
                        eid)
                    cur.execute(updateSQL)  #标记已抽取
                    conn.commit()
                except Exception:
                    print('error: ' + updateSQL)
        else:
            print('eid error: ' + fileP)
        #break#只运行一次
    cur.close()
    conn.close()
コード例 #3
0
def mainFunction():
    #
    filePathList = readFiles()
    #从列表中读取文件
    for i in range(len(filePathList)):  #用下标的形式,方便控制数量
        html = readTXT(filePathList[i])
        soup = BeautifulSoup(''.join(html), "lxml")
        dlInfo(html, soup)  #

    cur.close()
    conn.close()
コード例 #4
0
 def run(self):
     while True:
         #取出html
         filePath = self.fileQueue.get()
         html = str(readTXT(filePath))  #取出url
         print('get: ' + filePath)
         if html.find('<全部同伴的名字>') > 0:
             ind = html.find('<全部同伴的名字>')
             html = html[:ind]
         nameLink = analysisPage(html)
         if len(nameLink) > 0:
             for nl in nameLink:
                 addInfo(nl)
         self.fileQueue.task_done()
コード例 #5
0
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        #print('do here')
        eid = getID(html)
        #print('do here0')
        if eid >0:
            instit = extractInstitut(html)
            
            if len(instit)>0:
                #print('do here1')
                insertInstitution(eid,instit,fileP)
            
            #print(instit)
        #break#只运行一次
    cur.close();conn.close();
コード例 #6
0
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready')
    for fileP in filePathList:
        html = readTXT(fileP)
        #print('do here')
        eid = getID(html)
        #print('do here0')
        if eid > 0:
            topics = extractTopic(html)
            institution = extractInstitut(html)
            if len(topics) > 0:
                insertTopic(eid, topics, fileP)
            if len(institution) > 0:
                insertInstitution(eid, institution, fileP)
        #break#只运行一次
    cur.close()
    conn.close()
コード例 #7
0
def mainFunction():
    #
    #读取文件
    filePathList = readFiles(files_path)
    print('read is ready len is ' + str(len(filePathList)))
    for fileP in filePathList:
        html = readTXT(fileP)
        #print('do here')
        eid = getID(html)
        #print('do here0')
        if len(eid) > 0:
            for e in eid:
                topics = extractTopic(html)
                if len(topics) > 0:
                    #print('do here1')
                    insertTopic(e, topics, fileP)
                else:
                    updateSQL = 'update dlurl1 set status=2 where id=' + str(e)
                    cur.execute(updateSQL)
                    conn.commit()
コード例 #8
0
def mainFunction():
    #
    filePathList = readFiles()

    #从列表中读取文件
    for i in range(len(filePathList)):  #用下标的形式,方便控制数量
        html = readTXT(filePathList[i])
        dl = dlInfo(html)  #查重
        try:

            if len(dl) > 1:
                #print('ok0')
                infoSet = analysisPage(html, dl['id'])
                #print('ok')
                addInfo(infoSet, dl)
                #print('ok1')
            #if i>5:
            #break
        except Exception:
            print('analysis error:' + str(dl['id']))
    cur.close()
    conn.close()
コード例 #9
0
# -*- coding: utf-8 -*-
from tool import readTXT
from bs4 import BeautifulSoup

if __name__ == '__main__':
    path = 'e:/test.xml'
    xml = readTXT(path)
    #print(xml)
    if xml.find('<全部同伴的名字>') > -1:
        ind = xml.find('<全部同伴的名字>')
        xml = xml[:ind]
    else:
        print('error')

    soup = BeautifulSoup(''.join(xml), "lxml")
    a = soup.find('a', {'name': "collab"})
    try:
        divAb = a.parent.parent
        tr = divAb.table.tr
        for td in tr.findAll('td'):
            for div in td.findAll('div'):
                if div.a.string != None:
                    print(div.a.string)
                    url = 'http://dl.acm.org/' + div.a['href']
                    print(url)
    except Exception:
        print('error')
コード例 #10
0
    id = urlid.replace('http://dl.acm.org/author_page.cfm?id=', '')
    userid = id[4:]  #only numbers begin at 4 are considered
    return urlid, userid


def mainFunction():
    #
    filePathList = readFiles()
    #从列表中读取文件
    for i in range(len(filePathList)):  #用下标的形式,方便控制数量
        html = readTXT(filePathList[i])
        soup = BeautifulSoup(''.join(html), "lxml")
        dlInfo(html, soup)  #

    cur.close()
    conn.close()


if __name__ == '__main__':
    #
    filePathList = readFiles()
    #从列表中读取文件
    for i in range(len(filePathList)):  #用下标的形式,方便控制数量

        html = readTXT(filePathList[i])
        soup = BeautifulSoup(''.join(html), "lxml")
        dlInfo(html, soup)  #

    cur.close()
    conn.close()
コード例 #11
0
def getHttpUa(http_path, ua_path):
    #
    http = readTXT(http_path)  # read txt
    ua = readList(ua_path)  # read csv

    return http, ua