Beispiel #1
0
def parseWebsite():
    log = open('../logs/parseweb.log', 'a')
    # 每次取5000条
    count = TempLz.select().count()
    pagesize = 5000
    pagecount = int(math.ceil(float(count) / float(pagesize)))
    for i in range(pagecount):
        datas = TempLz.select().where(TempLz.id>18879).order_by(TempLz.id).paginate(i + 1, pagesize)
        if datas is not None:
            for d in datas:
                data = d.lzpage
                if data is not None:
                    parseData = parserCompanyAndWeb(data)
                    com = buildCompany(parseData['company'])
                    web = buildWebsite(parseData['web'])
                    if com is not None and web is not None:
                        c = Company.getOne(Company.coname == com.coname)
                        if c is not None:
                            web.regID = c
                            impWebsite(web)
                        else:
                            impCompanyInfo(com)
                            tempCom = Company.getOne(Company.regID == com.regID)
                            web.regID = tempCom
                            impWebsite(web)
                log.write(str(d.id)+ "\n")
                print d.id
    log.flush()
    log.close()
Beispiel #2
0
def judgeLzResult(com, web, shortUrl, subTask):
    # 更新网站信息和公司信息
    impCompanyInfo(com)
    # impWebsite(web)
    # nWeb = Website.getOne(Website.domain == web.domain)

    # judgeWeb = Website.getOne(Website.domain ** str.format("%{}%", shortUrl))

    # 如果查询网址与抓取亮照后的网址不匹配
    if shortUrl != '':
        shortUrl = shortUrl.replace('http://', '').replace(' ', '')
    print  'shortUrl:', shortUrl
    print "web.domain:", web.domain
    com = Company.getOne(Company.coname == com.coname)
    if shortUrl != web.domain:
        # 当已经存在跳转关系记录时,不再操作
        existsJumpWeb = Website.getOne((Website.domain == web.domain))
        if existsJumpWeb is None:
            dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            web.updateDate = dt
            web.regID = com
            web.save(force_insert=True)
            # 更新网站跳转地址
            q = Website.update(jump=web).where(Website.webId == subTask.webId.webId)
            q.execute()
        else:
            impWebsite(web)
            # 更新网站跳转地址
            q = Website.update(jump=existsJumpWeb).where(Website.webId == subTask.webId.webId)
            q.execute()
    else:
        #更新网站信息
        impWebsite(web)

    count = Website.select().where(
            (Website.licID != '') & (Website.regID == com) & (Website.domain == shortUrl)
    ).count()
    if count == 0:
        onlyCname = Website.select().where(
                (Website.licID != '') & (Website.regID == com)).count()
        onlyDomain = Website.select().where(
                (Website.licID != '') & (Website.domain == shortUrl)).count()
        if onlyCname > 0:
            q = TaskInfo.update(state='9').where(TaskInfo.id == subTask.id)
            q.execute()
        elif onlyDomain > 0:
            q = TaskInfo.update(state='8').where(TaskInfo.id == subTask.id)
            q.execute()
        else:
            q = TaskInfo.update(state='4').where(TaskInfo.id == subTask.id)
            q.execute()
    else:
        q = TaskInfo.update(state='2').where(TaskInfo.id == subTask.id)
        q.execute()
Beispiel #3
0
def impData():
    while(parsePage()!=[]):
        companyData = parsePage()
        for data in companyData:
            #解析当前内容
            result = parser(data)
            com = buildCompany(result)
            impCompanyInfo(com)
    data = TempLz.select().where(TempLz.lzID=='20120323163520958')
    for d in data:
        result = parser(d.lzpage)
        for r in result:
            print r
Beispiel #4
0
def parsePage():
    # 将已经解析过的亮照id写入impdbdata.log
    # 读取impdbdata.log

    log = None
    try:
        # 直到所有页面抓取完毕为止
        while (True):
            f = open('../logs/impdbdata.log', 'r')
            lines = f.readlines()
            arrs = []
            for l in lines:
                arrs.append(l)
            f.close()

            log = open('../logs/impdbdata.log', 'a')
            # 获取此文件中的所有regID
            if arrs != []:
                lzs = TempLz.select().where(
                        (TempLz.lzpage != '')
                        & (TempLz.lzID.not_in(arrs))
                )
                print 'result count:', TempLz.select().where(
                        (TempLz.lzpage != '')
                        & (TempLz.lzID.not_in(arrs))).count()

            else:
                lzs = TempLz.select().where(
                        (TempLz.lzpage != '')
                )
            i = 0
            for lz in lzs:
                i += 1
                result = parser(lz.lzpage)
                com = buildCompany(result)
                # 构建公司信息
                impCompanyInfo(com)
                log.write(lz.lzID + '\n')
                log.flush()
    except Exception, e:
        print e