def handle_single_item(item, **kwargs):
    company_id, company_name, parsed_content = item['id'], item['company_name'].strip(), item['data_gsxx']

    company_id_es = ETLTools.generate_company_id(company_name)
    logging.info("starts to process company: %s, id: %s, the id of es document is: %s "
                 % (company_name, company_id, company_id_es))
    status = EsHelper.check(index=kwargs['index'], doc_type=kwargs['doc_type'],
                            doc_id=company_id_es, site=kwargs['site'], config_site=kwargs['config_site'])

    if status == 2:  # 数据存在
        logging.info("company exists in es")
        return

    if parsed_content is None:
        select_company_parsed_content = 'SELECT value from common_data_all  where company_id = %s and site = %s and key_desc = %s '
        parsed_content = DbHelper.fetchone(select_company_parsed_content, data=(company_id, kwargs['site'], 'data_gsxx'))['value']
    converted = Qichacha_WeiXin_EsConvert.convert(parsed_content)
    converted['CmpName'] = company_name.strip()
    converted['Meta'] = {'Source': kwargs['site'], 'Time': datetime.datetime.now().date().__str__()}

    if status == 0:  # es中不存在

        EsHelper.es.index(index=kwargs['index'],
                          doc_type=kwargs['doc_type'],
                          id=company_id_es,
                          body=json.dumps(converted, ensure_ascii=False, encoding="utf-8"))
        logging.info("inserted into es")
    else:  # status == 1  es存在,但是优先级较低,因此更新之
        EsHelper.es.update(index='drdata_qyk', doc_type='BusinessInfo', id=company_id_es,
                           body=json.dumps(converted, ensure_ascii=False, encoding="utf-8"))
        logging.info("updated into es")

    DbHelper.execute("UPDATE qichacha_weixin_company set es_status = %s WHERE id = %s ", (kwargs['es_status_success'], company_id))
Esempio n. 2
0
def handle_single_item(item, **kwargs):
    id, old_data, web_data = item['id'], item['old_data'], item['web_data']
    logging.info("starts to process gonggao_dta id: %s" % (id))
    convert_list = GonggaoWebEsConvert.convert(old_data, web_data)
    for convert in convert_list:
        company_id_es = generate_company_id(str(id) + convert['CmpName'])
        status = check_existence(company_id_es,
                                 index=kwargs['index'],
                                 doc_type=kwargs['doc_type'])
        convert['Meta'] = {
            'Source': kwargs['site'],
            'Time': datetime.datetime.now().date().__str__()
        }
        if status == 0:  # 不存在
            logging.info("starts to index id : %s,company : %s,doc_id : %s" %
                         (id, convert['CmpName'], company_id_es))
            EsHelper.es.index(index=kwargs['index'],
                              doc_type=kwargs['doc_type'],
                              id=company_id_es,
                              body=json.dumps(convert,
                                              ensure_ascii=False,
                                              encoding="utf-8"))
            logging.info("inserted into es")
        elif status == 1:  # 存在
            DbHelper.execute(
                "UPDATE gonggao_data set es_status = %s WHERE id = %s ",
                (kwargs['es_status_exists'], id))
            logging.info("company exists in es")
            return
        DbHelper.execute(
            "UPDATE gonggao_data set es_status = %s WHERE id = %s ",
            (kwargs['es_status_success'], id))
def main():
    site = 'xysj_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:
        stime = time.time()
        items = DbHelper.fetchmany(
            "select id, crawl_time, src_list from xysj_weixin_company where crawl_status = 1 and "
            " parse_status = 0  limit %s ", (batch, ))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time, src_list = item[0], item[1], item[2]
            try:
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'info': src_list})
                logging.getLogger().info(" data stored ")
                DbHelper.execute(
                    "update xysj_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(" parse status updated ")

            except Exception, err:
                logging.getLogger().exception(err)
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Esempio n. 4
0
def main():
    site = 'guangdong_weixin'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gongshang, b.src_qiye, b.src_other, b.src_judicial from (
                select id from gs_guangdong_company where status = %s and parse_status = %s limit %s
            )as a left join (select id, update_time, src_gongshang, src_qiye, src_other, src_judicial from gs_guangdong_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, [15, 0, batch])
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gongshang'], item[
                'src_qiye'], item['src_other'], item['src_judicial']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={
                                           u'gsgs': gsgs,
                                           u'qygs': qygs,
                                           u'bmgs': bmgs,
                                           u'sfgs': sfgs
                                       })
                logging.getLogger().info(" data stored ")
                '''
                2 更新相关信息, 即parse_status 和 data_table_name
                '''
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, company_id))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (item[0]))
                DbHelper.execute(
                    "update gs_guangdong_company set parse_status = %s where id = %s",
                    [2, company_id])
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Esempio n. 5
0
def main():
    site = 'qichacha_weixin'
    batch = 100
    data_table = 'common_data_all'
    while True:

        stime = time.time()

        sql = '''
            select b.id, b.update_time, b.data from (
                select id from qichacha_weixin_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, data from qichacha_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time, data = item['id'], item[
                'update_time'], item['data']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " %
                                         (item[0], ))
                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'detail': data})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                    [1, data_table, item[0]])
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")

            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update qichacha_weixin_company set parse_status = %s where id = %s",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
Esempio n. 6
0
def main():
    batch = 1
    parse_status_success = 1
    crawl_status_success = 1

    while True:
        items = DbHelper.fetchmany(
            "SELECT id, web_data FROM wenshu_web WHERE parse_status != %s limit %s",
            (parse_status_success, batch))

        for item in items:

            case_list = json.loads(item['web_data'])

            for case in case_list:

                if 'Count' in case.keys():
                    continue
                case_name = case[u'案件名称']

                if not case_name.__contains__(u'公司'):
                    continue

                logging.info('starts to handle id: %s, case id: %s ' %
                             (item['id'], case[u'文书ID']))

                url = 'http://wenshu.court.gov.cn/content/content?DocID=%s' % (
                    case[u'文书ID'], )
                headers = None
                proxies = proxy.get_proxy("WenshuDetail")
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=15)
                if response.status_code != 200:
                    logging.info('case-fetch fails')
                    time.sleep(10)
                    continue
                content = response.text

                DbHelper.execute(
                    "INSERT INTO wenshu_web_detail(doc_id, summary, detail, crawl_status) VALUES (%s, %s, %s, %s)",
                    (case[u'文书ID'], json.dumps(case), content,
                     crawl_status_success))
                logging.info('case inserted')
                time.sleep(3)

            DbHelper.execute(
                'UPDATE wenshu_web SET parse_status = %s WHERE id = %s ',
                (parse_status_success, item['id']))
def main():
    site = 'shanghai_web'
    batch = 100
    data_table = 'common_data_all'

    while True:

        stime = time.time()
        sql = '''
            select b.id, b.update_time, b.src_gsgs, b.src_qygs, b.src_bmgs, b.src_sfgs from (
                select id from gs_shanghai_company where status = %s and parse_status = %s  limit %s
            ) as a left join (select id, update_time, src_gsgs, src_qygs, src_bmgs, src_sfgs from gs_shanghai_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (15, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, update_time = item['id'], item['update_time']
            gsgs, qygs, bmgs, sfgs = item['src_gsgs'], item['src_qygs'], item['src_bmgs'], item['src_sfgs']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (item[0],))

                result = IcWebParser().parsing([etree.HTML(text=gsgs),
                                                etree.HTML(text=qygs),
                                                etree.HTML(text=bmgs),
                                                etree.HTML(text=sfgs)])
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=update_time,
                                       data={u'gsgs': json.dumps(result[u'工商公示信息']),
                                             u'qygs': json.dumps(result[u'企业公示信息']),
                                             u'bmgs': json.dumps(result[u'其他部门公示信息']),
                                             u'sfgs': json.dumps(result[u'司法协助公示信息'])})
                logging.getLogger().info(" data inserted ")

                DbHelper.execute("update gs_shanghai_company set parse_status = %s, data_table_name= %s where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update gs_shanghai_company set parse_status = %s where id = %s", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    site = "tianyancha_web"
    batch = 200
    while True:

        logging.getLogger().info(" Batch begins ")
        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_detail from (
                select id from tianyancha_web_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_detail from tianyancha_web_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            logging.getLogger().info(" begin to parse company-id : %s " %
                                     (item[0], ))
            data_table = "common_data_all"

            try:
                # parse html page
                company_id, crawl_time, src_detail = item[0], item[1], item[2]
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'detail': src_detail})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                    (1, data_table, item[0]))
                logging.getLogger().info(
                    " parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info(
                    "exception/err occurs, company id: %s" % (company_id, ))
                DbHelper.execute(
                    "update tianyancha_web_company set parse_status = %s  where id = %s ",
                    (2, company_id))
                continue

        logging.getLogger().info(
            " the round of batch-parsing ends, and totally cost %s. " %
            (time.time() - stime))
def handle_single_item(item, **kwargs):
    company_id, company_name, parsed_content = item['id'], item[
        'company_name'].strip(), item['parsed_content']
    company_id_es = ETLTools.generate_company_id(company_name)
    logging.info(
        "starts to process company: %s, id: %s, the id of es document is: %s "
        % (company_name, company_id, company_id_es))
    status = EsHelper.check(index=kwargs['index'],
                            doc_type=kwargs['doc_type'],
                            doc_id=company_id_es,
                            site=kwargs['site'],
                            config_site=kwargs['config_site'])

    if status == 2:  # 数据存在
        logging.info("company exists in es")
        return

    converted = QycxbWebEsConvert.convert(parsed_content)
    converted['CmpName'] = company_name.strip()
    converted['Meta'] = {
        'Source': kwargs['site'],
        'Time': datetime.datetime.now().date().__str__()
    }

    if status == 0:  # es中不存在

        EsHelper.es.index(index=kwargs['index'],
                          doc_type=kwargs['doc_type'],
                          id=company_id_es,
                          body=json.dumps(converted,
                                          ensure_ascii=False,
                                          encoding="utf-8"))
        logging.info("inserted into es")
    else:  # status == 1  es存在,但是优先级较低,因此更新之
        EsHelper.es.update(index='drdata_qyk',
                           doc_type='BusinessInfo',
                           id=company_id_es,
                           body=json.dumps(converted,
                                           ensure_ascii=False,
                                           encoding="utf-8"))
        logging.info("updated into es")

    DbHelper.execute(
        "UPDATE qycxb_web_company set es_status = %s WHERE id = %s ",
        (kwargs['es_status_success'], company_id))
def main():
    batch = 100
    site = 'qixin_weixin'
    data_table = 'common_data_all'
    while True:

        stime = time.time()
        sql = '''
            select b.id, b.crawl_time, b.src_homepageinfo, b.src_basicinfo, b.src_changeinfo from (
                select id from qixin_weixin_company where crawl_status = %s and parse_status = %s  limit %s
            ) as a left join (select id, crawl_time, src_homepageinfo,src_basicinfo,src_changeinfo from qixin_weixin_company) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (7, 0, batch))
        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            company_id, crawl_time = item['id'], item['crawl_time']
            pageinfo, basicinfo, changeinfo = item['src_homepageinfo'], item['src_basicinfo'], item['src_changeinfo']
            try:
                logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

                '''
                1 存储公司信息
                '''
                StoreHelper.store_data(data_table=data_table, company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'pageinfo': pageinfo, u'basicinfo': basicinfo, u'changeinfo': changeinfo})
                logging.getLogger().info(" data stored ")
                '''
                2 更改状态为已解析以及data-table-name
                '''
                DbHelper.execute("update qixin_weixin_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (1, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update qixin_weixin_company set parse_status = %s where id = %s", [2, company_id])
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - stime))
def main():
    site = "xizhi_web"
    batch = 100
    while True:

        begin = time.time()
        sql = '''
            select b.id, b.update_time, b.data from (
                select id from xizhi_web_company where status = %s and  parse_status = %s limit %s
            ) as a left join (select id, update_time, data from xizhi_web_company ) as b on a.id = b.id
        '''
        items = DbHelper.fetchmany(sql, (1, 0, batch))

        if items is None or len(items) == 0:
            time.sleep(10)
            continue

        for item in items:
            data_table = "common_data_all"
            company_id, crawl_time, data = item['id'], item['update_time'], item['data']
            logging.getLogger().info(" begin to parse company-id : %s " % (company_id,))

            try:
                # parse html page
                detail = parse(data)
                # persist parsed company data into database
                StoreHelper.store_data(data_table=data_table,
                                       company_id=company_id,
                                       site=site,
                                       crawl_time=crawl_time,
                                       data={u'data': json.dumps(detail)})
                logging.getLogger().info(" data stored ")
                # update parse status
                DbHelper.execute("update xizhi_web_company set parse_status = %s, data_table_name =%s  where id = %s",
                                 (3, data_table, company_id))
                logging.getLogger().info(" parse status updated, and data_table_name inserted ")
            except Exception, err:
                logging.getLogger().exception(err)
                logging.getLogger().info("exception/err occurs, company id: %s" % (company_id,))
                DbHelper.execute("update xizhi_web_company set parse_status = %s  where id = %s ", (2, company_id))
                continue

        logging.getLogger().info(" the round of batch-parsing ends, and totally cost %s. " % (time.time() - begin))
Esempio n. 12
0
def process():
    # get un-crawled company-keyword list
    keywords_response = requests.get(
        'http://10.51.1.201:3352/getKeywords',
        params={
            "data":
            json.dumps({
                "field": "SoftwareCopyrightStatus",
                "status": 0
            },
                       ensure_ascii=False)
        })
    if keywords_response.status_code != 200:
        time.sleep(10)
        return
    else:
        companies = json.loads(keywords_response.text)
    logging.getLogger("Crawler").info(
        "Get Companies From ES, And Size is : %s" % (len(companies), ))

    for company in companies:

        cmpid, cmpname = company["Id"], company['CmpName']
        logging.getLogger("Crawler").info(
            "Begins to Crawl Info, the CmpName:%s, CmpId:%s " %
            (cmpname, cmpid))
        url_template = "http://www.ccopyright.com.cn/cpcc/RRegisterAction.do?method=list&no=fck&sql_name=&sql_regnum=&sql_author=%s&curPage=1"
        url = url_template % (quote(cmpname.replace(u"・", u"").encode("gbk")))
        headers = None
        proxies = proxy.get_proxy("SoftWareCopyright")

        crawling_response = requests.get(url=url,
                                         headers=headers,
                                         proxies=proxies,
                                         timeout=15)
        if crawling_response.status_code != 200:
            time.sleep(10)
            continue
        content = crawling_response.text
        try:
            parsed_content = parse(content)
        except Exception as e:
            logging.getLogger("Crawler").exception(
                "Exceptions occurs when parsing crawled page")
            logging.getLogger("Crawler").exception(e)
            continue
        if len(parsed_content) == 0:
            logging.getLogger("Crawler").info("No software copyright found")
        else:
            DbHelper.execute(
                "INSERT INTO software_copyright(cmpid, src) VALUES(%s, %s)",
                (cmpid, content))
            parse_status = 1
            DbHelper.execute(
                "UPDATE software_copyright SET parse_status = %s, parsed_content = %s where cmpid = %s ",
                (parse_status, json.dumps(parsed_content), cmpid))
            logging.getLogger("Crawler").info("Page Parsed Successfully")

        notify_response = requests.get(
            url=url,
            params={
                "data":
                json.dumps([
                    {
                        "Id": cmpid,
                        "field": "SoftwareCopyrightStatus",
                        "status": 1
                    },
                ],
                           ensure_ascii=False)
            })

        if notify_response.status_code != 200:
            logging.getLogger("Crawler").info(
                "Action, which notify es, fails.")
        logging.getLogger("Crawler").info("Info Crawled Successfully")
        time.sleep(2)