Esempio n. 1
0
def cookieurl_3():
    url = configutil.getconfig('cookieurl', '3')
    page = cookie_Parse(url)
    soup = BeautifulSoup(page, 'html.parser')
    name = soup.title.text
    try:
        tbody = soup.find('td', attrs={'class': '2016_erji_content'})
        for link in tbody.find_all('a'):
            href = link.get('href')
            real_path = parse.urljoin(url, href)
            title = link.get_text()
            urlMd5 = esutil.format_md5(real_path)
            if esutil.query_data('spidernews_index', 'spidernews_type',
                                 urlMd5):
                pass
            else:
                data = {
                    'link': real_path,
                    'name': name,
                    'createTime': int(round(time.time() * 1000)),
                    'title': title,
                    'urlMd5': urlMd5,
                }
                print(data)
                esutil.insert_single_data('spidernews_index',
                                          'spidernews_type', data, urlMd5)
    except Exception as e:
        logger.info(e)
Esempio n. 2
0
def cookieurl_1():
    url = configutil.getconfig('cookieurl', '1')
    page = cookie_Parse(url)
    soup = BeautifulSoup(page, 'html.parser')
    name = soup.title.text
    try:
        tbody = soup.find('tbody', attrs={'id': 'contentBody'})
        for link in tbody.find_all('a'):
            href = link.get('href')
            title = link.get_text()
            urlMd5 = esutil.format_md5(href)
            if esutil.query_data('spidernews_index', 'spidernews_type',
                                 urlMd5):
                pass
            else:
                data = {
                    'link': href,
                    'name': name,
                    'createTime': int(round(time.time() * 1000)),
                    'title': title,
                    'urlMd5': urlMd5,
                }
                esutil.insert_single_data('spidernews_index',
                                          'spidernews_type', data, urlMd5)
    except Exception as e:
        logger.info(e)
Esempio n. 3
0
def confirm(string):
    try:
        url = getconfig('companycheck', 'address') + string
        resp = requests.get(url)
        # print(resp.text)
        result = json.loads(resp.text)
        if result.get('code') == '2':
            return True
        else:
            return False
    except Exception as e:
        logger.info(e)
        return False
Esempio n. 4
0
def formatUrl():
    url = getconfig('urls','nfrb')
    date = time.strftime('%Y-%m/%d',time.localtime(time.time()))
    formatUrl = url.format(date)
    return formatUrl
Esempio n. 5
0
def formatUrl():
    url = getconfig('urls','hbrb')
    date = time.strftime('%Y%m%d',time.localtime(time.time()))
    formatUrl = url.format(date,'{}')
    return formatUrl
Esempio n. 6
0
def formatUrl():
    url = getconfig('urls', 'jxrb')
    date1 = time.strftime('%Y-%m/%d', time.localtime(time.time()))
    date2 = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    formatUrl = url.format(date1, date2)
    return formatUrl
Esempio n. 7
0
                        // "-";
                    }
                    return guid;
                }"""
    ctx = execjs.compile(js)
    pageid = ctx.call("happy")
    return pageid


if __name__ == '__main__':
    servers = ['47.111.24.165:5000', '47.94.209.31:5000', '47.105.61.16:5000']
    ser = random.choice(servers)
    logger.info('本次工作ip : %s' % ser)
    logger.info('=========================开始抓取政府网站案件=========================')
    logger.info('~~~~~~~~~~~~~~~~~~~证券部分~~~~~~~~~~~~~~~')
    zq = getconfig('pjws', 'address1').format(get_pageid())
    result = grab(zq, ser)
    for i in result:
        logger.info('%s、%s' % (result.index(i) + 1, i))
    es_operate(result)
    time.sleep(random.randint(100, 300))
    logger.info(
        '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    )
    logger.info(
        '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    )
    logger.info(
        '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    )
    logger.info(
Esempio n. 8
0
def formatUrl():
    url = getconfig('urls', 'dzszb')
    date1 = time.strftime('%Y%m%d', time.localtime(time.time()))
    return url.format(date1)
Esempio n. 9
0
# -*- coding: utf-8 -*-

import hashlib
import time

from elasticsearch import Elasticsearch

from util import configutil
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='esutil').getlog()
try:
    host = configutil.getconfig('eshost', 'host')
    port = configutil.getconfig('eshost', 'port')
    es = Elasticsearch([{'host': host, 'port': port}])
except Exception as ex:
    logger.info(ex)


def insert_single_data(index_name, doc_type, data, esid):
    try:
        res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid)
        return res
    except Exception as e:
        logger.info(e)


def insert_datas(index_name, doc_type, datas):
    try:
        res = es.bulk(index=index_name, doc_type=doc_type, body=datas)
        return res