Example #1
0
def search_task():
    log = Gsxtlogger('hunan.log').get_logger()
    mongo_db_conf = {
        'host': '172.16.215.16',
        'port': 40042,
        'db': 'app_data',
        'username': '******',
        'password': '******'
    }

    # 搜索列表存储表
    source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'],
                       mongo_db_conf['username'],
                       mongo_db_conf['password'], log=log)

    for company in data_list:
        item = source_db.find_one('enterprise_data_gov', {'company': company})
        if item is None:
            log.error(company)
            continue

        if 'shareholder_information' not in item:
            log.warn(company)
            continue
sys.path.append('../')
from common import util
from common.mongo import MongDb

from logger import Gsxtlogger

db_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******',
}

log = Gsxtlogger('clean_enterprise_data_gov_src.log').get_logger()

source_db = MongDb(db_conf['host'],
                   db_conf['port'],
                   db_conf['db'],
                   db_conf['username'],
                   db_conf['password'],
                   log=log)


def main():
    result_list = []
    source_table = 'enterprise_data_gov'
    count = 0
    deal_total = 0
    for item in source_db.traverse_batch(source_table):
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

sys.path.append('../')

from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger(
    util.get_pid_log_name('copy_data_to_online_all_list')).get_logger()

count = 0

mongo_db_crawl_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'crawl_data',
    'username': '******',
    'password': '******'
}

mongo_db_gov = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data_test',
Example #4
0
"""
from common.mongo import MongDb
from logger import Gsxtlogger

mongo_db_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******'
}
table_name = 'enterprise_data_gov'

conf_name = 'company_list.csv'

global_logger = Gsxtlogger('judge_exist.log')
log = global_logger.get_logger()


def main():
    source_db = MongDb(mongo_db_conf['host'],
                       mongo_db_conf['port'],
                       mongo_db_conf['db'],
                       mongo_db_conf['username'],
                       mongo_db_conf['password'],
                       log=log)

    count = 0
    total = 0
    already = 0
    with open(conf_name) as p_file:
Example #5
0
@time: 2016/12/19 16:32
"""
import json
import sys

import pymongo
import time

from tools.pybeanstalk import PyBeanstalk

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('xinghe.log').get_logger()

province_zh_to_py = {
    '上海': 'shanghai',
    '云南': 'yunnan',
    '内蒙古': 'neimenggu',
    '北京': 'beijing',
    '吉林': 'jilin',
    '四川': 'sichuan',
    '天津': 'tianjin',
    '宁夏': 'ningxia',
    '安徽': 'anhui',
    '山东': 'shandong',
    '山西': 'shanxicu',
    '广东': 'guangdong',
    '广西': 'guangxi',
@email: [email protected]
@license: Apache Licence
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

import pymongo
import time

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('send_all_company_offline_list.log').get_logger()

province_zh_to_py = {
    '上海': 'shanghai',
    '云南': 'yunnan',
    '内蒙古': 'neimenggu',
    '北京': 'beijing',
    '吉林': 'jilin',
    '四川': 'sichuan',
    '天津': 'tianjin',
    '宁夏': 'ningxia',
    '安徽': 'anhui',
    '山东': 'shandong',
    '山西': 'shanxicu',
    '广东': 'guangdong',
    '广西': 'guangxi',
Example #7
0
@email: [email protected]
@license: Apache Licence
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

import pymongo
import time

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('company_name_repair.log').get_logger()

province_zh_to_py = {
    '上海': 'shanghai',
    '云南': 'yunnan',
    '内蒙古': 'neimenggu',
    '北京': 'beijing',
    '吉林': 'jilin',
    '四川': 'sichuan',
    '天津': 'tianjin',
    '宁夏': 'ningxia',
    '安徽': 'anhui',
    '山东': 'shandong',
    '山西': 'shanxicu',
    '广东': 'guangdong',
    '广西': 'guangxi',
Example #8
0
@author: youfeng
@email: [email protected]
@license: Apache Licence
@file: find_in_gsxt.py
@time: 2017/8/11 14:32
"""
import sys

import pandas

sys.path.append('../')
from common.mongo import MongDb

from logger import Gsxtlogger

log = Gsxtlogger('find_in_gsxt.log').get_logger()

db_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******',
}

source_db = MongDb(db_conf['host'],
                   db_conf['port'],
                   db_conf['db'],
                   db_conf['username'],
                   db_conf['password'],
                   log=log)
Example #9
0
#!/usr/bin/env python
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: clean_online_all_search.py
@time: 2017/2/26 15:40
"""
import sys

sys.path.append('../')
from logger import Gsxtlogger

log = Gsxtlogger('clean_online_all_search.log').get_logger()
from common.global_resource import source_db
'''
data = {
            # 以搜索列表名与省份信息作为唯一主键
            '_id': util.generator_id({'priority': priority}, search_name, self.province),
            'company_name': company,
            'search_name': search_name,
            'province': self.province,
            'in_time': util.get_now_time(),
            'param': param,
            'rank': rank,
            self.crawl_flag: 0,
            'priority': priority,
        }
'''
#!/usr/bin/env python
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: clean_online_all_search.py
@time: 2017/2/26 15:40
"""
import sys

sys.path.append('../')
from common import util
from logger import Gsxtlogger

log = Gsxtlogger('copy_cs2_online_all_search.log').get_logger()
from common.global_resource import source_db
'''
data = {
            # 以搜索列表名与省份信息作为唯一主键
            '_id': util.generator_id({'priority': priority}, search_name, self.province),
            'company_name': company,
            'search_name': search_name,
            'province': self.province,
            'in_time': util.get_now_time(),
            'param': param,
            'rank': rank,
            self.crawl_flag: 0,
            'priority': priority,
        }
'''
from common import util
from common.mongo import MongDb
from common.pybeanstalk import PyBeanstalk
from logger import Gsxtlogger
from tools.crawl_conf import company_info

mongo_db_source = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

global_logger = Gsxtlogger('insert_company.log')
global_log = global_logger.get_logger()

# 搜索列表存储表
source_db = MongDb(mongo_db_source['host'],
                   mongo_db_source['port'],
                   mongo_db_source['db'],
                   mongo_db_source['username'],
                   mongo_db_source['password'],
                   log=global_log)

beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400}
beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                        beanstalk_consumer_conf['port'])

Example #12
0
#!/usr/bin/env python
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence
@file: proxy_check.py
@time: 2017/10/12 00:24
"""
import json

import requests

from logger import Gsxtlogger

log = Gsxtlogger('proxy_check.log').get_logger()


def local_proxy():
    return 'test'


def get_proxy():
    proxy_url = 'http://101.132.128.78:18585/proxy'

    user_config = {
        'username': '******',
        'password': '******',
    }
    try:
        r = requests.post(proxy_url, json=user_config)
@email: [email protected]
@license: Apache Licence 
@file: test_mongodb_result_total.py
@time: 2016/12/9 22:22
"""
import sys
import time

sys.path.append('../')
from common.config_parser import ConfigParser
from config.conf import mongo_db_target
from common.mongo import MongDb
from logger import Gsxtlogger

# 开启日志
log = Gsxtlogger('mongodb_count_total.py.log', for_mat='').get_logger()

target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'],
                   mongo_db_target['username'], mongo_db_target['password'], log=log)


def total_source(config_list):
    start_time = time.time()

    log.info('开始统计种子: ')
    province_total_list = []
    count_total = 0
    for key, value in config_list.iteritems():
        source_table = value.get('source_table', None)
        if source_table is None:
            log.error('读取表信息错误')
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('copy_data_to_offline_all_list.log').get_logger()

mongo_db_company_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

mongo_db_schedule_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'schedule_data',
    'username': '******',
    'password': '******'
Example #15
0
# 'config/online_gsxt_crawl.conf' 'shanghai'


# 获得log的名称
def get_log_name():
    log_name = 'start_online_crawl.log'
    length = len(sys.argv)
    if length > 2:
        search_list = re.findall('config/(.*?)\.conf', sys.argv[1])
        if len(search_list) > 0:
            log_name = search_list[0] + '_' + sys.argv[2] + '.log'

    return log_name


global_logger = Gsxtlogger(get_log_name())
global_log = global_logger.get_logger()

# 旧网页库
target_db = MongDb(mongo_db_target['host'],
                   mongo_db_target['port'],
                   mongo_db_target['db'],
                   mongo_db_target['username'],
                   mongo_db_target['password'],
                   log=global_log)

# 新网页库
# target_db_new = MongDb(mongo_db_target_new['host'], mongo_db_target_new['port'], mongo_db_target_new['db'],
#                        mongo_db_target_new['username'],
#                        mongo_db_target_new['password'], log=global_log)
Example #16
0
    "host": "172.16.215.2",
    "port": 40042,
    "db": "crawl_data_new",
    "username": "******",
    "password": "******",
}

mongo_db_webpage_old = {
    "host": "172.16.215.2",
    "port": 40042,
    "db": "crawl_data",
    "username": "******",
    "password": "******",
}

log = Gsxtlogger('find_equity_field.log').get_logger()

target_db_new = MongDb(mongo_db_webpage_new['host'], mongo_db_webpage_new['port'], mongo_db_webpage_new['db'],
                       mongo_db_webpage_new['username'], mongo_db_webpage_new['password'], log=log)

target_db_old = MongDb(mongo_db_webpage_old['host'], mongo_db_webpage_old['port'], mongo_db_webpage_old['db'],
                       mongo_db_webpage_old['username'], mongo_db_webpage_old['password'], log=log)

mail_from_addr = '*****@*****.**'
mail_password = '******'
mail_to_addrs = ['*****@*****.**']


def send_email(from_addr, password, to_addrs, subject, msg, smtp_host="smtp.weibangong.com", smtp_port=465):
    email_client = SMTP(smtp_host, smtp_port)
    email_client.login(from_addr, password)
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('inseart_data_to_offline_all_list.py.log').get_logger()

mongo_db_company_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'],
                   mongo_db_company_data['username'], mongo_db_company_data['password'], log=log)

app_data_config = {
    'host': '172.16.215.16',
    'port': 40042,
Example #18
0
@email: [email protected]
@license: Apache Licence
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

import pymongo
import time

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('update_new_company_to_offline_all_list.log').get_logger()

province_zh_to_py = {
    '上海': 'shanghai',
    '云南': 'yunnan',
    '内蒙古': 'neimenggu',
    '北京': 'beijing',
    '吉林': 'jilin',
    '四川': 'sichuan',
    '天津': 'tianjin',
    '宁夏': 'ningxia',
    '安徽': 'anhui',
    '山东': 'shandong',
    '山西': 'shanxicu',
    '广东': 'guangdong',
    '广西': 'guangxi',
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence 
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger('update_data_to_offline_all_list.log').get_logger()

mongo_db_company_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'],
                   mongo_db_company_data['username'], mongo_db_company_data['password'], log=log)


#
def main():
Example #20
0
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

app_data_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******'
}

log = Gsxtlogger('count_gansu.log').get_logger()

company_data_db = MongDb(company_data_conf['host'],
                         company_data_conf['port'],
                         company_data_conf['db'],
                         company_data_conf['username'],
                         company_data_conf['password'],
                         log=log)

app_data_db = MongDb(app_data_conf['host'],
                     app_data_conf['port'],
                     app_data_conf['db'],
                     app_data_conf['username'],
                     app_data_conf['password'],
                     log=log)
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

mongo_db_target = {
    'host': "103.36.136.211",
    'port': 40042,
    'db': 'company_data',
    "username": '******',
    "password": '******',
}

log = Gsxtlogger('copy_data_to_beihai.log').get_logger()
source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'],
                   mongo_db_source['username'], mongo_db_source['password'], log=log)

target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'],
                   mongo_db_target['username'], mongo_db_target['password'], log=log)


def main():
    collection_table = 'offline_all_list'

    log.info("开始导入数据..")
    result_list = []
    count = 0
    for item in source_db.traverse(collection_table):
        item['crawl_online'] = 0
Example #22
0
#!/usr/bin/env python
# encoding: utf-8
"""
@author: youfeng
@email: [email protected]
@license: Apache Licence
@file: guizhou.py
@time: 2017/7/24 22:35
"""

import requests

from logger import Gsxtlogger

log = Gsxtlogger('guizhou.log').get_logger()


def get_http():
    url = 'http://112.74.163.187:23128/__static__/proxies.txt'
    r = requests.get(url)
    if r.status_code != 200:
        return []
    ip_list = r.text.split("\n")

    return ip_list


def main():
    ip_list = get_http()
    for ip in ip_list:
Example #23
0
@email: [email protected]
@license: Apache Licence
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

import pymongo
import time

sys.path.append('../')
from common import util
from common.mongo import MongDb
from logger import Gsxtlogger

log = Gsxtlogger(
    'update_guangdong_new_company_to_offline_all_list.log').get_logger()

province_zh_to_py = {
    '上海': 'shanghai',
    '云南': 'yunnan',
    '内蒙古': 'neimenggu',
    '北京': 'beijing',
    '吉林': 'jilin',
    '四川': 'sichuan',
    '天津': 'tianjin',
    '宁夏': 'ningxia',
    '安徽': 'anhui',
    '山东': 'shandong',
    '山西': 'shanxicu',
    '广东': 'guangdong',
    '广西': 'guangxi',
@author: youfeng
@email: [email protected]
@license: Apache Licence
@file: kafka_producer.py
@time: 2016/12/19 16:32
"""
import sys

sys.path.append('../')
from common import util

from common.mongo import MongDb
from config.conf import mongo_db_source
from logger import Gsxtlogger

log = Gsxtlogger('search_list_data_clean.log').get_logger()

count = 0

source_db = MongDb(mongo_db_source['host'],
                   mongo_db_source['port'],
                   mongo_db_source['db'],
                   mongo_db_source['username'],
                   mongo_db_source['password'],
                   log=log)


def main():
    count = 0
    log.info('开始清洗数据')
    source_table = 'cs2_online_all_search'