def search_task(): log = Gsxtlogger('hunan.log').get_logger() mongo_db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } # 搜索列表存储表 source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) for company in data_list: item = source_db.find_one('enterprise_data_gov', {'company': company}) if item is None: log.error(company) continue if 'shareholder_information' not in item: log.warn(company) continue
sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******', } log = Gsxtlogger('clean_enterprise_data_gov_src.log').get_logger() source_db = MongDb(db_conf['host'], db_conf['port'], db_conf['db'], db_conf['username'], db_conf['password'], log=log) def main(): result_list = [] source_table = 'enterprise_data_gov' count = 0 deal_total = 0 for item in source_db.traverse_batch(source_table):
""" @author: youfeng @email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger( util.get_pid_log_name('copy_data_to_online_all_list')).get_logger() count = 0 mongo_db_crawl_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'crawl_data', 'username': '******', 'password': '******' } mongo_db_gov = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data_test',
""" from common.mongo import MongDb from logger import Gsxtlogger mongo_db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } table_name = 'enterprise_data_gov' conf_name = 'company_list.csv' global_logger = Gsxtlogger('judge_exist.log') log = global_logger.get_logger() def main(): source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) count = 0 total = 0 already = 0 with open(conf_name) as p_file:
@time: 2016/12/19 16:32 """ import json import sys import pymongo import time from tools.pybeanstalk import PyBeanstalk sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('xinghe.log').get_logger() province_zh_to_py = { '上海': 'shanghai', '云南': 'yunnan', '内蒙古': 'neimenggu', '北京': 'beijing', '吉林': 'jilin', '四川': 'sichuan', '天津': 'tianjin', '宁夏': 'ningxia', '安徽': 'anhui', '山东': 'shandong', '山西': 'shanxicu', '广东': 'guangdong', '广西': 'guangxi',
@email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys import pymongo import time sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('send_all_company_offline_list.log').get_logger() province_zh_to_py = { '上海': 'shanghai', '云南': 'yunnan', '内蒙古': 'neimenggu', '北京': 'beijing', '吉林': 'jilin', '四川': 'sichuan', '天津': 'tianjin', '宁夏': 'ningxia', '安徽': 'anhui', '山东': 'shandong', '山西': 'shanxicu', '广东': 'guangdong', '广西': 'guangxi',
@email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys import pymongo import time sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('company_name_repair.log').get_logger() province_zh_to_py = { '上海': 'shanghai', '云南': 'yunnan', '内蒙古': 'neimenggu', '北京': 'beijing', '吉林': 'jilin', '四川': 'sichuan', '天津': 'tianjin', '宁夏': 'ningxia', '安徽': 'anhui', '山东': 'shandong', '山西': 'shanxicu', '广东': 'guangdong', '广西': 'guangxi',
@author: youfeng @email: [email protected] @license: Apache Licence @file: find_in_gsxt.py @time: 2017/8/11 14:32 """ import sys import pandas sys.path.append('../') from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('find_in_gsxt.log').get_logger() db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******', } source_db = MongDb(db_conf['host'], db_conf['port'], db_conf['db'], db_conf['username'], db_conf['password'], log=log)
#!/usr/bin/env python # encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: clean_online_all_search.py @time: 2017/2/26 15:40 """ import sys sys.path.append('../') from logger import Gsxtlogger log = Gsxtlogger('clean_online_all_search.log').get_logger() from common.global_resource import source_db ''' data = { # 以搜索列表名与省份信息作为唯一主键 '_id': util.generator_id({'priority': priority}, search_name, self.province), 'company_name': company, 'search_name': search_name, 'province': self.province, 'in_time': util.get_now_time(), 'param': param, 'rank': rank, self.crawl_flag: 0, 'priority': priority, } '''
#!/usr/bin/env python # encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: clean_online_all_search.py @time: 2017/2/26 15:40 """ import sys sys.path.append('../') from common import util from logger import Gsxtlogger log = Gsxtlogger('copy_cs2_online_all_search.log').get_logger() from common.global_resource import source_db ''' data = { # 以搜索列表名与省份信息作为唯一主键 '_id': util.generator_id({'priority': priority}, search_name, self.province), 'company_name': company, 'search_name': search_name, 'province': self.province, 'in_time': util.get_now_time(), 'param': param, 'rank': rank, self.crawl_flag: 0, 'priority': priority, } '''
from common import util from common.mongo import MongDb from common.pybeanstalk import PyBeanstalk from logger import Gsxtlogger from tools.crawl_conf import company_info mongo_db_source = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } global_logger = Gsxtlogger('insert_company.log') global_log = global_logger.get_logger() # 搜索列表存储表 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=global_log) beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400} beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port'])
#!/usr/bin/env python # encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: proxy_check.py @time: 2017/10/12 00:24 """ import json import requests from logger import Gsxtlogger log = Gsxtlogger('proxy_check.log').get_logger() def local_proxy(): return 'test' def get_proxy(): proxy_url = 'http://101.132.128.78:18585/proxy' user_config = { 'username': '******', 'password': '******', } try: r = requests.post(proxy_url, json=user_config)
@email: [email protected] @license: Apache Licence @file: test_mongodb_result_total.py @time: 2016/12/9 22:22 """ import sys import time sys.path.append('../') from common.config_parser import ConfigParser from config.conf import mongo_db_target from common.mongo import MongDb from logger import Gsxtlogger # 开启日志 log = Gsxtlogger('mongodb_count_total.py.log', for_mat='').get_logger() target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=log) def total_source(config_list): start_time = time.time() log.info('开始统计种子: ') province_total_list = [] count_total = 0 for key, value in config_list.iteritems(): source_table = value.get('source_table', None) if source_table is None: log.error('读取表信息错误')
# encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('copy_data_to_offline_all_list.log').get_logger() mongo_db_company_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } mongo_db_schedule_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'schedule_data', 'username': '******', 'password': '******'
# 'config/online_gsxt_crawl.conf' 'shanghai' # 获得log的名称 def get_log_name(): log_name = 'start_online_crawl.log' length = len(sys.argv) if length > 2: search_list = re.findall('config/(.*?)\.conf', sys.argv[1]) if len(search_list) > 0: log_name = search_list[0] + '_' + sys.argv[2] + '.log' return log_name global_logger = Gsxtlogger(get_log_name()) global_log = global_logger.get_logger() # 旧网页库 target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=global_log) # 新网页库 # target_db_new = MongDb(mongo_db_target_new['host'], mongo_db_target_new['port'], mongo_db_target_new['db'], # mongo_db_target_new['username'], # mongo_db_target_new['password'], log=global_log)
"host": "172.16.215.2", "port": 40042, "db": "crawl_data_new", "username": "******", "password": "******", } mongo_db_webpage_old = { "host": "172.16.215.2", "port": 40042, "db": "crawl_data", "username": "******", "password": "******", } log = Gsxtlogger('find_equity_field.log').get_logger() target_db_new = MongDb(mongo_db_webpage_new['host'], mongo_db_webpage_new['port'], mongo_db_webpage_new['db'], mongo_db_webpage_new['username'], mongo_db_webpage_new['password'], log=log) target_db_old = MongDb(mongo_db_webpage_old['host'], mongo_db_webpage_old['port'], mongo_db_webpage_old['db'], mongo_db_webpage_old['username'], mongo_db_webpage_old['password'], log=log) mail_from_addr = '*****@*****.**' mail_password = '******' mail_to_addrs = ['*****@*****.**'] def send_email(from_addr, password, to_addrs, subject, msg, smtp_host="smtp.weibangong.com", smtp_port=465): email_client = SMTP(smtp_host, smtp_port) email_client.login(from_addr, password)
# encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('inseart_data_to_offline_all_list.py.log').get_logger() mongo_db_company_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'], mongo_db_company_data['username'], mongo_db_company_data['password'], log=log) app_data_config = { 'host': '172.16.215.16', 'port': 40042,
@email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys import pymongo import time sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('update_new_company_to_offline_all_list.log').get_logger() province_zh_to_py = { '上海': 'shanghai', '云南': 'yunnan', '内蒙古': 'neimenggu', '北京': 'beijing', '吉林': 'jilin', '四川': 'sichuan', '天津': 'tianjin', '宁夏': 'ningxia', '安徽': 'anhui', '山东': 'shandong', '山西': 'shanxicu', '广东': 'guangdong', '广西': 'guangxi',
# encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger('update_data_to_offline_all_list.log').get_logger() mongo_db_company_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'], mongo_db_company_data['username'], mongo_db_company_data['password'], log=log) # def main():
'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } app_data_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } log = Gsxtlogger('count_gansu.log').get_logger() company_data_db = MongDb(company_data_conf['host'], company_data_conf['port'], company_data_conf['db'], company_data_conf['username'], company_data_conf['password'], log=log) app_data_db = MongDb(app_data_conf['host'], app_data_conf['port'], app_data_conf['db'], app_data_conf['username'], app_data_conf['password'], log=log)
'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } mongo_db_target = { 'host': "103.36.136.211", 'port': 40042, 'db': 'company_data', "username": '******', "password": '******', } log = Gsxtlogger('copy_data_to_beihai.log').get_logger() source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=log) target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=log) def main(): collection_table = 'offline_all_list' log.info("开始导入数据..") result_list = [] count = 0 for item in source_db.traverse(collection_table): item['crawl_online'] = 0
#!/usr/bin/env python # encoding: utf-8 """ @author: youfeng @email: [email protected] @license: Apache Licence @file: guizhou.py @time: 2017/7/24 22:35 """ import requests from logger import Gsxtlogger log = Gsxtlogger('guizhou.log').get_logger() def get_http(): url = 'http://112.74.163.187:23128/__static__/proxies.txt' r = requests.get(url) if r.status_code != 200: return [] ip_list = r.text.split("\n") return ip_list def main(): ip_list = get_http() for ip in ip_list:
@email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys import pymongo import time sys.path.append('../') from common import util from common.mongo import MongDb from logger import Gsxtlogger log = Gsxtlogger( 'update_guangdong_new_company_to_offline_all_list.log').get_logger() province_zh_to_py = { '上海': 'shanghai', '云南': 'yunnan', '内蒙古': 'neimenggu', '北京': 'beijing', '吉林': 'jilin', '四川': 'sichuan', '天津': 'tianjin', '宁夏': 'ningxia', '安徽': 'anhui', '山东': 'shandong', '山西': 'shanxicu', '广东': 'guangdong', '广西': 'guangxi',
@author: youfeng @email: [email protected] @license: Apache Licence @file: kafka_producer.py @time: 2016/12/19 16:32 """ import sys sys.path.append('../') from common import util from common.mongo import MongDb from config.conf import mongo_db_source from logger import Gsxtlogger log = Gsxtlogger('search_list_data_clean.log').get_logger() count = 0 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=log) def main(): count = 0 log.info('开始清洗数据') source_table = 'cs2_online_all_search'