def main(): source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) count = 0 total = 0 already = 0 with open(conf_name) as p_file: for line in p_file: total += 1 company = line.strip("\n").strip("\r").strip(" ") item = source_db.find_one(table_name, {'company': company}) if item is None: log.error("当前企业没有抓到: {company}".format(company=company)) count += 1 else: log.info("已抓到企业: {}".format(company)) already += 1 log.info("总共企业数目为: {}".format(total)) log.info("当前已抓到的个数: {}".format(already)) log.info("当前总共没有抓到企业数目为: {}".format(count))
def __init__(self, page, log): self.__page = page self.log = log self.log.info("获得 {} 页之后的数据...".format(self.__page)) self.mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.table = "douban" self.request = self.__init_reqeust() self.douban_handler = DouBanInfoHandler()
def __init__(self, secrite_key, token, user_id, log): self.log = log self.secrite_key = secrite_key self.user_id = user_id self.token = token self.request = self.__init_reqeust() self.cp_mongo = MongDb(LocalMongoConfig.HOST, LocalMongoConfig.PORT, LocalMongoConfig.DB, LocalMongoConfig.USER, LocalMongoConfig.PASSWD, log=self.log) self.cp_table = "yizhou_cp"
def search_task(): log = Gsxtlogger('hunan.log').get_logger() mongo_db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } # 搜索列表存储表 source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'], mongo_db_conf['username'], mongo_db_conf['password'], log=log) for company in data_list: item = source_db.find_one('enterprise_data_gov', {'company': company}) if item is None: log.error(company) continue if 'shareholder_information' not in item: log.warn(company) continue
'重庆市': 'chongqing', '陕西省': 'shanxi', '总局': 'gsxt' } mongo_db_company_data = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } source_db = MongDb(mongo_db_company_data['host'], mongo_db_company_data['port'], mongo_db_company_data['db'], mongo_db_company_data['username'], mongo_db_company_data['password'], log=log) db_query = pymongo.MongoClient('172.16.215.2', 40042)['schedule_data'] db_query.authenticate('work', 'haizhi') db_query_app_data = pymongo.MongoClient('172.16.215.16', 40042)['app_data'] db_query_app_data.authenticate('work', 'haizhi') # def main(): try: count = 0 log.info('开始读取数据...')
'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } mongo_db_target = { 'host': "103.36.136.211", 'port': 40042, 'db': 'company_data', "username": '******', "password": '******', } log = Gsxtlogger('copy_data_to_beihai.log').get_logger() source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=log) target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=log) def main(): collection_table = 'offline_all_list' log.info("开始导入数据..") result_list = [] count = 0 for item in source_db.traverse(collection_table): item['crawl_online'] = 0 result_list.append(item)
from logger import Gsxtlogger log = Gsxtlogger('find_in_gsxt.log').get_logger() db_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******', } source_db = MongDb(db_conf['host'], db_conf['port'], db_conf['db'], db_conf['username'], db_conf['password'], log=log) def classify(): with open("company_invalid.txt", "w") as invalid_file: with open("company_valid.txt", "w") as valid_file: with open("company_list.txt") as p_file: for line in p_file: company = line.strip("\r").strip("\n").strip() # if source_db.find_one("enterprise_data_gov", {"company": company}) is None: # log.warn("当前企业不存在: {}".format(company)) # else: # log.info("找到企业信息: {}".format(company)) if len(company) <= 15 or len(company) > 90:
'db': 'crawl_data', 'username': '******', 'password': '******' } mongo_db_gov = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data_test', 'username': '******', 'password': '******' } crawl_data_db = MongDb(mongo_db_crawl_data['host'], mongo_db_crawl_data['port'], mongo_db_crawl_data['db'], mongo_db_crawl_data['username'], mongo_db_crawl_data['password'], log=log) gov_db = MongDb(mongo_db_gov['host'], mongo_db_gov['port'], mongo_db_gov['db'], mongo_db_gov['username'], mongo_db_gov['password'], log=log) # while True: # data = {'province': 'yunnan', 'company_name': '国网'} # producer.produce(json.dumps(data)) # log.info(count) # # time.sleep()
'password': '******' } app_data_conf = { 'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******' } log = Gsxtlogger('count_gansu.log').get_logger() company_data_db = MongDb(company_data_conf['host'], company_data_conf['port'], company_data_conf['db'], company_data_conf['username'], company_data_conf['password'], log=log) app_data_db = MongDb(app_data_conf['host'], app_data_conf['port'], app_data_conf['db'], app_data_conf['username'], app_data_conf['password'], log=log) def get_now_time(): from datetime import datetime return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
"db": "crawl_data_new", "username": "******", "password": "******", } mongo_db_webpage_old = { "host": "172.16.215.2", "port": 40042, "db": "crawl_data", "username": "******", "password": "******", } log = Gsxtlogger('find_equity_field.log').get_logger() target_db_new = MongDb(mongo_db_webpage_new['host'], mongo_db_webpage_new['port'], mongo_db_webpage_new['db'], mongo_db_webpage_new['username'], mongo_db_webpage_new['password'], log=log) target_db_old = MongDb(mongo_db_webpage_old['host'], mongo_db_webpage_old['port'], mongo_db_webpage_old['db'], mongo_db_webpage_old['username'], mongo_db_webpage_old['password'], log=log) mail_from_addr = '*****@*****.**' mail_password = '******' mail_to_addrs = ['*****@*****.**'] def send_email(from_addr, password, to_addrs, subject, msg, smtp_host="smtp.weibangong.com", smtp_port=465): email_client = SMTP(smtp_host, smtp_port) email_client.login(from_addr, password) msg['Subject'] = Header(subject, 'utf-8') msg['From'] = from_addr msg['To'] = str(to_addrs)
mongo_db_source = { 'host': '172.16.215.2', 'port': 40042, 'db': 'company_data', 'username': '******', 'password': '******' } global_logger = Gsxtlogger('insert_company.log') global_log = global_logger.get_logger() # 搜索列表存储表 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'], mongo_db_source['username'], mongo_db_source['password'], log=global_log) beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400} beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) def main(search_name, province, unified_social_credit_code, param): item = { "_id": "9c9d8f8b848514f240f54a40b0a0c6f02622b3d87d54d353e525ca58d9dbe312", "province": province, "crawl_online": 0, "error_times": 0,
length = len(sys.argv) if length > 2: search_list = re.findall('config/(.*?)\.conf', sys.argv[1]) if len(search_list) > 0: log_name = search_list[0] + '_' + sys.argv[2] + '.log' return log_name global_logger = Gsxtlogger(get_log_name()) global_log = global_logger.get_logger() # 旧网页库 target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'], mongo_db_target['username'], mongo_db_target['password'], log=global_log) # 新网页库 target_db_new = MongDb(mongo_db_target_new['host'], mongo_db_target_new['port'], mongo_db_target_new['db'], mongo_db_target_new['username'], mongo_db_target_new['password'], log=global_log) # 搜索列表存储表 source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'],