def save_task(task=None): if task: content = "" try: max_line = ConsumerConfig.log_max_line max_rows = ConsumerConfig.log_max_rows with open(task["log_path"], "r", encoding="utf-8") as f: while max_line: line = f.readline(max_rows) if line: content += line max_line -= 1 else: break except Exception as e: content += "log read error!\n\n" content += e.__repr__() client = get_mongodb_client() coll = client[ConsumerConfig.mongodb_dbs_task][ ConsumerConfig.mongodb_col_task] task.setdefault("log_content", content) task.setdefault("end_time", timekiller.get_now().strftime("%Y-%m-%d %H:%M:%S")) coll.insert_one(task) client.close()
def get_vector_wordsTemplate(): cza = get_mongodb_client()['news']['train'] titles = set() for new in cza.find({}, {'_id': 0, 'label': 1, '标题': 1}): titles |= set(cutTitle(new['标题'])) with open('wordsTemplate.json', 'w') as f: f.write(json.dumps(list(titles), ensure_ascii=False))
def trainDataSet(): # mongoClient = get_mongodb_client()['news']['train'] with open('keyCounter.json', 'r') as f: keyCounter = list(json.loads(f.read()).keys()) with open('wordsTemplate.json', 'r') as f: json_data = json.loads(f.read()) # with open('train.txt', 'w+') as f: # for key in keyCounter: # index_dict = {} # for new in mongoClient.find({'label': key}, {'_id': 0, 'label': 1, '标题': 1}): # for word in cutTitle(new['标题']): # if word in wordsTemplate: # index = wordsTemplate.index(word) # index_dict[index] = index_dict.get(index, 0) + 1 # sumValue = sum(index_dict.values()) # index_dict = {key: value / sumValue for key, value in index_dict.items()} # f.write(json.dumps(index_dict, ensure_ascii=False) + '|' + key + '\n') # del index_dict # print(key, 'done') cza = get_mongodb_client()['news']['train'] with open('train.txt', 'w+') as f: for key in keyCounter: index_dict = {} jss = 0 # 训练数据是不是需要稍微限制一下。还是需要限制一下,应该本身自己分得就不是很对 for new in cza.find({'label': key}, { '_id': 0, 'label': 1, '标题': 1 }): if jss == 5001: # 限制5k把 break jss += 1 for word in cutTitle(new['标题']): if word in json_data: index = json_data.index(word) index_dict[index] = index_dict.get(index, 0) + 1 dict_values = index_dict.values() dict_values_max = max(dict_values) dict_values_min = min(dict_values) if dict_values_max == dict_values_min: if dict_values_min == 1: continue else: dict_values_min = dict_values_min / 2 denominator = dict_values_max - dict_values_min new_index_dict = {} for index_dict_key, index_dict_value in index_dict.items(): v = (index_dict_value - dict_values_min) / denominator if v < 0.8: v = min(max(v, 0.03), 0.06) elif v < 0.5: pass elif v < 0.7: v = v / 1.3 else: v = v / 5 new_index_dict[index_dict_key] = float(v) f.write(json.dumps(new_index_dict) + '|' + key + '\n') del index_dict, new_index_dict print(key, 'done')
def test6(): NO_D = re.compile('\d+').search cza = get_mongodb_client()['news']['train'] titles = set() all_label = set() for new in cza.find({}, {'_id': 0, 'label': 1}): all_label.add(new['label']) for label in all_label: jump = False for fil in ['伊春', '文昌', '长治', '咸阳', '嘉兴', '贵州', '外埠', '阳泉', '盐城', '昌平', '孙大军', '安顺', '宝坻', '山西', '巴中', '怀柔', '襄阳', '驻芜', '驻柳', '首页其他', '馆务', '馆藏', '陵水', '南宁', '安阳', '乌海', '黎苗', '巴州', '盘锦', '凉都', '视频', '临夏', '柳州', '下载', '相关新闻', '公开文件', '白沙', '顺义', '芜湖', '龙湾', '兴安', '兴谷', '衡水', '专题集锦', '河南', '走进', '图片', '图表', '六个如何', '创模', '精彩', '他山之石', '好人', '焦作', '智库', '留言选登', '黔府', '两会', '区人防办', '各地创建', '南开', '两会', '国有企业', '三变', '杨勤荣说', '项目介绍', '注销撤销', '网站首页', '豫风楚韵', '澄迈', '桂林', '宜居生活', '百姓问政', '价格信息', '文件', '煤化', 'Home', '投资平台', '延庆', '佳作欣赏', '代表建议', '信阳', '特色节目', '赛事动态', '主动发声', '创卫', '宿迁', '创建要闻', '创卫要闻']: if fil in label: jump = True break if jump: continue if cza.count({'label': label}) < 50: continue print(f'获取到: {label}') for new in cza.find({'label': label}, {'_id': 0, '标题': 1}): titles |= set([i for i in cutTitle(new['标题']) if not NO_D(i)]) print(len(titles)) # 501878 => 106506 => 88991 with open('allUniqueLabel.json', 'w') as f: f.write(json.dumps(list(titles), ensure_ascii=False))
def get_all_key_counter(): cza = get_mongodb_client()['news']['train'] sdict = {} for new in cza.find({}, {'_id': 0, 'label': 1, '标题': 1}): sdict[new['label']] = sdict.get(new['label'], 0) + 1 with open('keyCounter.json', 'w') as f: f.write(json.dumps(sdict, ensure_ascii=False))
def test11(): with open('allUniqueLabel.json', 'r') as f: json_data = json.loads(f.read()) client = get_mongodb_client()['news']['test'] for doc in client.find({}): wordTitle = doc['标题'] index_dict = {} for word in cutTitle(wordTitle): if word in json_data: index = json_data.index(word) index_dict[index] = index_dict.get(index, 0) + 1 wordLabel = None wordValue = 0 with open('train.txt', 'r') as f: for content in f: train_json_data, label = content.strip().split('|') trainSet = json.loads(train_json_data) # 得到训练数据 wordValueTemp = 0 for key, value in index_dict.items(): trainKey = trainSet.get(str(key), None) if trainKey: wordValueTemp += value * trainKey if wordValueTemp > wordValue: wordValue = wordValueTemp label = '公示公告' if '公示' in label else label label = '其他信息' if '住房资讯' in label else label label = '其他信息' if '回应关切' in label else label wordLabel = '其他信息' if '畜牧兽医' in label else label print(f"v1:{doc['v1-label']}-v2:{doc['v2-label']}-v3:{wordLabel}") client.update_one({'_id': doc['_id']}, {'$set': {'v3-label': wordLabel}}) client.close()
def __init__(self, crawler, mongodb_db, mongodb_coll, **kwargs): self.not_log_detail = crawler.settings.getbool( "mongodb_not_log_detail", True) self.mongodb_client = get_mongodb_client(**kwargs) self.mongodb = self.mongodb_client[mongodb_db][mongodb_coll] crawler.spider.mongodb_client = self.mongodb_client crawler.spider.mongodb = self.mongodb
def api_log_list(log_id=None): col = get_mongodb_client()[ConsumerConfig.mongodb_dbs_task][ ConsumerConfig.mongodb_col_task] if log_id: doc = col.find_one({"_id": log_id}, {"log_content": 1})["log_content"] return f"<pre>{doc}</pre>" documents = col.find({}, {"pid": 0, "log_content": 0}). \ sort([('_id', -1)]).skip(0).limit(5) return jsonify(list(documents))
def api_ziru(day=15): mongodb_ziru_statistics = get_mongodb_client( )["housePrice"]["ziru_zufang_statistics"] documents = mongodb_ziru_statistics.find( {"timestamp": { "$gte": timekiller.get_past_day(day).timestamp() }}, {"_id": 0}) documents = list(documents) return jsonify({"status": 0, "data": {"ziru": documents}})
def api_lagou(): query = request.args.get("query", "python") mongodb_lagou_statistics = get_mongodb_client( )["job_lagou"][f"city_statistics_{query}"] document = mongodb_lagou_statistics.find_one( {"timestamp": timekiller.get_today().timestamp()}, {"_id": 0}) if document: return jsonify({ "status": 0, "data": { "lagou": json.loads(document["statistics"]) }, "msg": "success", }) else: return jsonify({ "status": 2, "data": "", "msg": "Not query data", }), 404
def test2(): cza = get_mongodb_client()['news']['train'] sett = set() count = 0 NO_D = re.compile('\d+').search SSUB = re.compile('\s').sub for new in cza.find({}, {'金融分类': 1}): ccc = new['金融分类'].split('-') label = [i.strip() for i in ccc[-2:]] if len(label[0]) != 4 and len(label[1]) != 4 and \ len(SSUB('', label[0])) != 4 and len(SSUB('', label[1])) != 4: # cza.delete_one({'_id': new['_id']}) # 删除是不是有点久啊 continue label = label[0] if len(label[0]) == 4 else label[1] if NO_D(label): # cza.delete_one({'_id': new['_id']}) # 删除是不是有点久啊 continue # cza.update_one({'_id': new['_id']}, {'$set': {'label': label}}) count += 1 sett.add(label)
def test10(limit=-1): with open('allUniqueLabel.json', 'r') as f: json_data = json.loads(f.read()) right_num = 0 all_num = 0 for doc in get_mongodb_client()['news']['train'].find({}, {'_id': 0}): if all_num == limit: break all_num += 1 wordTitle = doc['标题'] index_dict = {} for word in cutTitle(wordTitle): if word in json_data: index = json_data.index(word) index_dict[index] = index_dict.get(index, 0) + 1 wordLabel = None wordValue = 0 with open('train.txt', 'r') as f: for content in f: train_json_data, label = content.strip().split('|') trainSet = json.loads(train_json_data) # 得到训练数据 wordValueTemp = 0 for key, value in index_dict.items(): trainKey = trainSet.get(str(key), None) if trainKey: wordValueTemp += value * trainKey if wordValueTemp > wordValue: wordValue = wordValueTemp label = '公示公告' if '公示' in label else label label = '其他信息' if '住房资讯' in label else label wordLabel = '其他信息' if '畜牧兽医' in label else label print(f"原标题:{wordTitle}") print(f"金融分类:{doc['金融分类']}") print(f"原label:{doc['label']}") print(f"预测分类为:{wordLabel}") if doc['label'] == wordLabel: print("预测正确\n") right_num += 1 else: print("预测错误\n") print(f"正确率为:{100 * right_num / all_num}%")
# -*- coding: utf-8 -*- import json import logging from collections import defaultdict from minitools import timekiller from minitools.db.mongodb import get_mongodb_client from minitools.scrapy import miniSpider from scrapy import FormRequest, Request from scrapy.downloadermiddlewares.cookies import CookiesMiddleware mongodb_client = get_mongodb_client() class MySpider(miniSpider): start_urls = ["https://www.lagou.com/jobs/allCity.html"] custom_settings = { "USER_AGENT": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/79.0.3945.88 Safari/537.36", "CONCURRENT_REQUESTS": 1, "DOWNLOAD_DELAY": 3, } collect_url = "https://a.lagou.com/collect" cookies_middleware = None mongodb_coll = "city_statistics" mongodb_db = "job_lagou" cities_statistics = defaultdict(dict) def check_response(self, response):