def extract_job(skip, size): """ 预处理的速度奇慢 """ valid_cond = {} models = list() batch_size = 5000 unique_ids = list() bozz_job = mongo_db['bozz_job'] job_list = bozz_job.find(valid_cond).sort([('crawl_time', DESCENDING) ]).skip(skip).limit(size) for i, doc in enumerate(job_list): sys.stdout.write("\r{}".format(i + 1)) sys.stdout.flush() jid = doc['encryptJobId'] if jid in unique_ids: continue unique_ids.append(jid) company_id = match_company(doc.get('encryptBrandId'), doc.get('brandName'), doc.get('brandLogo')) if not company_id: continue # job, MongoDB 中已经去重了 job = BozzJobModel.get_by(BozzJobModel.source_id == jid) parsed_job = parse_each_job(doc) parsed_job['company_id'] = company_id # parsed_job['recruiter_id'] = recruiter_model.id job_model = BozzJobModel.dict2model(parsed_job, job) models.append(job_model) if len(models) == batch_size: save_batch(models, chunk_size=batch_size) models = list() if models: save_batch(models, chunk_size=batch_size) return i + 1
def extract_company(skip, size): valid_cond = {} batch_size = 5000 models = list() unique_ids = list() bozz_company = mongo_db['bozz_company'] company_list = bozz_company.find(valid_cond).sort([ ('crawl_time', DESCENDING) ]).skip(skip).limit(size) for i, doc in enumerate(company_list): sys.stdout.write("\r{}".format(i + 1)) sys.stdout.flush() cid = doc['encryptBrandId'] if cid in unique_ids: continue unique_ids.append(cid) model = BozzCompanyModel.get_by(BozzCompanyModel.source_id == cid) parse_doc = process_each_company(doc) model = BozzCompanyModel.dict2model(parse_doc, model) models.append(model) if len(models) == batch_size: save_batch(models, chunk_size=batch_size) models = list() if models: save_batch(models, chunk_size=batch_size) return i + 1
def main(): models = list() done_ids = set() for doc in collection.find().sort('crawl_time', DESCENDING): unique_id = str(doc['episode_id']) bangumi_source_id = str(doc['bangumi_id']) new_bangumi = AnimeBangumiModel.get_by_bangumi_id(bangumi_source_id) assert new_bangumi is not None model = AnimeEpisodeModel.get_by_episode_id(unique_id) if unique_id in done_ids: continue if not model: model = AnimeEpisodeModel() model.episode_id = unique_id model.created = datetime.utcnow() model.bangumi_id = new_bangumi._id model.title = doc['title'] model.number = int(doc['number']) model.source_url = doc['first_page'] model.updated = datetime.utcnow() models.append(model) done_ids.add(unique_id) print(len(models)) save_batch(models) return models
def extract_zhihu_answers(): """ todo: 可以模仿Flask的Form, 针对各个字段定义不同validate方法, 还能联合校验 提取MongoDB 的 answer 信息 :return: """ unique_models = dict() for doc in collection.find({'id': '81972368'}): # todo: 有效数据的条件 # 无视匿名用户 model = ZhihuAnswerModel() # todo: 无效数据的过滤 model.answer_id = doc['id'] model.question_id = int(doc['question_id']) # # todo: mognod schema to mysql schame model.user_id = doc['user_id'] if doc.get('is_labeled') is not None: # todo: 处理缺失值 model.is_labeled = doc['is_labeled'] model.answer_url = doc['answer_url'] if doc.get('content'): model.content = parse_html(doc['content']) if doc.get('thumbnail'): model.thumbnail = doc['thumbnail'] model.comment_count = doc['comment_count'] model.voteup_count = doc['voteup_count'] model.created_time = datetime.fromtimestamp(int(doc['created_time'])) # convert 北京时间戳 to utc now model.updated_time = datetime.fromtimestamp(int(doc['updated_time'])) model.created = datetime.utcnow() model.updated = datetime.utcnow() unique_models[model.answer_id] = model models = list() models.extend(unique_models.values()) return save_batch(models)
def extract_zhihu_topics(): """ 提取MongoDB中的zhihu_topics到MySQL :return: """ zhihu_topic = mongo_client['crawler']['zhihu_topic'] docs = list(zhihu_topic.find({'crawl_time': {'$gt': datetime.utcnow() - timedelta(days=1)}})) models = list() for doc in docs: model = ZhihuTopicModel.get(int(doc['id'])) if doc.get('questions_count') is not None: model.questions_count = doc['questions_count'] if doc.get('unanswered_count') is not None: model.unanswered_count = doc['unanswered_count'] if doc.get('followers_count') is not None: model.followers_count = doc['followers_count'] if doc.get('father_count') is not None: model.father_count = doc['father_count'] if doc.get('best_answers_count') is not None: model.best_answers_count = doc['best_answers_count'] if doc.get('best_answerers_count'): model.best_answerers_count = doc['best_answerers_count'] if doc.get('category'): model.category = doc['category'] if doc.get('avatar_url'): model.avatar_url = doc['avatar_url'] if doc.get('introduction'): model.description = doc['introduction'] model.topic_url = doc['topic_url'] models.append(model) return save_batch(models)
def main(): zhihu_coll = mongo_db['zhihu_topic_node'] # unique topic id pipeline = [ {'$group': {'_id': {'topic_id': '$id', 'topic_name': '$name', 'parent_id': '$parent_id', 'parent_name': '$parent_name'}, 'count': {'$sum': 1}}}, ] edges = list(zhihu_coll.aggregate(pipeline)) print("一共有{}条边".format(len(edges))) models = list() unique_topic_ids = set() for edge in edges: info = edge['_id'] now = datetime.utcnow() if not ZhihuTopicModel.get(info['topic_id']): topic_model = ZhihuTopicModel() topic_model.topic_id = int(info['topic_id']) topic_model.topic_name = info['topic_name'] topic_model.created = now topic_model.updated = now if topic_model.topic_id not in unique_topic_ids: models.append(topic_model) unique_topic_ids.add(topic_model.topic_id) try: if not ZhihuTopicMapModel.query( ZhihuTopicMapModel.topic_id == int(info['topic_id']), ZhihuTopicMapModel.parent_id == int(info['parent_id'])): map_model = ZhihuTopicMapModel() map_model.topic_id = int(info['topic_id']) map_model.parent_id = int(info['parent_id']) map_model.created = now models.append(map_model) except KeyError as e: pprint(info) save_batch(models)
def main(): models = list() for doc in collection.find().sort('crawl_time', DESCENDING): model = AnimeImageModel() episode_id = str(doc['episode_id']) episode_model = AnimeEpisodeModel.get_by_episode_id(episode_id) if not episode_model: print("NO Episode: ", doc) continue model.episode_id = doc['episode_id'] if doc.get('number'): model.number = int(doc['number']) if doc.get('save_file'): # 保存相对路径 model.image = 'images/' + doc['save_file'] if doc.get('image'): model.source_image = doc['image'] model.episode_id = episode_model._id model.source_url = doc['url'] model.created = datetime.utcnow() model.updated = datetime.utcnow() models.append(model) return save_batch(models)
def extract_zhihu_users(): """ 提取MongoDB 的user信息 :return: """ answerer_topic_maps = dict() unique_models = dict() for doc in mongo_client['crawler']['zhihu_user'].find( {'id': { '$nin': [0, '0'] }}): # 无视匿名用户 model = ZhihuUserModel() model.user_id = doc['id'] model.name = doc['name'] if doc.get( 'user_url' ): # and doc['user_url'] != 'http://www.zhihu.com/api/v4/people/0' model.user_url = doc['user_url'] if doc.get('user_type') is not None: model.user_type = doc['user_type'] if doc.get('url_token'): model.url_token = doc['url_token'] if doc.get('avatar_url') and doc[ 'avatar_url'] != 'http://www.zhihu.com/api/v4/people/0': model.avatar_url = doc['avatar_url'] edu_member_tag = doc.get('edu_member_tag') if edu_member_tag: if edu_member_tag['type'] == 'subject': model.subject = edu_member_tag.get( 'member_tag') or edu_member_tag.get('memberTag') elif edu_member_tag['type'] == 'master': model.master_area = edu_member_tag.get( 'member_tag') or edu_member_tag.get('memberTag') if doc.get('gender') in [ 1, 0 ] and doc['user_url'] != 'http://www.zhihu.com/api/v4/people/0': model.gender = GenderValue.male if doc[ 'gender'] == 1 else GenderValue.female if doc.get('headline'): model.headline = doc['headline'] if doc.get('is_org') is not None: model.is_org = doc['is_org'] if doc.get('is_advertiser') is not None: model.is_advertiser = doc['is_advertiser'] for badge in doc.get('badge', []): if badge['type'] == 'identity': model.identity = badge['description'] elif badge['type'] == 'best_answerer': for topic in badge['topics']: map_model = ZhihuTopicBestAnswererMapModel() map_model.topic_id = int(topic['id']) map_model.user_id = doc['id'] map_model.created = datetime.utcnow() answerer_topic_maps[(map_model.user_id, map_model.topic_id)] = map_model model.created = datetime.utcnow() model.updated = datetime.utcnow() unique_models[model.user_id] = model models = [] models.extend(answerer_topic_maps.values()) models.extend(unique_models.values()) return save_batch(models)