Example #1
0
def extract_job(skip, size):
    """
    预处理的速度奇慢
    """
    valid_cond = {}
    models = list()
    batch_size = 5000
    unique_ids = list()
    bozz_job = mongo_db['bozz_job']
    job_list = bozz_job.find(valid_cond).sort([('crawl_time', DESCENDING)
                                               ]).skip(skip).limit(size)
    for i, doc in enumerate(job_list):
        sys.stdout.write("\r{}".format(i + 1))
        sys.stdout.flush()
        jid = doc['encryptJobId']
        if jid in unique_ids:
            continue
        unique_ids.append(jid)
        company_id = match_company(doc.get('encryptBrandId'),
                                   doc.get('brandName'), doc.get('brandLogo'))
        if not company_id:
            continue
        # job, MongoDB 中已经去重了
        job = BozzJobModel.get_by(BozzJobModel.source_id == jid)
        parsed_job = parse_each_job(doc)
        parsed_job['company_id'] = company_id
        # parsed_job['recruiter_id'] = recruiter_model.id
        job_model = BozzJobModel.dict2model(parsed_job, job)
        models.append(job_model)
        if len(models) == batch_size:
            save_batch(models, chunk_size=batch_size)
            models = list()
    if models:
        save_batch(models, chunk_size=batch_size)
    return i + 1
Example #2
0
def extract_company(skip, size):
    valid_cond = {}
    batch_size = 5000
    models = list()
    unique_ids = list()
    bozz_company = mongo_db['bozz_company']
    company_list = bozz_company.find(valid_cond).sort([
        ('crawl_time', DESCENDING)
    ]).skip(skip).limit(size)
    for i, doc in enumerate(company_list):
        sys.stdout.write("\r{}".format(i + 1))
        sys.stdout.flush()
        cid = doc['encryptBrandId']
        if cid in unique_ids:
            continue
        unique_ids.append(cid)
        model = BozzCompanyModel.get_by(BozzCompanyModel.source_id == cid)
        parse_doc = process_each_company(doc)
        model = BozzCompanyModel.dict2model(parse_doc, model)
        models.append(model)
        if len(models) == batch_size:
            save_batch(models, chunk_size=batch_size)
            models = list()
    if models:
        save_batch(models, chunk_size=batch_size)
    return i + 1
Example #3
0
def main():
    models = list()
    done_ids = set()
    for doc in collection.find().sort('crawl_time', DESCENDING):
        unique_id = str(doc['episode_id'])
        bangumi_source_id = str(doc['bangumi_id'])
        new_bangumi = AnimeBangumiModel.get_by_bangumi_id(bangumi_source_id)
        assert new_bangumi is not None
        model = AnimeEpisodeModel.get_by_episode_id(unique_id)
        if unique_id in done_ids:
            continue
        if not model:
            model = AnimeEpisodeModel()
            model.episode_id = unique_id
            model.created = datetime.utcnow()
        model.bangumi_id = new_bangumi._id
        model.title = doc['title']
        model.number = int(doc['number'])
        model.source_url = doc['first_page']
        model.updated = datetime.utcnow()
        models.append(model)
        done_ids.add(unique_id)
    print(len(models))
    save_batch(models)
    return models
Example #4
0
def extract_zhihu_answers():
    """
    todo: 可以模仿Flask的Form, 针对各个字段定义不同validate方法, 还能联合校验
    提取MongoDB 的 answer 信息
    :return:
    """
    unique_models = dict()
    for doc in collection.find({'id': '81972368'}):  # todo: 有效数据的条件
        # 无视匿名用户
        model = ZhihuAnswerModel()   # todo: 无效数据的过滤
        model.answer_id = doc['id']
        model.question_id = int(doc['question_id'])  # # todo:  mognod schema to mysql schame
        model.user_id = doc['user_id']
        if doc.get('is_labeled') is not None:  # todo: 处理缺失值
            model.is_labeled = doc['is_labeled']
        model.answer_url = doc['answer_url']
        if doc.get('content'):
            model.content = parse_html(doc['content'])
        if doc.get('thumbnail'):
            model.thumbnail = doc['thumbnail']
        model.comment_count = doc['comment_count']
        model.voteup_count = doc['voteup_count']
        model.created_time = datetime.fromtimestamp(int(doc['created_time']))  # convert 北京时间戳 to utc now
        model.updated_time = datetime.fromtimestamp(int(doc['updated_time']))
        model.created = datetime.utcnow()
        model.updated = datetime.utcnow()
        unique_models[model.answer_id] = model
    models = list()
    models.extend(unique_models.values())
    return save_batch(models)
Example #5
0
def extract_zhihu_topics():
    """
    提取MongoDB中的zhihu_topics到MySQL
    :return:
    """
    zhihu_topic = mongo_client['crawler']['zhihu_topic']
    docs = list(zhihu_topic.find({'crawl_time': {'$gt': datetime.utcnow() - timedelta(days=1)}}))
    models = list()
    for doc in docs:
        model = ZhihuTopicModel.get(int(doc['id']))
        if doc.get('questions_count') is not None:
            model.questions_count = doc['questions_count']
        if doc.get('unanswered_count') is not None:
            model.unanswered_count = doc['unanswered_count']
        if doc.get('followers_count') is not None:
            model.followers_count = doc['followers_count']
        if doc.get('father_count') is not None:
            model.father_count = doc['father_count']
        if doc.get('best_answers_count') is not None:
            model.best_answers_count = doc['best_answers_count']
        if doc.get('best_answerers_count'):
            model.best_answerers_count = doc['best_answerers_count']
        if doc.get('category'):
            model.category = doc['category']
        if doc.get('avatar_url'):
            model.avatar_url = doc['avatar_url']
        if doc.get('introduction'):
            model.description = doc['introduction']
        model.topic_url = doc['topic_url']
        models.append(model)
    return save_batch(models)
def main():
    zhihu_coll = mongo_db['zhihu_topic_node']
    # unique topic id
    pipeline = [
        {'$group': {'_id':
                        {'topic_id': '$id', 'topic_name': '$name',
                         'parent_id': '$parent_id', 'parent_name': '$parent_name'},
                    'count': {'$sum': 1}}},
    ]
    edges = list(zhihu_coll.aggregate(pipeline))
    print("一共有{}条边".format(len(edges)))
    models = list()
    unique_topic_ids = set()
    for edge in edges:
        info = edge['_id']
        now = datetime.utcnow()
        if not ZhihuTopicModel.get(info['topic_id']):
            topic_model = ZhihuTopicModel()
            topic_model.topic_id = int(info['topic_id'])
            topic_model.topic_name = info['topic_name']
            topic_model.created = now
            topic_model.updated = now
            if topic_model.topic_id not in unique_topic_ids:
                models.append(topic_model)
                unique_topic_ids.add(topic_model.topic_id)
        try:
            if not ZhihuTopicMapModel.query(
                    ZhihuTopicMapModel.topic_id == int(info['topic_id']),
                    ZhihuTopicMapModel.parent_id == int(info['parent_id'])):
                map_model = ZhihuTopicMapModel()
                map_model.topic_id = int(info['topic_id'])
                map_model.parent_id = int(info['parent_id'])
                map_model.created = now
                models.append(map_model)
        except KeyError as e:
            pprint(info)
    save_batch(models)
Example #7
0
def main():
    models = list()
    for doc in collection.find().sort('crawl_time', DESCENDING):
        model = AnimeImageModel()
        episode_id = str(doc['episode_id'])
        episode_model = AnimeEpisodeModel.get_by_episode_id(episode_id)
        if not episode_model:
            print("NO Episode: ", doc)
            continue
        model.episode_id = doc['episode_id']
        if doc.get('number'):
            model.number = int(doc['number'])
        if doc.get('save_file'):
            # 保存相对路径
            model.image = 'images/' + doc['save_file']
        if doc.get('image'):
            model.source_image = doc['image']
        model.episode_id = episode_model._id
        model.source_url = doc['url']
        model.created = datetime.utcnow()
        model.updated = datetime.utcnow()
        models.append(model)
    return save_batch(models)
Example #8
0
def extract_zhihu_users():
    """
    提取MongoDB 的user信息
    :return:
    """
    answerer_topic_maps = dict()
    unique_models = dict()
    for doc in mongo_client['crawler']['zhihu_user'].find(
        {'id': {
            '$nin': [0, '0']
        }}):
        # 无视匿名用户
        model = ZhihuUserModel()
        model.user_id = doc['id']
        model.name = doc['name']
        if doc.get(
                'user_url'
        ):  # and doc['user_url'] != 'http://www.zhihu.com/api/v4/people/0'
            model.user_url = doc['user_url']
        if doc.get('user_type') is not None:
            model.user_type = doc['user_type']
        if doc.get('url_token'):
            model.url_token = doc['url_token']
        if doc.get('avatar_url') and doc[
                'avatar_url'] != 'http://www.zhihu.com/api/v4/people/0':
            model.avatar_url = doc['avatar_url']
        edu_member_tag = doc.get('edu_member_tag')
        if edu_member_tag:
            if edu_member_tag['type'] == 'subject':
                model.subject = edu_member_tag.get(
                    'member_tag') or edu_member_tag.get('memberTag')
            elif edu_member_tag['type'] == 'master':
                model.master_area = edu_member_tag.get(
                    'member_tag') or edu_member_tag.get('memberTag')
        if doc.get('gender') in [
                1, 0
        ] and doc['user_url'] != 'http://www.zhihu.com/api/v4/people/0':
            model.gender = GenderValue.male if doc[
                'gender'] == 1 else GenderValue.female
        if doc.get('headline'):
            model.headline = doc['headline']
        if doc.get('is_org') is not None:
            model.is_org = doc['is_org']
        if doc.get('is_advertiser') is not None:
            model.is_advertiser = doc['is_advertiser']
        for badge in doc.get('badge', []):
            if badge['type'] == 'identity':
                model.identity = badge['description']
            elif badge['type'] == 'best_answerer':
                for topic in badge['topics']:
                    map_model = ZhihuTopicBestAnswererMapModel()
                    map_model.topic_id = int(topic['id'])
                    map_model.user_id = doc['id']
                    map_model.created = datetime.utcnow()
                    answerer_topic_maps[(map_model.user_id,
                                         map_model.topic_id)] = map_model
        model.created = datetime.utcnow()
        model.updated = datetime.utcnow()
        unique_models[model.user_id] = model
    models = []
    models.extend(answerer_topic_maps.values())
    models.extend(unique_models.values())
    return save_batch(models)