Exemple #1
0
def process(warcfile, collection):
    f = warc.WARCFile(warcfile, 'rb')
    for record in f:
        mongoDoc = MongoDoc(record)
        oneDoc = mongoDoc.gen_mongo_doc()
        if oneDoc['warc_type'] == 'warcinfo':
            continue
        doc = None
        doc = db[collection].find_one({'_id': oneDoc['warc_trec_id']})
        if doc is None:
            db[collection].insert_one(oneDoc)
            tmp_str = warcfile + ' ' + oneDoc['warc_trec_id'] + ' is done.'
            _LOGGER_NORMAL.info(tmp_str)
        else:
            pass
Exemple #2
0
def insert_one_file(filename, collection):
    doc_cnt = 0
    f = warc.WARCFile(filename, 'rb')
    for record in f:
        mongoDoc = MongoDoc(record)
        oneDoc = mongoDoc.gen_mongo_doc()
        if oneDoc['warc_type'] == 'warcinfo':
            continue
        db[collection].insert_one(oneDoc)
        doc_cnt += 1
        tmp_str = filename + ' ' + oneDoc['warc_trec_id'] + ' is done.'
        _LOGGER_NORMAL.info(tmp_str)

    #check doc_count is the same
    true_cnt = get_doc_num(filename,CHECK_DIRNAME)
    tmp_str = filename + '\t' + str(true_cnt) + '\t' + str(doc_cnt) + '\t' + str(true_cnt == doc_cnt)
    _LOGGER_CHECK.info(tmp_str)