def process(warcfile, collection): f = warc.WARCFile(warcfile, 'rb') for record in f: mongoDoc = MongoDoc(record) oneDoc = mongoDoc.gen_mongo_doc() if oneDoc['warc_type'] == 'warcinfo': continue doc = None doc = db[collection].find_one({'_id': oneDoc['warc_trec_id']}) if doc is None: db[collection].insert_one(oneDoc) tmp_str = warcfile + ' ' + oneDoc['warc_trec_id'] + ' is done.' _LOGGER_NORMAL.info(tmp_str) else: pass
def insert_one_file(filename, collection): doc_cnt = 0 f = warc.WARCFile(filename, 'rb') for record in f: mongoDoc = MongoDoc(record) oneDoc = mongoDoc.gen_mongo_doc() if oneDoc['warc_type'] == 'warcinfo': continue db[collection].insert_one(oneDoc) doc_cnt += 1 tmp_str = filename + ' ' + oneDoc['warc_trec_id'] + ' is done.' _LOGGER_NORMAL.info(tmp_str) #check doc_count is the same true_cnt = get_doc_num(filename,CHECK_DIRNAME) tmp_str = filename + '\t' + str(true_cnt) + '\t' + str(doc_cnt) + '\t' + str(true_cnt == doc_cnt) _LOGGER_CHECK.info(tmp_str)