Esempio n. 1
0
def lsh_zipfile(LineFormat, zip_reader, source, filename, file_key = ''):
    infolist = zip_reader.infolist()
    dummydoc = MatrixRow.create()            # force the creation of the table
    dataset = Matrix.create(source, filename, file_key)    # force the creation of the table and filling it with a row
    dataset = Matrix.find(dataset.ds_key)
    start = time.time()
    all_stats = defaultdict(float)
    new_docs_count = 0
    docs_cache = Cache(max_size = 15)
    for info in infolist:
        with zip_reader.open(info) as text_reader:
            logging.debug('Reading file %s', info.filename)
            deferred.defer(lsh_text, LineFormat, zip_reader, filename, matrix_key = dataset.ds_key, text_filename = info.filename)
    return
Esempio n. 2
0
def lsh_text(LineFormat, zip_reader, filename, matrix_key, text_filename):
    logging.info('<TextWorker filename={filename} text_filename={text_filename}>'\
        .format(filename=filename, text_filename=text_filename))

    infolist = zip_reader.infolist()
    Matrix._initialize()
    MatrixRow._initialize()
    dataset = Matrix.find(matrix_key)
    for info in infolist:
        if info.filename == text_filename:
            break

    with zip_reader.open(info) as text_reader:
        logging.debug('Reading file %s', info.filename)
        stats = {}
        for line in text_reader:
            doc_id, text = LineFormat.parse(line)
            doc = dataset.create_doc(doc_id, text, stats)
            stats = {}
    logging.info('</TextWorker filename={filename} text_filename={text_filename}>'\
        .format(filename=filename, text_filename=text_filename))