def lsh_zipfile(LineFormat, zip_reader, source, filename, file_key = ''): infolist = zip_reader.infolist() dummydoc = MatrixRow.create() # force the creation of the table dataset = Matrix.create(source, filename, file_key) # force the creation of the table and filling it with a row dataset = Matrix.find(dataset.ds_key) start = time.time() all_stats = defaultdict(float) new_docs_count = 0 docs_cache = Cache(max_size = 15) for info in infolist: with zip_reader.open(info) as text_reader: logging.debug('Reading file %s', info.filename) deferred.defer(lsh_text, LineFormat, zip_reader, filename, matrix_key = dataset.ds_key, text_filename = info.filename) return
def lsh_text(LineFormat, zip_reader, filename, matrix_key, text_filename): logging.info('<TextWorker filename={filename} text_filename={text_filename}>'\ .format(filename=filename, text_filename=text_filename)) infolist = zip_reader.infolist() Matrix._initialize() MatrixRow._initialize() dataset = Matrix.find(matrix_key) for info in infolist: if info.filename == text_filename: break with zip_reader.open(info) as text_reader: logging.debug('Reading file %s', info.filename) stats = {} for line in text_reader: doc_id, text = LineFormat.parse(line) doc = dataset.create_doc(doc_id, text, stats) stats = {} logging.info('</TextWorker filename={filename} text_filename={text_filename}>'\ .format(filename=filename, text_filename=text_filename))