os.makedirs(output_path) logging.info("Initializing lexicon.") lexicon = DictLexicon(output_path) logging.info("Initializing storage.") storage = LdbStorage(output_path) storage.init_db() storage.open_db() logging.info("Initializing index.") index = InvertedIndex(output_path, field_properties=[ ("sentence_id", numpy.int32), ]) index.init_index() index.open() logging.info("Initializing sentence stream and its parser.") sentence_stream = LFSentenceStream(input_path, language=arguments.language) sentence_parser = LFSentenceParser() sentence_indexer = LFSentenceIndexer(lexicon) logging.info("Initializing indexing pipeline.") indexing_pipeline = IndexingPipeline(lexicon, index, storage) logging.info("Start indexing file: %s" % input_path) input_mb_size = float(os.path.getsize(input_path)) / (1024**2) logging.info("Input size: %.2fMB" % input_mb_size) indexing_pipeline.index_stream(sentence_stream, sentence_parser, sentence_indexer)
logging.info("Initializing lexicon.") lexicon = DictLexicon(output_path) lexicon.load() logging.info("Initializing storage.") storage = LdbStorage(output_path) storage.init_db() storage.open_db() logging.info("Initializing index.") index = InvertedIndex(output_path, field_properties=[ ("document_id", numpy.int32), ]) index.init_index() index.open() logging.info("Initializing ruwac stream and its parser.") if arguments.language == "spa" or arguments.language == "eng": sentence_stream = GigawordStream(open(input_path, "rb")) sentence_parser = GigawordParser(language=arguments.language) elif arguments.language == "rus": sentence_stream = RuwacStream(open(input_path, "rb")) sentence_parser = RuwacParser() else: raise Exception("Unsupported language: %s" % arguments.language) sentence_indexer = RuwacIndexer(lexicon)