query_paths = arguments.query context_input = arguments.context_input logging.info("Input: %s" % input_path) logging.info("Context: %s" % context_input) logging.info("Query: %s" % query_paths) logging.info("Output: %s" % output_path) logging.info("Initializing lexicon.") lexicon = DictLexicon(input_path) lexicon.load() logging.info("Opening index.") index = InvertedIndex(input_path) index.open() logging.info("Initializing searcher.") searcher = Searcher(index, "sentence_id") logging.info("Initializing storage.") storage = LdbStorage(input_path) storage.open_db() if context_input is not None: logging.info("Initializing context lexicon.") c_lexicon = DictLexicon(context_input) c_lexicon.load()
logging.info("Initializing lexicon.") lexicon = DictLexicon(output_path) lexicon.load() logging.info("Initializing storage.") storage = LdbStorage(output_path) storage.init_db() storage.open_db() logging.info("Initializing index.") index = InvertedIndex(output_path, field_properties=[ ("document_id", numpy.int32), ]) index.init_index() index.open() logging.info("Initializing ruwac stream and its parser.") if arguments.language == "spa" or arguments.language == "eng": sentence_stream = GigawordStream(open(input_path, "rb")) sentence_parser = GigawordParser(language=arguments.language) elif arguments.language == "rus": sentence_stream = RuwacStream(open(input_path, "rb")) sentence_parser = RuwacParser() else: raise Exception("Unsupported language: %s" % arguments.language)
if os.path.exists(output_path): shutil.rmtree(output_path) os.makedirs(output_path) logging.info("Initializing lexicon.") lexicon = DictLexicon(output_path) logging.info("Initializing storage.") storage = LdbStorage(output_path) storage.init_db() storage.open_db() logging.info("Initializing index.") index = InvertedIndex(output_path, field_properties=[ ("sentence_id", numpy.int32), ]) index.init_index() index.open() logging.info("Initializing sentence stream and its parser.") sentence_stream = LFSentenceStream(input_path, language=arguments.language) sentence_parser = LFSentenceParser() sentence_indexer = LFSentenceIndexer(lexicon) logging.info("Initializing indexing pipeline.") indexing_pipeline = IndexingPipeline(lexicon, index, storage) logging.info("Start indexing file: %s" % input_path) input_mb_size = float(os.path.getsize(input_path)) / (1024**2) logging.info("Input size: %.2fMB" % input_mb_size)
input_path = arguments.input output_path = arguments.output query_paths = arguments.query context_input = arguments.context_input logging.info("Input: %s" % input_path) logging.info("Context: %s" % context_input) logging.info("Query: %s" % query_paths) logging.info("Output: %s" % output_path) logging.info("Initializing lexicon.") lexicon = DictLexicon(input_path) lexicon.load() logging.info("Opening index.") index = InvertedIndex(input_path) index.open() logging.info("Initializing searcher.") searcher = Searcher(index, "sentence_id") logging.info("Initializing storage.") storage = LdbStorage(input_path) storage.open_db() if context_input is not None: logging.info("Initializing context lexicon.") c_lexicon = DictLexicon(context_input) c_lexicon.load()