def __init__(self, input_path): # Temporarily removing from arguments: ,input_file self.txt_files = glob.glob(input_path+"en*.txt") self.TITLE = "title" self.TEXT = "text" self.directory = input_path # Store the index on disk: self.in_directory = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR))) self.in_directory_English = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR_ENG))) self.in_directory_English_lemma = SimpleFSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR_EL))) self.queries = [] self.sp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) self.sp_bodies_lemma = [] self.sp_bodies_pos = [] self.query_lemma = "" self.query_pos = "" self.prec_at_1 =0 pass
def main(args): global verbose verbose = args.verbose if verbose: logger.info(f'Read {args.dir_index}') directory = SimpleFSDirectory.open(Paths.get(args.dir_index)) searcher = IndexSearcher(DirectoryReader.open(directory)) reader = searcher.getIndexReader() if verbose: logger.info(f'Write to {args.path_output}') with open(args.path_output, 'w') as f: for idx in trange(reader.maxDoc()): doc = reader.document(idx) babelnet_id = doc.get('ID') synset_id = doc.get('SYNSET_ID') pos = doc.get('POS') synset_type = doc.get('TYPE') main_sense = doc.get('MAIN_SENSE') categories = list(doc.getValues('CATEGORY')) translation_mappings = list(doc.getValues('TRANSLATION_MAPPING')) images = list(doc.getValues('IMAGE')) lemmas = doc.getValues('LEMMA') forms = [] for i in range(len(lemmas)): forms.append({ 'lemma': lemmas[i], 'source': doc.getValues('LEMMA_SOURCE')[i], 'lang': doc.getValues('LEMMA_LANGUAGE')[i], 'weight': doc.getValues('LEMMA_WEIGHT')[i], 'sense_key': doc.getValues('LEMMA_SENSEKEY')[i], }) entry = { 'id': babelnet_id, 'synset': synset_id, 'pos': pos, 'type': synset_type, 'main_sense': main_sense, 'categories': categories, 'translation_mappings': translation_mappings, 'images': images, 'forms': forms } f.write(json.dumps(entry, ensure_ascii=False) + '\n') return 0
keyterm = line[0] doc.add(StringField("keyterm", keyterm, Field.Store.YES)) index = line[1] doc.add(StringField("Sno", index, Field.Store.YES)) del line[0:2] line = ' '.join(line) qterm = keyterm.replace("_", " ") if qterm not in line: line = qterm + ' ' + line doc.add(TextField("text", line, Field.Store.YES)) return doc lucene.initVM() index_path = File(INDEX_DIR).toPath() directory = SimpleFSDirectory.open(index_path) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print("Number of documents:", writer.numDocs()) for input_file in listdir(INPUT_DIR): print("Current file:", input_file) if input_file.endswith(".txt"): path = INPUT_DIR + input_file with open(path) as file: line = file.readline() while (line): line = file.readline() if len(line.strip()) != 0: doc = create_document(line)
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) # Highlight the matching text in red highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)
def create_index_from_folder(folder, index_file): """Lets Lucene create an index of all database files within a specified folder :param folder: absolute or relative path to database files :param index_file: absolute or relative output location for index Notes: - Does not go through database folder recursively, i.e. all files have to be at the root of the folder - Only CSV files are supported - Column headers are hardcoded and should follow: ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold """ # Set up Lucene print() print("Starting Lucene ...") lucene.initVM() index_store = SimpleFSDirectory.open(File(index_file).toPath()) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) print() # Go through files, add rows of each as Documents to writer for file in os.listdir(folder): if file.endswith(".csv"): print("Indexing {} ...".format(file), end=" ", flush=True) with open(os.path.join(folder, file), newline='') as db: reader = csv.reader(db) # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those post_ids = set() duplicate_counter = 0 # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) # CSV files have a useless first row... skipfirst = True # ... and a useless first column. Skip both. for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader: if skipfirst: skipfirst = False continue doc = Document() if rid in post_ids: duplicate_counter += 1 continue # skip else: post_ids.add(rid) # Tokenize, index and store doc.add(Field("text", text, customfield)) # Index and store doc.add(StringField("id", rid, Field.Store.YES)) doc.add( StringField("subreddit", subreddit, Field.Store.YES)) doc.add(StringField("meta", meta, Field.Store.YES)) doc.add(StringField("time", time, Field.Store.YES)) doc.add(StringField("author", author, Field.Store.YES)) # Store only doc.add(StoredField("ups", ups)) doc.add(StoredField("downs", downs)) doc.add(StoredField("authorlinkkarma", authorlinkkarma)) doc.add(StoredField("authorkarma", authorkarma)) doc.add(StoredField("authorisgold", authorisgold)) writer.addDocument(doc) print("DONE!\t(Duplicate posts skipped: {})".format( duplicate_counter)) writer.commit() writer.close() print() print("Finished indexing!")
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print() print("Hit enter with no input to quit.") command = input("Query:") if command == '': return print("Searching for:", command) query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print("%s total matching documents." % len(scoreDocs)) # Highlight the matching text in red highlighter = Highlighter( SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)