def setUp(self): super(Test_Bug1763, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.d1 = RAMDirectory() self.d2 = RAMDirectory() w1, w2 = [ self.getWriter(directory=d, analyzer=self.analyzer) for d in [self.d1, self.d2] ] doc1 = Document() doc2 = Document() doc1.add( Field("all", "blah blah double blah Gesundheit", TextField.TYPE_NOT_STORED)) doc1.add(Field('id', '1', StoredField.TYPE)) doc2.add( Field("all", "a quick brown test ran over the lazy data", TextField.TYPE_NOT_STORED)) doc2.add(Field('id', '2', StoredField.TYPE)) w1.addDocument(doc1) w2.addDocument(doc2) for w in [w1, w2]: w.close()
class PyLuceneTestCase(TestCase): def __init__(self, *args): super(PyLuceneTestCase, self).__init__(*args) def setUp(self): self.directory = RAMDirectory() def tearDown(self): self.directory.close() def getConfig(self, analyzer=None): return IndexWriterConfig(analyzer) def getWriter(self, directory=None, analyzer=None, open_mode=None, similarity=None, maxBufferedDocs=None, mergePolicy=None): if analyzer is None: analyzer = LimitTokenCountAnalyzer(WhitespaceAnalyzer(), 10000) config = self.getConfig(analyzer) if open_mode is None: open_mode = IndexWriterConfig.OpenMode.CREATE config.setOpenMode(open_mode) if similarity is not None: config.setSimilarity(similarity) if maxBufferedDocs is not None: config.setMaxBufferedDocs(maxBufferedDocs) if mergePolicy is not None: config.setMergePolicy(mergePolicy) if directory is None: directory = self.directory return IndexWriter(directory, config) def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(self.getReader(directory=directory)) def getReader(self, directory=None): if directory is None: directory = self.directory return DirectoryReader.open(directory) def getOnlyLeafReader(self, reader): subReaders = reader.leaves() if subReaders.size() != 1: raise ValueError(reader + " has " + subReaders.size() + " segments instead of exactly one") return subReaders.get(0).reader()
class PyLuceneTestCase(TestCase): def __init__(self, *args): super(PyLuceneTestCase, self).__init__(*args) self.TEST_VERSION = Version.LUCENE_CURRENT def setUp(self): self.directory = RAMDirectory() def tearDown(self): self.directory.close() def getConfig(self, analyzer=None): return IndexWriterConfig(self.TEST_VERSION, analyzer) def getWriter(self, directory=None, analyzer=None, open_mode=None, similarity=None, maxBufferedDocs=None, mergePolicy=None): if analyzer is None: analyzer = LimitTokenCountAnalyzer( WhitespaceAnalyzer(self.TEST_VERSION), 10000) config = self.getConfig(analyzer) if open_mode is None: open_mode = IndexWriterConfig.OpenMode.CREATE config.setOpenMode(open_mode) if similarity is not None: config.setSimilarity(similarity) if maxBufferedDocs is not None: config.setMaxBufferedDocs(maxBufferedDocs) if mergePolicy is not None: config.setMergePolicy(mergePolicy) if directory is None: directory = self.directory return IndexWriter(directory, config) def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(self.getReader(directory=directory)) def getReader(self, directory=None): if directory is None: directory = self.directory return DirectoryReader.open(directory)
def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None
def __init__(self): # self.analyzer = StandardAnalyzer() # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw)) # self.analyzer = PersianAnalyzer() self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address)) self.config = IndexWriterConfig(self.analyzer) self.index = RAMDirectory() self.w = IndexWriter(self.index, self.config)
class PyLuceneTestCase(TestCase): def __init__(self, *args): super(PyLuceneTestCase, self).__init__(*args) self.TEST_VERSION = Version.LUCENE_CURRENT def setUp(self): self.directory = RAMDirectory() def tearDown(self): self.directory.close() def getConfig(self, analyzer=None): return IndexWriterConfig(self.TEST_VERSION, analyzer) def getWriter(self, directory=None, analyzer=None, open_mode=None, similarity=None, maxBufferedDocs=None, mergePolicy=None): if analyzer is None: analyzer = LimitTokenCountAnalyzer(WhitespaceAnalyzer(self.TEST_VERSION), 10000) config = self.getConfig(analyzer) if open_mode is None: open_mode = IndexWriterConfig.OpenMode.CREATE config.setOpenMode(open_mode) if similarity is not None: config.setSimilarity(similarity) if maxBufferedDocs is not None: config.setMaxBufferedDocs(maxBufferedDocs) if mergePolicy is not None: config.setMergePolicy(mergePolicy) if directory is None: directory = self.directory return IndexWriter(directory, config) def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(self.getReader(directory=directory)) def getReader(self, directory=None): if directory is None: directory = self.directory return DirectoryReader.open(directory)
def __init__(self, root, analyzer): self.store = RAMDirectory() self.analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config) self.numDocs = self.indexDocs(root, self.writer) self.writer.commit() self.writer.close()
def __init__(self): indexDir = RAMDirectory() analyzer = SmartChineseAnalyzer() writerConfig = IndexWriterConfig(analyzer) # create new directory, remove previously indexed documents writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writerConfig.setSimilarity(mySimilarity()) logger.debug('search similarity:{}'.format( writerConfig.getSimilarity())) self.indexDir = indexDir self.writer = IndexWriter(indexDir, writerConfig)
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def create_miniindex(docs): index_store = RAMDirectory() analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) for doc in docs: writer.addDocument(doc) writer.commit() writer.close() return index_store
def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR)
def testTieBreaker(self): # MultiTermQuery provides (via attribute) information about which values # must be competitive to enter the priority queue. # # FuzzyQuery optimizes itself around this information, if the attribute # is not implemented correctly, there will be problems! # directory = RAMDirectory() writer = self.getWriter(directory=directory) self._addDoc("a123456", writer) self._addDoc("c123456", writer) self._addDoc("d123456", writer) self._addDoc("e123456", writer) directory2 = RAMDirectory() writer2 = self.getWriter(directory=directory2) self._addDoc("a123456", writer2) self._addDoc("b123456", writer2) self._addDoc("b123456", writer2) self._addDoc("b123456", writer2) self._addDoc("c123456", writer2) self._addDoc("f123456", writer2) ir1 = writer.getReader() ir2 = writer2.getReader() mr = MultiReader([ir1, ir2]) searcher = self.getSearcher(reader=mr) fq = FuzzyQuery(Term("field", "z123456"), 1, 0, 2, False) docs = searcher.search(fq, 2) self.assertEqual(5, docs.totalHits.value) # 5 docs, from the a and b's mr.close() ir1.close() ir2.close() writer.close() writer2.close() directory.close() directory2.close()
def index_and_search_sentence(list_paragraph, question): ramDir = RAMDirectory() analyzer = SmartChineseAnalyzer() myIndexer = SSQA_S_Indexer(ramDir, analyzer) try: sent_num = 0 logger.info("Start indexing sentences...") for paragraph in tqdm(list_paragraph): sentences = re.split('#', paragraph) for sent in sentences: myIndexer.add(sent) sent_num += 1 logger.info("Indexed {} sentences.".format(sent_num)) myIndexer.close() mySearcher = SSQA_S_Searcher(ramDir, analyzer) ret_sents = mySearcher.search(question, 1) return ret_sents mySearcher.close() finally: myIndexer.close() mySearcher.close()
def __init__(self, index_dir, use_ram=False, jvm_ram=None): global lucene_vm_init if not lucene_vm_init: if jvm_ram: # e.g. jvm_ram = "8g" print "Increased JVM ram" lucene.initVM(vmargs=['-Djava.awt.headless=true'], maxheap=jvm_ram) else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True self.dir = SimpleFSDirectory(Paths.get(index_dir)) self.use_ram = use_ram if use_ram: print "Using ram directory..." self.ram_dir = RAMDirectory(self.dir, IOContext.DEFAULT) self.analyzer = None self.reader = None self.searcher = None self.writer = None self.ldf = None print "Connected to index " + index_dir
def __init__(self, dest=None): """ create a apache lucene indexer input: dest destination to store index information. If not set, use RAM. """ # where to store information file or ram if dest: _dir = FSDirectory.open(java.io.File(dest)) else: _dir = RAMDirectory() self.directory = _dir # analyser self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT) # index writer cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser) cfg.setDefaultWriteLockTimeout(6000) self.idx_writer = IndexWriter(self.directory, cfg)
def indexer(documents_file): analyzer = StandardAnalyzer() # creating a directory on the RAM directory = RAMDirectory() config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) # indexing the documents doc = Document() lines = documents_file.readlines() length = len(lines) for line_number in range(length): # indexing document ID if lines[line_number].startswith(".U"): doc_id = lines[line_number + 1].strip() writer.addDocument(doc) doc = Document() doc.add(Field("DocID", doc_id, TextField.TYPE_STORED)) # indexing document description elif lines[line_number].startswith(".W"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) # indexing document title elif lines[line_number].startswith(".T"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) # indexing document keywords elif lines[line_number].startswith(".M"): paragraph = lines[line_number + 1].strip() paragraph = search.stop_words(paragraph) doc.add(Field("DocParagraph", paragraph, TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() return directory, analyzer
def testTieBreaker(self): # MultiTermQuery provides (via attribute) information about which values # must be competitive to enter the priority queue. # # FuzzyQuery optimizes itself around this information, if the attribute # is not implemented correctly, there will be problems! # directory = RAMDirectory() writer = self.getWriter(directory=directory) self._addDoc("a123456", writer) self._addDoc("c123456", writer) self._addDoc("d123456", writer) self._addDoc("e123456", writer) directory2 = RAMDirectory() writer2 = self.getWriter(directory=directory2) self._addDoc("a123456", writer2) self._addDoc("b123456", writer2) self._addDoc("b123456", writer2) self._addDoc("b123456", writer2) self._addDoc("c123456", writer2) self._addDoc("f123456", writer2) ir1 = writer.getReader() ir2 = writer2.getReader() mr = MultiReader([ir1, ir2]) searcher = self.getSearcher(reader=mr) fq = FuzzyQuery(Term("field", "z123456"), 1, 0, 2, False) docs = searcher.search(fq, 2) self.assertEqual(5, docs.totalHits) # 5 docs, from the a and b's mr.close() ir1.close() ir2.close() writer.close() writer2.close() directory.close() directory2.close()
nltk.download('gutenberg') # nltk.corpus.gutenberg.fileids() gutenberg_list = [ 'austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt' ] lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = StandardAnalyzer() # store the index in memory directory = RAMDirectory() # # store the index in File System # directory = FSDirectory() config = IndexWriterConfig(analyzer) iwriter = IndexWriter(directory, config) doc = Document() text = "This is the text to be indexed." doc.add(Field("fieldname", text, TextField.TYPE_STORED)) iwriter.addDocument(doc) iwriter.close() # now search the index ireader = DirectoryReader.open(directory) isearcher = IndexSearcher(ireader)
path = INPUT_DIR + file_name # assemble the file descriptor file = open(path) # open in read mode doc = Document() # create a new document # add the title field doc.add(StringField("title", input_file, Field.Store.YES)) # add the whole book doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() # close the file pointer return doc # Initialize lucene and the JVM lucene.initVM() # Create a new directory. As a SimpleFSDirectory is rather slow ... directory = RAMDirectory() # ... we'll use a RAMDirectory! # Get and configure an IndexWriter analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".txt"): # consider only .txt files doc = create_document(input_file) # call the create_document function writer.addDocument(doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs()
tweets.append(tweet) except: continue for tweet in tweets: ids = [tweet['id_str'] for tweet in tweets if 'id_str' in tweet] text = [tweet['text'] for tweet in tweets if 'text' in tweet] lang = [tweet['lang'] for tweet in tweets if 'lang' in tweet] geo = [tweet['geo'] for tweet in tweets if 'geo' in tweet] place = [tweet['place'] for tweet in tweets if 'place' in tweet] print(ids, text, lang, geo, place) tweet_dict = { "ids": ids, "text": text, "lang": lang, "geo": geo, "place": place } lucene.initVM(vmargs=['-Djava.awt.headless=true']) index = Document() for i in list(tweet_dict): index.add(Field(i, tweet_dict[i], StringField.TYPE_STORED)) index_config = IndexWriterConfig(StandardAnalyzer()) index_direc = RAMDirectory() #currently the index is being saved on RAM, saving on disk is possible as well. index_writer = IndexWriter(index_direc, index_config) index_writer.addDocument(index) index_writer.commit() index_writer.close()
def __init__(self): self.num_doc = 0 self.directory = RAMDirectory() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.writer = IndexWriter(self.directory, self.config)
## Java imports: from org.apache.lucene.search.spell import PlainTextDictionary, SpellChecker # boilerplate for setting up spellchecking: from java.io import StringReader from org.apache.lucene.store import RAMDirectory from org.apache.lucene.index import IndexWriterConfig from org.apache.lucene.analysis.core import KeywordAnalyzer # Start JVM for Lucene. lucene.initVM() # Set up Lucene spellchecking. dict_reader = StringReader(dict_str) dictionary = PlainTextDictionary(dict_reader) ramdir = RAMDirectory() spellchecker = SpellChecker(ramdir) spellchecker.indexDictionary(dictionary, IndexWriterConfig(KeywordAnalyzer()), True) # Run the word correction test. def correct_word(word): candidates = spellchecker.suggestSimilar(word, 10) if len(candidates) > 0: return candidates[0] else: return '' good, bad = [], []
def __init__(_self, minR, maxR, allowNegativeRandomInts): _self.minR = minR _self.maxR = maxR _self.allowNegativeRandomInts = allowNegativeRandomInts _self.index = RAMDirectory()
def setUp(self): self.directory = RAMDirectory()