def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def wikipedia_indexer(storage, wikipedia_file) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f) : text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0 : print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index_files(): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() BASE_DIR = path.dirname(path.abspath(sys.argv[0])) INPUT_DIR = BASE_DIR + "/input/" INDEX_DIR = BASE_DIR + "/lucene_index/" NoT = 100000 # Number of Tokens print "------------------------------------------------------" print "PyLucene Demo started (lucene_demo.py)" print "Python version: %d.%d.%d" % ( sys.version_info.major, sys.version_info.minor, sys.version_info.micro) print 'Lucene version:', lucene.VERSION print "------------------------------------------------------\n" # lucene.initVM() # directory = RAMDirectory() index_path = Paths.get(INDEX_DIR) directory = SimpleFSDirectory(index_path) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".json"): with open(INPUT_DIR + input_file) as f: for line in f: # doc = create_document(line, input_file) # call the create_document function o = json.loads(line) doc = Document() # create a new document doc.add(TextField("filename", input_file, Field.Store.YES)) # print file doc.add( TextField("username", o['user']['screen_name'], Field.Store.YES)) # print "username: "******"text", o['text'], Field.Store.YES)) # print "text: " + o['text'] if o['user']['location']: doc.add( TextField("location", o['user']['location'], Field.Store.YES)) # print "location: " + o['user']['location'] doc.add(TextField("time", o['created_at'], Field.Store.YES)) writer.addDocument( doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Finished\n" print "-----------------------------------------------------"
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
class LuceneIndexer: def __init__(self, path_to_save): self.path_to_save = path_to_save self.num_docs = 0 lucene.initVM() self.indexDir = SimpleFSDirectory(File(self.path_to_save)) self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1) self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2) self.writer = IndexWriter(self.indexDir, self.writerConfig) def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1 def close(self): print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs()) self.writer.close()
def index(): # Initialize lucene and the JVM # lucene.initVM() GLOBALDIRECTORY = getDirectory() #Indexwriter config analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, tokenCount) config = IndexWriterConfig(analyzer) writer = IndexWriter(GLOBALDIRECTORY, config) fileNames = getTxtFile(textfileDirectory) #creates document for each tweet fileNames = getTxtFile(textfileDirectory) #creates document for each tweet for file in fileNames: data = getData(file) for tweets in data: if 'text' in tweets: doc = createDocument_tweet(tweets) writer.addDocument(doc) # add the document to IndexWriter print file print "\nNumber of indexed documents: %d" % writer.numDocs( ) #number of documents indexed for testing writer.close() print "Indexing done!\n" print "------------------------------------------------------" return GLOBALDIRECTORY
def lucene_indexing(): lucene.initVM() index_dir = os.getcwd() dir = SimpleFSDirectory(File(index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_48) index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer); index_writer = IndexWriter(dir, index_writer_config) for tfile in glob.glob(os.path.join(index_dir, '*.txt')): print "Indexing: ", tfile document = Document() with open(tfile, 'r') as f: content = f.read() document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("title", tfile, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) print index_writer.numDocs() index_writer.close()
def indexer(docNumber, docText): lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer()) writer = IndexWriter(indexDir, writerConfig) doc = Document() doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index(index) : indexDir = SimpleFSDirectory(File(index)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open('f:/nlp/data/questions/combine.txt') for line in f : line = get_data_from_text(line.decode('utf-8')) doc = Document() field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED) field.setBoost(2.0) doc.add(field) writer.addDocument(doc) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def build_index(file_dir): indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")) config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) # t1 = FieldType() # t1.setStored(True) # t1.setTokenized(False) # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # # t2 = FieldType() # t2.setStored(True) # t2.setTokenized(True) # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("%d docs in index" % writer.numDocs()) if writer.numDocs(): print("Index already built.") return with open(file_dir + "/train/train.ast.src") as fc: codes = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in fc.readlines() ] for k, code in enumerate(codes): doc = Document() doc.add(StoredField("id", str(k))) doc.add(TextField("code", code, Field.Store.YES)) writer.addDocument(doc) print("Closing index of %d docs..." % writer.numDocs()) writer.close()
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def index(self): if not (os.path.exists(self._dataDir) and os.path.isdir(self._dataDir)): raise IOError, "%s isn't existed or is not a directory" % ( self._dataDir) dir = SimpleFSDirectory(Paths.get(self._indexDir)) writer = IndexWriter(dir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) self.indexDirectory(writer, self._dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def indexer(): '''索引器''' lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) movies = MovieInfo.query.limit(10000).all() print("Index starting...") for n, l in enumerate(movies): doc = Document() doc.add(Field("name", l.name, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("shortcut", l.shortcut, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('url', l.url, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Item {} indexed...".format(n + 1)) print("Index finished...") print("Closing index of %d docs..." % writer.numDocs()) writer.close()
def main(): """Function to index negative situations and retrive based on input sentence""" all_sent_df = pd.read_csv("../data/sentiment_data.csv") neg = all_sent_df[all_sent_df["label"] == 1] all_neg_phrases = list(neg["phrase"]) with open("../data/negSituations.txt", "r") as fpointer: all_neg_situations = fpointer.readlines() all_neg_situations = map(lambda s: s.strip(), all_neg_situations) all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases) lucene.initVM() analyzer = StandardAnalyzer() path = Paths.get('negSituationIndex') directory = SimpleFSDirectory(path) writer_config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, writer_config) print(writer.numDocs()) # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS for each in all_neg_situations: document = Document() document.add(Field("negativeSituations", each, TextField.TYPE_STORED)) writer.addDocument(document) print(writer.numDocs()) writer.close() analyzer = StandardAnalyzer() reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer: all_test_sent = fpointer.readlines() all_test_sent = map(lambda s: s.strip(), all_test_sent) query_parser = QueryParser("negativeSituations", analyzer) total_num = 0 tic = time.time() all_ans = [] for each in all_test_sent: total_num = total_num + 1 if total_num % 1000 == 0: print(total_num, time.time() - tic) query = query_parser.parse(query_parser.escape(each)) hits = searcher.search(query, 3) docs_scores = [hit.score for hit in hits.scoreDocs] current_ans = [] if docs_scores != []: for hit in hits.scoreDocs: doc_t = searcher.doc(hit.doc) doc_text = doc_t.get("negativeSituations") current_ans.append(doc_text) else: continue current_ans = list(set(current_ans)) all_ans.append(current_ans) print(all_ans)
class Indexer(object): # Creates index adds it to docs # indexDir Directory is where the index is created def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config) def close(self): self._writer.close() def getDoc(self, file): try: f = open(os.getcwd()+FILE_DIR+'/'+file, "r") try: c = [] s = BeautifulSoup(f, 'html.parser') text = s.findAll(text=True) c = filter(tag_vis, text) try: c = ' '.join(c) except Exception as e: c = b' '.join(c) except Exception as e: print(str(e)) return content = TextField("contents", c, Field.Store.YES) fileName = str(Paths.get(file)).split('/')[-1] fileName = fileName[:fileName.find(".")] filename = TextField("filename", fileName, Field.Store.YES) path = TextField("filepath", str(os.getcwd()+FILE_DIR+'/'+file), Field.Store.NO) doc = Document() doc.add(content) doc.add(filename) doc.add(path) return doc except Exception as e: print(type(Exception).__name__) print(str(e)) return def indexFile(self, file): if ( self.getDoc(file) is not None ): self._writer.addDocument(self.getDoc(file)) #pass in absolute path when calling this function def createIndex(self, path): for file in os.listdir(path): print(file) if os.path.isfile(path+"/"+file): self.indexFile(file) return self._writer.numDocs() def closeWriter(self): self._writer.close()
lucene.initVM() print "lucene version is:", lucene.VERSION # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Get index storage indexDir = SimpleFSDirectory(File("index/")) # Get index writer config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer); writer = IndexWriter(indexDir, config); print "%d docs in index" % writer.numDocs() for d in data: rec = d['record'] if not rec['product_name'] or not rec['uniq_id']: logging.info ("Incomplete product ... skipping") logging.debug(rec) continue else: doc = Document() for k,v in rec.iteritems(): if k in keys: doc.add(Field(k, v, Field.Store.YES, Field.Index.ANALYZED)) else: if (k == 'product_specifications'): specs = v['product_specification']
trainingFilePath = '/home/tarun/PE/Dataset/training_set.tsv' lucene.initVM() # ANALYZER analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) # DIRECTORY directory = SimpleFSDirectory(File(luceneIndexPath)) # INDEX WRITER writerConfig = IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, writerConfig) print writer.numDocs() # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS for fileName in os.listdir(corpus): print fileName document = Document() article = os.path.join(corpus, fileName) content = open(article, 'r').read() document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) print writer.numDocs() writer.close() # INDEX READER reader = IndexReader.open(directory) searcher = IndexSearcher(reader)
del line[0:2] line = ' '.join(line) qterm = keyterm.replace("_", " ") if qterm not in line: line = qterm + ' ' + line doc.add(TextField("text", line, Field.Store.YES)) return doc lucene.initVM() index_path = File(INDEX_DIR).toPath() directory = SimpleFSDirectory.open(index_path) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print("Number of documents:", writer.numDocs()) for input_file in listdir(INPUT_DIR): print("Current file:", input_file) if input_file.endswith(".txt"): path = INPUT_DIR + input_file with open(path) as file: line = file.readline() while (line): line = file.readline() if len(line.strip()) != 0: doc = create_document(line) writer.addDocument(doc) file.close() print("finally:", writer.numDocs()) print("Indexing done!")
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) analyzer_ws = WhitespaceAnalyzer(Version.LUCENE_4_10_1) std_path = "%s/lucene_full_standard/" % (output_path) ws_path = "%s/lucene_full_ws/" % (output_path) if os.path.exists(std_path): os.remove(std_path) if os.path.exists(ws_path): os.remove(ws_path) indexDir1 = SimpleFSDirectory(File(std_path)) indexDir2 = SimpleFSDirectory(File(ws_path)) writerConfig1 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writerConfig2 = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer_ws) writer1 = IndexWriter(indexDir1, writerConfig1) writer2 = IndexWriter(indexDir2, writerConfig2) print "%d docs in index1" % writer1.numDocs() print "%d docs in index2" % writer2.numDocs() print "Reading lines from sys.stdin..." ftypes = open(LUCENE_TYPES_FILE, "w") for n, l in enumerate(sys.stdin): doc = Document() doc_lc = Document() fields = l.rstrip().split("\t") all_ = [] if n == 0: sys.stdout.write("TYPES_HEADER") elif n == 1: sys.stdout.write("\n") for (idx, field) in enumerate(fields):
from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": lucene.initVM() path = Paths.get('index') indexDir = SimpleFSDirectory(path) analyzer = StandardAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." todo = get_all_rawtext_ids() for n, i in enumerate(todo): try: html = get_rawtext_by_id(i).html root = LH.fromstring(html) text = root.text_content().strip() except: #print "Failed to parse doc" continue doc = Document() # print text doc.add(TextField("text", text, Field.Store.NO)) doc.add(StoredField("id", i)) writer.addDocument(doc)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) try: con = mdb.connect('localhost', 'root', '', 'cs246') cur = con.cursor() cur.execute("SELECT * FROM article_page;") rows = cur.fetchall() n = 0 for row in rows: n = n+1 page_id = str(row[0]) page_title = str(row[1]).replace('_', ' ') doc = Document() doc.add(Field("title", page_title, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)) doc.add(Field("id", page_id, Field.Store.YES, Field.Index.NO)) writer.addDocument(doc) print "total number of tuples", n except mdb.Error, e: print "Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1) finally: if con: con.close() print "Created (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
class IndexFiles(object): def __init__(self, indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config) def index(self, file, duplicates): exact = [ duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] == 1 ] near = [ duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] < 1 ] with open(file) as file: for document in file: data = json.loads(document) if (data['url'] in exact): continue doc = self.createDoc(data['url'], data['html'], data['url'] in near) self.writer.addDocument(doc) store_outlinks(data['url'], data['outlinks']) self.writer.commit() return self.writer.numDocs() def createDoc(self, url, html, duplicate): title, contents = self.parseHtml(url, html) doc = Document() doc.add(StringField("title", title, Field.Store.YES)) doc.add(StringField("url", url, Field.Store.YES)) doc.add( StringField("duplicate", str(duplicate).lower(), Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "Warning: No content in %s" % url return doc def close(self): self.writer.close() def parseHtml(self, url, html): soup = BeautifulSoup(html, 'lxml') title = self.getTitle(url, soup) body = self.getBody(soup) return title, body def getTitle(self, url, soup): if soup.title: title = soup.title.get_text().strip() elif soup.find("h1"): title = " ".join(soup.find("h1").get_text().split()) else: title = url.split("/")[-1] return title def getBody(self, soup): comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] [style.decompose() for style in soup.find_all('style')] [script.decompose() for script in soup.find_all('script')] if soup.body: return soup.body.get_text(" ", strip=True) else: return soup.get_text(" ", strip=True)
from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, IntField, StringField, TextField from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField} if __name__ == "__main__": lucene.initVM() indexDir = SimpleFSDirectory(File("data/lucene_full_v1/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." header=[] for n, l in enumerate(sys.stdin): doc = Document() fields = l.rstrip().split("\t") #add one more field to header field set, which will index the concatenated set of all fields for general searches all_ = [] if len(fields) < 1 or len(fields[0]) == 0: continue for (idx,field) in enumerate(fields): if n == 0: typechar = field[-1] if typechar not in set(['t','s','i']): sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field)) exit(-1)
class QuestionLuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print('Creating index at', prm.index_folder) if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print('copying index from', prm.index_folder, 'to', prm.local_index_folder) if os.path.exists(prm.local_index_folder): print('Folder', prm.local_index_folder, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(BM25Similarity()) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print('Creating index at', prm.index_folder_term) self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term) if os.path.exists(prm.local_index_folder_term): print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print('Loading Text-ID mapping...') self.text_id_map, self.id_text_map = self.get_text_id_map() def get_text_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() text_id = {} id_text = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) text = doc['text'] text_id[text] = idd id_text[idd] = text return text_id, id_text # def add_doc(self, doc_id, title, txt, add_terms): def add_doc(self, doc_id, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) # doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print('Loading Vocab...') if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print("%d docs in index" % self.writer.numDocs()) print("Indexing documents...") # import corpus_hdf5 # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path) import pickle with open(docs_path, "rb") as read_file: corpus = pickle.load(read_file) idx_cnt = 0 # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()): # for doc_id, txt in corpus.items(): for txt in corpus: self.add_doc(idx_cnt, txt, add_terms) # not lowered if idx_cnt % 1000 == 0: print('indexing doc', idx_cnt) idx_cnt += 1 print("Index of %d docs..." % self.writer.numDocs()) self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] # print(c) return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in zip(out, terms): for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())): outt[cand_id] = term if save_cache: for q, c in zip(qs, out): if q not in self.cache: self.cache[q] = c return out def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True): # if prm.n_threads > 1: # out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term) # else: # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term) out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_singlethread(self, q, doc_int, searcher): out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_multithread(self, qs_trailing_doc, searcher): self.curr_searcher = searcher # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int)) out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc) return out def search_pair_score_multithread_part(self, q_doc_int): # print(q_doc_int) spl=q_doc_int.split('<|endoftext|>') q = spl[0] print(q) doc_int = int(spl[1]) print(doc_int) if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.curr_searcher.explain(query, doc_int) c[1] = exp return c
def main(indexDir, inputDir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(Version.LUCENE_CURRENT, 1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords) writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED )) doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) json_data.close() except (IOError) as v: try: (code, message) = v except: code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up #logger.info("About to optimize index of %d documents..." % writer.numDocs()) #writer.optimize() #logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = IndexReader.open(dir) with open('all.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in xrange(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \ doc.get('title').strip().replace(',', '\,').encode('utf8')])
from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.util import Version from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig from org.apache.lucene.document import Document, Field, FieldType if __name__ == "__main__": lucene.initVM(vmargs=['-Djava.awt.headless=true']) indexDir = "../pyFreya/freya/index/actual" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(dir, config) with open("data",'r') as f: for doc in f.read().split("newDocSep"): docr = Document() for field in doc.split("csvSep"): fieldData = field.split("||") try:docr.add(Field(fieldData[1], fieldData[2], Field.Store.YES, Field.Index.ANALYZED)) except:print "ups" print "\n" writer.addDocument(docr) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.commit() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs() writer.close()
class LuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map() def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in itertools.izip(out, terms): for cand_id, term in itertools.izip( outt.keys()[:max_full_cand], termss.values()): outt[cand_id] = term if save_cache: for q, c in itertools.izip(qs, out): if q not in self.cache: self.cache[q] = c return out
class LuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map() if prm.idf_path: print 'Loading IDF dictionary...' self.idf = pkl.load(open(prm.idf_path)) def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title def add_idf(self, txt): txt = utils.clean(txt) txt = txt.lower() df = set() for word in wordpunct_tokenize(txt): if word not in df: df.add(word) self.idf[word] += 1. def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: if prm.top_tfidf > 0: words_idx = [] words, _ = utils.top_tfidf(txt.lower(), self.idf, prm.top_tfidf, prm.min_term_freq) if len(words) == 0: words.append('unk') for w in words: if w in self.vocab: words_idx.append(self.vocab[w]) else: words_idx.append(-1) # unknown words. else: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) if add_terms: if prm.top_tfidf > 0 or prm.idf_path: print 'Creating IDF dictionary...' self.idf = defaultdict(int) doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): # remove html tags txt = BeautifulSoup( record.payload[:1000 * 1000], "lxml").get_text() # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 for key, val in self.idf.items(): self.idf[key] = math.log(float(doc_id) / val) pkl.dump(self.idf, open(prm.idf_path, 'wb')) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): if 'warc-trec-id' in record: title = record['warc-trec-id'] else: title = record['warc-record-id'] # remove html tags #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text() txt = record.payload[:1000 * 1000] # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in itertools.izip(out, terms): for cand_id, term in itertools.izip( outt.keys()[:max_full_cand], termss.values()): outt[cand_id] = term if save_cache: for q, c in itertools.izip(qs, out): if q not in self.cache: self.cache[q] = c return out
def main(index_dir, input_dir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % index_dir) fs_dir = SimpleFSDirectory(Paths.get(index_dir)) analyzer = StandardAnalyzer(stopwords) writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(fs_dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(input_dir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(StringField("journal", journal_code, Field.Store.YES)) doc.add(StringField("url", entry['url'], Field.Store.YES)) doc.add(StringField("date", entry['date'], Field.Store.YES)) doc.add(TextField("title", entry['title'], Field.Store.YES)) writer.addDocument(doc) json_data.close() except IOError as v: try: (code, message) = v except (TypeError, ValueError): code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up # logger.info("About to optimize index of %d documents..." % writer.numDocs()) # writer.optimize() # logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = DirectoryReader.open(fs_dir) with open('all.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in range(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([ doc.get('journal'), doc.get('date'), doc.get('url'), doc.get('title').strip().replace(',', '\,') ])
return doc # Initialize lucene and the JVM lucene.initVM() # Create a new directory. As a SimpleFSDirectory is rather slow ... directory = RAMDirectory() # ... we'll use a RAMDirectory! # Get and configure an IndexWriter analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".txt"): # consider only .txt files doc = create_document(input_file) # call the create_document function writer.addDocument(doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Indexing done!\n" print "------------------------------------------------------" # --------------------------------------------------------------------------- # # ____ _ _ # # | _ \ ___| |_ _ __(_) _____ _____ _ __ # # | |_) / _ \ __| '__| |/ _ \ \ / / _ \ '__| #
from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory import sqlite3 import pandas as pd PATH = '' if __name__ == "__main__": PATH = os.getcwd() lucene.initVM() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print("%d docs in index" % writer.numDocs()) print("Reading lines from sys.stdin...") con = sqlite3.connect(PATH + '/imdb.db') df = pd.read_sql('select * from movies', con) con.close() for v in df.values: doc = Document() doc.add(StringField("id", str(v[0]), Field.Store.YES)) doc.add(TextField("name", v[1], Field.Store.YES)) doc.add(StringField("year", str(v[2]), Field.Store.YES)) writer.addDocument(doc) print("Indexed %d lines from stdin (%d docs in index)" % (df.shape[0], writer.numDocs())) print("Closing index of %d docs..." % writer.numDocs()) writer.close()
from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, IntField, StringField, TextField from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField} if __name__ == "__main__": lucene.initVM() indexDir = SimpleFSDirectory(File("lucene/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." header=[] for n, l in enumerate(sys.stdin): doc = Document() fields = l.rstrip().split("\t") for (idx,field) in enumerate(fields): if n == 0: typechar = field[-1] if typechar not in set(['t','s','i']): sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field)) exit(-1) header.append([field,LUCENE_TYPES[typechar]]) else: (fname,fieldtype) = header[idx] if fieldtype is IntField:
# http://svn.apache.org/viewvc/lucene/pylucene/trunk/samples/IndexFiles # .py?view=markup json_file = sys.argv[1] index_folder = sys.argv[2] glog.setLevel(glog.INFO) lucene.initVM() store = SimpleFSDirectory(Paths.get(index_folder)) stop_words = CharArraySet(50, True) c_analyzer = ClassicAnalyzer(stop_words) analyzer = LimitTokenCountAnalyzer(c_analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) print('%d docs in index' % writer.numDocs()) print('Indexing json files...') # For text field. t1 = FieldType() t1.setStored(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) with codecs.open(json_file, encoding='utf8') as f: for line in tqdm(f): line = line.strip() try: json_doc = json.loads(line) except: glog.warning('Error json parsing: {}'.format(line)) continue
class LuceneSearch(): """Index and search docs. Parameters ---------- index_dir : str Index of the documents produced by Lucene db_path: str File path of the SQLlite database containing articles of wikipedia dump.(from DrQA) num_search_workers: int (optional), default=8 Workers to use to accelerate searching. """ def __init__(self, index_dir: str, db_path: str = None, num_search_workers: int = 8) -> None: self.env = lucene.getVMEnv() # pylint: disable=no-member if not self.env: self.env = lucene.initVM( initialheap='28g', # pylint: disable=no-member maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.num_search_workers = num_search_workers if not os.path.exists(index_dir): self.doc_db = DocDB(db_path=db_path) logger.info('Creating index at %s', index_dir) self._create_index(index_dir) fs_dir = MMapDirectory(Paths.get(index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(fs_dir)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=num_search_workers) def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close() def _search_multithread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: args = [(query, doc_max) for query in queries] queries_results = self.pool.starmap(self._search_multithread_part, args) return queries_results def _search_multithread_part( self, query: str, doc_max: int) -> List[Dict[str, Union[float, str]]]: if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: query = QueryParser('text', self.analyzer).parse(QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning(colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) return query_results def _search_singlethread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: queries_result = [] for query in queries: try: query = QueryParser('text', self.analyzer).parse( QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning( colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) queries_result.append(query_results) return queries_result def search(self, query: str, doc_max: int = 20) -> List[Dict[str, Union[float, str]]]: """Search a given query. Parameters ---------- query : str Anything you want to search doc_max : int Maximum number of result to return Returns ------- Tuple[Any] Search results. """ return self.batch_search([query], doc_max=doc_max)[0] def batch_search( self, queries: List[str], doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]: """ Search a list of queries. Parameters ---------- queries : List[str] queries list doc_max : int, optional, default=20 maximum number of docs returned by the search engine. Returns ------- List[Tuple[Any]] Result returned by the search engine. """ if self.num_search_workers > 1: result = self._search_multithread(queries, doc_max) else: result = self._search_singlethread(queries, doc_max) return result @staticmethod def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None: """Print the results returned by the doc searcher. Parameters ---------- search_result : List[Dict[str, Union[float, str]]] Results returned from ranker """ headers = ['Rank', 'Title', 'Text', 'Score'] table = prettytable.PrettyTable(headers) for i, result in enumerate(search_result): text, title = result['text'], result['title'] text = text[:100] + ' ...' if len(text) > 100 else text title = title[:30] + ' ...' if len(title) > 30 else title table.add_row([i, title, text, '%.5g' % result['score']]) print('Top Results:') print(table)
class IndexFiles(object): def __init__(self, indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config) def index(self, file, duplicates): exact = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] == 1] near = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] < 1] with open(file) as file: for document in file: data = json.loads(document) if (data['url'] in exact): continue doc = self.createDoc(data['url'], data['html'], data['url'] in near) self.writer.addDocument(doc) store_outlinks(data['url'], data['outlinks']) self.writer.commit() return self.writer.numDocs() def createDoc(self, url, html, duplicate): title, contents = self.parseHtml(url, html) doc = Document() doc.add(StringField("title", title, Field.Store.YES)) doc.add(StringField("url", url, Field.Store.YES)) doc.add(StringField("duplicate", str(duplicate).lower(), Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "Warning: No content in %s" % url return doc def close(self): self.writer.close() def parseHtml(self, url, html): soup = BeautifulSoup(html, 'lxml') title = self.getTitle(url, soup) body = self.getBody(soup) return title, body def getTitle(self, url, soup): if soup.title: title = soup.title.get_text().strip() elif soup.find("h1"): title = " ".join(soup.find("h1").get_text().split()) else: title = url.split("/")[-1] return title def getBody(self, soup): comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] [style.decompose() for style in soup.find_all('style')] [script.decompose() for script in soup.find_all('script')] if soup.body: return soup.body.get_text(" ", strip=True) else: return soup.get_text(" ", strip=True)