def create_index(directory, analyzer, documents_to_index): config = index.IndexWriterConfig(analyzer) index_writer = index.IndexWriter(directory, config) for doc in documents_to_index: index_writer.addDocument(doc) index_writer.close()
def create_index_for_wiki_sentence(filename, path, firstTime=False): logging.info('Start create wiki_sentence!') wiki_dict = get_wiki_data(path) logging.info('Start creating index!') filename = '_wiki_sentence' analyzer = analysis.standard.StandardAnalyzer() # # Store the index in memory: base_dir = HOMEPATH INDEX_DIR = "IndexFiles" + filename + ".index" storeDir = os.path.join(base_dir, INDEX_DIR) if not os.path.exists(storeDir): os.mkdir(storeDir) directory = SimpleFSDirectory(Paths.get(storeDir)) if firstTime: config = index.IndexWriterConfig(analyzer) iwriter = index.IndexWriter(directory, config) for cnt, key in enumerate(wiki_dict.keys()): if cnt % 1000 == 0: logging.info( 'I have preprocessed {} index in creating index by document!' .format(str(cnt))) org_title = key[0] preprocessed_title = key[1] doc_id = key[2] sentence = wiki_dict[key] doc = create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence) iwriter.addDocument(doc) iwriter.close() logging.info('Finish creating index wiki_sentence!') return directory
def createIndex_Stem_Lemma_SpacyIndex(self): print("In create index method") spacy_file = self.directory+"wiki_spacy_lemma_pos.json" my_analyzer = analysis.en.EnglishAnalyzer() my_config = index.IndexWriterConfig(my_analyzer) my_config.setSimilarity(ClassicSimilarity()) my_writer = index.IndexWriter(self.in_directory_English_lemma, my_config) # # Setting up Title field for content we want tokenized t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # only want documents returned # Setting up Body field for content we want tokenized t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking # Setting up Categories field for content we want tokenized t3 = FieldType() t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(IndexOptions.DOCS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking # Setting up Body POS field for content we want tokenized t4 = FieldType() t4.setStored(True) t4.setTokenized(True) t4.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking nDocsAdded = 0 docs = self.readJSONFromDisk(spacy_file) print("Len of file is", len(docs)) for doc in docs: title = doc[0] lemma = doc[1] category = doc[2] pos = doc[3] doc = Document() doc.add(Field(self.TITLE, title, t1)) doc.add(Field(self.TEXT, lemma, t2)) doc.add(Field("Categories", category, t3)) doc.add(Field("POS", pos, t4)) my_writer.addDocument(doc) nDocsAdded +=1 # now safely in the provided directories: indexDir and taxoDir. my_writer.commit() my_writer.close() print("Indexed %d documents with spacy." % nDocsAdded) pass
def createIndex_simple(self,input_files): # open file and read lines docs = [] cur_title = "" cur_body = "" cur_category = [] file_counter = 0 ip_file_counter = 1 # Initialize Standard analyzer & Index writer my_analyzer = analysis.standard.StandardAnalyzer() my_config = index.IndexWriterConfig(my_analyzer) # Set ClassicSimilarity for tf-idf #my_config.setSimilarity(ClassicSimilarity()) my_writer = index.IndexWriter(self.in_directory, my_config) # # Setting up Title field for content we want tokenized t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # only want documents returned # Setting up Body field for content we want tokenized t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking # Setting up Categories field for content we want tokenized t3 = FieldType() t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(IndexOptions.DOCS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking nDocsAdded = 0 print("List of input files is",input_files) for input_file in input_files: with open(input_file, 'r', encoding='utf8') as f1: # Assumes all input documents contain documents that are separated by titles denoted by [[xxx]] my_line = f1.readline() while my_line: if my_line.startswith("[[") and my_line.rstrip().endswith("]]"): if cur_title != "": # add previous document to index when next document doc = Document() doc.add(Field(self.TITLE, cur_title, t1)) doc.add(Field(self.TEXT, cur_body, t2)) doc.add(Field("Categories", self.listToString(cur_category), t3)) my_writer.addDocument(doc) # increment counters and reset document variables nDocsAdded += 1 cur_title = "" cur_body = "" cur_category = [] file_counter += 1 # store current title cur_title = my_line[2:-3] # store categories as a string elif my_line.startswith("CATEGORIES:"): # categories are in a line that starts with CATEGORIES and each category is separated by ", " cur_category = my_line[11:].strip().split(", ") # store body of document else: cur_body += my_line #read next line my_line = f1.readline() file_counter += 1 print("File counter",file_counter) # ,"cur category",listToString(cur_category) # on EOF save document to index doc = Document() doc.add(Field(self.TITLE, cur_title, t1)) doc.add(Field(self.TEXT, cur_body, t2)) doc.add(Field("Categories", self.listToString(cur_category), t3)) my_writer.addDocument(doc) cur_title = "" cur_body = "" ip_file_counter += 1 # now safely in the provided directories: indexDir and taxoDir. my_writer.commit() my_writer.close() print("Indexed %d documents." % nDocsAdded) pass
def createIndex_Stem(self,input_files): cur_title = "" cur_body = "" cur_category = [] file_counter = 0 ip_file_counter = 1 # Initialize PorterStemmer analyzer & Index writer my_analyzer = analysis.en.EnglishAnalyzer() my_config = index.IndexWriterConfig(my_analyzer) my_config.setSimilarity(ClassicSimilarity()) my_writer = index.IndexWriter(self.in_directory_English, my_config) # Setting up Title field for content we want tokenized t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Setting up Body field for content we want tokenized t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking # Setting up Categories field for content we want tokenized t3 = FieldType() t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(IndexOptions.DOCS) nDocsAdded = 0 print("List of input files is",input_files) for input_file in input_files: with open(input_file, 'r', encoding='utf8') as f1: # Assumes all input documents contain documents that are separated by titles denoted by [[xxx]] my_line = f1.readline() while my_line: if my_line.startswith("[[") and my_line.rstrip().endswith("]]"): if cur_title != "": doc = Document() doc.add(Field(self.TITLE, cur_title, t1)) doc.add(Field(self.TEXT, cur_body, t2)) doc.add(Field("Categories", self.listToString(cur_category), t3)) my_writer.addDocument(doc) nDocsAdded += 1 cur_body = "" cur_category = [] file_counter += 1 cur_title = my_line[2:-3] elif my_line.startswith("CATEGORIES:"): cur_category = my_line[11:].strip().split(", ") else: cur_body += my_line my_line = f1.readline() file_counter += 1 doc = Document() doc.add(Field(self.TITLE, cur_title, t1)) doc.add(Field(self.TEXT, cur_body, t2)) doc.add(Field("Categories", self.listToString(cur_category), t3)) my_writer.addDocument(doc) cur_title = "" cur_body = "" ip_file_counter += 1 my_writer.commit() my_writer.close() print("Indexed %d documents." % nDocsAdded) pass