def build_index(in_dir, out_dict, out_postings): """ Build index from documents stored in the input directory, then output the dictionary file and postings file """ print('Indexing...') stemmer = PorterStemmer() dictionaries = Dictionaries(out_dict) postings = Postings(out_postings) offset = 1 for docID in os.listdir(in_dir): f = open(f'{in_dir}/{docID}', 'r') content_tokens = word_tokenize(f.read()) for word in content_tokens: term = stemmer.stem(word=word).lower() if dictionaries.has_term(term): old_offset = dictionaries.get_offset(term) postings.add_docId_to_offset(old_offset, docID) else: dictionaries.add_term(term, offset) postings.add_doc_id(offset) postings.add_docId_to_offset(offset, docID) offset += 1 dictionaries.increment_frequency(term) postings.save_to_file(dictionaries) dictionaries.save_to_file()
def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: int(x)) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = int(training_file) doc_path = osp.join(training_data_dir, training_file) postings.not_list().add(doc_id) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 1 f.readline() # skip postings list containing all doc ids while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell()) line = f.readline() if not line: break current_line += 1 dictionary.save()
def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: x) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = training_file doc_path = osp.join(training_data_dir, training_file) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 0 while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell(), update_freq=False) line = f.readline() if not line: break current_line += 1 dictionary.generate_idf(len(training_files)) dictionary.save()
def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: tf_list = {} line_number = 1 offset = 0 # Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': # tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: # tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: # apply stemming and case folding stemmed_token = stemmer.stem(token).lower() # if term already exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) # If postings for that term already has doc id, # then increment tf, # Else increment df and add the doc id if postings.has_doc_id(doc_id, offset): postings.increment_tf(doc_id, offset) else: dictionary.increment_df(stemmed_token) postings.add_doc_id(doc_id, offset) # else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) #Keep track of tf values of all terms in doc if stemmed_token in tf_list: tf_list[stemmed_token] += 1 else: tf_list[stemmed_token] = 1 line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) # Store doc length dictionary.add_doc_length(doc_id, tf_list.values()) # save data postings.save(dictionary) dictionary.save()
def __init__(self, company_id="demo"): self.graph_id = company_id self.orphans = set() self.orphan_list = list() self.node_map = dict() self.node_id_map = dict() self.db_info = dict() self.search_postings = Postings() # get utils for intents. with open(os.path.realpath("chatbot/intentUtils.json")) as data_file: self.graph_utils = json.load(data_file) # replace escaped slashes by a single slash. for class_string in self.graph_utils["class"]: if type(self.graph_utils["class"][class_string]) is str: self.graph_utils["class"][class_string] = self.graph_utils[ "class"][class_string].replace('\\\\', '\\')
def build_index(in_dir, out_dict, out_postings): """ Build index from documents stored in the input directory, then output the dictionary file and postings file """ print('Indexing...') stemmer = PorterStemmer() dictionaries = Dictionaries(out_dict) postings = Postings(out_postings) offset = 1 count = len(os.listdir(in_dir)) for docID in os.listdir(in_dir): f = open(f'{in_dir}/{docID}', 'r') content = f.read() sentences = sent_tokenize(content) doc_terms = [] for sentence in sentences: for word in word_tokenize(sentence): term = stemmer.stem(word=word.lower()) doc_terms.append(term) # Calculate weighted term frequencies for each term weighted_term_freqs = [(x[0], get_term_frequency_weight(x[1])) for x in Counter(doc_terms).most_common()] # Calculate document vector length doc_length = math.sqrt( sum(map(lambda x: x[1] * x[1], weighted_term_freqs))) for term, normalised_tf in weighted_term_freqs: if dictionaries.has_term(term): old_offset = dictionaries.get_offset(term) postings.add_docId_tf_to_offset(old_offset, docID, normalised_tf / doc_length) else: dictionaries.add_term(term, offset) postings.add_doc_id(offset) postings.add_docId_tf_to_offset(offset, docID, normalised_tf / doc_length) offset += 1 postings.save_to_file(dictionaries, count) dictionaries.save_to_file()
def run_search(dict_file, postings_file, queries_file, results_file): """ Using the given dictionary file and postings file, perform searching on the given queries file and output the results to a file """ print('Running search on the queries...') dictionaries = Dictionaries(dict_file) dictionaries.load() postings = Postings(postings_file) searcher = Searcher(dictionaries, postings) result_string = '' with open(queries_file, 'r') as f, open(results_file, 'w') as o: for i, query in enumerate(f): searcher.set_query(query.strip()) output = searcher.evaluate_query() result_string += output.strip() + '\n' searcher.clear_postings() f.close() o.write(result_string.strip()) o.close()
def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: postings.add_doc(doc_id) line_number = 1 #Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': #tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: #tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: #apply stemming and case folding stemmed_token = stemmer.stem(token).lower() #if term alraeady exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) result = postings.add_doc_id(doc_id, offset) # Result indicates if the doc id is new if result: dictionary.increment_frequency(stemmed_token) #else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) #save data postings.save(dictionary) dictionary.save()
def __init__(self, dictionary_file, postings_file): self.dictionary = Dictionary(dictionary_file) self.postings = Postings(postings_file) self.dictionary.load() self.all_docs = self.postings.load_list(0)
def build_extraction_postings(self, db_object, redis_object, extraction_indices): if self.db_info and self.db_info.get("mappings"): map_names = self.db_info.get("mappings", []) or [] for map_name in map_names: # initializations mapping = dict() postings_object = Postings() # skip if entry for the map exists in redis map_value = redis_object.get(map_name) # in case the entry has not been populated before if not map_value: # get mapping from DB mapping = db_object["mappings"].find_one( {"name": map_name}) mapping.pop("_id") # store mapping in Redis redis_object.set(map_name, json.dumps(mapping)) else: mapping = json.loads(map_value) entries = mapping.get("map") tokenized_entries = [] fields_to_index = mapping.get("toIndex") # build postings for i, entry in enumerate(entries): # use active entries if entry.get("active"): # merge all texts stripped_text = utils.remove_non_alpha_num_chars( " ".join( filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))))[0] # generate tokens if stripped_text: map( lambda x: postings_object. add_document_for_token(x, i), set(utils.lemmatize_text( stripped_text.lower()))) if not map_value: # construct tokens for all constituents of the entry and store in redis if not already there tokenized_elements = map( lambda x: sorted( utils.lemmatize_text( utils.remove_non_alpha_num_chars(x)[0]) ), filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))) tokenized_entries.append(tokenized_elements) else: if not map_value: tokenized_entries.append(None) extraction_indices[map_name] = postings_object if not map_value: # set tokenized mappings in redis if not already there redis_object.set("tokenized" + map_name, json.dumps(tokenized_entries))
def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r')