def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: x) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = training_file doc_path = osp.join(training_data_dir, training_file) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 0 while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell(), update_freq=False) line = f.readline() if not line: break current_line += 1 dictionary.generate_idf(len(training_files)) dictionary.save()
def test_dictionary_has_entry(): d = Dictionary() assert not d.has_entry('asdf', 1) d.add_term('asdf', 1, 10) assert d.has_entry('asdf', 1) assert not d.has_entry('qwer', 1)
def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: int(x)) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = int(training_file) doc_path = osp.join(training_data_dir, training_file) postings.not_list().add(doc_id) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 1 f.readline() # skip postings list containing all doc ids while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell()) line = f.readline() if not line: break current_line += 1 dictionary.save()
def build_index(dir_of_docs, dict_file, postings_file): docs = [ f for f in os.listdir(dir_of_docs) if isfile(join(dir_of_docs, f)) and f.isdigit() ] # print docs sorted_doc_ids = sorted(docs, key=lambda x: int(basename(x))) # print sorted_doc_ids dictionary = Dictionary() with PostingFile(postings_file, 'w+') as p_file: for doc_id in sorted_doc_ids: doc_path = dir_of_docs + '/' + doc_id terms = process_file(doc_path) counter = Counter(terms) print "Indexing document ", str(doc_id) + "..." # print counter # print "There are " + str(len(terms)) + " terms" document_vector = {} for term, freq in counter.iteritems(): p_file.file_obj.seek(0, os.SEEK_END) curr_ptr = p_file.file_obj.tell() doc_id = int(doc_id) if dictionary.has_term(term): # Overwrite previous posting entry for the term prev_entry_ptr = dictionary.end_ptr_hash[term] prev_entry = p_file.read_posting_entry(prev_entry_ptr) p_file.write_posting_entry(prev_entry.doc_id, prev_entry.term_freq, curr_ptr, overwrite_pos=prev_entry_ptr) # Write new entry to posting file at end p_file.write_posting_entry(doc_id, freq) dictionary.add_term(term, doc_id, curr_ptr) # Build document_vector document_vector[term] = calculate_tf_wt(freq) # print "Document vector: ", document_vector # Save document length into dictionary document_length = calculate_document_length(document_vector) # print "Document length: ", document_length dictionary.doc_id_length_hash[doc_id] = document_length # print "dictionary doc ids to length: ", dictionary.doc_id_length_hash # Check if the dictionary and postings are ok # print_term_to_postings(dictionary, p_file) p_file.close() # Save dictionary to file dictionary.save_dict_to_file(dict_file)
def build(training_dir, dict_file, postings_file): dictionary = Dictionary() # Read each file in the training dir. filepaths = [] for filename in os.listdir(training_dir): filepaths.append(os.path.join(training_dir, filename)) # Sort the filepaths according to doc_id filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x))) # Two loops here to have control over the size of the loop. # NOTE(michael): for testing. # filepaths = filepaths[:10] with PostingsFile( postings_file, mode='w+', entry_cls=PostingsFileEntryWithFrequencies) as postings_file: for filepath in filepaths: # TODO(michael): Making assumption that document is an int. doc_id = int(os.path.basename(filepath)) terms = process_file(filepath) for term in terms: # Create postings file entry if entry does not exist for # `(term, doc_id)` pair. if not dictionary.has_entry(term, doc_id): # Update postings file entry for previous `(term, doc_id)` # entry for the current term. (To point to the entry we are # about to add. # `(term, doc_id)` pair. if dictionary.get_frequency(term) != 0: previous_node_location = dictionary.get_tail(term) previous_entry = \ postings_file.get_entry(previous_node_location) previous_entry.next_pointer = postings_file.pointer postings_file.write_entry(previous_entry) # Add new postings file entry for the `(term, doc_id)` pair. dictionary.add_term(term, doc_id, postings_file.pointer) new_entry = PostingsFileEntryWithFrequencies(doc_id) postings_file.write_entry(new_entry) # Update postings file entry term frequency. (Increment). # NOTE(michael): We can safely use the tail pointer since we # process documents in order and not at random. current_term_location = dictionary.get_tail(term) current_term_entry = \ postings_file.get_entry(current_term_location) current_term_entry.term_freq += 1 postings_file.write_entry(current_term_entry) # Write dictionary to file. with open(dict_file, 'w') as dictionary_file: dictionary_file.write(dictionary.to_json())
def build(training_dir, dict_file, postings_file): dictionary = Dictionary() # Read each file in the training dir. filepaths = [] for filename in os.listdir(training_dir): filepaths.append(os.path.join(training_dir, filename)) # Sort the filepaths according to doc_id filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x))) # Two loops here to have control over the size of the loop. # NOTE(michael): for testing. # filepaths = filepaths[:10] with PostingsFile(postings_file, mode='w+', entry_cls=PostingsFileEntryWithFrequencies) as postings_file: for filepath in filepaths: # TODO(michael): Making assumption that document is an int. doc_id = int(os.path.basename(filepath)) terms = process_file(filepath) for term in terms: # Create postings file entry if entry does not exist for # `(term, doc_id)` pair. if not dictionary.has_entry(term, doc_id): # Update postings file entry for previous `(term, doc_id)` # entry for the current term. (To point to the entry we are # about to add. # `(term, doc_id)` pair. if dictionary.get_frequency(term) != 0: previous_node_location = dictionary.get_tail(term) previous_entry = \ postings_file.get_entry(previous_node_location) previous_entry.next_pointer = postings_file.pointer postings_file.write_entry(previous_entry) # Add new postings file entry for the `(term, doc_id)` pair. dictionary.add_term(term, doc_id, postings_file.pointer) new_entry = PostingsFileEntryWithFrequencies(doc_id) postings_file.write_entry(new_entry) # Update postings file entry term frequency. (Increment). # NOTE(michael): We can safely use the tail pointer since we # process documents in order and not at random. current_term_location = dictionary.get_tail(term) current_term_entry = \ postings_file.get_entry(current_term_location) current_term_entry.term_freq += 1 postings_file.write_entry(current_term_entry) # Write dictionary to file. with open(dict_file, 'w') as dictionary_file: dictionary_file.write(dictionary.to_json())
def test_dictionary_add_term_pointers(): d = Dictionary() first_pointer = 0 d.add_term('asdf', 1, first_pointer) assert_eq(1, d.get_frequency('asdf')) assert_eq(first_pointer, d.get_head('asdf')) assert_eq(first_pointer, d.get_tail('asdf')) second_pointer = 10 d.add_term('asdf', 2, second_pointer) assert_eq(2, d.get_frequency('asdf')) assert_eq(first_pointer, d.get_head('asdf')) assert_eq(second_pointer, d.get_tail('asdf'))
def test_dictionary_add_term(): d = Dictionary() first_pointer = 10 d.add_term('asdf', 1, first_pointer) assert_eq(1, d.get_frequency('asdf')) assert_eq(first_pointer, d.get_head('asdf')) assert_eq(first_pointer, d.get_tail('asdf')) next_pointer = 20 d.add_term('asdf', 2, next_pointer) assert_eq(2, d.get_frequency('asdf')) assert_eq(first_pointer, d.get_head('asdf')) assert_eq(next_pointer, d.get_tail('asdf')) third_pointer = 30 d.add_term('qwer', 2, third_pointer) assert_eq(1, d.get_frequency('qwer')) assert_eq(third_pointer, d.get_head('qwer')) assert_eq(third_pointer, d.get_tail('qwer')) forth_pointer = 40 d.add_term('asdf', 2, forth_pointer) assert_eq(2, d.get_frequency('asdf')) assert_eq(first_pointer, d.get_head('asdf')) assert_eq(next_pointer, d.get_tail('asdf'))
def test_dictionary_to_json_from_json(): d = Dictionary() d.add_term('asdf', 1, 1) d.add_term('asdf', 2, 1) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) d2 = Dictionary.from_json(d.to_json()) assert_eq(d2.all_docs(), d.all_docs()) assert_eq(d2.all_terms(), d.all_terms()) assert_eq(d2.get_frequency('asdf'), d.get_frequency('asdf')) assert_eq(d2.get_frequency('qwer'), d.get_frequency('qwer')) assert_eq(d2.get_frequency('zxcv'), d.get_frequency('zxcv')) assert_eq(d2.get_head('asdf'), d.get_head('asdf')) assert_eq(d2.get_head('qwer'), d.get_head('qwer')) assert_eq(d2.get_head('zxcv'), d.get_head('zxcv')) assert_eq(d2.get_tail('asdf'), d.get_tail('asdf')) assert_eq(d2.get_tail('qwer'), d.get_tail('qwer')) assert_eq(d2.get_tail('zxcv'), d.get_tail('zxcv'))
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') indexing_doc_files = sorted(map(int, os.listdir(in_dir))) dictionary = Dictionary(out_dict) temp_dictionary = dict() temp_dictionary[ALL_DOCS] = set() # For each document get the terms and add it into the temporary in-memory posting lists for document in indexing_doc_files: temp_dictionary[ALL_DOCS].add((document, 0)) terms = util.read_document(in_dir, document) for term in terms: if term in temp_dictionary: doc_set = temp_dictionary[term] doc_set.add((document, 0)) else: temp_dictionary[term] = set() temp_dictionary[term].add((document, 0)) # Save dictionary on disk by getting offset in postings file with open(temp_file, 'wb') as temp_posting_file: for token, docs_set in sorted(temp_dictionary.items()): offset = temp_posting_file.tell() dictionary.add_term(token, len(docs_set), offset) pickle.dump(sorted(list(docs_set)), temp_posting_file) # Post processing step to add skip pointers to postings list skipPointer = SkipPointer("ROOT_L") skipPointer.set_skip_for_posting_list(out_postings, temp_file, dictionary) dictionary.save() os.remove(temp_file)
def test_dictionary_all_terms(): d = Dictionary() assert_eq([], d.all_terms()) d.add_term('asdf', 1, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('asdf', 2, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) assert_eq(sorted(['asdf', 'qwer', 'zxcv']), sorted(d.all_terms()))
def test_dictionary_all_docs(): d = Dictionary() assert_eq([], d.all_docs()) d.add_term('asdf', 1, 1) assert_eq([1], d.all_docs()) d.add_term('asdf', 2, 1) assert_eq([1, 2], d.all_docs()) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) assert_eq([1, 2], d.all_docs())
def test_dictionary_all_terms(): d = Dictionary() assert_eq([], d.all_terms()) d.add_term('asdf', 1, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('asdf', 2, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) assert_eq( sorted(['asdf', 'qwer', 'zxcv']), sorted(d.all_terms()))
def build(training_dir, dict_file, postings_file): dictionary = Dictionary() # Read each file in the training dir. filepaths = [] for filename in os.listdir(training_dir): filepaths.append(os.path.join(training_dir, filename)) # Sort the filepaths according to doc_id filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x))) # Two loops here to have control over the size of the loop. # NOTE(michael): for testing. # filepaths = filepaths[:10] with PostingsFile(postings_file, mode='w+') as postings_file: for filepath in filepaths: terms = process_file(filepath) # TODO(michael): Making assumption that document is an int. doc_id = int(os.path.basename(filepath)) for term in terms: if not dictionary.has_entry(term, doc_id): current_node_location = postings_file.pointer if dictionary.get_frequency(term) != 0: # Update previous node in the linked list. previous_node_location = dictionary.get_tail(term) previous_entry = \ postings_file.get_entry(previous_node_location) postings_file.write_entry( previous_entry.doc_id, current_node_location, write_location=previous_node_location) dictionary.add_term(term, doc_id, current_node_location) postings_file.write_entry( doc_id, write_location=current_node_location) # Skip pointers for term in dictionary.all_terms(): term_frequency = dictionary.get_frequency(term) skip_pointer_frequency = int(math.sqrt(term_frequency)) # Don't bother if too low. if skip_pointer_frequency < SKIP_POINTER_THRESHOLD: continue head = dictionary.get_head(term) entries = postings_file.get_entry_list_from_pointer(head) for idx in xrange(term_frequency): if idx % skip_pointer_frequency == 0: skip_to = idx + skip_pointer_frequency # Nothing to point to. if skip_to >= term_frequency: continue current_entry = entries[idx] skip_to_entry = entries[skip_to] # Add skip pointer. postings_file.write_entry( current_entry.doc_id, current_entry.next_pointer, skip_to_entry.own_pointer, skip_to_entry.doc_id, write_location=current_entry.own_pointer) # Write dictionary to file. with open(dict_file, 'w') as dictionary_file: dictionary_file.write(dictionary.to_json())
def build_index(dir_of_docs, dict_file, postings_file): docs = [ f for f in os.listdir(dir_of_docs) if isfile(join(dir_of_docs, f)) and f.isdigit() ] print docs sorted_doc_ids = sorted(docs, key=lambda x: int(basename(x))) print sorted_doc_ids dictionary = Dictionary() with PostingFile(postings_file, 'w+') as p_file: for doc_id in sorted_doc_ids: doc_path = dir_of_docs + '/' + doc_id terms = process_file(doc_path) print terms print "There are " + str(len(terms)) + " terms" for term in terms: p_file.file_obj.seek(0, os.SEEK_END) curr_ptr = p_file.file_obj.tell() doc_id = int(doc_id) if dictionary.has_term(term): # Overwrite previous posting entry for the term prev_entry_ptr = dictionary.end_ptr_hash[term] prev_entry = p_file.read_posting_entry(prev_entry_ptr) p_file.write_posting_entry(prev_entry.doc_id, curr_ptr, overwrite_pos=prev_entry_ptr) # Write new entry to posting file at end p_file.write_posting_entry(doc_id) dictionary.add_term(term, doc_id, curr_ptr) for term in dictionary.get_all_terms(): ptr = dictionary.get_start_ptr(term) p_list = p_file.get_posting_list_for_ptr(ptr) skip_distance = int(math.sqrt(len(p_list))) if skip_distance < SKIP_DIST_THRESHOLD: continue for idx in range(len(p_list)): if idx % skip_distance == 0: curr_entry = p_list[idx] if idx == 0: curr_ptr = ptr else: curr_ptr = p_list[idx - 1].next_ptr skip_idx = idx + skip_distance if skip_idx < len(p_list): skip_entry = p_list[idx + skip_distance] skip_ptr = p_list[idx + skip_distance - 1].next_ptr p_file.write_posting_entry(curr_entry.doc_id, curr_entry.next_ptr, skip_entry.doc_id, skip_ptr, overwrite_pos=curr_ptr) # Check if the dictionary and postings are ok print_term_to_postings(dictionary, p_file) p_file.close() # Save dictionary to file dictionary.save_dict_to_file(dict_file)