def test_dictionary_all_terms(): d = Dictionary() assert_eq([], d.all_terms()) d.add_term('asdf', 1, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('asdf', 2, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) assert_eq(sorted(['asdf', 'qwer', 'zxcv']), sorted(d.all_terms()))
def test_dictionary_all_terms(): d = Dictionary() assert_eq([], d.all_terms()) d.add_term('asdf', 1, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('asdf', 2, 1) assert_eq(['asdf'], d.all_terms()) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) assert_eq( sorted(['asdf', 'qwer', 'zxcv']), sorted(d.all_terms()))
def test_dictionary_to_json_from_json(): d = Dictionary() d.add_term('asdf', 1, 1) d.add_term('asdf', 2, 1) d.add_term('qwer', 1, 1) d.add_term('zxcv', 1, 1) d2 = Dictionary.from_json(d.to_json()) assert_eq(d2.all_docs(), d.all_docs()) assert_eq(d2.all_terms(), d.all_terms()) assert_eq(d2.get_frequency('asdf'), d.get_frequency('asdf')) assert_eq(d2.get_frequency('qwer'), d.get_frequency('qwer')) assert_eq(d2.get_frequency('zxcv'), d.get_frequency('zxcv')) assert_eq(d2.get_head('asdf'), d.get_head('asdf')) assert_eq(d2.get_head('qwer'), d.get_head('qwer')) assert_eq(d2.get_head('zxcv'), d.get_head('zxcv')) assert_eq(d2.get_tail('asdf'), d.get_tail('asdf')) assert_eq(d2.get_tail('qwer'), d.get_tail('qwer')) assert_eq(d2.get_tail('zxcv'), d.get_tail('zxcv'))
def build(training_dir, dict_file, postings_file): dictionary = Dictionary() # Read each file in the training dir. filepaths = [] for filename in os.listdir(training_dir): filepaths.append(os.path.join(training_dir, filename)) # Sort the filepaths according to doc_id filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x))) # Two loops here to have control over the size of the loop. # NOTE(michael): for testing. # filepaths = filepaths[:10] with PostingsFile(postings_file, mode='w+') as postings_file: for filepath in filepaths: terms = process_file(filepath) # TODO(michael): Making assumption that document is an int. doc_id = int(os.path.basename(filepath)) for term in terms: if not dictionary.has_entry(term, doc_id): current_node_location = postings_file.pointer if dictionary.get_frequency(term) != 0: # Update previous node in the linked list. previous_node_location = dictionary.get_tail(term) previous_entry = \ postings_file.get_entry(previous_node_location) postings_file.write_entry( previous_entry.doc_id, current_node_location, write_location=previous_node_location) dictionary.add_term(term, doc_id, current_node_location) postings_file.write_entry( doc_id, write_location=current_node_location) # Skip pointers for term in dictionary.all_terms(): term_frequency = dictionary.get_frequency(term) skip_pointer_frequency = int(math.sqrt(term_frequency)) # Don't bother if too low. if skip_pointer_frequency < SKIP_POINTER_THRESHOLD: continue head = dictionary.get_head(term) entries = postings_file.get_entry_list_from_pointer(head) for idx in xrange(term_frequency): if idx % skip_pointer_frequency == 0: skip_to = idx + skip_pointer_frequency # Nothing to point to. if skip_to >= term_frequency: continue current_entry = entries[idx] skip_to_entry = entries[skip_to] # Add skip pointer. postings_file.write_entry( current_entry.doc_id, current_entry.next_pointer, skip_to_entry.own_pointer, skip_to_entry.doc_id, write_location=current_entry.own_pointer) # Write dictionary to file. with open(dict_file, 'w') as dictionary_file: dictionary_file.write(dictionary.to_json())