Ejemplo n.º 1
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir), key=lambda x: x)
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = training_file
        doc_path = osp.join(training_data_dir, training_file)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 0
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell(), update_freq=False)
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.generate_idf(len(training_files))
    dictionary.save()
Ejemplo n.º 2
0
def test_dictionary_has_entry():
    d = Dictionary()
    assert not d.has_entry('asdf', 1)

    d.add_term('asdf', 1, 10)
    assert d.has_entry('asdf', 1)
    assert not d.has_entry('qwer', 1)
Ejemplo n.º 3
0
def test_dictionary_has_entry():
    d = Dictionary()
    assert not d.has_entry('asdf', 1)

    d.add_term('asdf', 1, 10)
    assert d.has_entry('asdf', 1)
    assert not d.has_entry('qwer', 1)
Ejemplo n.º 4
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir),
                            key=lambda x: int(x))
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = int(training_file)
        doc_path = osp.join(training_data_dir, training_file)
        postings.not_list().add(doc_id)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 1
    f.readline()  # skip postings list containing all doc ids
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell())
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.save()
Ejemplo n.º 5
0
def build_index(dir_of_docs, dict_file, postings_file):
    docs = [
        f for f in os.listdir(dir_of_docs)
        if isfile(join(dir_of_docs, f)) and f.isdigit()
    ]
    # print docs
    sorted_doc_ids = sorted(docs, key=lambda x: int(basename(x)))
    # print sorted_doc_ids
    dictionary = Dictionary()
    with PostingFile(postings_file, 'w+') as p_file:
        for doc_id in sorted_doc_ids:
            doc_path = dir_of_docs + '/' + doc_id
            terms = process_file(doc_path)
            counter = Counter(terms)
            print "Indexing document ", str(doc_id) + "..."
            # print counter
            # print "There are " + str(len(terms)) + " terms"
            document_vector = {}
            for term, freq in counter.iteritems():

                p_file.file_obj.seek(0, os.SEEK_END)
                curr_ptr = p_file.file_obj.tell()

                doc_id = int(doc_id)

                if dictionary.has_term(term):
                    # Overwrite previous posting entry for the term
                    prev_entry_ptr = dictionary.end_ptr_hash[term]
                    prev_entry = p_file.read_posting_entry(prev_entry_ptr)
                    p_file.write_posting_entry(prev_entry.doc_id,
                                               prev_entry.term_freq,
                                               curr_ptr,
                                               overwrite_pos=prev_entry_ptr)

                # Write new entry to posting file at end
                p_file.write_posting_entry(doc_id, freq)

                dictionary.add_term(term, doc_id, curr_ptr)

                # Build document_vector
                document_vector[term] = calculate_tf_wt(freq)

            # print "Document vector: ", document_vector
            # Save document length into dictionary
            document_length = calculate_document_length(document_vector)
            # print "Document length: ", document_length
            dictionary.doc_id_length_hash[doc_id] = document_length

        # print "dictionary doc ids to length: ", dictionary.doc_id_length_hash
        # Check if the dictionary and postings are ok
        # print_term_to_postings(dictionary, p_file)

    p_file.close()

    # Save dictionary to file
    dictionary.save_dict_to_file(dict_file)
Ejemplo n.º 6
0
def build(training_dir, dict_file, postings_file):
    dictionary = Dictionary()

    # Read each file in the training dir.
    filepaths = []
    for filename in os.listdir(training_dir):
        filepaths.append(os.path.join(training_dir, filename))

    # Sort the filepaths according to doc_id
    filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x)))

    # Two loops here to have control over the size of the loop.
    # NOTE(michael): for testing.
    # filepaths = filepaths[:10]

    with PostingsFile(
            postings_file, mode='w+',
            entry_cls=PostingsFileEntryWithFrequencies) as postings_file:
        for filepath in filepaths:
            # TODO(michael): Making assumption that document is an int.
            doc_id = int(os.path.basename(filepath))
            terms = process_file(filepath)
            for term in terms:
                # Create postings file entry if entry does not exist for
                # `(term, doc_id)` pair.
                if not dictionary.has_entry(term, doc_id):
                    # Update postings file entry for previous `(term, doc_id)`
                    # entry for the current term. (To point to the entry we are
                    # about to add.
                    # `(term, doc_id)` pair.
                    if dictionary.get_frequency(term) != 0:
                        previous_node_location = dictionary.get_tail(term)
                        previous_entry = \
                            postings_file.get_entry(previous_node_location)
                        previous_entry.next_pointer = postings_file.pointer
                        postings_file.write_entry(previous_entry)

                    # Add new postings file entry for the `(term, doc_id)` pair.
                    dictionary.add_term(term, doc_id, postings_file.pointer)
                    new_entry = PostingsFileEntryWithFrequencies(doc_id)
                    postings_file.write_entry(new_entry)

                # Update postings file entry term frequency. (Increment).
                # NOTE(michael): We can safely use the tail pointer since we
                # process documents in order and not at random.
                current_term_location = dictionary.get_tail(term)
                current_term_entry = \
                    postings_file.get_entry(current_term_location)
                current_term_entry.term_freq += 1
                postings_file.write_entry(current_term_entry)

    # Write dictionary to file.
    with open(dict_file, 'w') as dictionary_file:
        dictionary_file.write(dictionary.to_json())
Ejemplo n.º 7
0
def build(training_dir, dict_file, postings_file):
    dictionary = Dictionary()

    # Read each file in the training dir.
    filepaths = []
    for filename in os.listdir(training_dir):
        filepaths.append(os.path.join(training_dir, filename))

    # Sort the filepaths according to doc_id
    filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x)))

    # Two loops here to have control over the size of the loop.
    # NOTE(michael): for testing.
    # filepaths = filepaths[:10]

    with PostingsFile(postings_file, mode='w+',
            entry_cls=PostingsFileEntryWithFrequencies) as postings_file:
        for filepath in filepaths:
            # TODO(michael): Making assumption that document is an int.
            doc_id = int(os.path.basename(filepath))
            terms = process_file(filepath)
            for term in terms:
                # Create postings file entry if entry does not exist for
                # `(term, doc_id)` pair.
                if not dictionary.has_entry(term, doc_id):
                    # Update postings file entry for previous `(term, doc_id)`
                    # entry for the current term. (To point to the entry we are
                    # about to add.
                    # `(term, doc_id)` pair.
                    if dictionary.get_frequency(term) != 0:
                        previous_node_location = dictionary.get_tail(term)
                        previous_entry = \
                            postings_file.get_entry(previous_node_location)
                        previous_entry.next_pointer = postings_file.pointer
                        postings_file.write_entry(previous_entry)

                    # Add new postings file entry for the `(term, doc_id)` pair.
                    dictionary.add_term(term, doc_id, postings_file.pointer)
                    new_entry = PostingsFileEntryWithFrequencies(doc_id)
                    postings_file.write_entry(new_entry)

                # Update postings file entry term frequency. (Increment).
                # NOTE(michael): We can safely use the tail pointer since we
                # process documents in order and not at random.
                current_term_location = dictionary.get_tail(term)
                current_term_entry = \
                    postings_file.get_entry(current_term_location)
                current_term_entry.term_freq += 1
                postings_file.write_entry(current_term_entry)

    # Write dictionary to file.
    with open(dict_file, 'w') as dictionary_file:
        dictionary_file.write(dictionary.to_json())
Ejemplo n.º 8
0
def test_dictionary_add_term_pointers():
    d = Dictionary()

    first_pointer = 0
    d.add_term('asdf', 1, first_pointer)
    assert_eq(1, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(first_pointer, d.get_tail('asdf'))

    second_pointer = 10
    d.add_term('asdf', 2, second_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(second_pointer, d.get_tail('asdf'))
Ejemplo n.º 9
0
def test_dictionary_add_term():
    d = Dictionary()

    first_pointer = 10
    d.add_term('asdf', 1, first_pointer)
    assert_eq(1, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(first_pointer, d.get_tail('asdf'))

    next_pointer = 20
    d.add_term('asdf', 2, next_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(next_pointer, d.get_tail('asdf'))

    third_pointer = 30
    d.add_term('qwer', 2, third_pointer)
    assert_eq(1, d.get_frequency('qwer'))
    assert_eq(third_pointer, d.get_head('qwer'))
    assert_eq(third_pointer, d.get_tail('qwer'))

    forth_pointer = 40
    d.add_term('asdf', 2, forth_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(next_pointer, d.get_tail('asdf'))
Ejemplo n.º 10
0
def test_dictionary_add_term_pointers():
    d = Dictionary()

    first_pointer = 0
    d.add_term('asdf', 1, first_pointer)
    assert_eq(1, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(first_pointer, d.get_tail('asdf'))

    second_pointer = 10
    d.add_term('asdf', 2, second_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(second_pointer, d.get_tail('asdf'))
Ejemplo n.º 11
0
def test_dictionary_add_term():
    d = Dictionary()

    first_pointer = 10
    d.add_term('asdf', 1, first_pointer)
    assert_eq(1, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(first_pointer, d.get_tail('asdf'))

    next_pointer = 20
    d.add_term('asdf', 2, next_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(next_pointer, d.get_tail('asdf'))

    third_pointer = 30
    d.add_term('qwer', 2, third_pointer)
    assert_eq(1, d.get_frequency('qwer'))
    assert_eq(third_pointer, d.get_head('qwer'))
    assert_eq(third_pointer, d.get_tail('qwer'))

    forth_pointer = 40
    d.add_term('asdf', 2, forth_pointer)
    assert_eq(2, d.get_frequency('asdf'))
    assert_eq(first_pointer, d.get_head('asdf'))
    assert_eq(next_pointer, d.get_tail('asdf'))
Ejemplo n.º 12
0
def test_dictionary_to_json_from_json():
    d = Dictionary()
    d.add_term('asdf', 1, 1)
    d.add_term('asdf', 2, 1)
    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)

    d2 = Dictionary.from_json(d.to_json())
    assert_eq(d2.all_docs(), d.all_docs())
    assert_eq(d2.all_terms(), d.all_terms())

    assert_eq(d2.get_frequency('asdf'), d.get_frequency('asdf'))
    assert_eq(d2.get_frequency('qwer'), d.get_frequency('qwer'))
    assert_eq(d2.get_frequency('zxcv'), d.get_frequency('zxcv'))

    assert_eq(d2.get_head('asdf'), d.get_head('asdf'))
    assert_eq(d2.get_head('qwer'), d.get_head('qwer'))
    assert_eq(d2.get_head('zxcv'), d.get_head('zxcv'))

    assert_eq(d2.get_tail('asdf'), d.get_tail('asdf'))
    assert_eq(d2.get_tail('qwer'), d.get_tail('qwer'))
    assert_eq(d2.get_tail('zxcv'), d.get_tail('zxcv'))
Ejemplo n.º 13
0
def test_dictionary_to_json_from_json():
    d = Dictionary()
    d.add_term('asdf', 1, 1)
    d.add_term('asdf', 2, 1)
    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)

    d2 = Dictionary.from_json(d.to_json())
    assert_eq(d2.all_docs(), d.all_docs())
    assert_eq(d2.all_terms(), d.all_terms())

    assert_eq(d2.get_frequency('asdf'), d.get_frequency('asdf'))
    assert_eq(d2.get_frequency('qwer'), d.get_frequency('qwer'))
    assert_eq(d2.get_frequency('zxcv'), d.get_frequency('zxcv'))

    assert_eq(d2.get_head('asdf'), d.get_head('asdf'))
    assert_eq(d2.get_head('qwer'), d.get_head('qwer'))
    assert_eq(d2.get_head('zxcv'), d.get_head('zxcv'))

    assert_eq(d2.get_tail('asdf'), d.get_tail('asdf'))
    assert_eq(d2.get_tail('qwer'), d.get_tail('qwer'))
    assert_eq(d2.get_tail('zxcv'), d.get_tail('zxcv'))
Ejemplo n.º 14
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')

    indexing_doc_files = sorted(map(int, os.listdir(in_dir)))
    dictionary = Dictionary(out_dict)

    temp_dictionary = dict()
    temp_dictionary[ALL_DOCS] = set()
    # For each document get the terms and add it into the temporary in-memory posting lists
    for document in indexing_doc_files:
        temp_dictionary[ALL_DOCS].add((document, 0))

        terms = util.read_document(in_dir, document)
        for term in terms:
            if term in temp_dictionary:
                doc_set = temp_dictionary[term]
                doc_set.add((document, 0))
            else:
                temp_dictionary[term] = set()
                temp_dictionary[term].add((document, 0))

    # Save dictionary on disk by getting offset in postings file
    with open(temp_file, 'wb') as temp_posting_file:
        for token, docs_set in sorted(temp_dictionary.items()):
            offset = temp_posting_file.tell()
            dictionary.add_term(token, len(docs_set), offset)
            pickle.dump(sorted(list(docs_set)), temp_posting_file)

    # Post processing step to add skip pointers to postings list
    skipPointer = SkipPointer("ROOT_L")
    skipPointer.set_skip_for_posting_list(out_postings, temp_file, dictionary)

    dictionary.save()
    os.remove(temp_file)
Ejemplo n.º 15
0
def test_dictionary_all_terms():
    d = Dictionary()
    assert_eq([], d.all_terms())

    d.add_term('asdf', 1, 1)
    assert_eq(['asdf'], d.all_terms())

    d.add_term('asdf', 2, 1)
    assert_eq(['asdf'], d.all_terms())

    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)
    assert_eq(sorted(['asdf', 'qwer', 'zxcv']), sorted(d.all_terms()))
Ejemplo n.º 16
0
def test_dictionary_all_docs():
    d = Dictionary()
    assert_eq([], d.all_docs())

    d.add_term('asdf', 1, 1)
    assert_eq([1], d.all_docs())

    d.add_term('asdf', 2, 1)
    assert_eq([1, 2], d.all_docs())

    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)
    assert_eq([1, 2], d.all_docs())
Ejemplo n.º 17
0
def test_dictionary_all_docs():
    d = Dictionary()
    assert_eq([], d.all_docs())

    d.add_term('asdf', 1, 1)
    assert_eq([1], d.all_docs())

    d.add_term('asdf', 2, 1)
    assert_eq([1, 2], d.all_docs())

    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)
    assert_eq([1, 2], d.all_docs())
Ejemplo n.º 18
0
def test_dictionary_all_terms():
    d = Dictionary()
    assert_eq([], d.all_terms())

    d.add_term('asdf', 1, 1)
    assert_eq(['asdf'], d.all_terms())

    d.add_term('asdf', 2, 1)
    assert_eq(['asdf'], d.all_terms())

    d.add_term('qwer', 1, 1)
    d.add_term('zxcv', 1, 1)
    assert_eq(
        sorted(['asdf', 'qwer', 'zxcv']),
        sorted(d.all_terms()))
Ejemplo n.º 19
0
def build(training_dir, dict_file, postings_file):
    dictionary = Dictionary()

    # Read each file in the training dir.
    filepaths = []
    for filename in os.listdir(training_dir):
        filepaths.append(os.path.join(training_dir, filename))

    # Sort the filepaths according to doc_id
    filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x)))

    # Two loops here to have control over the size of the loop.
    # NOTE(michael): for testing.
    # filepaths = filepaths[:10]

    with PostingsFile(postings_file, mode='w+') as postings_file:
        for filepath in filepaths:
            terms = process_file(filepath)
            # TODO(michael): Making assumption that document is an int.
            doc_id = int(os.path.basename(filepath))

            for term in terms:
                if not dictionary.has_entry(term, doc_id):
                    current_node_location = postings_file.pointer

                    if dictionary.get_frequency(term) != 0:
                        # Update previous node in the linked list.
                        previous_node_location = dictionary.get_tail(term)
                        previous_entry = \
                            postings_file.get_entry(previous_node_location)
                        postings_file.write_entry(
                            previous_entry.doc_id,
                            current_node_location,
                            write_location=previous_node_location)

                    dictionary.add_term(term, doc_id, current_node_location)
                    postings_file.write_entry(
                        doc_id, write_location=current_node_location)

        # Skip pointers
        for term in dictionary.all_terms():
            term_frequency = dictionary.get_frequency(term)
            skip_pointer_frequency = int(math.sqrt(term_frequency))

            # Don't bother if too low.
            if skip_pointer_frequency < SKIP_POINTER_THRESHOLD:
                continue

            head = dictionary.get_head(term)
            entries = postings_file.get_entry_list_from_pointer(head)

            for idx in xrange(term_frequency):
                if idx % skip_pointer_frequency == 0:
                    skip_to = idx + skip_pointer_frequency

                    # Nothing to point to.
                    if skip_to >= term_frequency:
                        continue

                    current_entry = entries[idx]
                    skip_to_entry = entries[skip_to]

                    # Add skip pointer.
                    postings_file.write_entry(
                        current_entry.doc_id,
                        current_entry.next_pointer,
                        skip_to_entry.own_pointer,
                        skip_to_entry.doc_id,
                        write_location=current_entry.own_pointer)

    # Write dictionary to file.
    with open(dict_file, 'w') as dictionary_file:
        dictionary_file.write(dictionary.to_json())
Ejemplo n.º 20
0
def build_index(dir_of_docs, dict_file, postings_file):
    docs = [
        f for f in os.listdir(dir_of_docs)
        if isfile(join(dir_of_docs, f)) and f.isdigit()
    ]
    print docs
    sorted_doc_ids = sorted(docs, key=lambda x: int(basename(x)))
    print sorted_doc_ids
    dictionary = Dictionary()
    with PostingFile(postings_file, 'w+') as p_file:
        for doc_id in sorted_doc_ids:
            doc_path = dir_of_docs + '/' + doc_id
            terms = process_file(doc_path)
            print terms
            print "There are " + str(len(terms)) + " terms"
            for term in terms:
                p_file.file_obj.seek(0, os.SEEK_END)
                curr_ptr = p_file.file_obj.tell()

                doc_id = int(doc_id)

                if dictionary.has_term(term):
                    # Overwrite previous posting entry for the term
                    prev_entry_ptr = dictionary.end_ptr_hash[term]
                    prev_entry = p_file.read_posting_entry(prev_entry_ptr)
                    p_file.write_posting_entry(prev_entry.doc_id,
                                               curr_ptr,
                                               overwrite_pos=prev_entry_ptr)

                # Write new entry to posting file at end
                p_file.write_posting_entry(doc_id)

                dictionary.add_term(term, doc_id, curr_ptr)

        for term in dictionary.get_all_terms():
            ptr = dictionary.get_start_ptr(term)
            p_list = p_file.get_posting_list_for_ptr(ptr)
            skip_distance = int(math.sqrt(len(p_list)))
            if skip_distance < SKIP_DIST_THRESHOLD:
                continue
            for idx in range(len(p_list)):
                if idx % skip_distance == 0:
                    curr_entry = p_list[idx]
                    if idx == 0:
                        curr_ptr = ptr
                    else:
                        curr_ptr = p_list[idx - 1].next_ptr
                    skip_idx = idx + skip_distance
                    if skip_idx < len(p_list):
                        skip_entry = p_list[idx + skip_distance]
                        skip_ptr = p_list[idx + skip_distance - 1].next_ptr
                        p_file.write_posting_entry(curr_entry.doc_id,
                                                   curr_entry.next_ptr,
                                                   skip_entry.doc_id,
                                                   skip_ptr,
                                                   overwrite_pos=curr_ptr)

        # Check if the dictionary and postings are ok
        print_term_to_postings(dictionary, p_file)
    p_file.close()

    # Save dictionary to file
    dictionary.save_dict_to_file(dict_file)