Exemple #1
0
    def setUp(self):
        self.emptyList = LinkedList()
        self.List = LinkedList()

        # populate list with 1, 2, 3, 4
        for i in range(1, 5):
            self.List.append(i)
Exemple #2
0
def merge_positional_indexes(
    before: LinkedList[Tuple[str, LinkedList[int]]],
    after: LinkedList[Tuple[str, LinkedList[int]]]
) -> LinkedList[Tuple[str, LinkedList[int]]]:
    result = LinkedList()
    before, after = before.get_head(), after.get_head()
    while before is not None and after is not None:
        before_id, before_positions = before.value
        after_id, after_positions = after.value
        if before_id == after_id:
            merge_result = merge_positions(before_positions, after_positions)
            if merge_result:
                result.append((before_id, merge_result))
            before = before.next()
            after = after.next()
        elif before_id < after_id:

            if before.skip() and before.skip().value[0] <= after_id:
                before = before.skip()
            else:
                before = before.next()
        elif after.skip() and after.skip().value[0] <= before_id:
            after = after.skip()
        else:
            after = after.next()
    return result
Exemple #3
0
def get_relevant_docs(query: str, dictionary: Dict[str,
                                                   Tuple[float, Tuple[int,
                                                                      int],
                                                         Tuple[int, int]]],
                      vector_lengths: Dict[str,
                                           float], relevant_doc_ids: List[str],
                      document_vectors_dictionary: Dict[str, Tuple[int, int]],
                      postings_file: BinaryIO) -> LinkedList:
    query_vector = query_to_vector(query)
    if QUERY_EXPANSION:  # Requires non-normalized terms
        query_vector = query_expansion(query_vector)
    query_vector = normalized_vector(query_vector)
    if RELEVANT_FEEDBACK:  # Requires normalized terms
        query_vector = rocchio_algorithm(query_vector, relevant_doc_ids,
                                         document_vectors_dictionary, ALPHA,
                                         BETA, postings_file)
    scores = defaultdict(float)
    for term, factor in query_vector.items():
        if term not in dictionary:
            continue
        term_postings = load_postings_list(postings_file, dictionary, term)
        idf = dictionary[term][0]
        for doc_id, tf_d in term_postings:
            scores[doc_id] += factor * tf_d * idf
    normalized_scores = sorted(((doc_id, score / vector_lengths[doc_id])
                                for doc_id, score in scores.items()),
                               key=lambda x: x[1],
                               reverse=True)
    relevant_docs = (x[0] for x in normalized_scores if x[1] > THRESHOLD)
    output = LinkedList()
    output.extend(relevant_docs)
    return output
def linked_list_sum(list_a, list_b):
    # a new linked list to return the results in
    results = LinkedList()

    # get the starting values from the linked lists
    node_a, node_b = list_a.root, list_b.root
    # the carry is used to make sure any carryover if the sum of two ints is greater than 9
    # is added to the next value
    carry = 0

    # while there are still values in a least one list,
    # and there is a carry value greater than 0,
    # continue processing the nodes
    while node_a is not None or node_b is not None or carry > 0:
        # get the values from their respective linked lists,
        # taking care to avoid trying to access a null value
        val_a = 0 if node_a is None else node_a.value
        val_b = 0 if node_b is None else node_b.value

        # find the sum of the node values and the carry values
        new_val = val_a + val_b + carry

        # calculate carryover
        # if the sum is greater than ten, you will have carryover
        if new_val >= 10:
            # the carry is the int division of the value divided by 10
            carry = new_val // 10
            # the new value is the remainder of dividing the new value by 10
            new_val = new_val % 10
        else:
            carry = 0

        # add the new result to the results linked list
        results.add(new_val)

        # find the next nodes, if there are any remaining
        node_a = None if node_a is None else node_a.next
        node_b = None if node_b is None else node_b.next

    return results
Exemple #5
0
def merge_positions(before_positions: LinkedList[int],
                    after_positions: LinkedList[int]):
    result = LinkedList()
    before, after = before_positions.get_head(), after_positions.get_head()

    while before is not None and after is not None:
        before, after = cast(Node[int], before), cast(Node[int],
                                                      after)  # typecasting
        if before.value == after.value - 1:
            result.append(after.value)
            before = before.next()
            after = after.next()
        elif before.value < after.value - 1:
            if before.skip() and before.skip().value <= after.value - 1:
                before = before.skip()
            else:
                before = before.next()
        elif after.skip() and after.skip().value - 1 <= before.value:
            after = after.skip()
        else:
            after = after.next()
    return result
def perform_and(operand_a: LinkedList[int],
                operand_b: LinkedList[int]) -> LinkedList:
    """
    Returns all ids that are ids of operand a and operand b. Copied from HW2.
    """
    result = LinkedList()
    operand_a, operand_b = operand_a.get_head(), operand_b.get_head()
    while operand_a is not None and operand_b is not None:
        if operand_a.value == operand_b.value:
            result.append(operand_a.value)
            operand_a = operand_a.next()
            operand_b = operand_b.next()
        elif operand_a.value < operand_b.value:
            if operand_a.skip() and operand_a.skip().value <= operand_b.value:
                operand_a = operand_a.skip()
            else:
                operand_a = operand_a.next()
        elif operand_b.skip() and operand_b.skip().value <= operand_a.value:
            operand_b = operand_b.skip()
        else:
            operand_b = operand_b.next()
    return result
def load_postings_list(postings_file: BinaryIO,
                       dictionary: Dict[str, Tuple[float, Tuple[int, int],
                                                   Tuple[int, int]]],
                       token: str) -> LinkedList[Tuple[str, float]]:
    """
    Loads postings list from postings file using the location provided
    by the dictionary.

    Returns an empty LinkedList if token is not in dictionary.
    """
    if token not in dictionary:
        return LinkedList()
    _, (offset, length), _ = dictionary[token]
    postings_file.seek(offset)
    pickled = postings_file.read(length)
    return pickle.loads(pickled)
Exemple #8
0
 def __init__(self, vertices, edges=None, is_directed=True):
     """ Constructor
     
         + vertices: Number of vertices
         + edges: List with pairs (x,y) representing edges, where 0 <= x,y < vertices
         + is_directed: whether the graph is directed or not
     """
     self.v = vertices
     self.graph = {node: LinkedList() for node in range(vertices)}
     self.is_directed = is_directed
     if edges:
         for edge in edges:
             w = 1
             if len(edge) > 2:  # Has weight associated
                 w = edge[2]
             self.add_edge(edge[0], edge[1], w)
Exemple #9
0
def retrieve_phrase(
        dictionary: Dict[str, Tuple[float, Tuple[int, int],
                                    Tuple[int, int]]], postings_file: BinaryIO,
        tokens: List[str]) -> LinkedList[Tuple[str, LinkedList[int]]]:
    """
    Returns a LinkedList of documents that contain a specific phrase.

    :param dictionary
    :param postings_file the read-only binary file descriptor
    :param tokens tokens in phrase
    :return: A LinkedList of document IDs ().
    """
    if not tokens:
        return LinkedList()

    positional_index = load_positional_index(postings_file, dictionary,
                                             tokens[0])
    for token in tokens[1:]:
        next_positional_index = load_positional_index(postings_file,
                                                      dictionary, token)
        positional_index = merge_positional_indexes(positional_index,
                                                    next_positional_index)

    return positional_index
Exemple #10
0
class TestLinkedList(unittest.TestCase):  # doubly-linked list

    def setUp(self):
        self.emptyList = LinkedList()
        self.List = LinkedList()

        # populate list with 1, 2, 3, 4
        for i in range(1, 5):
            self.List.append(i)

    def test_repr(self):
        '''Test returning the linked list as a string literal.'''
        self.assertEqual(repr(self.emptyList), str(()))
        tup = (1, 3.14, 'foo', True)

        for i in tup:
            self.emptyList.append(i)

        self.assertEqual(repr(self.emptyList), str(tup))

    def test_len_size(self):
        '''Test returning the length of the list.'''
        for i in range(100):
            self.emptyList.insert(i)

        self.assertEqual(len(self.emptyList), 100)
        self.assertEqual(self.emptyList.size(), 100)

        self.emptyList.pop()
        self.assertEqual(len(self.emptyList), 99)
        self.assertEqual(self.emptyList.size(), 99)

        self.emptyList.shift()
        self.assertEqual(len(self.emptyList), 98)
        self.assertEqual(self.emptyList.size(), 98)

        self.emptyList.remove(4)
        self.assertEqual(len(self.emptyList), 97)
        self.assertEqual(self.emptyList.size(), 97)

    def test_tuple(self):
        '''Test returning the linked list as a list literal.'''
        self.assertEqual(list(self.emptyList), [])
        li = [1, 3.14, 'foo', True]

        for i in li:
            self.emptyList.append(i)

        self.assertEqual(list(self.emptyList), li)

    def test_list(self):
        '''Test returning the linked list as a tuple literal.'''
        self.assertEqual(tuple(self.emptyList), ())
        tup = (1, 3.14, 'foo', True)

        for i in tup:
            self.emptyList.append(i)

        self.assertEqual(tuple(self.emptyList), tup)

    def test_in_contains(self):
        '''Test returning T/F for whether the list contains certain values.'''
        self.assertTrue(2 in self.List)
        self.assertFalse(10 in self.List)

        self.assertTrue(self.List.contains(2))
        self.assertFalse(self.List.contains(10))

    def test_insert(self):
        '''Test inserting values at the head of list the list.'''
        for i in range(1, 5):
            self.emptyList.insert(i)

        self.assertEqual(tuple(self.emptyList), (4, 3, 2, 1))

    def test_append(self):
        '''Test appending values at the end of the list.'''
        self.List.append(True)
        self.assertEqual(tuple(self.List), (1, 2, 3, 4, True))

    def test_shift(self):
        '''Test removing and returning the head of the list.'''
        self.assertEqual(self.List.shift(), 1)
        self.assertEqual(tuple(self.List), (2, 3, 4))
        self.assertEqual(self.List.shift(), 2)
        self.assertEqual(self.List.shift(), 3)
        self.assertEqual(self.List.shift(), 4)
        self.assertEqual(tuple(self.List), ())

        with self.assertRaises(IndexError):
            self.emptyList.shift()

    def test_pop(self):
        '''Test popping off and returning the tail of the list.'''
        self.assertEqual(self.List.pop(), 4)
        self.assertEqual(tuple(self.List), (1, 2, 3))
        self.assertEqual(self.List.pop(), 3)
        self.assertEqual(self.List.pop(), 2)
        self.assertEqual(self.List.pop(), 1)
        self.assertEqual(tuple(self.List), ())

        with self.assertRaises(IndexError):
            self.emptyList.pop()

    def test_shift_and_post(self):
        '''Testing shifting and popping interchaneably.'''
        self.assertEqual(self.List.pop(), 4)
        self.assertEqual(tuple(self.List), (1, 2, 3))
        self.assertEqual(self.List.shift(), 1)
        self.assertEqual(tuple(self.List), (2, 3))
        self.assertEqual(self.List.pop(), 3)
        self.assertEqual(tuple(self.List), (2, ))
        self.assertEqual(self.List.shift(), 2)
        self.assertEqual(tuple(self.List), ())

        with self.assertRaises(IndexError):
            self.emptyList.pop()

        with self.assertRaises(IndexError):
            self.emptyList.shift()

    def test_remove(self):
        '''Testing finding and removing values from the list.'''
        for i in range(1, 5):
            self.List.append(i)

        self.List.remove(2)
        self.assertEqual(tuple(self.List), (1, 3, 4, 1, 2, 3, 4))

        with self.assertRaises(ValueError):
            self.List.remove(6)
def perform_boolean_query(tokens: List[Tuple[str, Union[List[str], str]]],
                          dictionary: Dict[str, Tuple[float, Tuple[int, int],
                                                      Tuple[int, int]]],
                          postings_file: BinaryIO) -> LinkedList:
    """
    Returns a LinkedList of documents that satisfy a purely conjunctive boolean
    query.

    :param tokens: A List containing Tuples with the form
        (<'phrase' | 'nonphrase', <term>) where <term> is a query term/phrase
        -- phrases are Lists, single terms are strings.
    :param dictionary the combined TF-IDF and positional index dictionary
    :param postings_file the read-only binary file descriptor for the postings
        list file.
    :return: A LinkedList of document IDs that satisfy the boolean query.
    """
    def get_idf(token: Tuple[str, Union[List[str], str]]) -> float:
        """
        Helper function to get the IDF of a phrase/term.

        The IDF of a phrase is estimated by summing up the IDF of the
        individual terms. This works out as phrases usually produce small
        resultant postings lists, and a phrase with rare terms (high IDF) will
        produce smaller postings lists than a phrase with common terms.
        """
        term_type, phrase = token
        if term_type == TokenType.PHRASE:
            phrase = cast(List[str], phrase)
            # Returns the sum of the idfs of each term
            # (first item in the dictionary tuple)
            return sum(
                map(
                    lambda term: dictionary[term][0]
                    if term in dictionary else 0, phrase))
        elif term_type == TokenType.NON_PHRASE:
            term = cast(str, phrase)
            return dictionary[term][0] if term in dictionary else 0

    def get_postings_list(term_type: str, phrase: Union[List[str],
                                                        str]) -> LinkedList:
        """
        Helper function to get a postings list length of a phrase/term.
        Returns empty LinkedList if phrase does not exist.
        """
        if term_type == TokenType.PHRASE:
            return retrieve_phrase(dictionary, postings_file, phrase)
        elif term_type == TokenType.NON_PHRASE:
            term = cast(str, phrase)
            return load_postings_list(postings_file, dictionary, term)

    # Guard against empty tokens list
    if not tokens:
        return LinkedList()

    # Optimization for faster AND computation -- do the rarer term.
    list.sort(tokens, key=get_idf, reverse=True)

    # Generate a list of Postings lists
    resultant_list: LinkedList[int] = get_postings_list(*tokens[0])

    # Successively use AND on the tokens' postings lists
    for token in tokens[1:]:
        # Short circuit for empty LinkedList --
        # cannot be done easily when using `reduce`
        if not resultant_list:
            break
        resultant_list = perform_and(resultant_list, get_postings_list(*token))

    return resultant_list
Exemple #12
0
        quick_sort(array, lower, partition_at + 1)
        quick_sort(array, partition_at + 1, n)

    return array


def merge_sort(array, n=None):
    pass


def radix_sort(array, n=None):
    pass


if __name__ == '__main__':
    large_ll = LinkedList(2)
    cur_node = large_ll
    for i in [7, 5, 8, 4, 6, 1, 3]:
        cur_node.next_node = LinkedList(i)
        cur_node = cur_node.next_node

    unsorted_large_array = [2, 7, 5, 8, 4, 6, 1, 3]
    sorted_large_array = list(range(2 << 20))
    # bubble_sort(unsorted_large_array.copy())
    # bubble_sort(sorted_large_array.copy())
    # insertion_sort(unsorted_large_array.copy())
    # insertion_sort(sorted_large_array.copy())
    # insertion_sort_ll(large_ll)
    # selection_sort(unsorted_large_array.copy())
    # assert sorted_large_array == selection_sort(sorted_large_array.copy())
    print(quick_sort(unsorted_large_array.copy()))