def setUp(self): self.emptyList = LinkedList() self.List = LinkedList() # populate list with 1, 2, 3, 4 for i in range(1, 5): self.List.append(i)
def merge_positional_indexes( before: LinkedList[Tuple[str, LinkedList[int]]], after: LinkedList[Tuple[str, LinkedList[int]]] ) -> LinkedList[Tuple[str, LinkedList[int]]]: result = LinkedList() before, after = before.get_head(), after.get_head() while before is not None and after is not None: before_id, before_positions = before.value after_id, after_positions = after.value if before_id == after_id: merge_result = merge_positions(before_positions, after_positions) if merge_result: result.append((before_id, merge_result)) before = before.next() after = after.next() elif before_id < after_id: if before.skip() and before.skip().value[0] <= after_id: before = before.skip() else: before = before.next() elif after.skip() and after.skip().value[0] <= before_id: after = after.skip() else: after = after.next() return result
def get_relevant_docs(query: str, dictionary: Dict[str, Tuple[float, Tuple[int, int], Tuple[int, int]]], vector_lengths: Dict[str, float], relevant_doc_ids: List[str], document_vectors_dictionary: Dict[str, Tuple[int, int]], postings_file: BinaryIO) -> LinkedList: query_vector = query_to_vector(query) if QUERY_EXPANSION: # Requires non-normalized terms query_vector = query_expansion(query_vector) query_vector = normalized_vector(query_vector) if RELEVANT_FEEDBACK: # Requires normalized terms query_vector = rocchio_algorithm(query_vector, relevant_doc_ids, document_vectors_dictionary, ALPHA, BETA, postings_file) scores = defaultdict(float) for term, factor in query_vector.items(): if term not in dictionary: continue term_postings = load_postings_list(postings_file, dictionary, term) idf = dictionary[term][0] for doc_id, tf_d in term_postings: scores[doc_id] += factor * tf_d * idf normalized_scores = sorted(((doc_id, score / vector_lengths[doc_id]) for doc_id, score in scores.items()), key=lambda x: x[1], reverse=True) relevant_docs = (x[0] for x in normalized_scores if x[1] > THRESHOLD) output = LinkedList() output.extend(relevant_docs) return output
def linked_list_sum(list_a, list_b): # a new linked list to return the results in results = LinkedList() # get the starting values from the linked lists node_a, node_b = list_a.root, list_b.root # the carry is used to make sure any carryover if the sum of two ints is greater than 9 # is added to the next value carry = 0 # while there are still values in a least one list, # and there is a carry value greater than 0, # continue processing the nodes while node_a is not None or node_b is not None or carry > 0: # get the values from their respective linked lists, # taking care to avoid trying to access a null value val_a = 0 if node_a is None else node_a.value val_b = 0 if node_b is None else node_b.value # find the sum of the node values and the carry values new_val = val_a + val_b + carry # calculate carryover # if the sum is greater than ten, you will have carryover if new_val >= 10: # the carry is the int division of the value divided by 10 carry = new_val // 10 # the new value is the remainder of dividing the new value by 10 new_val = new_val % 10 else: carry = 0 # add the new result to the results linked list results.add(new_val) # find the next nodes, if there are any remaining node_a = None if node_a is None else node_a.next node_b = None if node_b is None else node_b.next return results
def merge_positions(before_positions: LinkedList[int], after_positions: LinkedList[int]): result = LinkedList() before, after = before_positions.get_head(), after_positions.get_head() while before is not None and after is not None: before, after = cast(Node[int], before), cast(Node[int], after) # typecasting if before.value == after.value - 1: result.append(after.value) before = before.next() after = after.next() elif before.value < after.value - 1: if before.skip() and before.skip().value <= after.value - 1: before = before.skip() else: before = before.next() elif after.skip() and after.skip().value - 1 <= before.value: after = after.skip() else: after = after.next() return result
def perform_and(operand_a: LinkedList[int], operand_b: LinkedList[int]) -> LinkedList: """ Returns all ids that are ids of operand a and operand b. Copied from HW2. """ result = LinkedList() operand_a, operand_b = operand_a.get_head(), operand_b.get_head() while operand_a is not None and operand_b is not None: if operand_a.value == operand_b.value: result.append(operand_a.value) operand_a = operand_a.next() operand_b = operand_b.next() elif operand_a.value < operand_b.value: if operand_a.skip() and operand_a.skip().value <= operand_b.value: operand_a = operand_a.skip() else: operand_a = operand_a.next() elif operand_b.skip() and operand_b.skip().value <= operand_a.value: operand_b = operand_b.skip() else: operand_b = operand_b.next() return result
def load_postings_list(postings_file: BinaryIO, dictionary: Dict[str, Tuple[float, Tuple[int, int], Tuple[int, int]]], token: str) -> LinkedList[Tuple[str, float]]: """ Loads postings list from postings file using the location provided by the dictionary. Returns an empty LinkedList if token is not in dictionary. """ if token not in dictionary: return LinkedList() _, (offset, length), _ = dictionary[token] postings_file.seek(offset) pickled = postings_file.read(length) return pickle.loads(pickled)
def __init__(self, vertices, edges=None, is_directed=True): """ Constructor + vertices: Number of vertices + edges: List with pairs (x,y) representing edges, where 0 <= x,y < vertices + is_directed: whether the graph is directed or not """ self.v = vertices self.graph = {node: LinkedList() for node in range(vertices)} self.is_directed = is_directed if edges: for edge in edges: w = 1 if len(edge) > 2: # Has weight associated w = edge[2] self.add_edge(edge[0], edge[1], w)
def retrieve_phrase( dictionary: Dict[str, Tuple[float, Tuple[int, int], Tuple[int, int]]], postings_file: BinaryIO, tokens: List[str]) -> LinkedList[Tuple[str, LinkedList[int]]]: """ Returns a LinkedList of documents that contain a specific phrase. :param dictionary :param postings_file the read-only binary file descriptor :param tokens tokens in phrase :return: A LinkedList of document IDs (). """ if not tokens: return LinkedList() positional_index = load_positional_index(postings_file, dictionary, tokens[0]) for token in tokens[1:]: next_positional_index = load_positional_index(postings_file, dictionary, token) positional_index = merge_positional_indexes(positional_index, next_positional_index) return positional_index
class TestLinkedList(unittest.TestCase): # doubly-linked list def setUp(self): self.emptyList = LinkedList() self.List = LinkedList() # populate list with 1, 2, 3, 4 for i in range(1, 5): self.List.append(i) def test_repr(self): '''Test returning the linked list as a string literal.''' self.assertEqual(repr(self.emptyList), str(())) tup = (1, 3.14, 'foo', True) for i in tup: self.emptyList.append(i) self.assertEqual(repr(self.emptyList), str(tup)) def test_len_size(self): '''Test returning the length of the list.''' for i in range(100): self.emptyList.insert(i) self.assertEqual(len(self.emptyList), 100) self.assertEqual(self.emptyList.size(), 100) self.emptyList.pop() self.assertEqual(len(self.emptyList), 99) self.assertEqual(self.emptyList.size(), 99) self.emptyList.shift() self.assertEqual(len(self.emptyList), 98) self.assertEqual(self.emptyList.size(), 98) self.emptyList.remove(4) self.assertEqual(len(self.emptyList), 97) self.assertEqual(self.emptyList.size(), 97) def test_tuple(self): '''Test returning the linked list as a list literal.''' self.assertEqual(list(self.emptyList), []) li = [1, 3.14, 'foo', True] for i in li: self.emptyList.append(i) self.assertEqual(list(self.emptyList), li) def test_list(self): '''Test returning the linked list as a tuple literal.''' self.assertEqual(tuple(self.emptyList), ()) tup = (1, 3.14, 'foo', True) for i in tup: self.emptyList.append(i) self.assertEqual(tuple(self.emptyList), tup) def test_in_contains(self): '''Test returning T/F for whether the list contains certain values.''' self.assertTrue(2 in self.List) self.assertFalse(10 in self.List) self.assertTrue(self.List.contains(2)) self.assertFalse(self.List.contains(10)) def test_insert(self): '''Test inserting values at the head of list the list.''' for i in range(1, 5): self.emptyList.insert(i) self.assertEqual(tuple(self.emptyList), (4, 3, 2, 1)) def test_append(self): '''Test appending values at the end of the list.''' self.List.append(True) self.assertEqual(tuple(self.List), (1, 2, 3, 4, True)) def test_shift(self): '''Test removing and returning the head of the list.''' self.assertEqual(self.List.shift(), 1) self.assertEqual(tuple(self.List), (2, 3, 4)) self.assertEqual(self.List.shift(), 2) self.assertEqual(self.List.shift(), 3) self.assertEqual(self.List.shift(), 4) self.assertEqual(tuple(self.List), ()) with self.assertRaises(IndexError): self.emptyList.shift() def test_pop(self): '''Test popping off and returning the tail of the list.''' self.assertEqual(self.List.pop(), 4) self.assertEqual(tuple(self.List), (1, 2, 3)) self.assertEqual(self.List.pop(), 3) self.assertEqual(self.List.pop(), 2) self.assertEqual(self.List.pop(), 1) self.assertEqual(tuple(self.List), ()) with self.assertRaises(IndexError): self.emptyList.pop() def test_shift_and_post(self): '''Testing shifting and popping interchaneably.''' self.assertEqual(self.List.pop(), 4) self.assertEqual(tuple(self.List), (1, 2, 3)) self.assertEqual(self.List.shift(), 1) self.assertEqual(tuple(self.List), (2, 3)) self.assertEqual(self.List.pop(), 3) self.assertEqual(tuple(self.List), (2, )) self.assertEqual(self.List.shift(), 2) self.assertEqual(tuple(self.List), ()) with self.assertRaises(IndexError): self.emptyList.pop() with self.assertRaises(IndexError): self.emptyList.shift() def test_remove(self): '''Testing finding and removing values from the list.''' for i in range(1, 5): self.List.append(i) self.List.remove(2) self.assertEqual(tuple(self.List), (1, 3, 4, 1, 2, 3, 4)) with self.assertRaises(ValueError): self.List.remove(6)
def perform_boolean_query(tokens: List[Tuple[str, Union[List[str], str]]], dictionary: Dict[str, Tuple[float, Tuple[int, int], Tuple[int, int]]], postings_file: BinaryIO) -> LinkedList: """ Returns a LinkedList of documents that satisfy a purely conjunctive boolean query. :param tokens: A List containing Tuples with the form (<'phrase' | 'nonphrase', <term>) where <term> is a query term/phrase -- phrases are Lists, single terms are strings. :param dictionary the combined TF-IDF and positional index dictionary :param postings_file the read-only binary file descriptor for the postings list file. :return: A LinkedList of document IDs that satisfy the boolean query. """ def get_idf(token: Tuple[str, Union[List[str], str]]) -> float: """ Helper function to get the IDF of a phrase/term. The IDF of a phrase is estimated by summing up the IDF of the individual terms. This works out as phrases usually produce small resultant postings lists, and a phrase with rare terms (high IDF) will produce smaller postings lists than a phrase with common terms. """ term_type, phrase = token if term_type == TokenType.PHRASE: phrase = cast(List[str], phrase) # Returns the sum of the idfs of each term # (first item in the dictionary tuple) return sum( map( lambda term: dictionary[term][0] if term in dictionary else 0, phrase)) elif term_type == TokenType.NON_PHRASE: term = cast(str, phrase) return dictionary[term][0] if term in dictionary else 0 def get_postings_list(term_type: str, phrase: Union[List[str], str]) -> LinkedList: """ Helper function to get a postings list length of a phrase/term. Returns empty LinkedList if phrase does not exist. """ if term_type == TokenType.PHRASE: return retrieve_phrase(dictionary, postings_file, phrase) elif term_type == TokenType.NON_PHRASE: term = cast(str, phrase) return load_postings_list(postings_file, dictionary, term) # Guard against empty tokens list if not tokens: return LinkedList() # Optimization for faster AND computation -- do the rarer term. list.sort(tokens, key=get_idf, reverse=True) # Generate a list of Postings lists resultant_list: LinkedList[int] = get_postings_list(*tokens[0]) # Successively use AND on the tokens' postings lists for token in tokens[1:]: # Short circuit for empty LinkedList -- # cannot be done easily when using `reduce` if not resultant_list: break resultant_list = perform_and(resultant_list, get_postings_list(*token)) return resultant_list
quick_sort(array, lower, partition_at + 1) quick_sort(array, partition_at + 1, n) return array def merge_sort(array, n=None): pass def radix_sort(array, n=None): pass if __name__ == '__main__': large_ll = LinkedList(2) cur_node = large_ll for i in [7, 5, 8, 4, 6, 1, 3]: cur_node.next_node = LinkedList(i) cur_node = cur_node.next_node unsorted_large_array = [2, 7, 5, 8, 4, 6, 1, 3] sorted_large_array = list(range(2 << 20)) # bubble_sort(unsorted_large_array.copy()) # bubble_sort(sorted_large_array.copy()) # insertion_sort(unsorted_large_array.copy()) # insertion_sort(sorted_large_array.copy()) # insertion_sort_ll(large_ll) # selection_sort(unsorted_large_array.copy()) # assert sorted_large_array == selection_sort(sorted_large_array.copy()) print(quick_sort(unsorted_large_array.copy()))