def get_scores(self, terms):
     """creates a list of scores for each file in corpus
     The score = weighted frequency / the total word count in the file.
     Compute this score for each term in a query and sum all the scores.
     Args:
     terms (list) : a list of str
     Returns:
     list : a list of tuples, each containing the filename and its relevancy score
     """
     scores = HashTableLinear()
     for item in terms:
         if item in self.term_freqs:
             temp = self.term_freqs[item]
             for thing in temp.hash_table:
                 if thing is not None:
                     if thing[0] in scores:
                         scores[thing[0]] += self.get_wf(thing[1])
                     else:
                         scores.put(thing[0], self.get_wf(thing[1]))
     for item in scores.hash_table:
         if item is None:
             continue
         else:
             scores[item[0]] /= self.doc_length[item[0]]
     return scores
Exemple #2
0
    def count_words(self, filename, words):
        """ Count words in a file and store the frequency of each word in the term_freqs hash table.
        Words should not contain stopwords. Also store the total count of words contained in the
        file in the doc_length hash table.
        Args:
            filename (str): the file name
            words (list): a list of words
        """
        # store total count of words in the doc_length hash table
        self.doc_length.put(filename, len(words))

        # iterate through each word
        for word in words:

            # calculate frequency of this word in this document
            word_frequency = words.count(
                word)  # returns number of occurences of this word in words

            # if word is already in term_freqs
            if self.term_freqs.contains(word):
                # add new ("doc1", freq) pair to term_freqs[word] (which is the lower hashtable)
                self.term_freqs[word][1].put(filename, word_frequency)

            # if word is not already in term_freqs
            else:
                # create new frequency hashtable for each term ("doc1", frequency)
                freq_hashtable = HashTable()
                freq_hashtable.put(filename, word_frequency)
                # put this newly created hash table into term_freqs hash table
                self.term_freqs.put(word, freq_hashtable)
 def count_words(self, filename, words):
     """count words in a file and store the frequency of each
     word in the term_freqs hash table. The keys of the term_freqs hash table shall be
     words. The values of the term_freqs hash table shall be hash tables (term_freqs
     is a hash table of hash tables). The keys of the hash tables (inner hash table) stored
     in the term_freqs shall be file names. The values of the inner hash tables shall be
     the frequencies of words. For example, self.term_freqs[word][filename] += 1;
     Words should not contain stopwords.
     Also store the total count of words contained in the file in the doc_length hash table.
     Args:
     filename (str) : the file name
     words (list) : a list of words
     """
     for item in words:
         if self.term_freqs.contains(item):
             if self.term_freqs[item].contains(filename):
                 self.term_freqs[item][filename] += 1
             else:
                 self.term_freqs[item].put(filename, 1)
         else:
             ht = HashTableLinear()
             ht.put(filename, 1)
             self.term_freqs.put(item, ht)
         if self.doc_length.contains(filename):
             self.doc_length[filename] += 1
         else:
             self.doc_length.put(filename, 1)
Exemple #4
0
    def search(self, query):
        """ Search for the query terms in files
        Args:
            query (str): query input, "user input goes here"
        Returns:
            list: a list of files in descending order of relevancy
        """
        # parse words
        filtered_query = self.parse_words([query])

        # remove duplicate words using a hash table
        word_table = HashTable()
        for word in filtered_query:
            word_table.put(word, word)
        word_table_keys = word_table.keys()
        parsed_query_terms = []  # changes from string to a list
        # add all words from hash table to list using keys()
        for key in word_table_keys:
            parsed_query_terms.append(word_table[key][0])

        # pass query terms to get_scores()
        tuples = self.get_scores(parsed_query_terms)

        # pass resulting list of tuples to rank()
        results = self.rank(tuples)

        # rank's result will be displayed in descending order on screen
        for a_tuple in results:
            print(a_tuple[0])
 def test_linear2(self):
     ht = HashTableLinear()
     for i in range(22):
         ht.put(chr(i), i)
     self.assertEqual(ht.size(), 22)
     self.assertTrue(ht.load_factor() <= 0.75)
     self.assertTrue(ht.contains(chr(0)))
     self.assertTrue(ht.contains(chr(1)))
     self.assertTrue(ht.contains(chr(19)))
     self.assertFalse(ht.contains(chr(22)))
 def test_linear1(self):
     ht = HashTableLinear()
     for i in range(11):
         ht.put(str(i), i)
     self.assertEqual(ht.size(), 11)
     self.assertTrue(ht.load_factor() <= 0.75)
     self.assertTrue(ht.contains('0'))
     self.assertTrue(ht.contains('1'))
     self.assertTrue(ht.contains('10'))
     self.assertFalse(ht.contains('11'))
Exemple #7
0
    def get_scores(self, terms):
        """ Creates list of scores for each file in corpus.
        The score = (weighted frequency / total word count in file)
        Compute the score for each term in a query and sum all the scores.
        Args:
            terms (list): a list of strings, raw input string from user query
        Returns:
            list: a list of tuples, each containing the filename and its relevancy score
        """
        # scores = HashMap()
        score_table = HashTable(
        )  # contains tuples of (filename, weighted_frequency)

        for query_term in terms:

            # fetch a hash table of "term" from self.term_freqs
            query_term_table = self.term_freqs[query_term][1]

            # for each file in the hash table, add weighted frequency to scores[file]
            qt_table_keys = query_term_table.keys()
            for key in qt_table_keys:  # key is a file name
                weighted_frequency = self.get_wf(query_term_table[key][1])
                if weighted_frequency != 0:

                    # if this is the second query_term
                    if score_table.contains(key):

                        # new frequency + old frequency
                        old_freq = score_table[key][1]
                        updated_freq = weighted_frequency + old_freq
                        score_table.put(key, updated_freq)

                    # if score_table[key] is empty, use put (if first query_term)
                    else:
                        score_table.put(key, weighted_frequency)

        # for each file in scores, do scores[file] /= self.doc_length[file]
        score_table_keys = score_table.keys()
        score_list = []
        for key in score_table_keys:  # key is a filename
            normalized_score = score_table[key][1] / self.doc_length[key][1]
            score_table[key] = normalized_score
            score_list.append(score_table[key])

        # return scores, which is a list of tuples neglecting terms with frequencies of 0
        return score_list
    def test_whole_functionality(self):
        """ Tests the Separate Chain Hash Table Functionality"""

        filename = 'stop_words.txt'
        hash_table = HashTableLinear()

        hash_table = import_stopwords(filename, hash_table)

        self.assertRaises(KeyError, hash_table.get, 'BubbleGum')
        self.assertTrue('to' in hash_table)

        second_hash = HashTableLinear()
        second_hash.put('three', 'three')
        third_hash = HashTableLinear()
        third_hash.put('three', 'three')
        self.assertEqual(second_hash, third_hash)
        self.assertNotEqual(hash_table, second_hash)
        self.assertNotEqual(hash_table, 5)
        expected = "Hash_val = 0: None\n" \
            "Hash_val = 1: None\n" \
            "Hash_val = 2: None\n" \
            "Hash_val = 3: None\n" \
            "Hash_val = 4: ('three', 'three')\n" \
            "Hash_val = 5: None\n" \
            "Hash_val = 6: None\n" \
            "Hash_val = 7: None\n" \
            "Hash_val = 8: None\n" \
            "Hash_val = 9: None\n" \
            "Hash_val = 10: None\n"

        self.assertEqual(expected, repr(second_hash))

        second_hash['four'] = 'four'
        self.assertEqual(second_hash['four'], 'four')
        second_hash['five'] = 'five'
        self.assertEqual(0, hash_table.get('from'))

        self.assertFalse(second_hash.contains('p'))
        self.assertTrue(second_hash.contains('five'))
        second_hash.remove('five')
        self.assertFalse(second_hash.contains('five'))
        self.assertRaises(KeyError, second_hash.remove, 'p')

        self.assertEqual(1, third_hash.size())

        self.assertEqual(0, third_hash.collisions())
Exemple #9
0
    def test_HashTableLinear(self):
        t = HashTableLinear()

        self.assertEqual(t.size(), 0)
        self.assertFalse(t.contains('us'))
        self.assertRaises(KeyError, t.get, 'us')

        t.put('us', 'us')
        self.assertEqual(t.get('us'), 'us')
        self.assertEqual(t['us'], 'us')
        self.assertTrue(t.contains('us'))
        self.assertFalse(t.contains('say'))
        self.assertEqual(t.size(), 1)
        self.assertEqual(t.collisions(), 0)

        t.put('say', 'say')
        self.assertEqual(t.get('say'), 'say')
        self.assertTrue(t.contains('say'))
        self.assertEqual(t.size(), 2)
        self.assertEqual(t.collisions(), 1)

        t.remove('say')
        self.assertFalse(t.contains('say'))
        self.assertTrue(t.contains('us'))
        t.remove('us')
        self.assertEqual(t.size(), 0)

        # print(hash_string('the', 11)) # 'the' = 5
        t.put('us', 'us')
        t.put('say', 'say')
        # self.assertEqual(t.load_factor(), 0.18181818181818182)
        t.put('the', 'the')
        # t.put(chr(0), chr(0))
        # t.put('0', '0')
        # print('chr 0', chr(0))
        # print('just 0', '0')
        # print(type(chr(0)))
        # print(type('0'))

        # print(hash_string('us', 23)) # 'the' = 5
        # print(hash_string('say', 23)) # 'the' = 5
        # print(hash_string('the', 23)) # 'the' = 5
        # print('from tests', t)

        self.assertTrue(t.contains('us'))
        self.assertTrue(t.contains('the'))
Exemple #10
0
    def test_linear4(self):
        ht = HashTableLinear()
        for i in range(22):
            ht.put(chr(i), i)
        self.assertEqual(ht.size(), 22)
        self.assertTrue(ht.load_factor() <= 0.75)
        self.assertEqual(ht[chr(0)], 0)
        self.assertEqual(ht[chr(1)], 1)
        self.assertEqual(ht[chr(19)], 19)

        self.assertRaises(KeyError, ht.get, 'a')

        for i in range(22):
            ht.remove(chr(i))
        self.assertFalse(ht.contains(chr(0)))
        self.assertFalse(ht.contains(chr(1)))
        self.assertFalse(ht.contains(chr(19)))

        self.assertRaises(KeyError, ht.remove, 'a')
Exemple #11
0
    def search(self, query):
        """ Search for the query terms in files
            Args:
                query (str): query input: e.g. "computer science"
            Returns:
                list: a list of tuples: (files_path_name, score) sorted in
                descending order or relevancy excluding files whose relevancy
                score is 0.
        """

        terms = self.parse_words([query])
        cleaned_terms = []
        hash_terms = HashTableLinear()
        for term in terms:
            if not hash_terms.contains(term):
                cleaned_terms.append(term)
            hash_terms.put(term, term)
        scores = self.get_scores(cleaned_terms)
        scores = self.rank(scores)

        return scores
Exemple #12
0
 def get_scores(self, terms):
     """creates a list of scores for each file in corpus
     The score = weighted frequency / the total word count in the file.
     Compute this score for each term in a query and sum all the scores.
     Args:
         terms (list) : a list of str
     Returns:
         list : a list of tuples, each containing the filename and its relevancy score
     """
     scores = HashTableLinear()
     for term in terms:
         if self.term_freqs.contains(term):
             hashtable = self.term_freqs[term]
             for file1 in hashtable.slots:
                 if file1 != None and hashtable.contains(file1[0]):
                     if scores.contains(file1[0]):
                         key, val = scores.remove(file1[0])
                         scores.put(file1[0], val + self.get_wf(file1[1]))
                     else:
                         scores.put(file1[0], self.get_wf(file1[1]))
     for file1 in scores.slots:
         if file1 is not None:
             key, val = scores.remove(file1[0])
             val /= self.doc_length.get(file1[0])
             scores.put(file1[0], val)
     return scores
Exemple #13
0
    def count_words(self, file_path_name, words):
        """ Count words in a file and store the frequency of each word in the
            term_freqs hash table. The keys of the term_freqs hash table shall
            be words. The values of the term_freqs hash table shall be hash
            tables (term_freqs is a hash table of hash tables). The keys of
            the hash tables (inner hash table) stored in the term_freqs shall
            be file names. Values of inner hash tables shall be the frequencies
            of words.

        Args:
            file_path_name (str): the file name
            words (list): a list of words
        """
        self.doc_length.put(file_path_name, len(words))

        while len(words) > 0:
            current_word = words[0]
            word_freq = 0

            word_freq = words.count(current_word)

            try:
                while True:
                    words.remove(current_word)
            except ValueError:
                pass

            # If the word already in term_freqs, retrieve the doc freq table
            # otherwise, create a new hash table
            if current_word in self.term_freqs:
                freq_hash = self.term_freqs.get(current_word)
            else:
                freq_hash = HashTableLinear()

            freq_hash.put(file_path_name, word_freq)
            self.term_freqs.put(current_word, freq_hash)
class SearchEngine:
    """class for SearchEngine
    Attributes:
        directory (str) : a directory name
        stopwords (HashMap) : a hash table containing stopwords
        doc_length (HashMap) : a hash table containing the total number of words in each
                                document
        term_freqs (HashMap) : a hash table of hash tables for each term. Each hash table
                                contains the frequency of the term in documents
                                (document names are the keys and the frequencies are the values)
        file_list : a list of files
    """
    def __init__(self, directory, stopwords):
        self.doc_length = HashTableLinear(
        )  # Replace HashMap() with your hash table.
        self.term_freqs = HashTableLinear()
        self.stopwords = stopwords
        self.index_files(directory)

    def __repr__(self):
        return "doc_length: %s, term_freqs: %s, stopwords: %s"\
               % (self.doc_length, self.term_freqs, self.stopwords)

    def __eq__(self, other):
        return isinstance(other, SearchEngine) and self.doc_length == other.doc_length\
            and self.term_freqs == other.term_freqs

    def read_file(self, infile):
        """A helper function to read a file
        Args:
        infile (str) : the path to a file
        Returns:
        list : a list of str read from a file
        """
        file_list = []
        with open(infile, 'r') as i:
            for line in i.readlines():
                file_list.append(line)
        return file_list

    def parse_words(self, lines):
        """split strings into words
        Convert words to lower cases and remove new line chars.
        Exclude stopwords.
        Args:
        lines (list) : a list of strings
        Returns:
        list : a list of words
        """
        word_list = []
        for item in lines:
            split_arr = item.split(' ')
            for word in split_arr:
                if word != '\n':
                    word_list.append(word.lower())
        final_arr = []
        for item in word_list:
            item = item.replace("(", '')
            item = item.replace(")", '')
            item = item.replace(".", '')
            item = item.replace(",", '')
            final_arr.append(item.rstrip())
        final_arr = self.exclude_stopwords(final_arr)
        return final_arr

    def exclude_stopwords(self, terms):
        """exclude stopwords from the list of terms
        Args:
        terms (list) :
        Returns:
        list : a list of str with stopwords removed
        """
        new_terms = []
        for item in terms:
            if item in self.stopwords:
                continue
            else:
                new_terms.append(item)
        return new_terms

    def count_words(self, filename, words):
        """count words in a file and store the frequency of each
        word in the term_freqs hash table. The keys of the term_freqs hash table shall be
        words. The values of the term_freqs hash table shall be hash tables (term_freqs
        is a hash table of hash tables). The keys of the hash tables (inner hash table) stored
        in the term_freqs shall be file names. The values of the inner hash tables shall be
        the frequencies of words. For example, self.term_freqs[word][filename] += 1;
        Words should not contain stopwords.
        Also store the total count of words contained in the file in the doc_length hash table.
        Args:
        filename (str) : the file name
        words (list) : a list of words
        """
        for item in words:
            if self.term_freqs.contains(item):
                if self.term_freqs[item].contains(filename):
                    self.term_freqs[item][filename] += 1
                else:
                    self.term_freqs[item].put(filename, 1)
            else:
                ht = HashTableLinear()
                ht.put(filename, 1)
                self.term_freqs.put(item, ht)
            if self.doc_length.contains(filename):
                self.doc_length[filename] += 1
            else:
                self.doc_length.put(filename, 1)

    def index_files(self, directory):
        """index all text files in a given directory
        Args:
        directory (str) : the path of a directory
        """
        dir_list = os.listdir(directory)
        for item in dir_list:
            if os.path.isfile(os.path.join(directory, item)) is True:
                parts = os.path.splitext(item)
                if parts[1] == '.txt':
                    line_arr = self.read_file(os.path.join(directory, item))
                    words = self.parse_words(line_arr)
                    self.count_words(os.path.join(directory, item), words)

    def get_wf(self, tf):
        """computes the weighted frequency
        Args:
        tf (float) : term frequency
        Returns:
        float : the weighted frequency
        """

        if tf > 0:
            wf = 1 + math.log(tf)
        else:
            wf = 0
        return wf

    def get_scores(self, terms):
        """creates a list of scores for each file in corpus
        The score = weighted frequency / the total word count in the file.
        Compute this score for each term in a query and sum all the scores.
        Args:
        terms (list) : a list of str
        Returns:
        list : a list of tuples, each containing the filename and its relevancy score
        """
        scores = HashTableLinear()
        for item in terms:
            if item in self.term_freqs:
                temp = self.term_freqs[item]
                for thing in temp.hash_table:
                    if thing is not None:
                        if thing[0] in scores:
                            scores[thing[0]] += self.get_wf(thing[1])
                        else:
                            scores.put(thing[0], self.get_wf(thing[1]))
        for item in scores.hash_table:
            if item is None:
                continue
            else:
                scores[item[0]] /= self.doc_length[item[0]]
        return scores

    def rank(self, scores):
        """ranks files in the descending order of relevancy
        Args:
        scores(list) : a list of tuples: (filename, score)
        Returns:
        list : a list of tuples: (filename, score) sorted in descending order of relevancy
        """
        new_scores = list(filter(None, scores))
        sorted_scores = sorted(new_scores, key=lambda x: x[1], reverse=True)
        return sorted_scores

    def search(self, query):
        """ search for the query terms in files
        Args:
        query (str) : query input
        Returns:
        list : list of files in descending order or relevancy
        """
        scores = self.get_scores(query).hash_table
        new_scores = self.rank(scores)
        score_arr = []
        for item in new_scores:
            if item is not None:
                score_arr.append(item)
                print(item)
        return score_arr
Exemple #15
0
class SearchEngine:
    """ Search engine class to build an inverted index of documents stored
        in a specified directory and provides a functionality to search
        documents with query terms.
        Attributes:
            directory (str): a directory name
            stopwords (HashMap): a hash table containing stop words
            doc_length (HashMap): a hash table containing the total number of
                                  words in each document
            term_freqs (HashMap): a hash table of hash tables for each term.
                                  Each hash table contains the frequency of
                                  the term in documents (document names are
                                  the keys and the frequencies of the values)
    """
    def __init__(self, directory, stopwords):
        """ Initialize the data structure by taking a directory name and a
            hash table containing stopwords.
            Args:
                directory (str): a directory name
                stopwords (HashMap): a hash table containing stopwords
        """

        self.doc_length = HashTableLinear()
        self.term_freqs = HashTableLinear()
        self.stopwords = stopwords
        self.file_list = []
        self.index_files(directory)

    def __eq__(self, other):
        """ Compares the data structure to other"""
        return isinstance(other, SearchEngine) and \
            self.doc_length == other.doc_length and \
            self.term_freqs == other.term_freqs and \
            self.stopwords == other.stopwords

    def __repr__(self):
        """ How the data structure is represented"""
        return "SearchEngine Instance:\n" + str(self.term_freqs)

    def read_file(self, infile):
        """ Reads all words contained in the file except for stop words
            Args:
                infile (str): the path to a file
            Returns:
                list: a list of str read from a file
        """
        with open(infile, 'r') as file:
            str_list = file.readlines()
        for index, line in enumerate(str_list):
            str_list[index] = line.rstrip('\n')
        return str_list

    def parse_words(self, lines):
        """ Splits strings into words by spaces, converts words to lower
            cases, and removes newline chars, parentheses, brackets, and
            punctuations
            Args:
                lines (list): a list of strings
            Returns:
                list: a list of words
        """
        words = []
        for each in lines:
            line_list = each.split()
            for index, word in enumerate(line_list):
                line_list[index] = re.sub('[\W_]+', '', word)
            words += line_list

        for index, word in enumerate(words):
            words[index] = word.lower()

        words = self.exclude_stopwords(words)

        return words

    def exclude_stopwords(self, terms):
        """ Exclude stopwords from the list of terms
            Args:
                terms (list): list of terms to be cleaned of stop words
            Returns:
                  list: a list of str with stopwords removed
        """
        out_list = []
        for each in terms:
            if each not in self.stopwords:
                out_list.append(each)

        return out_list

    def count_words(self, file_path_name, words):
        """ Count words in a file and store the frequency of each word in the
            term_freqs hash table. The keys of the term_freqs hash table shall
            be words. The values of the term_freqs hash table shall be hash
            tables (term_freqs is a hash table of hash tables). The keys of
            the hash tables (inner hash table) stored in the term_freqs shall
            be file names. Values of inner hash tables shall be the frequencies
            of words.

        Args:
            file_path_name (str): the file name
            words (list): a list of words
        """
        self.doc_length.put(file_path_name, len(words))

        while len(words) > 0:
            current_word = words[0]
            word_freq = 0

            word_freq = words.count(current_word)

            try:
                while True:
                    words.remove(current_word)
            except ValueError:
                pass

            # If the word already in term_freqs, retrieve the doc freq table
            # otherwise, create a new hash table
            if current_word in self.term_freqs:
                freq_hash = self.term_freqs.get(current_word)
            else:
                freq_hash = HashTableLinear()

            freq_hash.put(file_path_name, word_freq)
            self.term_freqs.put(current_word, freq_hash)

    def index_files(self, directory):
        """ Processes a directory and makes an index of all the files
            Args:
                directory (str): the directory being indexed
        """

        dir_list = os.listdir(directory)
        full_dir_list = []
        for index, item in enumerate(dir_list):
            full_dir_list.append(os.path.join(directory, item))
        file_list = []
        for index, item in enumerate(full_dir_list):
            if os.path.isfile(item):
                parts = os.path.splitext(item)
                if parts[1] == '.txt':
                    file_list.append(full_dir_list[index])

        # The list of txt files in directory is now in file_list
        self.file_list = file_list

        for file in file_list:
            str_list = self.read_file(file)
            words = self.parse_words(str_list)
            self.count_words(file, words)

    def get_wf(self, tf):
        """ computes the weighted frequency
            Args:
                tf (float): term frequency
            Returns:
                float: the weighted frequency
        """

        if tf > 0:
            wf = 1 + math.log(tf)
        else:
            wf = 0
        return wf

    def get_scores(self, terms):
        """ Creates a list of scores for each file in corpus
            The score = weighted frequency / the total word count in file.

            Args:
                terms (list): a list of str
            Returns:
                list: a list of tuples, each containing the file_path_name
                      and its relevancy score
        """

        scores = HashTableLinear()
        for term in terms:
            word_hash_table = self.term_freqs.get(term)
            for file in self.file_list:
                if word_hash_table.contains(file):
                    if scores.contains(file):
                        scores[file] += self.get_wf(word_hash_table[file])
                    else:
                        scores[file] = self.get_wf(word_hash_table[file])

        score_list = []
        for file in self.file_list:
            if scores.contains(file) and scores[file] > 0:
                norm_score = scores[file] / self.doc_length[file]
                score_list.append((file, norm_score))
        return score_list

    def rank(self, scores):
        """ Ranks files in the descending order of relevancy
            Args:
                scores (list): a list of tuples: (file_path_name, score)
            Returns:
                list: a list of tuples (file_path_name, score) sorted in
                      descending order of relevancy
        """

        for pos in range(1, len(scores)):
            cur_pos = pos

            while cur_pos > 0 and scores[cur_pos - 1][1] < scores[cur_pos][1]:
                temp = scores[cur_pos - 1]
                scores[cur_pos - 1] = scores[cur_pos]
                scores[cur_pos] = temp

        # Reverse the order so its descending
        # scores.reverse()

        return scores

    def search(self, query):
        """ Search for the query terms in files
            Args:
                query (str): query input: e.g. "computer science"
            Returns:
                list: a list of tuples: (files_path_name, score) sorted in
                descending order or relevancy excluding files whose relevancy
                score is 0.
        """

        terms = self.parse_words([query])
        cleaned_terms = []
        hash_terms = HashTableLinear()
        for term in terms:
            if not hash_terms.contains(term):
                cleaned_terms.append(term)
            hash_terms.put(term, term)
        scores = self.get_scores(cleaned_terms)
        scores = self.rank(scores)

        return scores

    def print_nice_results(self, scores):
        """ Takes the output of scores method and makes the results more
            presentable to look at.
            Args:
                scores (list): list of tuples that is output from self.scores
        """
        for each in scores:
            print(each[0] + str(each[1]))
Exemple #16
0
class SearchEngine:
    """ Builds and maintains an inverted index of documents stored in a specified directory and
    provides a functionality to search documents with query terms
    Attributes:
        directory (str): a directory name
        stopwords (HashTable): contains stopwords
        doc_length (HashTable): contains number of words in each document
        doc_freqs (HashTable): contains number of documents containing the term for each term
        term_freqs (HashTable): hash table of hash tables for each term, each hash table contains
                                the frequency of the term in documents (document names are the keys
                                and the frequencies are the values)
    """
    def __init__(self, directory, stopwords):
        self.doc_length = HashTable()
        self.doc_freqs = HashTable()  #this will not be used in this assignment
        self.term_freqs = HashTable()
        self.stopwords = stopwords
        self.index_files(directory)

    # PREPROCESSING ================================================================================

    def read_file(self, infile):
        """ A helper function to read a file
        Args:
            infile (str): the path to a file
        Returns:
            list: a list of strings read from a file
        """
        # open file
        with open(infile, "r") as filepointer:
            lines = filepointer.readlines(
            )  # looks like ["line 1 here", "line 2 here"]
        return lines

    def parse_words(self, lines):
        """ Split strings into words, convert words to lower cases and remove newline characters,
        exclude stopwords
        Args:
            lines (list): a list of lists of strings
        Returns:
            list: a list of words
        """
        raw_words = []
        for line in lines:
            split_line = line.split(
                " ")  # split line looks like ["line", "1", "here"]
            raw_words.extend(split_line)

        # create new list with all words that aren't stop words
        filtered_words = [
            word.rstrip().lower() for word in raw_words
            if word not in self.stopwords
        ]

        return filtered_words

    def count_words(self, filename, words):
        """ Count words in a file and store the frequency of each word in the term_freqs hash table.
        Words should not contain stopwords. Also store the total count of words contained in the
        file in the doc_length hash table.
        Args:
            filename (str): the file name
            words (list): a list of words
        """
        # store total count of words in the doc_length hash table
        self.doc_length.put(filename, len(words))

        # iterate through each word
        for word in words:

            # calculate frequency of this word in this document
            word_frequency = words.count(
                word)  # returns number of occurences of this word in words

            # if word is already in term_freqs
            if self.term_freqs.contains(word):
                # add new ("doc1", freq) pair to term_freqs[word] (which is the lower hashtable)
                self.term_freqs[word][1].put(filename, word_frequency)

            # if word is not already in term_freqs
            else:
                # create new frequency hashtable for each term ("doc1", frequency)
                freq_hashtable = HashTable()
                freq_hashtable.put(filename, word_frequency)
                # put this newly created hash table into term_freqs hash table
                self.term_freqs.put(word, freq_hashtable)

    def index_files(self, directory):
        """ Index all text files in a given directory
        Args:
            directory (str) : the path of a directory
        """
        # get a list of files in the directory
        file_list = os.listdir(directory)

        # for each item in file_list, item is a filename
        for item in file_list:

            # construct full path of each file
            path = os.path.join(directory, item)

            # if item is not a file, skip it
            if not os.path.isfile(path) or item == "stop_words.txt":
                continue

            # split path into file extension and the rest
            parts = os.path.splitext(
                item)  # maybe change item stuff here to path

            # only process text files
            if parts[1] == ".txt":

                # process it
                item_lines = self.read_file(path)
                item_words = self.parse_words(item_lines)
                self.count_words(path, item_words)

    # SEARCHING ====================================================================================

    def get_wf(self, term_frequency):
        """ Computes the weighted frequency
        Args:
            term_frequency (float): term frequency
        Returns:
            float: the weighted frequency
        """
        if term_frequency > 0:
            weighted_freq = 1 + math.log(term_frequency)
        else:
            weighted_freq = 0
        return weighted_freq

    def get_scores(self, terms):
        """ Creates list of scores for each file in corpus.
        The score = (weighted frequency / total word count in file)
        Compute the score for each term in a query and sum all the scores.
        Args:
            terms (list): a list of strings, raw input string from user query
        Returns:
            list: a list of tuples, each containing the filename and its relevancy score
        """
        # scores = HashMap()
        score_table = HashTable(
        )  # contains tuples of (filename, weighted_frequency)

        for query_term in terms:

            # fetch a hash table of "term" from self.term_freqs
            query_term_table = self.term_freqs[query_term][1]

            # for each file in the hash table, add weighted frequency to scores[file]
            qt_table_keys = query_term_table.keys()
            for key in qt_table_keys:  # key is a file name
                weighted_frequency = self.get_wf(query_term_table[key][1])
                if weighted_frequency != 0:

                    # if this is the second query_term
                    if score_table.contains(key):

                        # new frequency + old frequency
                        old_freq = score_table[key][1]
                        updated_freq = weighted_frequency + old_freq
                        score_table.put(key, updated_freq)

                    # if score_table[key] is empty, use put (if first query_term)
                    else:
                        score_table.put(key, weighted_frequency)

        # for each file in scores, do scores[file] /= self.doc_length[file]
        score_table_keys = score_table.keys()
        score_list = []
        for key in score_table_keys:  # key is a filename
            normalized_score = score_table[key][1] / self.doc_length[key][1]
            score_table[key] = normalized_score
            score_list.append(score_table[key])

        # return scores, which is a list of tuples neglecting terms with frequencies of 0
        return score_list

    def rank(self, scores):
        """ Ranks files in the descending order of relevancy
        Args:
            scores (list): list of tuples of (filename, score)
        Returns:
            list: a list of filenames sorted in descending order of relevancy
        """
        return sorted(scores, key=lambda x: x[1], reverse=True)

    def search(self, query):
        """ Search for the query terms in files
        Args:
            query (str): query input, "user input goes here"
        Returns:
            list: a list of files in descending order of relevancy
        """
        # parse words
        filtered_query = self.parse_words([query])

        # remove duplicate words using a hash table
        word_table = HashTable()
        for word in filtered_query:
            word_table.put(word, word)
        word_table_keys = word_table.keys()
        parsed_query_terms = []  # changes from string to a list
        # add all words from hash table to list using keys()
        for key in word_table_keys:
            parsed_query_terms.append(word_table[key][0])

        # pass query terms to get_scores()
        tuples = self.get_scores(parsed_query_terms)

        # pass resulting list of tuples to rank()
        results = self.rank(tuples)

        # rank's result will be displayed in descending order on screen
        for a_tuple in results:
            print(a_tuple[0])
    def test_basic(self):
        """ Tests basic functionality"""

        hash_table = HashTableLinear()
        hash_table.put('unless', 'unless')
        self.assertTrue(hash_table.contains('unless'))
        hash_table.put('every', 'every')
        hash_table.put('being', 'being')
        hash_table.put('elsewhere', 'elsewhere')
        hash_table.put('nothing', 'nothing')
        hash_table.put('hereby', 'hereby')
        hash_table.put('latter', 'latter')
        hash_table.put('and', 'and')
        hash_table.put('afterwards', 'afterwards')
        hash_table.put('say', 'say')
        hash_table.put('very', 'very')
        hash_table.put('few', 'few')
        hash_table.put('well', 'well')
        hash_table.put('various', 'various')
        hash_table.put('make', 'make')
        hash_table.put('regarding', 'regarding')
        hash_table.put('take', 'take')
        hash_table.put('give', 'give')
        hash_table.put('whole', 'whole')
        hash_table.put('i', 'i')
        hash_table.put('against', 'against')
        hash_table.put('can', 'can')

        hash_table.get('every')
        hash_table.get('being')
        hash_table.get('elsewhere')
        hash_table.get('nothing')
        hash_table.get('hereby')
        hash_table.get('latter')
        hash_table.get('and')
        hash_table.get('afterwards')
        hash_table.get('say')
        hash_table.get('very')
        hash_table.get('few')
        hash_table.get('well')
        hash_table.get('various')
        hash_table.get('make')
        hash_table.get('regarding')
        hash_table.get('take')
        hash_table.get('give')
        hash_table.get('whole')
        hash_table.get('i')
        hash_table.get('against')
        hash_table.get('can')
Exemple #18
0
class SearchEngine:
    """
    doc_length: the number of words contained in document contained in a hashtable
    term_freqs: a hashtable containing each word in text file and an inner hashtable containing the frequency count
    stopwords: a hashtable containing stop words
    """
    def __init__(self, directory, stopwords):
        self.doc_length = HashTableLinear()
        self.term_freqs = HashTableLinear()
        self.stopwords = stopwords
        self.index_files(directory)

    def read_file(self, infile):
        """A helper function to read a file
            Args:
                infile (str) : the path to a file
            Returns:
                list : a list of str read from a file
        """
        with open(infile, 'r') as fi:
            strings = fi.read()
            strings = strings.split()
            return strings

    def parse_words(self, lines):
        """split strings into words
        Convert words to lower cases and remove new line chars.
        Exclude stopwords.
        Args:
            lines (list) : a list of strings
        Returns:
            list : a list of words
        """
        list1 = []
        for line in lines:
            words = line.lower()
            words = words.split()
            for word in words:
                list1.append(word)
        list1 = self.exclude_stopwords(list1)
        return list1

    def exclude_stopwords(self, terms):
        """exclude stopwords from the list of terms
        Args:
            terms (list) :
        Returns:
            list : a list of str with stopwords removed
        """
        list1 = []
        for i in terms:
            if not self.stopwords.contains(i):
                list1.append(i)
        return list1

    def count_words(self, filename, words):
        """
        Args:
            filename (str) : the file name
            words (list) : a list of words
        """
        for word in words:
            if word not in self.term_freqs:
                self.term_freqs[word] = HashTableLinear()
                self.term_freqs[word][filename] = 1
            else:
                if filename not in self.term_freqs[word]:
                    self.term_freqs[word][filename] = 1
                else:
                    self.term_freqs[word][filename] += 1
        self.doc_length.put(filename, len(words))

    def index_files(self, directory):
        """index all text files in a given directory
        Args:
            directory (str) : the path of a directory
        """
        file_list = os.listdir(directory)
        for item in file_list:
            val = os.path.join(directory, item)
            if os.path.isfile(val):
                parts = os.path.splitext(val)
                if parts[1] == '.txt':
                    words1 = self.read_file(val)
                    words2 = self.parse_words(words1)
                    self.count_words(val, words2)

    def get_wf(self, tf):
        """comptes the weighted frequency
        Args:
            tf (float) : term frequency
        Returns:
            float : the weighted frequency
        """
        if tf > 0:
            wf = 1 + math.log(tf)
        else:
            wf = 0
        return wf

    def get_scores(self, terms):
        """creates a list of scores for each file in corpus
        The score = weighted frequency / the total word count in the file.
        Compute this score for each term in a query and sum all the scores.
        Args:
            terms (list) : a list of str
        Returns:
            list : a list of tuples, each containing the filename and its relevancy score
        """
        scores = HashTableLinear()
        for term in terms:
            if self.term_freqs.contains(term):
                hashtable = self.term_freqs[term]
                for file1 in hashtable.slots:
                    if file1 != None and hashtable.contains(file1[0]):
                        if scores.contains(file1[0]):
                            key, val = scores.remove(file1[0])
                            scores.put(file1[0], val + self.get_wf(file1[1]))
                        else:
                            scores.put(file1[0], self.get_wf(file1[1]))
        for file1 in scores.slots:
            if file1 is not None:
                key, val = scores.remove(file1[0])
                val /= self.doc_length.get(file1[0])
                scores.put(file1[0], val)
        return scores

    def rank(self, scores):
        """ranks files in the descending order of relevancy
        Args:
            scores(list) : a list of tuples: (filename, score)
        Returns:
            list : a list of tuples: (filename, score) sorted in descending order of relevancy
        """
        no_none = []
        for val in scores.slots:
            if val != None:
                no_none.append(val)
        no_none = sorted(no_none, key=lambda x: x[1], reverse=True)
        new_line_out = ''
        for i in no_none:
            new_line_out += i[0] + '\n'
        return new_line_out

    def search(self, query):
        """ search for the query terms in files
        Args:
            query (str) : query input
        Returns:
            list :  list of files in descending order or relevancy
        """
        list1 = self.parse_words([query])
        scores = self.get_scores(list1)
        print(scores)
        return self.rank(scores)
Exemple #19
0
class SearchEngine:
    """
    Builds and maintains an inverted index of documents stored in a specified directory and
        provides a functionality to search documents with query terms.
    Attributes:
        directory (str) : a directory name
        stopwords (HashTableLinear) : a hash table containing stopwords
        doc_length (HashTableLinear) : a hash table containing the total number of words in each
            document
        doc_freqs (HashTableLinear) : a hash table containing the number of documents containing the
            term for each term
        term_freqs (HashTableLinear) : a hash table of hash tables for each term. Each hash table
            contains the frequency of the term in documents (document names are the keys and the
            frequencies are the values)
    """
    def __init__(self, directory, stopwords=[]):
        self.doc_length = HashTableLinear(
        )  # Replace HashTableLinear() with your hash table.
        self.doc_freqs = HashTableLinear(
        )  # this will not be used in this assignment
        self.term_freqs = HashTableLinear()
        self.stopwords = stopwords
        self.index_files(directory)
        self.directory = directory

    def read_file(self, infile):
        """A helper function to read a file
        Args:
        infile (str) : the path to a file
        Returns:
        list : a list of str read from a file
        """
        with open(infile, 'r') as reader:
            return_list = reader.readlines()
        return return_list

    def parse_words(self, lines):
        """split strings into words
        Convert words to lower cases and remove new line chars.
        Exclude stopwords.
        Args:
        lines (list) : a list of strings
        Returns:
        list : a list of words
        """
        return_list = []
        for i in lines:
            i = i.lower()
            i = i.split()
            for j in i:
                j = j.replace('\n', '')
                if j not in self.stopwords:
                    return_list.append(j)
        return return_list

    def count_words(self, filename, words):
        """count words in a file and store the frequency of each
        word in the term_freqs hash table. Words should not contain stopwords.
        Also store the total count of words contained in the file
        in the doc_length hash table.
        Args:
        filename (str) : the file name
        words (list) : a list of words
        """
        #file_lines = self.read_file(filename)
        #str_list = self.parse_words(file_lines)
        for i in words:
            self.term_freqs.put(i, HashTableLinear())
        self.doc_freqs.put(filename, 0)
        for i in words:
            if i in self.term_freqs and filename in self.term_freqs[i]:
                self.term_freqs[i][filename] = self.term_freqs[i][filename] + 1
            elif i and i in self.term_freqs:
                self.term_freqs[i].put(filename, 1)
            elif i:
                self.term_freqs.put(i, HashTableLinear())
                self.term_freqs[i].put(filename, 1)
        self.doc_freqs[filename] = len(words)

    def index_files(self, directory):
        """index all text files in a given directory
        Args:
        directory (str) : the path of a directory
        """
        txt_list = os.listdir(directory)
        for i in txt_list:
            if ".txt" in i:
                j = self.parse_words(self.read_file(os.path.join(directory,
                                                                 i)))
                self.count_words(i, j)

    def get_weighted_freq(self, total_freq):
        """comptes the weighted frequency
        Args:
        tf (float) : term frequency
        Returns:
        float : the weighted frequency
        """
        if total_freq > 0:
            weighted_freq = 1 + math.log(total_freq)
        else:
            weighted_freq = 0
        return weighted_freq

    def get_scores(self, terms):
        """creates a list of scores for each file in corpus
        The score = weighted frequency / the total word count in the file.
        Compute this score for each term in a query and sum all the scores.
        Args:
        terms (list) : a list of str
        Returns:
        list : a list of tuples, each containing the filename and its relevancy score
        """
        scores = ([], [])
        for term in terms:
            if term not in self.term_freqs:
                raise ValueError
            for i in self.term_freqs[term].hash_table:
                if i and i.key not in scores[0]:
                    file = i.key
                    freq = i.data
                    scores[0].append(file)
                    freq = self.get_weighted_freq(freq) / self.doc_freqs[file]
                    scores[1].append(freq)
                elif i and i.key in scores[0]:
                    file = i.key
                    freq = i.data
                    freq = self.get_weighted_freq(freq) / self.doc_freqs[file]
                    ind = scores[0].index(i.key)
                    scores[1][ind] += freq
        return scores

    def rank(self, scores):
        """ranks files in the descending order of relevancy
        Args:
        scores(list) : a list of tuples: (filename, score)
        Returns:
        list : a list of tuples: (filename, score) sorted in descending order of relevancy
        """
        for i in range(
                len(scores[0]) - 1, 0,
                -1):  # extra credit implementation of sorting algorithm.
            for j in range(len(scores[0]) - 1):
                if scores[1][j] < scores[1][j + 1]:
                    temp = scores[0][j]
                    temp2 = scores[1][j]
                    scores[0][j] = scores[0][j + 1]
                    scores[1][j] = scores[1][j + 1]
                    scores[0][j + 1] = temp
                    scores[1][j + 1] = temp2
        return scores

    def search(self, query):
        """
        Searches query and returns stuff.
        :param query:
        :return:
        """
        temp_list = query.lower()
        temp_list = temp_list.split()
        query_list = self.parse_words(temp_list)
        scores_tuple = self.get_scores(query_list)
        ranked_scores = self.rank(scores_tuple)
        str = ""
        for i in ranked_scores[0]:
            if i:
                str += "%s/%s \n" % (self.directory, i)
        return str