Ejemplo n.º 1
0
    def vectorQuery(self, k):
        """ vector query processing, using the cosine similarity. """
        # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        result = {}
        ivObj = InvertedIndex()
        ivObj.load(self.filename)  # loading the InvertedIndex
        doc_set = set()
        term_idf_list = []
        for term in self.tokens:  # for every term in the query finding the document IDs where the term is present
            if term in self.index:
                doc_set = doc_set.union(set(self.index[term].posting.keys()))
            term_idf_list.append(
                ivObj.idf(term) * 1.0 /
                len(self.tokens))  # calculating tf-idf weights for query
        doc_list = list(doc_set)
        for docID in doc_list:  # Calculating tf-idf weights for the above documents
            for term in self.tokens:
                if term in self.index:
                    if docID in result.keys():
                        result[docID].append(ivObj.tfidf(term, docID))
                    else:
                        result[docID] = [ivObj.tfidf(term, docID)]
                else:
                    if docID in result.keys():
                        result[docID].append(0.0)
                    else:
                        result[docID] = [0.0]

        score_dict = {}
        term_idf_list_np = np.array(self.unitVector(
            term_idf_list))  # calculating unit vector for each document
        for docID in doc_list:
            unit_result = self.unitVector(result[docID])
            unit_np = np.array(unit_result)
            score_dict[docID] = np.dot(
                term_idf_list_np,
                unit_np)  # dot product for query and each document
        score_list = score_dict.items()
        final = sorted(score_list, key=itemgetter(1), reverse=True)
        similarity = []
        for i in range(0, k):
            similarity.append(final[i])
        return similarity  # list of (docID,cosine similarity) in order of ranking
Ejemplo n.º 2
0
def main():

    #########
    # SETUP #
    #########

    # Get input args
    newsgroups_root_dir = argv[1]
    feat_def_path = argv[2]
    class_def_path = argv[3]
    training_data_path = argv[4]

    # Generate index
    #index_newsgroups(newsgroups_root_dir, "idx_save.pkl")
    ii = InvertedIndex()
    ii.load("idx_save.pkl")

    # Write out feature/term pairs to feat_def_path
    feature_id = 0
    with open(feat_def_path, 'w') as outf:
        for item in ii.items:
            outf.write(str(feature_id) + " " + str(item) + "\n")
            feature_id += 1

    # Read back in the feature/term pairs for later
    with open(feat_def_path, 'r') as inf:
        ft_pairs = inf.readlines()

    # Put the ft_pairs into a dictionary for quick lookup
    ft_dict = {}
    for pair in ft_pairs:
        ft_dict[pair.split()[1].strip()] = pair.split()[0]

    # Map the different newsgroups to a given class
    # This is fairly manual...
    with open(class_def_path, 'w') as outf:
        for dir in listdir(newsgroups_root_dir):
            outf.write(class_def_helper(dir) + " " + dir + "\n")

    ############################
    # TRAINING DATA GENERATION #
    ############################

    # Create the training data
    # For each document:
    # Find its containing folder, and extract class from class def
    # For each term in document
    # Compute tfidf, tf or idf
    current_file_id = 1
    with open(training_data_path + ".TFIDF", 'w') as outf:
        # Compute tf-idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Calculate and write out TF-IDF
                    # Note current_file_id is our doc_id
                    tf = ii.find(word).posting[current_file_id].term_freq()
                    idf = ii.idf(word)
                    #outf.write(str(log10(1 + tf) * idf) + " ")
                    data_set.add(ft_dict[word] + ":" +
                                 str(log10(1 + tf) * idf))

                # Write newline to signify end of file
                #outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".TF", 'w') as outf:
        # Compute tf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Write the TF
                    # Note current_file_id is our doc_id
                    # outf.write(str(ii.find(word).posting[
                    # current_file_id].term_freq()) + " ")
                    data_set.add(ft_dict[word] + ":" + str(
                        ii.find(word).posting[current_file_id].term_freq()))

                # Write newline to signify end of file
                # outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                # outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".IDF", 'w') as outf:
        # Compute idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":" + str(ii.idf(word))
                    #    + " ")
                    data_set.add(ft_dict[word] + ":" + str(ii.idf(word)))

                # Write newline to signify end of file
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
Ejemplo n.º 3
0
class QueryProcessor:
    ##
    #
    #    @param         self
    #    @param         query
    #    @param         index
    #    @param         collection
    #    @return        None
    #    @brief         The constructor.
    #                   This process is extremely expensive because it loads the entire pickle object into memory.
    #                   If we are only executing this for one query it is fine but if we are doing it
    #                   for the evaluation used the load query instead
    #    @exception     None documented yet
    ##
    def __init__(self, query, index_file, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = InvertedIndex()
        self.index = self.index.loadData(index_file)
        self.docs = collection
        self.tokenizer = Tokenizer(
            known_words=set(self.index.get_items_inverted().keys()))
        if self.raw_query:
            self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         query
    #   @return        None
    #   @exception     None
    ##
    def loadQuery(self, query):
        self.raw_query = query
        self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         raw_query
    #   @return        None
    #   @exception     None
    ##
    def preprocessing(self, raw_query):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''
        return self.tokenizer.transpose_document_tokenized_stemmed_spelling(
            raw_query)

    ##
    #   @brief         This method does the boolean query processing
    #   @param         self
    #   @return        results:list[docID]
    #   @bug           Fixed
    #   @exception     None
    ##
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them'''
        ''' This method would likely be faster due to the use of  hashes, but I wanted to do what was shown in the slides
            from functools import reduce
            docs = [set(self.index[w]) for w in self.processed_query]
            docs.sort(key=len) # notice it is still smart to order by size 
            return reduce(set.intersection,docs) 
        '''
        if len(self.processed_query) == 0:
            return []

        ## checks that all of our query words are in the index, if not return [] ##
        for w in self.processed_query:
            if not w in self.index.get_items_inverted():
                return []

        ## checks if we only have 1 term in the query and returns its posting list if we do ##
        if len(self.processed_query) == 1:
            return list(self.index.get_items_inverted()[
                self.processed_query[0]].get_posting_list().keys())

        #### document_ids is a list of lists containing only document ids ####
        document_ids = [
            list(self.index.get_items_inverted()[w].get_posting_list().keys())
            for w in self.processed_query
        ]

        # by sorting so that we start with the shortest list of documents we get a potential speed up
        document_ids.sort(key=len)
        results = document_ids[0]

        ## iterates through each query word and does the intersection of docids from its posting list with all those before it ##
        ## could be done faster if index was implemented as set or some other hash data structure
        for p in document_ids[1:]:
            intermediate = []
            i, j = 0, 0
            while i < len(results) and j < len(p):
                if int(results[i]) < int(p[j]):
                    i += 1
                elif int(results[i]) > int(p[j]):
                    j += 1
                else:
                    intermediate.append(p[j])
                    j += 1
                    i += 1
            results = intermediate

            ## checks if we have already found terms totally disjoint from one another
            if len(results) == 0:
                return results

        return results

    ##
    #   @brief         This method compute cosine similarity for two vectors
    #   @param         self
    #   @param         vec1
    #   @param         vec2
    #   @return        score cosine: int
    #   @exception     None
    ##
    def cosine_similarity(self, vec1, vec2):
        # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)"
        AA, AB, BB = 0, 0, 0
        for i in range(len(vec1)):
            x = vec1[i]
            y = vec2[i]
            AA += x * x
            BB += y * y
            AB += x * y
        return round(AB / math.sqrt(AA * BB), 4)

    ##
    #   @brief         This method compute vector model
    #   @param         self
    #   @param         k
    #   @return        cosines: dict{docID: score}
    #   @bug           Fixed
    #   @exception     ValueError
    ##
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        if len(self.processed_query) == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        query_words = list(set(self.processed_query))
        idfs = [self.index.idf(w) for w in query_words]

        # undefined behavior from document on what to do if k is larger than the corpus
        try:
            if k > self.index.get_total_number_Doc():
                raise ValueError('k is greater than number of documents')
        except ValueError as err:
            print(err.args)
            return

        # below we define behavior if none of the words in the query are in any documents
        # this behavior was not defined in instructions so no documents seems most appropriate
        # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant
        if set(idfs) == {0}:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory
        # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples
        idfs, query_words = map(
            list,
            zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0]))

        #Calculates tfs of relevant words
        query_term_counter = Counter(self.processed_query)
        query_tf_vector = [
            round(math.log10(query_term_counter[w] + 1), 4)
            for w in query_words
        ]

        #Other way of doing tf
        #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words]

        ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term
        #### postings should be a list of lists which contains word postings

        postings = [
            self.index.get_items_inverted()[w].get_posting_list()
            for w in query_words if w in self.index.get_items_inverted()
        ]

        document_ids = set().union(*postings)
        document_tfs = {d: [0] * len(query_words) for d in document_ids}

        for inx, term in enumerate(postings):
            for document_id, posting in term.items():
                #log normalization
                document_tfs[document_id][inx] = math.log10(
                    posting.term_freq() + 1)

                #Other
                # tf = posting.term_freq()
                # if tf > 0 :
                #     tf = 1 + math.log10(tf)
                # else:
                #     tf = 0
                # document_tfs[document_id][inx] = tf

        query_tfidf = np.multiply(query_tf_vector, idfs)

        cosines = Counter({
            d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs))
            for d, d_tf in document_tfs.items()
        })
        # this has to be a list as dict are not sorted...
        # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow
        # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy
        temp_k = k
        scores = sorted(list(set(cosines.values())), reverse=True)
        ret = []
        for s in scores:
            docs_with_score_s = sorted(
                [int(d) for d, v in cosines.items() if v == s])
            if len(docs_with_score_s) >= temp_k:
                docs_with_score_s = docs_with_score_s[:temp_k]
                ret.extend([(str(d), s) for d in docs_with_score_s])
                temp_k = 0
                break
            else:
                temp_k = temp_k - len(docs_with_score_s)
                ret.extend([(str(d), s) for d in docs_with_score_s])
        if not temp_k == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())

            ret.extend([(str(j), 0) for j in sorted(
                list(map(int, all_docids.difference({i[0]
                                                     for i in ret}))))[:temp_k]
                        ])
        return ret
Ejemplo n.º 4
0
def test(index_loc, cran_loc, qrels_loc):
    ''' test your code thoroughly. put the testing cases here'''

    ##### SETUP ITEMS #####

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_loc)

    # Get the document collection
    cf = CranFile(cran_loc)

    # Get ground-truth results from qrels.txt
    with open(qrels_loc) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    ##### INITIAL TEST ITEMS #####
    print("TESTS BASED ON SUGGESTED TESTING POINTS")

    # Ensure tf is correct
    #   Find a random word and check TF value against what is manually done
    posting_list = ii.find("experiment").posting
    tf_vector = []
    for posting in posting_list:
        tf_vector.append(len(posting_list[posting].positions) \
            == posting_list[posting].term_freq())
    print("TF is computed correctly:", all(tf_vector))

    # Ensure idf is correct
    print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \
        == ii.idf("experiment"))

    # As both tf and idf are correct, and tf-idf is a product of the two,
    #   it is reasonable to assume tf-idf is computed correctly

    ##### BOOL QUERY TESTS #####

    # Here, I use very specific boolean queries to ensure that a
    #   limited number of documents are returned
    print("\nBOOL QUERY TESTS")

    # Ensure that the exact title of doc 8 matches for doc 8
    doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition"
    qp1 = QueryProcessor(doc8, ii, cf)
    print("Bool query matches on exact title:", qp1.booleanQuery() == [8])

    # Ensure that bool query matches very specific AND query
    qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf)
    print(
        "Bool query matches on specific AND query ('hugoniot and infinitesimally'):",
        qp2.booleanQuery() == [329])

    # Test that an OR query is handled properly
    #   Both gravel and stagnation have completely distinct postings lists.
    #   OR should merge them.
    gravel_postings = ii.find("gravel").sorted_postings[:]
    stag_postings = ii.find("stagnat").sorted_postings[:]
    gravel_postings.extend(stag_postings)
    qp3 = QueryProcessor("gravel or stagnation", ii, cf)
    print("Bool query successfully handles OR ('gravel or stagnation'):",
          qp3.booleanQuery() == sorted(gravel_postings))

    # Test that NOT is handled properly
    #   The posting list for "diameter" is a subset of "slipstream" postings
    #   (oddly enough). To test this works, do "slipstream and not diameter"
    #   and we chould get slipstream's postings minus those of diameter.
    slip_postings = ii.find("slipstream").sorted_postings[:]
    diam_postings = ii.find("diamet").sorted_postings[:]
    slip_not_diam = [t for t in slip_postings if t not in diam_postings]
    print("Bool query successfully handles NOT ('slipstream and not diameter'):",
        QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \
          == slip_not_diam)

    # Ensure AND/OR order doesn't matter
    print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery())
    print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):",
        QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery())

    # Ensure that the presence of parens does not change query results
    print("Bool query can handle query regardless of parens ('slipstream and diameter'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery())

    # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries
    print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):",
        QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery())
    print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):",
        QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery())

    # Ensure parentheses properly group items
    #   Tested by doing the query "manually" by adding/orring the correct terms
    part_one = QueryProcessor("conduction and cylinder and gas", ii,
                              cf).booleanQuery()
    part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery()
    part_one.extend(part_two)
    expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery()
    expected_result.extend(part_one)
    print("Bool query parens successfully group conflicting operators:",
        QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \
          == sorted(list(set(expected_result))))

    ##### VECTOR QUERY TESTS #####

    # For this, just ensure that most of the results are in the expected list
    print("\nVECTOR QUERY TESTS")

    # Ensure vector query can match on exact title
    print("Vector query matches on exact title:",
          qp1.vectorQuery(1)[0][0] == 8)

    # Try a few example queries from query.text
    #   As long as one-fifth of t-10 are in gt_result, call it a pass
    # Note that queries with larger answer sets were chosen to
    #   ensure there were enough to get to one-fifth of ten
    qc = loadCranQry("query.text")
    poss_queries = list(qc)

    # Query 001
    result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("001") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 001:",
          sum(correct_vector) > 2)

    # Query 128
    result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("128") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 128:",
          sum(correct_vector) > 2)

    # Query 226
    result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("226") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 226:",
          sum(correct_vector) > 2)

    # Query 196
    result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("196") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 196:",
          sum(correct_vector) > 2)

    # Query 291
    result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("291") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 291:",
          sum(correct_vector) > 2)