Ejemplo n.º 1
0
    def __init__(self):
        self.parser = Parser()
        self.bids = Crawler().finished_set

        if os.path.exists(os.path.join(Matrix.dirname, 'dictionary.pkl')):
            self.dictionary = None
        else:
            self.dictionary = self.load_dictionary()

        if os.path.exists(os.path.join(Matrix.dirname, 'higher_tier_dictionary.pkl')):
            self.higher_tier_dictionary = None
        else:
            self.load_higher_tier_dictionary()

        self.term_to_row = self.load_term_to_row()
        self.bid_to_col = self.load_bid_to_col()

        self.N = len(self.bid_to_col)    # Total number of documents
        self.M = len(self.term_to_row)    # Total number of terms

        self.col_to_bid = self.load_col_to_bid()
        self.row_to_term = self.load_row_to_term()    # Used in similar search
        self.term_bid_matrix = self.load_term_bid_matrix()
        self.title_term_bid_matrix = self.load_title_term_bid_matrix()
        # Csr sparse matrix sed for slice term rows
        self.term_bid_matrix_csr = self.term_bid_matrix.tocsr()
        self.title_term_bid_matrix_csr = self.title_term_bid_matrix.tocsr()
        self.stop_words = self.load_stop_words()

        # Tiers, tier[0] is the lowest tier
        self.mats_csc = (self.term_bid_matrix,
                         self.title_term_bid_matrix)
        self.mats_csr = (self.term_bid_matrix_csr,
                         self.title_term_bid_matrix_csr)
Ejemplo n.º 2
0
 def __init__(self, documents=[], transforms=[TFIDF, LSA]):
     self.collection_of_document_term_vectors = []
     self.parser = Parser()
     if len(documents) > 0:
         self._build(documents, transforms)
Ejemplo n.º 3
0
class Matrix(object):
    k1, k2, b = 1.5, 1.5, 0.75    # Parameters
    modified = {'dictionary': False, 'higher_tier_dictionary': False}    # Member vars modified mark
    dirname = os.path.dirname(os.path.realpath(__file__))
    def __init__(self):
        self.parser = Parser()
        self.bids = Crawler().finished_set

        if os.path.exists(os.path.join(Matrix.dirname, 'dictionary.pkl')):
            self.dictionary = None
        else:
            self.dictionary = self.load_dictionary()

        if os.path.exists(os.path.join(Matrix.dirname, 'higher_tier_dictionary.pkl')):
            self.higher_tier_dictionary = None
        else:
            self.load_higher_tier_dictionary()

        self.term_to_row = self.load_term_to_row()
        self.bid_to_col = self.load_bid_to_col()

        self.N = len(self.bid_to_col)    # Total number of documents
        self.M = len(self.term_to_row)    # Total number of terms

        self.col_to_bid = self.load_col_to_bid()
        self.row_to_term = self.load_row_to_term()    # Used in similar search
        self.term_bid_matrix = self.load_term_bid_matrix()
        self.title_term_bid_matrix = self.load_title_term_bid_matrix()
        # Csr sparse matrix sed for slice term rows
        self.term_bid_matrix_csr = self.term_bid_matrix.tocsr()
        self.title_term_bid_matrix_csr = self.title_term_bid_matrix.tocsr()
        self.stop_words = self.load_stop_words()

        # Tiers, tier[0] is the lowest tier
        self.mats_csc = (self.term_bid_matrix,
                         self.title_term_bid_matrix)
        self.mats_csr = (self.term_bid_matrix_csr,
                         self.title_term_bid_matrix_csr)

    def __del__(self):
        self.save_dictionary()
        self.save_higher_tier_dictionary()
        self.save_term_to_row()
        self.save_bid_to_col()
        self.save_term_bid_matrix()
        self.save_title_term_bid_matrix()
        self.save_row_to_term()
        self.save_col_to_bid()
        self.save_stop_words()

    def saving(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            stuff = func.__name__[5:]
            if Matrix.modified[stuff]:
                filename = os.path.join(Matrix.dirname, stuff + '.pkl')
                print('Saving {} ...'.format(stuff))
                with open(filename, 'rb') as f:
                    pickle.dump(getattr(self, stuff), f)
                print('{} saved.'.format(stuff))
        return wrapper

    def loading(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            stuff = func.__name__[5:]
            filename = os.path.join(Matrix.dirname, stuff + '.pkl')
            if os.path.exists(filename):
                print('Loading {} ...'.format(stuff))
                with open(filename, 'rb') as f:
                    Matrix.modified[stuff] = False
                    r = pickle.load(f)
                print('{} loaded.'.format(stuff))
            else:
                print('Building {} ...'.format(stuff))
                r = func(*args, **kwargs)
                Matrix.modified[stuff] = True
                print('{} built.'.format(stuff))
            return r
        return wrapper

    @loading
    def load_dictionary(self):
        dictionary = defaultdict(Counter)    # {term -> {bid -> frequency}}
        for bid in self.bids:
            path = os.path.join('text', bid + '.txt')
            try:
                with open(path, 'r') as f:
                    text = f.read()
            except FileNotFoundError as E:
                with open('parser.log', 'a') as f:
                    f.write('{}\n'.format(E))

            for token in self.parser.parse(text):
                dictionary[token][bid] += 1

        return dictionary

    @loading
    def load_higher_tier_dictionary(self):
        higher_tier_dictionary = defaultdict(Counter)
        conn = sqlite3.connect('books.db')
        c = conn.cursor()
        c.execute('SELECT bid, title, author, tags FROM books')
        while True:
            record = c.fetchone()
            if record:
                bid, title, author, tags = record
                text = '{} {} {}'.format(title, author, tags)
                for token in self.parser.parse(text):
                    higher_tier_dictionary[token][str(bid)] += 1
            else:
                break
        conn.close()

        return higher_tier_dictionary

    @loading
    def load_term_to_row(self):
        return {term: row for row, term in enumerate(self.dictionary)}

    @loading
    def load_bid_to_col(self):
        return {bid: col for col, bid in enumerate(self.bids)}

    @loading
    def load_term_bid_matrix(self):
        return self._dict_to_matrix(self.dictionary)

    @loading
    def load_title_term_bid_matrix(self):
        return self._dict_to_matrix(self.higher_tier_dictionary)

    @loading
    def load_row_to_term(self):
        return {row: term for term, row in self.term_to_row.items()}

    @loading
    def load_col_to_bid(self):
        return {col: bid for bid, col in self.bid_to_col.items()}

    @loading
    def load_stop_words(self, N=25000):
        """Return a boolean vector in which True entries stand for lower occurence words,
        while False entries stand for higher occurence words.
        N is the threshold values that how many stop words will be selected"""
        def df(term):
            return len(self.dictionary[term])
        def nlargest_df(N):
            return heapq.nlargest(N, ((df(term), term) for term in self.dictionary))
        v = np.ones((self.M, 1), dtype=bool)    # High occurence words (stop words) vector
        for df, term in nlargest_df(N):
            v[self.term_to_row[term], 0] = False
        return v

    def _dict_to_matrix(self, dictionary):
        """Get term/bid matrix."""
        m = sparse.dok_matrix((self.M, self.N), dtype=int)
        for term, bids in dictionary.items():
            row = self.term_to_row[term]
            for bid, freq in bids.items():
                col = self.bid_to_col[bid]
                m[row, col] = freq
        return m.tocsc()

    @saving
    def save_term_bid_matrix(self):
        pass

    @saving
    def save_title_term_bid_matrix(self):
        pass

    @saving
    def save_term_to_row(self):
        pass

    @saving
    def save_bid_to_col(self):
        pass

    @saving
    def save_dictionary(self):
        pass

    @saving
    def save_higher_tier_dictionary(self):
        pass

    @saving
    def save_row_to_term(self):
        pass

    @saving
    def save_col_to_bid(self):
        pass

    @saving
    def save_stop_words(self):
        pass

    def tf(self, freq):
        return 1 + np.log10(freq)

    def idf(self, term, tier=0):
        matrix = self.mats_csr[tier]
        def df(term):
            row_num = self.term_to_row[term]
            row = matrix.getrow(row_num)
            return row.nnz
        return np.log10(self.N / df(term))

    def make_query_vector(self, query_terms, tier):
        """Generator a query tf-idf vector according to terms counter.
        Parameter query_terms is a Counter data structure."""
        q = sparse.dok_matrix((1, self.M))    # 1 * #term_num# vector
        # Build query tf-idf vector
        for term, freq in query_terms.items():
            q[0,self.term_to_row[term]] = self.tf(freq) * self.idf(term, tier)
        q = normalize(q.tocsr(), axis=1, copy=False)
        return q

    def boolean_search(self, query_terms, tier):
        """Search a query according to its ocurrence in the documents and return matched results.
        When tier is 'l', search the lower tier, and if tier is 'h' search the higher tier.
        Return the column indices of matched books."""
        matrix = self.mats_csr[tier]
        docs = np.zeros(self.N, dtype=bool)    # Get union of docs containing query
        for term in query_terms:
            row = self.term_to_row.get(term)
            if row is None:
                continue
            else:
                np.logical_or(docs, matrix.getrow(row).toarray()[0], docs)
        return docs.nonzero()[0]

    def make_matched_matrix(self, cols, tier):
        matrix = self.mats_csc[tier]
        # Pick matched columns
        m = matrix[:,cols]
        m.data = self.tf(m.data)
        m = normalize(m.T, axis=1, copy=False).T    # No copy to make faster
        return m

    def tiered_search(self, query, K=50):
        query_terms = self.parser.parse_query(query)

        # Higher tier
        try:
            hres = heapq.nlargest(K, self.cos_sim_search(query_terms, tier=1))
        except:    # Terms don't occur in titles, continue to search the lower tier
            hres = []
        lres = []
        if len(hres) < K:    # Lower tier
            try:
                hres_bids = [r[1] for r in hres]
                # Docs that hasn't been retrieve
                lres = heapq.nlargest(K - len(hres),    # Find at most K docs
                                      filter(lambda pair: pair[0] not in hres_bids,
                                             self.cos_sim_search(query_terms, tier=0)))
            except:    # Term don't occur in all docs, return nothing
                return ()
        return (int(r[1]) for r in itertools.chain(hres, lres))    # Return K highest ranked bids

    def cos_sim_search(self, query_terms, tier=0):
        """Return an unsorted iterator of (similarity, bid) tuples given the query terms.
        If qv is not provided, it will be computed.
        Default tier is 0"""
        qv = self.make_query_vector(query_terms, tier)
        cols = self.boolean_search(query_terms, tier)
        m = self.make_matched_matrix(cols, tier)
        cos_sims = qv.dot(m).toarray()[0]
        bids = (self.col_to_bid[col] for col in cols)
        return zip(cos_sims, bids)

    def find_most_similar(self, bids, K_sim=14):
        """Return the bid of the most similar book to parameter bid except the given bid."""
        termv = sparse.csc_matrix((self.M, 1), dtype=int)
        for bid in bids:
            col_num = self.bid_to_col.get(str(bid))
            if col_num is not None:
                termv = termv + self.term_bid_matrix.getcol(col_num)
        if termv.nnz == 0:
            return ()
        termva = termv.toarray()    # Generate a vector for terms
        stop_words_removed = np.logical_and(termva, self.stop_words)
        nonzero = stop_words_removed.nonzero()[0]    # Nonzero indices
        rest_term_rows = self.term_bid_matrix_csr[nonzero]
        docs = np.zeros(self.N, dtype=bool)
        for row in rest_term_rows:
            np.logical_or(docs, row.toarray()[0], docs)
        cols = docs.nonzero()[0]
        matched_matrix = self.term_bid_matrix[:,cols]
        termv.data = self.tf(termv.data) * np.array([self.idf(self.row_to_term[row])
                                                     for row in termv.indices])
        termv = normalize(termv.T, axis=1, copy=False)
        matched_matrix.data = self.tf(matched_matrix.data)
        matched_matrix = normalize(matched_matrix.T, axis=1, copy=False).T
        cos_sims = termv.dot(matched_matrix).toarray()[0]
        found_bids = (self.col_to_bid[col] for col in cols)
        return islice((int(r[1])
                       for r in heapq.nlargest(K_sim, zip(cos_sims, found_bids))
                       if int(r[1]) not in bids),
                       9)

    def find_most_similar_tags(self, bid, K_sim=10):
        conn = sqlite3.connect('books.db')
        c = conn.cursor()
        c.execute('SELECT tags FROM books WHERE bid = ?', (int(bid),))
        query_terms = self.parser.parse_query(c.fetchone()[0])
        return (int(r[1]) for r in heapq.nlargest(K_sim, self.cos_sim_search(query_terms)))

    def bm25_search(self, query):
        pass
Ejemplo n.º 4
0
class VectorSpace:
    collection_of_document_term_vectors = []
    vector_index_to_keyword_mapping = []
    parser = None

    def __init__(self, documents=[], transforms=[TFIDF, LSA]):
        self.collection_of_document_term_vectors = []
        self.parser = Parser()
        if len(documents) > 0:
            self._build(documents, transforms)

    def _build(self, documents, transforms):
        """
            Create the vector space for the passed document strings
        """
        self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(documents)

        pprint(self.vector_index_to_keyword_mapping)
        matrix = [self._make_vector(document) for document in documents]
        matrix = reduce(lambda matrix, transform: transform(matrix).transform(), transforms, matrix)
        self.collection_of_document_term_vectors = matrix


    def _get_vector_keyword_index(self, document_list):
        """
            Zwraca słownik zawierający pary "słowo" : pozycja w liście rdzeni
        """
        vocabulary_list = self.parser.tokenise_and_remove_stop_words(document_list)
        unique_vocabulary_list = self._remove_duplicates(vocabulary_list)
        vector_index = {}
        offset = 0

        # Associate a position with the keywords
        # which maps to the dimension on the vector used to represent this word
        for word in unique_vocabulary_list:
            vector_index[word] = offset
            offset += 1
        return vector_index


    def related(self, document_id):
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [self._cosine(self.collection_of_document_term_vectors[document_id], document_vector) for
                   document_vector in self.collection_of_document_term_vectors]
        ratings.sort(reverse=True)
        return ratings


    def search(self, searchList):
        """ search for documents that match based on a list of terms """
        queryVector = self._build_query_vector(searchList)
        ratings = [self._cosine(queryVector, documentVector) for documentVector in self.collection_of_document_term_vectors]
        ratings.sort(reverse=True)
        return ratings

    def _make_vector(self, word_string):
        """ @pre: unique(vectorIndex) """

        vector = [0] * len(self.vector_index_to_keyword_mapping)

        word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))

        # Term Count Model
        for word in word_list:
            vector[self.vector_index_to_keyword_mapping[word]] += 1;
        return vector


    def _build_query_vector(self, term_list):
        """
        convert query string into a term vector
        """
        query = self._make_vector(" ".join(term_list))
        return query


    def _remove_duplicates(self, list):
        """
         Usuwanie duplikatów słów
        """
        return set((item for item in list))


    def _cosine(self, vector1, vector2):
        """ related documents j and q are in the concept space by comparing the vectors :
    		cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
        dot_val = dot(vector1, vector2)
        norms = norm(vector1) * norm(vector2)
        if norms == 0.0:
            return 0.0
        else:
            return float(dot_val / norms)