Exemple #1
0
    def set_doc_vector(self):
        """
        Iterates through the documents and sets their document vectors 
        """
        doc_generator = self.yield_documents()
        documents = True
        doc_freq = shelve('doc_frequencies', 'w')
        shelve('documentVectors', 'c')
        keyword_database = shelve('temp/terms_to_integer', 'r')
        keywords = keyword_database['term2id']
        dv = shelve('documentVectors', 'w')
        db = shelve(self.db, 'w')
        for document in db.itervalues():            
            key = document.key
            doc_terms = document.unique_terms_freq
    
            tf = 0
            doc_vector = zeros(self.vec_length)
            for kw in keywords:      
                if kw in doc_terms.keys():
                    tf = doc_terms[kw]
                    term_weight = mathutils.calculate_term_weight(tf, doc_freq[kw], self.vec_length)
                    doc_vector[keywords[kw]] = term_weight
 
            doc_vector = mathutils.normalise_vector(doc_vector)
            print doc_vector
            dv[key] = doc_vector
Exemple #2
0
def query_parser(query):
    """
    The query string is split into words or terms. The terms are then
    checked if they are present in our basis vector. The terms which are
    found in the basis vector are then mapped to their integer ids and 
    returned as a vector.
    """

    query_terms = query.split()
    query_terms = [porter_stemmer.stem(word) for word in query_terms]
    query_terms = [term for term in query_terms if term in keywords]
    query_vec = zeros(vec_length)
    tfs = dict(Counter(query_terms))
    for term in query_terms:
        if term in keywords:
            index = keyword_database['term2id'][term]
            weight = tfs[term]
            query_vec[index] = weight
    query_vec = mathutils.normalise_vector(query_vec)
    return (query_vec)
Exemple #3
0
def query_parser(query):
    """
    The query string is split into words or terms. The terms are then
    checked if they are present in our basis vector. The terms which are
    found in the basis vector are then mapped to their integer ids and 
    returned as a vector.
    """
   
    query_terms = query.split()
    query_terms = [porter_stemmer.stem(word) for word in query_terms]
    query_terms = [term for term in query_terms if term in keywords]
    query_vec = zeros(vec_length)
    tfs = dict(Counter(query_terms)) 
    for term in query_terms:
        if term in keywords:
            index = keyword_database['term2id'][term]
            weight = tfs[term]
            query_vec[index] = weight
    query_vec = mathutils.normalise_vector(query_vec)
    return (query_vec)