def set_doc_vector(self): """ Iterates through the documents and sets their document vectors """ doc_generator = self.yield_documents() documents = True doc_freq = shelve('doc_frequencies', 'w') shelve('documentVectors', 'c') keyword_database = shelve('temp/terms_to_integer', 'r') keywords = keyword_database['term2id'] dv = shelve('documentVectors', 'w') db = shelve(self.db, 'w') for document in db.itervalues(): key = document.key doc_terms = document.unique_terms_freq tf = 0 doc_vector = zeros(self.vec_length) for kw in keywords: if kw in doc_terms.keys(): tf = doc_terms[kw] term_weight = mathutils.calculate_term_weight(tf, doc_freq[kw], self.vec_length) doc_vector[keywords[kw]] = term_weight doc_vector = mathutils.normalise_vector(doc_vector) print doc_vector dv[key] = doc_vector
def query_parser(query): """ The query string is split into words or terms. The terms are then checked if they are present in our basis vector. The terms which are found in the basis vector are then mapped to their integer ids and returned as a vector. """ query_terms = query.split() query_terms = [porter_stemmer.stem(word) for word in query_terms] query_terms = [term for term in query_terms if term in keywords] query_vec = zeros(vec_length) tfs = dict(Counter(query_terms)) for term in query_terms: if term in keywords: index = keyword_database['term2id'][term] weight = tfs[term] query_vec[index] = weight query_vec = mathutils.normalise_vector(query_vec) return (query_vec)