def update_query(query, relevant=None, irrelevant=None, alpha=0.9, beta=0.5, gamma=0.1): """ Update query in our db using rocchio algorithm. Note, the query string is not updated but the results in the results db are updated. :param query: query string :param relevant: list of doc_ids :param irrelevant: list of doc_ids :param alpha: weight of original query :param beta: weight of relevant docs :param gamma: weight or irrelevant docs :return: True if successful, False if unsuccessful """ if relevant is None: relevant = [] if irrelevant is None: irrelevant = [] assert (query != "") query_map = SqliteDict(query_map_path) try: q0 = query_map[query] except KeyError: # Can't update queries we've never seen query_map.close() return False if not isinstance(q0, DictVector): q0 = DictVector(q0) doc_vec_db = SqliteDict(doc_vecs_db_path) Nr = len(relevant) for doc_id in relevant: try: doc_vec = doc_vec_db[doc_id] if not isinstance(doc_vec, DictVector): doc_vec = DictVector(doc_vec) except KeyError: continue q0 = q0 + (beta / Nr) * doc_vec Ni = len(irrelevant) for doc_id in irrelevant: try: doc_vec = doc_vec_db[doc_id] if not isinstance(doc_vec, DictVector): doc_vec = DictVector(doc_vec) except KeyError: continue q0 = q0 - (gamma / Ni) * doc_vec query_map[query] = q0 query_map.commit() set_query_results(query)
def undo_update(query, relevant=None, irrelevant=None, alpha=0.9, beta=0.5, gamma=0.1): """ Method for undoing an update if a user decides a post that was relevant isn't actually relevant. :param query: query string :param relevant: list of doc_ids :param irrelevant: list of doc_ids :param alpha: weight of original query :param beta: weight of relevant docs :param gamma: weight or irrelevant docs :return: True if successful, False if unsuccessful """ if relevant is None: relevant = [] if irrelevant is None: irrelevant = [] assert (query != "") query_map = SqliteDict(query_map_path) try: q0 = query_map[query] except KeyError: # Can't update queries we've never seen query_map.close() return False if not isinstance(q0, DictVector): q0 = DictVector(q0) doc_vec_db = SqliteDict(doc_vecs_db_path) Nr = len(relevant) for doc_id in relevant: try: doc_vec = doc_vec_db[doc_id] if not isinstance(doc_vec, DictVector): doc_vec = DictVector(doc_vec) except KeyError: continue q0 = q0 - (beta / Nr) * doc_vec Ni = len(irrelevant) for doc_id in irrelevant: try: doc_vec = doc_vec_db[doc_id] if not isinstance(doc_vec, DictVector): doc_vec = DictVector(doc_vec) except KeyError: continue q0 = q0 + (gamma / Ni) * doc_vec query_map[query] = q0 query_map.commit() set_query_results(query)
def testSubtraction(self): other = DictVector() other["a"] = 1 other["e"] = 2 result = self.vec - other self.assertEqual(result["a"], 0) self.assertEqual(result["e"], -2)
def testAdditionWithNonepmty(self): other = DictVector() other["a"] = 1 other["b"] = 2 result = self.vec + other self.assertEqual(result["a"], 2) self.assertEqual(result["b"], 4) self.assertEqual(result["c"], 3)
def testWithDifferentKeys(self): other = DictVector() other["a"] = 1 other["e"] = 2 result = self.vec + other self.assertEqual(result["a"], 2) self.assertEqual(result["b"], 2) self.assertEqual(result["c"], 3) self.assertEqual(result["e"], 2)
def tfidf_from_doc(self, doc: Document, doc_freqs: DocFreqs) -> dict: if self.query_expander is not None: tf = self.query_expander.compute_tf(doc, self.term_weights) else: tf = compute_tf(doc, doc_freqs, self.term_weights) tf_idf = {} N = doc_freqs.get_num_docs() for word in tf.keys(): tf_idf[word] = tf[word] * np.log(N / (1 + doc_freqs[word])) return DictVector(tf_idf)
def tfidf_from_tf(self, tf, doc_freqs): tf_idf = {} N = doc_freqs.get_num_docs() for word in tf.keys(): tf_idf[word] = tf[word] * np.log(N / (1 + doc_freqs[word])) return DictVector(tf_idf)
def testConstructor(self): dic = {"a": 1, "b": 2} vec = DictVector(dic) self.assertEqual(vec["a"], 1) self.assertEqual(vec["b"], 2)
def setUp(self) -> None: self.vec = DictVector() self.vec["a"] = 1 self.vec["b"] = 2 self.vec["c"] = 3
def query2vec(query): processed_query = processer.tfidf_from_query(query, doc_freqs) processed_query = DictVector(processed_query) return processed_query
db_path = os.path.join(app.root_path, "db") if not os.path.exists(db_path): os.makedirs(db_path) # query db stores the results of queries for later use query_db_path = os.path.join(db_path, "queries.db") # query map is a map of queries to vectors query_map_path = os.path.join(db_path, "query_map.db") doc_vecs_db_path = os.path.join(db_path, "doc_vecs.db") # Upload all doc vectors to db doc_vec_db = SqliteDict(doc_vecs_db_path) for doc_id, doc_vec in docs_tfidf: try: vec = doc_vec_db[doc_id] except KeyError: doc_vec_db[doc_id] = DictVector(doc_vec) doc_vec_db.commit() doc_vec_db.close() # Convert document to vector, then upload def upload_doc_vec(doc): #TODO Maybe implement this if time permits return @app.route("/docs", methods=["POST"]) def upload_doc(): doc_db = SqliteDict(os.path.join(db_path, "docs.db")) data = json.loads(request.data) doc = {