def word_list(): print(flask.request.method) if flask.request.method == 'PUT': word_list = flask.request.form.get( 'word_list') or flask.request.get_json(force=True).get('word_list') # Remove all words if len(word_list) > 0: Word.objects().delete() # Creates a wordlist for word in word_list: Word(word=word).save() return flask.jsonify( [word.word for word in Word.objects.all().order_by('_id')])
def calc_sim(self, document, query_document): sum_weight_word_in_document_multiply_word_in_query = 0 sum_weight_word_in_document_pow2 = 0 sum_weight_word_in_query_pow2 = 0 for word in query_document.words: query_word = Word.objects(text=word).first() if not query_word: continue weight_word_in_document = self.calc_weight(query_word, document) weight_word_in_query = self.calc_weight(query_word, query_document) sum_weight_word_in_document_multiply_word_in_query += \ weight_word_in_document * weight_word_in_query sum_weight_word_in_document_pow2 += weight_word_in_document**2 sum_weight_word_in_query_pow2 += weight_word_in_query**2 sim = sum_weight_word_in_document_multiply_word_in_query / ( (math.sqrt(sum_weight_word_in_document_pow2) * math.sqrt(sum_weight_word_in_query_pow2)) + 0.0001) return sim
while True: query_text = input("Search:") query_hash = text_processor.hash(query_text) query_words = text_processor.tokenize(query_text) print("Words:", query_words) query_document = Document(hash=query_hash, text=query_text, words=query_words) documents = [] for word_text in query_document.words: word = Word.objects(text=word_text).first() if not word: print("Word {word} not found!".format(word=word_text)) continue print("[{word}] - {count} documents found.".format( word=word_text, count=len(word.documents))) documents.extend(word.documents) documents = set(documents) top_documents = [] for document in tqdm(documents, total=len(documents)): sim_document_query = text_processor.calc_sim(document, query_document) if sim_document_query > 0: