lda = LatentDirichletAllocation(docs, workers=3) model, corpus, dictionary = lda.compute(topics, passes) lda.save_to_file(save_filename, model, corpus, dictionary) return model, corpus, dictionary def LSA(docs, topics, save_filename): lsi = LatentSemanticAnalyser(docs) return lsi.compute(topics, save_filename) dbmg = DatabaseHelper(connection_string) #replies_db = dbmg.select_query("select * from reply", None, fetch_to_dict=True) questions = dbmg.select_query("""select * from question join forum_details on question.forum_details_id = forum_details.forum_details_id where community_id = %s """, dbmg.get_community_id('Business'), fetch_to_dict=True) #replies_by_question_db = dbmg.select_query("""select reply_id, text, question_id #from reply #where question_id in # ( select question_id # from replyvgroup by question_id) # order by question_id asc""", None, fetch_to_dict=True) #replies_question_forum = dbmg.get_replies_question_forum() dbmg.close() questions_contents = [] for question in questions:
file = open("similarity_res_th_{}.txt".format(self.__id), 'w+') for doc in working_docs: file.write("NEW_QUESTION" + doc + "\n") for other_doc in self.__docs: if doc != other_doc: s1 = set(doc.split()).intersection(self.__model.wv.vocab) s2 = set(other_doc.split()).intersection(self.__model.wv.vocab) similarity = self.__model.n_similarity(s1, s2) if similarity > 0.95: print(similarity) file.write(str(similarity) + other_doc + "\n") connect_string = "dbname=uoa-nlp user=admin" db = DatabaseHelper(connect_string) questions_db = db.select_query("select question_id, content from question", None, fetch_to_dict=True) questions_content = [] tagged_questions = [] tokenizer = InputPreprocessor(None) ###tokenize the data before applying the model for question in questions_db: if question['content'] is not None: questions_content.append(question['content']) tokens = tokenizer.tokenize(question['content']) tagged_questions.append(doc2vec.TaggedDocument(words=tokens, tags=[question['questiony_id']]))