def apply_tfidf_to_connections(graph, class_visitors): edges = graph.edges() tf_idf = TfIdf() for src, dst in edges: source = class_visitors[src].get_merge_of_entities() destination = class_visitors[dst].get_merge_of_entities() similarity = round(tf_idf.apply_tfidf_to_pair(source, destination), 2) logging.info(f"{similarity} {src} - {dst}") graph[src][dst][str(WeightType.TF_IDF)] = similarity
def load_from_model(self, model_name): """ Metoda nacita z predaneho modelu jednotlive slovniky klasifikacnich trid a spoustí GUI, ceka na stisk tlacitka a pote klasifikuje zadanou vetu. :param model_name: model ze ktereho se maji nacist jednotliva data. """ with open(model_name, "r") as read_file: json_load = json.load(read_file) if json_load["namepriz"] == "BagOfWords": self.priz_metoda = BagOfWords() elif json_load["namepriz"] == "TfIdf": self.priz_metoda = TfIdf() elif json_load["namepriz"] == "NGram": self.priz_metoda = NGram() self.priz_metoda.words = json_load["words"] self.priz_metoda.klas_tridy = json_load["klas_tridy"] self.priz_metoda.prior = json_load["prior"] if json_load["nameklas"] == "NaiveBayes": self.klasifikator = NaiveBayes(self.priz_metoda) elif json_load["nameklas"] == "NN": self.klasifikator = NN(self.priz_metoda) self.top.title("Classify") self.top.geometry('400x300') buttonCommit = Button(self.top, height=1, width=10, text="Commit", command=lambda: self.retrieve_input()) self.text1.pack() buttonCommit.pack() self.label.pack() self.top.mainloop()
def main(preprocessed_node_path, argument_path, dictionary_path, tfidf_path): preprocessed_node_path = Path(args.preprocessed_node_path) argument_path = Path(args.argument_path) dictionary_path = Path(args.dictionary_path) tfidf_path = Path(args.tfidf_path) #argument_generator_getter = lambda: utils.load(argument_path) #argument_nodes_ids = set(( # node_id # for argument in argument_generator_getter() # for node_id in argument[0].values())) # Use the set of ids to select only the relevant nodes # (and not train nlp models on all documents). #preprocessed_node_generator_getter = lambda : filter( # lambda node: node['id'] in argument_nodes_ids, # utils.load( preprocessed_node_path)) dictionary = pkl.load(dictionary_path.open('rb')) #tfidf = text.fit_tfidf(preprocessed_node_generator_getter, # dictionary, # verbose = True) tfidf = TfIdf() tfidf.fit(dictionary.dictionary) tfidf.save(tfidf_path)
def __init__(self): self.question = '' self.response = '' self.sentence = '' self.question_type = '' self.query_process = QueryProcess() with open("Design-History.txt", 'r') as d: document = d.readlines() repalcer = RegexReplacer() document = repalcer.replace("".join(document)) documents = document.split('@') self.documents = [d.strip('\n').lower() for d in documents] tokenizer = RegexpTokenizer( r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w\-\']+|'.*'") self.documents_tokens = [tokenizer.tokenize(d) for d in self.documents] tf = TfIdf() self.tfidf = tf.tfidf(self.documents_tokens)
def load_from_parametres(self): """ Metoda kontroluje vstupni parametry a prirazuje je do patricnych promennych. """ if os.path.isfile(sys.argv[1]): self.classif = sys.argv[1] else: print("Zadany parametr pro klasifikacni tridy neni souborem") sys.exit(-1) if os.path.isfile(sys.argv[2]): self.tran_file = sys.argv[2] else: print("Zadany parametr pro trenovaci mnozinu neni souborem") sys.exit(-1) if os.path.isfile(sys.argv[3]): self.test_file = sys.argv[3] else: print("Zadany parametr pro testovaci mnozinu neni souborem") sys.exit(-1) if sys.argv[4] == "bow": self.priz_metoda = BagOfWords() elif sys.argv[4] == "tfidf": self.priz_metoda = TfIdf() elif sys.argv[4] == "ngram": self.priz_metoda = NGram() else: print("Neznama priznakova metoda") sys.exit(-1) if sys.argv[5] == "bayes": self.klasifikator = NaiveBayes(self.priz_metoda) elif sys.argv[5] == "nn": self.klasifikator = NN(self.priz_metoda) else: print("Neznamy klasifikator") sys.exit(-1) self.modelname = sys.argv[6]
def getData(self): for relation in self.data.keys(): for word in self.data[relation]: '''try 3 times with 3 different method to find the word in DBpedia''' '''1st try || Capitalize each word because in DBpedia the word needs to be capitalized''' results = DBpedia.Query(word[0].capitalize()) if not results["results"]["bindings"]: ''' 2nd try || word 'Bass' doesn't exist, but Bass_(fish) exists adds '_(term)' in the end of the word ''' results = DBpedia.Query(word[0].capitalize() + "_(" + self.term + ")") if not results["results"]["bindings"]: '''3rd try || word schrod doesn't exist, but Scrod (same word) exists uses wikipedia to correct the word ''' wikiTerms = wikipedia.search(word[0].capitalize()) #replace " " with "_" because the words in DBpedia are seperated by this char "_" wikiWord = wikiTerms[0].replace(" ", "_") results = DBpedia.Query(wikiWord) '''If can't find the word in DBpedia the weight is None''' if not results["results"]["bindings"]: if relation in self.DBdata: self.DBdata[relation].append([word[0], None]) else: self.DBdata[relation] = [] self.DBdata[relation].append([word[0], None]) #add the data to DBdata dictionary for result in results["results"]["bindings"]: if relation in self.DBdata: self.DBdata[relation].append( [word[0], result["abstract"]["value"]]) else: self.DBdata[relation] = [] self.DBdata[relation].append( [word[0], result["abstract"]["value"]]) '''Uses TfIdf class to compute tf-idf scores and CosineSimilarity on the CommentBoxes that we retrieve from DBpedia''' TfIdf_CosineSimilarityData = TfIdf(self.DBdata, self.term).getData() return TfIdf_CosineSimilarityData
print "False Positive - ", falsep print "False Negative - ", falsen precision = truep * 1.0 / (truep + falsep) recall = truep * 1.0 / (truep + falsen) f_score = (precision * recall * 2) / (precision + recall) accuracy = (truep + truen) * 1.0 / len(y) print "Precision is", precision print "Recall is", recall print "F Score is", f_score print "Accuracy is ", accuracy # Launch Codes # Step 1 - Data is read from file "duplicate_sample.in" in same directory ld = LoadData() tfidf = TfIdf() # Step 2 - Verify correct loading ld.load_statistics() # Step 3 - Parse questions (stop word removal, stemming) ld.parse_questions() # Step 4 - Create tf-idf matrix for all documents tfidf.create_tfidf_matrix(ld.get_rawsamples()) tfidf.create_tfidf_topics(ld.get_rawsamples()) # ML Method if not os.path.isfile("lsi_scores_train.txt"): # Checking if LSI scores are already stored in file! print "Writing LSI Scores to files" ld.write_lsi() if not os.path.isfile("lda_scores_train.txt"): # Checking if LDA scores are already stored in file! print "Writing LDA Scores to files" ld.write_lda()
entities_column = 1 entity_id_dict = {} for row in cursor: count_of_id = row[count_of_id_column] entity = row[entities_column] entity_id_dict[entity] = count_of_id return entity_id_dict def get_total_documents(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(distinct(id)) from "IdEntity" ' cursor.execute(query) count_of_distinct_id_column = 0 total_documents_count = 0 for row in cursor: total_documents_count = row[count_of_distinct_id_column] return total_documents_count ############### EntityIdIndexObj = EntityIdIndexer() entity_count_id_dict = EntityIdIndexObj.build_tf() total_documents_count = EntityIdIndexObj.get_total_documents() TfIdfObj = TfIdf() entity_tfidf_obj = TfIdfObj.computeTfIdf(entity_count_id_dict, total_documents_count) TfIdfObj.write_to_db(entity_tfidf_obj)
csvFile2 = tfList.createFile("tf-idf_list.csv") csvWriter1 = csv.writer(csvFile1, delimiter=',', quotechar='|') csvWriter2 = csv.writer(csvFile2, delimiter=',', quotechar='|') tfCloud = TfCloud() txtStr = txt1File.read() txtStr = txtStr.lower() docStr = doc1File.read() docStr = docStr.lower() pdfStr = pdf1.convert_pdf_to_txt() pdfStr.lower() tfidf = TfIdf() # Stopwords stopwords = set(stopwords.words('english')) academicStopwords = set(line.strip() for line in open('acStopWords.txt')) academicStopwords = academicStopwords.union(set(['mr','mrs','one','two','said'])) words1 = word_tokenize(txtStr) words2 = word_tokenize(docStr) words3 = word_tokenize(pdfStr) files = [words1, words2, words3] wordsFiltered = [] wordcount = {} for file in files:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': startTime = datetime.datetime.now() documentList = ["./texts/t11.txt","./texts/t22.txt"] # documentList = ["./texts/test_shak1.txt"] # documentList = ["./texts/shak.txt"] totalDocs = len(documentList) # Add language check on init and load correct stopwords list stopList = stopwords.words('english') # Init weighting libraries TfIdf = TfIdf(documentList, stopList) LSI = LSI(documentList, stopList) LDA = LDA(documentList, stopList) # Loop to get this argument print "Ready " while 1: try: line = sys.stdin.readline() print (TfIdf.runQuery(line)) print (LSI.runQuery(line)) print (LDA.runQuery(line)) except KeyboardInterrupt: break if not line: break
for row in cursor: count_of_id = row[count_of_id_column] entity = row[entities_column] entity_id_dict[entity] = count_of_id return entity_id_dict def get_total_documents(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(distinct(id)) from "IdEntity" ' cursor.execute(query) count_of_distinct_id_column = 0 total_documents_count = 0 for row in cursor: total_documents_count = row[count_of_distinct_id_column] return total_documents_count ############### EntityIdIndexObj = EntityIdIndexer() entity_count_id_dict = EntityIdIndexObj.build_tf() total_documents_count = EntityIdIndexObj.get_total_documents() TfIdfObj = TfIdf() entity_tfidf_obj = TfIdfObj.computeTfIdf(entity_count_id_dict,total_documents_count) TfIdfObj.write_to_db(entity_tfidf_obj)
def test_similarity(self): table = TfIdf() # 训练语料:三篇文章 table.add_document("doc1", [ "The", "game", "of", "life", "is", "a", "game", "of", "everlasting", "learning" ]) table.add_document( "doc2", ["The", "unexamined", "life", "is", "not", "worth", "living"]) table.add_document("doc3", ["Never", "stop", "learning"]) table.calculate_tf() table.calculate_idf() table.calculate_tf_idf() sims = table.similarities(["life", "learning"]) return sims
def tfidf(self): print("Starting Baseline run...") tfidf = TfIdf() tfidf.index_folder_location = self.index_folder tfidf.loadIndex() tfidf.processQueries() tfidf.fetchInvertedList() tfidf.calculateDocumentLength() for query_id, query in tfidf.queries.items(): querylist = query.split() tfidf.computeScore(querylist) tfidf.saveResults(query_id, 'TfIdfModel') print("Baseline run completed successfully!") print("Starting run for stemmed corpus...") tfidf = TfIdf() tfidf.index_folder_location = self.stemmed_index_folder tfidf.loadIndex() tfidf.processStemmedQueries() tfidf.fetchInvertedList() tfidf.calculateDocumentLength() for query_id, query in tfidf.queries.items(): querylist = query.split() tfidf.computeScore(querylist) tfidf.saveResults(query_id, 'TfIdfModel_Stemmed') print("Run for stemmed corpus completed successfully!") print("Starting run for stopping with no stemming...") tfidf = TfIdf() tfidf.stopping_required = True tfidf.index_folder_location = self.stopped_index_folder tfidf.loadIndex() tfidf.processQueries() tfidf.fetchInvertedList() tfidf.calculateDocumentLength() for query_id, query in tfidf.queries.items(): querylist = query.split() tfidf.computeScore(querylist) tfidf.saveResults(query_id, 'TfIdfModel_Stopped') print("Run for stopping with no stemming completed successfully!")
def __init__(self, query, documents): self.query = query self.documents = documents ti = TfIdf() self.tfidf = ti.tfidf(self.documents) pass