Esempio n. 1
0
def apply_tfidf_to_connections(graph, class_visitors):

    edges = graph.edges()

    tf_idf = TfIdf()
    for src, dst in edges:
        source = class_visitors[src].get_merge_of_entities()
        destination = class_visitors[dst].get_merge_of_entities()

        similarity = round(tf_idf.apply_tfidf_to_pair(source, destination), 2)
        logging.info(f"{similarity} {src} - {dst}")

        graph[src][dst][str(WeightType.TF_IDF)] = similarity
Esempio n. 2
0
    def load_from_model(self, model_name):
        """
        Metoda nacita z predaneho modelu jednotlive slovniky klasifikacnich trid a spoustí GUI, ceka na stisk tlacitka a pote klasifikuje zadanou vetu.
        :param model_name: model ze ktereho se maji nacist jednotliva data.
        """

        with open(model_name, "r") as read_file:
            json_load = json.load(read_file)
            if json_load["namepriz"] == "BagOfWords":
                self.priz_metoda = BagOfWords()
            elif json_load["namepriz"] == "TfIdf":
                self.priz_metoda = TfIdf()
            elif json_load["namepriz"] == "NGram":
                self.priz_metoda = NGram()

            self.priz_metoda.words = json_load["words"]
            self.priz_metoda.klas_tridy = json_load["klas_tridy"]
            self.priz_metoda.prior = json_load["prior"]

            if json_load["nameklas"] == "NaiveBayes":
                self.klasifikator = NaiveBayes(self.priz_metoda)
            elif json_load["nameklas"] == "NN":
                self.klasifikator = NN(self.priz_metoda)

        self.top.title("Classify")
        self.top.geometry('400x300')
        buttonCommit = Button(self.top, height=1, width=10, text="Commit",
                              command=lambda: self.retrieve_input())
        self.text1.pack()
        buttonCommit.pack()
        self.label.pack()
        self.top.mainloop()
Esempio n. 3
0
def main(preprocessed_node_path, argument_path, dictionary_path, tfidf_path):
    preprocessed_node_path = Path(args.preprocessed_node_path)
    argument_path = Path(args.argument_path)
    dictionary_path = Path(args.dictionary_path)
    tfidf_path = Path(args.tfidf_path)

    #argument_generator_getter = lambda: utils.load(argument_path)

    #argument_nodes_ids = set((
    #        node_id 
    #        for argument in argument_generator_getter()
    #        for node_id in argument[0].values())) 

    # Use the set of ids to select only the relevant nodes
    # (and not train nlp models on all documents).
    #preprocessed_node_generator_getter = lambda : filter(
    #        lambda node: node['id'] in argument_nodes_ids,
    #        utils.load( preprocessed_node_path))

    dictionary = pkl.load(dictionary_path.open('rb'))

    #tfidf = text.fit_tfidf(preprocessed_node_generator_getter,
    #        dictionary,
    #        verbose = True)
    tfidf = TfIdf()
    tfidf.fit(dictionary.dictionary)
    tfidf.save(tfidf_path)
Esempio n. 4
0
    def __init__(self):
        self.question = ''
        self.response = ''
        self.sentence = ''
        self.question_type = ''

        self.query_process = QueryProcess()

        with open("Design-History.txt", 'r') as d:
            document = d.readlines()

        repalcer = RegexReplacer()

        document = repalcer.replace("".join(document))
        documents = document.split('@')
        self.documents = [d.strip('\n').lower() for d in documents]

        tokenizer = RegexpTokenizer(
            r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w\-\']+|'.*'")

        self.documents_tokens = [tokenizer.tokenize(d) for d in self.documents]

        tf = TfIdf()
        self.tfidf = tf.tfidf(self.documents_tokens)
Esempio n. 5
0
    def load_from_parametres(self):
        """
        Metoda kontroluje vstupni parametry a prirazuje je do patricnych promennych.
        """
        if os.path.isfile(sys.argv[1]):
            self.classif = sys.argv[1]
        else:
            print("Zadany parametr pro klasifikacni tridy neni souborem")
            sys.exit(-1)

        if os.path.isfile(sys.argv[2]):
            self.tran_file = sys.argv[2]
        else:
            print("Zadany parametr pro trenovaci mnozinu neni souborem")
            sys.exit(-1)

        if os.path.isfile(sys.argv[3]):
            self.test_file = sys.argv[3]
        else:
            print("Zadany parametr pro testovaci mnozinu neni souborem")
            sys.exit(-1)

        if sys.argv[4] == "bow":
            self.priz_metoda = BagOfWords()
        elif sys.argv[4] == "tfidf":
            self.priz_metoda = TfIdf()
        elif sys.argv[4] == "ngram":
            self.priz_metoda = NGram()
        else:
            print("Neznama priznakova metoda")
            sys.exit(-1)

        if sys.argv[5] == "bayes":
            self.klasifikator = NaiveBayes(self.priz_metoda)
        elif sys.argv[5] == "nn":
            self.klasifikator = NN(self.priz_metoda)
        else:
            print("Neznamy klasifikator")
            sys.exit(-1)

        self.modelname = sys.argv[6]
    def getData(self):
        for relation in self.data.keys():
            for word in self.data[relation]:
                '''try 3 times with 3 different method to find the word in DBpedia'''
                '''1st try || Capitalize each word because in DBpedia the word needs to be capitalized'''
                results = DBpedia.Query(word[0].capitalize())
                if not results["results"]["bindings"]:
                    ''' 2nd try || word 'Bass' doesn't exist, but Bass_(fish) exists
                       adds '_(term)' in the end of the word '''
                    results = DBpedia.Query(word[0].capitalize() + "_(" +
                                            self.term + ")")

                    if not results["results"]["bindings"]:
                        '''3rd try || word schrod doesn't exist, but Scrod (same word) exists 
                           uses wikipedia to correct the word '''
                        wikiTerms = wikipedia.search(word[0].capitalize())
                        #replace " " with "_" because the words in DBpedia are seperated by this char "_"
                        wikiWord = wikiTerms[0].replace(" ", "_")
                        results = DBpedia.Query(wikiWord)
                        '''If can't find the word in DBpedia the weight is None'''
                        if not results["results"]["bindings"]:
                            if relation in self.DBdata:
                                self.DBdata[relation].append([word[0], None])
                            else:
                                self.DBdata[relation] = []
                                self.DBdata[relation].append([word[0], None])

                #add the data to DBdata dictionary
                for result in results["results"]["bindings"]:
                    if relation in self.DBdata:
                        self.DBdata[relation].append(
                            [word[0], result["abstract"]["value"]])
                    else:
                        self.DBdata[relation] = []
                        self.DBdata[relation].append(
                            [word[0], result["abstract"]["value"]])
        '''Uses TfIdf class to compute tf-idf scores and CosineSimilarity on 
            the CommentBoxes that we retrieve from DBpedia'''
        TfIdf_CosineSimilarityData = TfIdf(self.DBdata, self.term).getData()
        return TfIdf_CosineSimilarityData
Esempio n. 7
0
            print "False Positive - ", falsep
            print "False Negative - ", falsen
            precision = truep * 1.0 / (truep + falsep)
            recall = truep * 1.0 / (truep + falsen)
            f_score = (precision * recall * 2) / (precision + recall)
            accuracy = (truep + truen) * 1.0 / len(y)
            print "Precision is", precision
            print "Recall is", recall
            print "F Score is", f_score
            print "Accuracy is ", accuracy

# Launch Codes

# Step 1 - Data is read from file "duplicate_sample.in" in same directory
ld = LoadData()
tfidf = TfIdf()
# Step 2 - Verify correct loading
ld.load_statistics()
# Step 3 - Parse questions (stop word removal, stemming)
ld.parse_questions()
# Step 4 - Create tf-idf matrix for all documents
tfidf.create_tfidf_matrix(ld.get_rawsamples())
tfidf.create_tfidf_topics(ld.get_rawsamples())

# ML Method
if not os.path.isfile("lsi_scores_train.txt"):  # Checking if LSI scores are already stored in file!
    print "Writing LSI Scores to files"
    ld.write_lsi()
if not os.path.isfile("lda_scores_train.txt"):  # Checking if LDA scores are already stored in file!
    print "Writing LDA Scores to files"
    ld.write_lda()
Esempio n. 8
0
        entities_column = 1
        entity_id_dict = {}
        for row in cursor:
            count_of_id = row[count_of_id_column]
            entity = row[entities_column]
            entity_id_dict[entity] = count_of_id

        return entity_id_dict

    def get_total_documents(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(distinct(id)) from "IdEntity" '
        cursor.execute(query)
        count_of_distinct_id_column = 0
        total_documents_count = 0
        for row in cursor:
            total_documents_count = row[count_of_distinct_id_column]

        return total_documents_count


###############
EntityIdIndexObj = EntityIdIndexer()
entity_count_id_dict = EntityIdIndexObj.build_tf()
total_documents_count = EntityIdIndexObj.get_total_documents()
TfIdfObj = TfIdf()
entity_tfidf_obj = TfIdfObj.computeTfIdf(entity_count_id_dict,
                                         total_documents_count)
TfIdfObj.write_to_db(entity_tfidf_obj)
csvFile2 = tfList.createFile("tf-idf_list.csv")
csvWriter1 = csv.writer(csvFile1, delimiter=',', quotechar='|')
csvWriter2 = csv.writer(csvFile2, delimiter=',', quotechar='|')

tfCloud = TfCloud()

txtStr = txt1File.read()
txtStr = txtStr.lower()
docStr = doc1File.read()
docStr = docStr.lower()

pdfStr = pdf1.convert_pdf_to_txt()
pdfStr.lower()


tfidf = TfIdf()


# Stopwords
stopwords = set(stopwords.words('english'))
academicStopwords = set(line.strip() for line in open('acStopWords.txt'))
academicStopwords = academicStopwords.union(set(['mr','mrs','one','two','said']))
words1 = word_tokenize(txtStr)
words2 = word_tokenize(docStr)
words3 = word_tokenize(pdfStr)

files = [words1, words2, words3]
wordsFiltered = []
wordcount = {}

for file in files:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

if __name__ == '__main__':
	startTime = datetime.datetime.now()  
	documentList = ["./texts/t11.txt","./texts/t22.txt"] 
	# documentList = ["./texts/test_shak1.txt"] 
	# documentList = ["./texts/shak.txt"] 
	totalDocs = len(documentList) 
	# Add language check on init and load correct stopwords list   
	stopList = stopwords.words('english') 
	# Init weighting libraries 
	TfIdf = TfIdf(documentList, stopList) 
	LSI = LSI(documentList, stopList) 
	LDA = LDA(documentList, stopList) 
	# Loop to get this argument 
	print "Ready " 
	while 1:
		try:
			line = sys.stdin.readline()
			print (TfIdf.runQuery(line)) 
			print (LSI.runQuery(line)) 
			print (LDA.runQuery(line)) 
		except KeyboardInterrupt:
			break
		if not line:
			break 



Esempio n. 11
0
		for row in cursor:
			count_of_id = row[count_of_id_column]
			entity = row[entities_column]
			entity_id_dict[entity] = count_of_id

		return entity_id_dict			

	def get_total_documents(self):
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select count(distinct(id)) from "IdEntity" '
		cursor.execute(query)
		count_of_distinct_id_column = 0
		total_documents_count = 0
		for row in cursor:
			total_documents_count = row[count_of_distinct_id_column]

		return total_documents_count





###############
EntityIdIndexObj = EntityIdIndexer()
entity_count_id_dict = EntityIdIndexObj.build_tf()
total_documents_count = EntityIdIndexObj.get_total_documents() 
TfIdfObj = TfIdf()
entity_tfidf_obj = TfIdfObj.computeTfIdf(entity_count_id_dict,total_documents_count)
TfIdfObj.write_to_db(entity_tfidf_obj)
Esempio n. 12
0
    def test_similarity(self):
        table = TfIdf()
        # 训练语料:三篇文章
        table.add_document("doc1", [
            "The", "game", "of", "life", "is", "a", "game", "of",
            "everlasting", "learning"
        ])
        table.add_document(
            "doc2",
            ["The", "unexamined", "life", "is", "not", "worth", "living"])
        table.add_document("doc3", ["Never", "stop", "learning"])

        table.calculate_tf()
        table.calculate_idf()
        table.calculate_tf_idf()

        sims = table.similarities(["life", "learning"])
        return sims
Esempio n. 13
0
    def tfidf(self):
        print("Starting Baseline run...")
        tfidf = TfIdf()
        tfidf.index_folder_location = self.index_folder
        tfidf.loadIndex()
        tfidf.processQueries()
        tfidf.fetchInvertedList()
        tfidf.calculateDocumentLength()

        for query_id, query in tfidf.queries.items():
            querylist = query.split()
            tfidf.computeScore(querylist)
            tfidf.saveResults(query_id, 'TfIdfModel')
        print("Baseline run completed successfully!")

        print("Starting run for stemmed corpus...")
        tfidf = TfIdf()
        tfidf.index_folder_location = self.stemmed_index_folder
        tfidf.loadIndex()
        tfidf.processStemmedQueries()
        tfidf.fetchInvertedList()
        tfidf.calculateDocumentLength()

        for query_id, query in tfidf.queries.items():
            querylist = query.split()
            tfidf.computeScore(querylist)
            tfidf.saveResults(query_id, 'TfIdfModel_Stemmed')

        print("Run for stemmed corpus completed successfully!")

        print("Starting run for stopping with no stemming...")
        tfidf = TfIdf()
        tfidf.stopping_required = True
        tfidf.index_folder_location = self.stopped_index_folder
        tfidf.loadIndex()
        tfidf.processQueries()
        tfidf.fetchInvertedList()
        tfidf.calculateDocumentLength()

        for query_id, query in tfidf.queries.items():
            querylist = query.split()
            tfidf.computeScore(querylist)
            tfidf.saveResults(query_id, 'TfIdfModel_Stopped')
        print("Run for stopping with no stemming completed successfully!")
Esempio n. 14
0
 def __init__(self, query, documents):
     self.query = query
     self.documents = documents
     ti = TfIdf()
     self.tfidf = ti.tfidf(self.documents)
     pass