-
Notifications
You must be signed in to change notification settings - Fork 0
/
comparison.py
83 lines (70 loc) · 3.17 KB
/
comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# # nltk.download()
# # nltk.download('wordnet')
# from itertools import product
# from nltk.corpus import wordnet as wn
# # nltk.download(wordnet, 'C:\\Users\\Abdul\\AppData\\Local\\Programs\\Python\\Python36-32\\nltk_data')
import numpy
from PyDictionary import PyDictionary
import random
file_0 = open("comparison_nlp/nlp/sample_text.txt", "r")
file_2 = open("comparison_nlp/nlp/sample_text_2.txt", "r")
# print(wn.synsets("ball")[0].lemma_names())
def key_stats(file_1):
common_words = ['a', 'about', 'all', 'is', 'also', 'and', 'as', 'at', 'be', 'because', 'but', 'by', 'can', 'come', 'could', 'day', 'do', 'even', 'find', 'first', 'for',
'from', 'get', 'give', 'go', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'I', 'if', 'in', 'into', 'it', 'its', 'just', 'know', 'like', 'look', 'make', 'man', 'many', 'me', 'more', 'my', 'new', 'no', 'not', 'now', 'of', 'on', 'one', 'only', 'or', 'other', 'our', 'out', 'people', 'say', 'see', 'she', 'so', 'some', 'take', 'tell', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'thing', 'think', 'this', 'those', 'time', 'to', 'two', 'up', 'use', 'very', 'want', 'way', 'we', 'well', 'what', 'when', 'which', 'who', 'will', 'with', 'would', 'year', 'you', 'your']
words = {}
for line in file_1:
for word in line.split():
word = word.strip(',')
word = word.strip('.')
word = word.strip('?')
word = word.strip('-')
word = word.strip(':')
word = word.strip('"')
word = word.strip("'")
word = word.strip('[')
word = word.strip(']')
word = word.lower()
try:
if words[word] is None:
if word not in common_words:
words[word] = 1
else:
words[word] += 1
except KeyError:
if word not in common_words:
words[word] = 1
list_numbers = [] # list containing all count numbers
for item in words:
list_numbers.append(words[item])
#
threshold = numpy.mean(list_numbers, axis=0) + 0.2*numpy.std(list_numbers, axis=0)
vital_words_list = []
for item in words:
if words[item] >= threshold:
vital_words_list.append([item, words[item]])
return vital_words_list
def text_comparison(output_1, output_2):
proximity_index = 0
sum_index = 0
for word in output_1:
sum_index += word[1]
for other_word in output_2:
if word[0] in other_word[0] or other_word[0] in word[0]:
proximity_index += word[1] + other_word[1]
for other_word in output_2:
sum_index += other_word[1]
comparison_index = proximity_index/sum_index
return comparison_index
t = key_stats(file_0)
b = key_stats(file_2)
print(text_comparison(t, b))
dictionary = PyDictionary()
# def respond(wordList):
# output = ""
# for word in wordList:
# output = (output + " " + (random.choice(dictionary.synonym(word, "html.parser"))))
# return output
dictionary=PyDictionary("hotel","ambush","joy","perceptive")
print(dictionary.getSynonyms())
# print(respond(['chicken', 'food']))