/
Ex3_LOC.py
83 lines (68 loc) · 2.71 KB
/
Ex3_LOC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from collections import Counter
import nltk
from nltk.corpus import PlaintextCorpusReader
from Ex2 import preprocess, stem
from math import log
corpusdir = 'corpus/corpus-gutenberg/'
class TextVector:
def __init__(self, file_id):
self.file_id = file_id
self.words = build_vectors(corpusdir + file_id)
def build_vectors(text):
"""
Method that receives a text and returns the dict with the words and count of each word.
:param text: url of the text
:return: dict with the words and count of each word
"""
raw = open(text, 'rU').read()
tokens = nltk.word_tokenize(raw)
# used later for giving the size of the vector.
amount_of_words = len(tokens)
counter = Counter()
for token in tokens:
counter[token] += 1
dictionary = dict(counter)
# Creates the vector size
for key, value in dictionary.items():
dictionary[key] = value / amount_of_words
# Adds Most common bigrams
bigrams = nltk.bigrams(tokens)
fdist_bigrams = nltk.FreqDist(bigrams)
for best_bigram in fdist_bigrams.most_common():
dictionary[best_bigram] = fdist_bigrams[best_bigram]
return dictionary
class TextCollection:
def __init__(self):
# Create a Corpus with all the data preprocessed with exercise 2 tokenizer
self.corpus = PlaintextCorpusReader(corpusdir, '.*/*', word_tokenizer=preprocess)
# Create the vectorial Space, creating each Vector
self.Text_vectors = []
for document in self.corpus.fileids():
self.Text_vectors.append(TextVector(document))
def search_word_in_vector(text_collection, word):
document_match = []
stemmed_word = stem([word])[0]
for Text_vector in text_collection.Text_vectors:
if stemmed_word in Text_vector.words:
document_match.append(Text_vector)
if len(document_match) > 0:
print(len(document_match))
idf = log(len(text_collection.Text_vectors) / len(document_match))
# Remove words with weight 0
for Text_vector in text_collection.Text_vectors:
if stemmed_word in Text_vector.words:
if(Text_vector.words.get(stemmed_word) * idf) == 0:
document_match.remove(Text_vector)
# Automatically returns de tf_idf weight
return sorted(document_match, key=lambda document: (document.words.get(stemmed_word) * idf))
else:
return document_match
def main():
text_collection = TextCollection()
while True:
query = input("Insert a word: \n")
print("The recommended documents are (sorted by relevance): \n")
for document in search_word_in_vector(text_collection, query):
print(document.file_id)
print('\n')
main()