-
Notifications
You must be signed in to change notification settings - Fork 0
/
collocations_file_tetra.py
54 lines (45 loc) · 2.01 KB
/
collocations_file_tetra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from collocations_common_tetra import count_collocations_tetra, trim_word
from stemming.porter2 import stem
def find_collocations_tetra(file_name, data, popular_word):
text_file = open(file_name, 'r')
file_content = text_file.read()
most_common_words = find_most_common_words(file_content, popular_word)
second_word = None
third_word = None
fourth_word = None
fifth_word = None
collocations = data
text_file.seek(0)
for line in text_file:
for word in line.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower()):
count_collocations_tetra(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
count_collocations_tetra(collocations, first_word, second_word, third_word, fourth_word)
return collocations, most_common_words, file_content
def find_most_common_words(text, count):
words = dict()
for word in text.split():
word = trim_word(word)
if word not in words.keys():
words[word] = 1
else:
words[word] += 1
sorted_words = sorted(words, key=words.get, reverse=True)
return sorted_words[:count]
def find(file_name, data=None):
if data:
collocations = data
else:
collocations = dict()
return find_collocations_tetra(file_name, collocations)