-
Notifications
You must be signed in to change notification settings - Fork 0
/
collocations_wikipedia_tetra.py
56 lines (46 loc) · 1.99 KB
/
collocations_wikipedia_tetra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from collocations_common_tetra import count_collocations_tetra, trim_word
from stemming.porter2 import stem
import wikipedia
def find_most_common_words(text, count):
words = dict()
for word in text.split():
word = trim_word(word)
if word not in words.keys():
words[word] = 1
else:
words[word] += 1
sorted_words = sorted(words, key=words.get, reverse=True)
return sorted_words[:count]
def find_collocations_tetra(text, data, popular_word):
most_common_words = find_most_common_words(text, popular_word)
second_word = None
third_word = None
fourth_word = None
fifth_word = None
collocations = data
for word in text.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower()):
count_collocations_tetra(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
count_collocations_tetra(collocations, first_word, second_word, third_word, fourth_word)
return collocations, most_common_words
def find(file_name, data=None):
if data:
collocations = data
else:
collocations = dict()
articles = open(file_name, 'r')
for article in articles:
wiki = wikipedia.page(article)
collocations = find_collocations_tetra(wiki.content,collocations)
return collocations