Ejemplo n.º 1
0
def find_collocations_tetra(file_name, data, popular_word):
    text_file = open(file_name, 'r')
    file_content = text_file.read()

    most_common_words = find_most_common_words(file_content, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    collocations = data

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
            fourth_word = fifth_word
            fifth_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower()):
                count_collocations_tetra(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()))

     #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    count_collocations_tetra(collocations, first_word, second_word, third_word, fourth_word)
    return collocations, most_common_words, file_content
Ejemplo n.º 2
0
def find_collocations_tetra(text, data, popular_word):
    
    most_common_words = find_most_common_words(text, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    collocations = data

    for word in text.split():
        first_word = second_word
        second_word = third_word
        third_word = fourth_word
        fourth_word = fifth_word
        fifth_word = trim_word(word)
        if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words) and \
                (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower()):
            count_collocations_tetra(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()))

    #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    count_collocations_tetra(collocations, first_word, second_word, third_word, fourth_word)
    return collocations, most_common_words
Ejemplo n.º 3
0
def find_most_common_words(text, count):
    words = dict()
    for word in text.split():
        word = trim_word(word)
        if word not in words.keys():
            words[word] = 1
        else:
            words[word] += 1
    sorted_words = sorted(words, key=words.get, reverse=True)
    return sorted_words[:count]
Ejemplo n.º 4
0
def find_most_common_words(text, count):
    words = dict()
    for word in text.split():
        word = trim_word(word)
        if word not in words.keys():
            words[word] = 1
        else:
            words[word] += 1
    sorted_words = sorted(words, key=words.get, reverse=True)
    return sorted_words[:count]
Ejemplo n.º 5
0
def find_collocations_tetra(file_name, data, popular_word):
    text_file = open(file_name, 'r')
    file_content = text_file.read()

    most_common_words = find_most_common_words(file_content, popular_word)

    second_word = None
    third_word = None
    fourth_word = None
    fifth_word = None
    collocations = data

    text_file.seek(0)
    for line in text_file:
        for word in line.split():
            first_word = second_word
            second_word = third_word
            third_word = fourth_word
            fourth_word = fifth_word
            fifth_word = trim_word(word)
            if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words) and \
                    (first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower()):
                count_collocations_tetra(collocations,
                                         stem(first_word.lower()),
                                         stem(second_word.lower()),
                                         stem(third_word.lower()),
                                         stem(fourth_word.lower()))

    #dodatkowa iteracja dla ostatniego slowa
    first_word = second_word
    second_word = third_word
    third_word = fourth_word
    fourth_word = fifth_word
    count_collocations_tetra(collocations, first_word, second_word, third_word,
                             fourth_word)
    return collocations, most_common_words, file_content