Ejemplo n.º 1
0
def calc():
    # Separating texts on sentences
    text = centralize_texts.run()
    tokenized_text = sent_tokenize(text)

    # Separating sentences on words
    tokenized_word_arr = []

    for sentence in tokenized_text:
        tokenized_word = word_tokenize(sentence)
        tokenized_word_arr.append(tokenized_word)

    # Searching dialog signs in words
    stop_words = ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т',
                  'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я']

    dialogs = []

    for sentence in tokenized_word_arr:
        if sentence[0] == '-' or sentence[0] == '–' or sentence[0] == '—':
            if sentence[1][0] not in stop_words:
                dialogs.append(sentence)

    # Calculating a relative indicator
    relative_indicator = round(len(dialogs) * 100 / text__word.calc(), 2)

    # Calculating the result
    return relative_indicator
def calc():
    # Coping data from the other function
    text = centralize_texts.run().lower()
    tokenized_text = sent_tokenize(text)

    tokenized_word_arr = []

    for sentence in tokenized_text:
        tokenized_word = word_tokenize(sentence)
        tokenized_word_arr.append(tokenized_word)

    # Deleting punctuation signs
    tokenizer = RegexpTokenizer(r'\w+')
    text_without_punctuation_signs = tokenizer.tokenize(text)

    # Deleting stop-words
    stop_words = stopwords.words('russian')
    stop_words.extend(['не', 'это', 'что', 'именно', 'эта', 'лишь', 'очень', 'либо', 'или', 'ru', 'которые', 'конец',
                       'которая', 'который', 'ибо', 'см', 'n', 'например', 'является', '', 'а', 'б', 'в', 'г', 'д', 'е',
                       'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
                       'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'])

    filtered_sent = []

    for word in text_without_punctuation_signs:
        if word not in stop_words:
            filtered_sent.append(word)
def calc():
    # Coping data from the other function
    text = centralize_texts.run().lower()
    tokenized_text = sent_tokenize(text)

    tokenized_word_arr = []

    for sentence in tokenized_text:
        tokenized_word = word_tokenize(sentence)
        tokenized_word_arr.append(tokenized_word)

    # Deleting punctuation signs
    tokenizer = RegexpTokenizer(r'\w+')
    text_without_punctuation_signs = tokenizer.tokenize(text)

    # Deleting stop-words
    stop_words = stopwords.words('russian')
    stop_words.extend(['не', 'это', 'что', 'именно', 'эта', 'лишь', 'очень', 'либо', 'или', 'ru', 'которые', 'конец',
                       'которая', 'который', 'ибо', 'см', 'n', 'например', 'является', '1', '2', '3', '4', '5', '6',
                       '7', '8', '9', '0', '', 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м',
                       'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'])

    filtered_sent = []

    for w in text_without_punctuation_signs:
        if w not in stop_words:
            filtered_sent.append(w)

    # Searching adjectives in the text
    adjectives = []

    for word in filtered_sent:
        if word[len(word) - 2] == 'а' and word[len(word) - 1] == 'я' \
                or word[len(word) - 2] == 'о' and word[len(word) - 1] == 'е'\
                or word[len(word) - 2] == 'е' and word[len(word) - 1] == 'е'\
                or word[len(word) - 2] == 'о' and word[len(word) - 1] == 'й'\
                or word[len(word) - 2] == 'ы' and word[len(word) - 1] == 'е'\
                or word[len(word) - 2] == 'и' and word[len(word) - 1] == 'е'\
                or word[len(word) - 2] == 'ы' and word[len(word) - 1] == 'й':
            adjectives.append(word)

    # Deleting conjunctions
    for word in adjectives:
        if word[0] == 'к' and word[1] == 'о' and word[2] == 'т' and word[3] == 'о' and word[4] == 'р' \
                or word[0] == 'к' and word[1] == 'а' and word[2] == 'к':
            adjectives.remove(word)

    # Calculating a relative indicator
    relative_indicator = round(len(adjectives) * 100 / text__word.calc(), 2)

    # Calculating the result
    return relative_indicator
def calc(analysed_word):
    # Adding the text
    text = centralize_texts.run().lower()

    # Deleting punctuation signs
    tokenizer = RegexpTokenizer(r'\w+')
    text_without_punctuation_signs = tokenizer.tokenize(text)

    # Deleting stop-words
    stop_words = stopwords.words('russian')
    stop_words.extend([
        'не', 'это', 'что', 'именно', 'эта', 'лишь', 'очень', 'либо', 'или',
        'ru', 'которые', 'которая', 'который', 'ибо', 'см', 'n', 'например',
        'является', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '', 'а',
        'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н',
        'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы',
        'ь', 'э', 'ю', 'я'
    ])

    filtered_sent = []

    for word in text_without_punctuation_signs:
        if word not in stop_words:
            filtered_sent.append(word)

    # Calculating TOP-words
    fdist = FreqDist(filtered_sent)
    top_words = fdist.most_common(1000)

    # Calculating frequency of the marker word
    returned_result = ''

    for word in top_words:
        word_value = word[0]
        word_freq = word[1]

        if word_value == analysed_word:
            returned_result = word_freq

    if returned_result == '':
        returned_result = 'The word is not find'

    # Calculating a relative indicator of the marker word
    relative_indicator = 0

    if returned_result != 'The word is not find':
        relative_indicator = returned_result * 100 / text__word.calc()

    # Calculating the result
    return round(relative_indicator, 3)
def calc():
    # Coping data from the other function
    text = centralize_texts.run()
    tokenized_text = sent_tokenize(text)

    tokenized_word_arr = []

    for sentence in tokenized_text:
        tokenized_word = word_tokenize(sentence)
        tokenized_word_arr.append(tokenized_word)

    # Deleting punctuation signs
    tokenizer = RegexpTokenizer(r'\w+')
    text_without_punctuation_signs = tokenizer.tokenize(text)

    # Calculating the result
    return len(text_without_punctuation_signs)
Ejemplo n.º 6
0
def adding():
    # Coping data from the other function
    text = centralize_texts.run()
    tokenized_text = sent_tokenize(text)

    tokenized_word_arr = []

    for sentence in tokenized_text:
        tokenized_word = word_tokenize(sentence)
        tokenized_word_arr.append(tokenized_word)

    tokenized_word_arr_without_tokenized = ''

    for word in tokenized_word_arr:
        tokenized_word_arr_without_tokenized += (word + ' ')

    # Returning the result
    return tokenized_word_arr_without_tokenized
Ejemplo n.º 7
0
def calc():
    text = centralize_texts.run()
    tokenized_text = sent_tokenize(text)

    return len(tokenized_text)
Ejemplo n.º 8
0
from private_apps.AI_fiction_or_scientific.modules.additional_indicators import text__word
from private_apps.AI_fiction_or_scientific.modules.additional_indicators import text__sign_abs

from private_apps.AI_fiction_or_scientific.modules.main_indicators import text__sign_rel
from private_apps.AI_fiction_or_scientific.modules.main_indicators import sentence__word
from private_apps.AI_fiction_or_scientific.modules.main_indicators import word__letter
from private_apps.AI_fiction_or_scientific.modules.main_indicators import text__adjective
from private_apps.AI_fiction_or_scientific.modules.main_indicators import text__number
from private_apps.AI_fiction_or_scientific.modules.main_indicators import text__dialog
from private_apps.AI_fiction_or_scientific.modules.main_indicators import text__marker

from private_apps.AI_fiction_or_scientific.modules.results import result_recorder
from private_apps.AI_fiction_or_scientific.modules.results.result_calculator import result_generator

# 1. CENTRALIZING FEW TEXTS
centralize_texts.run()

# 2. GETTING MAIN AND ADDITIONAL ANALYSIS INDICATORS
print('РЕЗУЛЬТАТ АНАЛИЗА')

# 2.1. Getting sentences number in text(s)
print('1. Количество предложений: ' + str(text__sentence.calc()) +
      ' ед./текс.')

# 2.2. Getting words number in text(s)
print('2. Количество слов: ' + str(text__word.calc()) + ' ед./текс.')

# 2.3. Getting punctuation signs number in text(s)
print('3. Абсолютное число знаков препинания: ' + str(text__sign_abs.calc()) +
      ' ед./текс.')