def do_text_preprocessing(self, text, language_shortening):
     text = self.majka_lemmatizers[
         language_shortening].lemmatization_and_loaded_stop_words_removal_in_array(
             textPreprocessing.tokenize_text(text),
             self.stop_words[language_shortening])
     print(text)
     return textPreprocessing.tokenize_text(text)
Example #2
0
def find_word_pairs_extended(text1, text2, majka_lemmatizer_text1, majka_lammatizer_text2, stop_words_file1, stop_words_file2, alias_dict):
    # text1 = majka_lemmatizer_text1.lemmatization_and_stop_words_removal_not_included(
    #    textPreprocessing.tokenize_text(text1), stop_words_file1)

    arrays = []
    word_pairs = dict()

    array1 = textPreprocessing.tokenize_text(text1)
    arrays.append(textPreprocessing.tokenize_text(text2))

    if text2 in alias_dict:
        for alias in alias_dict[text2]:
            # alias = majka_lammatizer_text2.lemmatization_and_stop_words_removal_not_included(
            #    textPreprocessing.tokenize_text(alias), stop_words_file2)
            arrays.append(textPreprocessing.tokenize_text(alias))

    for sentence in arrays:
        if len(array1) > len(sentence):
            min_length = len(sentence)
        else:
            min_length = len(array1)

        for i in range(0, min_length):
            if array1[i] not in word_pairs.keys():
                word_pairs[array1[i]] = []
            word_pairs[array1[i]].append(sentence[i])

    return word_pairs
Example #3
0
    def find_dictionary_docs(self, preprocessed_text):
        found_dictionary_docs = None

        for token in textPreprocessing.tokenize_text(preprocessed_text):
            help_dictionary_docs = dict()
            for score in self.tf_idf_scores:
                if token in score:
                    for doc, score_value in score[token].items():
                        if doc not in help_dictionary_docs:
                            help_dictionary_docs[doc] = score_value
                        else:
                            help_dictionary_docs[
                                doc] = help_dictionary_docs[doc] + score_value

            if found_dictionary_docs is None:
                found_dictionary_docs = help_dictionary_docs.copy()
            else:
                found_dictionary_docs = {
                    key: value
                    for key, value in found_dictionary_docs.items()
                    if key in help_dictionary_docs.keys()
                }

        sorted_dictionary = {
            key: value
            for key, value in sorted(found_dictionary_docs.items(),
                                     key=lambda item: item[1],
                                     reverse=True)
        }
        return sorted_dictionary
Example #4
0
def find_word_pairs(text1, text2, majka_lemmatizer_text1, majka_lammatizer_text2, stop_words_file1, stop_words_file2):
    # text1 = majka_lemmatizer_text1.lemmatization_and_stop_words_removal_not_included(
    #    textPreprocessing.tokenize_text(text1), stop_words_file1)
    # text2 = majka_lammatizer_text2.lemmatization_and_stop_words_removal_not_included(
    #    textPreprocessing.tokenize_text(text2), stop_words_file2)

    array1 = textPreprocessing.tokenize_text(text1)
    array2 = textPreprocessing.tokenize_text(text2)

    word_pairs = dict()
    if len(array1) > len(array2):
        min_length = len(array2)
    else:
        min_length = len(array1)

    for i in range(0, min_length):
        word_pairs[array1[i]] = array2[i]

    return word_pairs
    def parse_lemmatizer(self, lifo_stack, dictionary_base,
                         language_array_shortenings):

        for character, dictionary in dictionary_base.items():
            if character in language_array_shortenings:
                # print("lang: " + character + " " + dictionary )
                word = ''.join(lifo_stack)

                translation = self.translator.translate(
                    dictionary, src=character, dest=self.base_language)
                # print(translation.text + " " + word)
                translation_text = self.majka_lemmatizers[character].\
                    lemmatization_and_loaded_stop_words_removal_in_array(
                    textPreprocessing.tokenize_text(translation.text), self.stop_words[character]).lower()
                word_text = self.majka_lemmatizers[
                    character].lemmatization_and_loaded_stop_words_removal_in_array(
                        textPreprocessing.tokenize_text(word),
                        self.stop_words[character]).lower()

                if translation_text == "" or word_text == "":
                    self.statistics['missed'][
                        character] = self.statistics['missed'][character] + 1
                    continue

                print(translation_text + " " + word_text)

                if translation_text != word_text:
                    self.statistics['negative'][
                        character] = self.statistics['negative'][character] + 1
                else:
                    self.statistics['positive'][
                        character] = self.statistics['positive'][character] + 1
                # print(word + " " + dictionary)
            else:
                lifo_stack.append(character)
                self.parse_lemmatizer(lifo_stack, dictionary,
                                      language_array_shortenings)
                lifo_stack.pop()
Example #6
0
    def characters(self, content):
        if self.currentData == "title" and self.on_page_to_do and re.search(
                r"^MediaWiki:", content) is None:
            if content in self.titles_indexes[self.language_shortening]:
                self.document_identifier = str(
                    self.titles_indexes[self.language_shortening][content])
                self.lang_document_identifier = self.language_shortening + "-" + self.document_identifier
                self.prepared_for_indexing = True

        if self.currentData == 'text' and self.prepared_for_indexing:
            self.page_content = self.lemmatizer.lemmatization_and_stop_words_removal_not_included(
                textPreprocessing.tokenize_text(content),
                'stop_words/en_stop_words.json')
            # print(self.document_identifier+ "< >"+str(len(content)))
            if self.page_content != "":
                self.text_content = self.text_content + self.page_content
def index_words(index, check_duplicates, text, document_identifier, dest_lang_shortening):
    for word in textPreprocessing.tokenize_text(text):
        word = str(word)
        if word in index[dest_lang_shortening]:

            if document_identifier not in check_duplicates[dest_lang_shortening][word]['doc']:
                index[dest_lang_shortening][str(word)]['doc'].append(document_identifier)
                index[dest_lang_shortening][word]['df'] = len(index[dest_lang_shortening][word]['doc'])
        else:
            index[dest_lang_shortening][word] = {}
            index[dest_lang_shortening][word]['doc'] = []
            index[dest_lang_shortening][word]['df'] = 1

            check_duplicates[dest_lang_shortening][word] = {}
            check_duplicates[dest_lang_shortening][word]['doc'] = set()
            check_duplicates[dest_lang_shortening][word]['doc'].add(document_identifier)
            index[dest_lang_shortening][word]['doc'].append(document_identifier)
def index_words_term_freq_doc_freq(index, doc_freq_index, text, document_identifier, dest_lang_shortening):
    tokenized_text = textPreprocessing.tokenize_text(text)
    doc_freq_index[dest_lang_shortening][document_identifier] = len(tokenized_text)
    index[dest_lang_shortening + "_docs"] = index[dest_lang_shortening + "_docs"] + 0

    for word in tokenized_text:
        word = str(word)
        if word in index[dest_lang_shortening]:
            if document_identifier not in index[dest_lang_shortening][word]['doc']:
                index[dest_lang_shortening][word]['doc'][document_identifier] = 1
                index[dest_lang_shortening][word]['df'] = len(index[dest_lang_shortening][word]['doc'])
            else:
                index[dest_lang_shortening][word]['doc'][document_identifier] = \
                                                    index[dest_lang_shortening][word]['doc'][document_identifier] + 1
        else:
            index[dest_lang_shortening][word] = {}
            index[dest_lang_shortening][word]['doc'] = {}
            index[dest_lang_shortening][word]['df'] = 1
            index[dest_lang_shortening][word]['doc'][document_identifier] = 1
def find_statistics_information_connection(connection_file, base_language_shortening,
                                           language_array_shortenings):
    connection_data = load_as_json(connection_file)

    max_tokens = 25
    number_records = 0
    statistic_for_languages = dict()

    for language_shortening in language_array_shortenings:
        statistic_for_languages[language_shortening] = dict()

    for language_shortening in language_array_shortenings:
        statistic_for_languages[language_shortening]['records'] = 0
        statistic_for_languages[language_shortening]['tokens'] = dict()
        for i in range(1, max_tokens):
            statistic_for_languages[language_shortening]['tokens'][str(i)] = 0

    for record in connection_data[base_language_shortening]:
        if 'title' in record and record['title'] != "":
            number_records = number_records + 1

            for language_shortening in language_array_shortenings:
                if language_shortening in record and record[language_shortening] != "":
                    statistic_for_languages[language_shortening]['records'] = \
                        statistic_for_languages[language_shortening]['records'] + 1
                    number_tokens = len(textPreprocessing.tokenize_text(record[language_shortening]))

                    if number_tokens < max_tokens:
                        statistic_for_languages[language_shortening]['tokens'][str(number_tokens)] = \
                            statistic_for_languages[language_shortening]['tokens'][str(number_tokens)] + 1
                    else:
                        print('Number tokens for ' + language_shortening + 'exceeds: ' + str(number_tokens))

    print("Number records: " + str(number_records))
    for language_shortening in language_array_shortenings:
        print("Number connections in " + language_shortening + " is: " +
              str(statistic_for_languages[language_shortening]['records']))
        for i in range(1, max_tokens):
            print(str(i) + " token frequencies is: " + str(statistic_for_languages[language_shortening]['tokens']
                                                           [str(i)]))
    def find_foreign_equivalents(self, text, initial_language_shortening,
                                 dest_language_shortenings):
        results = dict()
        for language_shortening in dest_language_shortenings:
            results[language_shortening] = ""

        tokens = textPreprocessing.tokenize_text(text)
        #self.do_text_preprocessing(text, initial_language_shortening)

        for token in tokens:
            result_part_dict = self.character_trees[
                initial_language_shortening].find_word_more_languages(
                    token, dest_language_shortenings)
            if result_part_dict is not None:
                for language_shortening, result_part in result_part_dict.items(
                ):
                    results[language_shortening] = results[
                        language_shortening] + " " + result_part

        for language_shortening, result in results.items():
            if result != "":
                print(language_shortening + ": " + result)
def find_statistics_information_from_alias_files_extended(alias_files):
    statistic_for_language = dict()
    max_tokens_base = 10
    max_tokens_aliases = 20
    max_number_aliases = 40

    for alias_file in alias_files:
        language_shortening = alias_file.split('_')[0]
        alias_data = load_as_json(alias_file)

        statistic_for_language[language_shortening] = dict()
        statistic_for_language[language_shortening]['records'] = 0
        statistic_for_language[language_shortening]['tokens_base'] = dict()
        statistic_for_language[language_shortening]['tokens_aliases'] = dict()
        statistic_for_language[language_shortening]['number_aliases'] = dict()

        for i in range(1, max_tokens_base):
            statistic_for_language[language_shortening]['tokens_base'][str(i)] = 0

        for i in range(1, max_tokens_aliases):
            statistic_for_language[language_shortening]['tokens_aliases'][str(i)] = 0

        for i in range(1, max_number_aliases):
            statistic_for_language[language_shortening]['number_aliases'][str(i)] = 0

        for base_name, array_aliases in alias_data.items():
            if base_name != "":
                if len(array_aliases) < max_number_aliases:
                    statistic_for_language[language_shortening]['number_aliases'][str(len(array_aliases))] =\
                        statistic_for_language[language_shortening]['number_aliases'][str(len(array_aliases))] + 1
                else:
                    print("Number of aliases for " + language_shortening + " exceeds: " + str(len(array_aliases)))

                statistic_for_language[language_shortening]['records'] = \
                    statistic_for_language[language_shortening]['records'] + 1
                base_tokens = len(textPreprocessing.tokenize_text(base_name))
                if base_tokens < max_tokens_base:
                    statistic_for_language[language_shortening]['tokens_base'][str(base_tokens)] = \
                        statistic_for_language[language_shortening]['tokens_base'][str(base_tokens)] + 1
                else:
                    print("Number base tokens for " + language_shortening + " exceeds: " + base_tokens)

                for alias in array_aliases:
                    alias_tokens = len(textPreprocessing.tokenize_text(alias))
                    if alias_tokens < max_tokens_aliases:
                        statistic_for_language[language_shortening]['tokens_aliases'][str(alias_tokens)] = \
                            statistic_for_language[language_shortening]['tokens_aliases'][str(alias_tokens)] + 1
                    else:
                        print("Number alias tokens for " + language_shortening + " exceeds: " + alias_tokens)

    for language_shortening in statistic_for_language.keys():
        print('\n')
        print("Number records for " + language_shortening + " containing aliases: " +
              str(statistic_for_language[language_shortening]['records']))

        for i in range(1, max_tokens_base):
            print(str(i) + " base token frequencies are: " +
                  str(statistic_for_language[language_shortening]['tokens_base'][str(i)]))

        for i in range(1, max_tokens_aliases):
            print(str(i) + " token of aliases frequencies are: " +
                  str(statistic_for_language[language_shortening]['tokens_aliases'][str(i)]))

        for i in range(1, max_number_aliases):
            print(str(i) + " aliases have frequency of: " +
                  str(statistic_for_language[language_shortening]['number_aliases'][str(i)]))
    def parse_lemmatizer_ext_min(self, lifo_stack, dictionary_base,
                                 language_array_shortenings, min_frequency):

        for character, dictionary in dictionary_base.items():
            if character in language_array_shortenings:
                # print("lang: " + character + " " + dictionary )
                word = ''.join(lifo_stack)
                word_text = self.majka_lemmatizers[
                    character].lemmatization_and_loaded_stop_words_removal_in_array(
                        textPreprocessing.tokenize_text(word),
                        self.stop_words[character]).lower()

                if word_text == "":
                    self.statistics['missed'][
                        character] = self.statistics['missed'][character] + 1
                    continue

                max = 0
                array_of_equal = list()
                for foreign_word, frequency in dictionary.items():
                    if frequency > max:
                        max = frequency
                        array_of_equal = list()
                        array_of_equal.append(foreign_word)
                    elif frequency == max:
                        array_of_equal.append(foreign_word)

                if frequency < min_frequency:
                    self.statistics['missed'][
                        character] = self.statistics['missed'][character] + 1
                    continue

                missed = True
                translation_text = ""
                if len(array_of_equal) > 1:
                    self.statistics['negative'][
                        character] = self.statistics['negative'][character] + 1
                    continue

                translation = self.translator.translate(
                    array_of_equal[0], src=character, dest=self.base_language)
                # print(translation.text + " " + word)
                translation_text = self.majka_lemmatizers[character]. \
                    lemmatization_and_loaded_stop_words_removal_in_array(
                    textPreprocessing.tokenize_text(translation.text), self.stop_words[character]).lower()

                if missed and (translation_text == "" or word_text == ""):
                    self.statistics['missed'][
                        character] = self.statistics['missed'][character] + 1
                    continue

                print(translation_text + " " + word_text)

                if translation_text != word_text:
                    self.statistics['negative'][
                        character] = self.statistics['negative'][character] + 1
                else:
                    self.statistics['positive'][
                        character] = self.statistics['positive'][character] + 1
                    # print(word + " " + dictionary)
            else:
                lifo_stack.append(character)
                self.parse_lemmatizer_ext_min(lifo_stack, dictionary,
                                              language_array_shortenings,
                                              min_frequency)
                lifo_stack.pop()
Example #13
0
 def text_preprocessing(self, text, stopwords):
     return self.lemmatizer.lemmatization_and_stop_words_removal_in_array_all_majka(
         textPreprocessing.tokenize_text(text), stopwords)