Esempio n. 1
0
import time;
from textprocessor import TextProcessor
from bayes_classifier import BayesClassifier

form_file = 'formy'
polish_texts = ['dramat', 'popul', 'proza', 'publ', 'wp']
path_to_files = './data/'
show_simmilar = True;
num_of_simmilar = 4

if __name__ == '__main__':
    textprocessor = TextProcessor()
    textprocessor.create_dictionary(path_to_file = path_to_files, form_file=form_file)
    textprocessor.improve_dictionary(path_to_files = path_to_files, polish_texts=polish_texts)
    dict_of_words = textprocessor.dict_of_words;

    input_word = input('Napisz pojedyncze slowo:\n')
    start = time.time();
    input_word = textprocessor.map_chars(input_word)
    if not input_word in dict_of_words:
        bayes_classifier = BayesClassifier()
        simmilar_words = bayes_classifier.calculate(f'{input_word}', dict_of_words)
        unmapped_words = []
        for word in simmilar_words:
            unmapped_words.append(textprocessor.unmap_words(word))
        print(f'Slowo nie wystepuje w polskim jezyku.')
        print(f'Moze chodzilo o \'{unmapped_words[0]}\'?')
        show_hints = input('Chcesz zobaczyc inne mozliwosci? t/n\n')
        if(show_hints == 't'):
            print(f'Inne możliwosci {unmapped_words[1:num_of_simmilar]}\n')
    else:
Esempio n. 2
0
        'com', 'eu', 'pl', 'telfax', 'office', 'burg', 'poland'
    ]


fileName = "./data/pap.txt"
note_idx = 98

#https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf?fbclid=IwAR3ax6JNemqmWzfau24-UwePT7isOEDP5mAE3jbCQG92dITVVwV9ZS7CYiA

if __name__ == '__main__':
    start = time.time()

    ### PREPROCESSING

    textProcessor = TextProcessor()
    textProcessor.create_dictionary("data", "odm.txt")

    lineWords = []
    for line in open(fileName, 'r', encoding='utf-8'):
        read_line = line.replace('#', '').strip('\n').strip(' ')
        if not read_line.isdigit():
            lineWords.append(textProcessor.preprocess(read_line))

    textProcessor.create_frequency_dict(lineWords)
    preprocessed = []
    textProcessor.pre_process_vol_2(lineWords)

    ### Document-term matrix

    stop_list = get_stop_list()
    ct_vectorizer = CountVectorizer(min_df=1, stop_words=stop_list)