import time; from textprocessor import TextProcessor from bayes_classifier import BayesClassifier form_file = 'formy' polish_texts = ['dramat', 'popul', 'proza', 'publ', 'wp'] path_to_files = './data/' show_simmilar = True; num_of_simmilar = 4 if __name__ == '__main__': textprocessor = TextProcessor() textprocessor.create_dictionary(path_to_file = path_to_files, form_file=form_file) textprocessor.improve_dictionary(path_to_files = path_to_files, polish_texts=polish_texts) dict_of_words = textprocessor.dict_of_words; input_word = input('Napisz pojedyncze slowo:\n') start = time.time(); input_word = textprocessor.map_chars(input_word) if not input_word in dict_of_words: bayes_classifier = BayesClassifier() simmilar_words = bayes_classifier.calculate(f'{input_word}', dict_of_words) unmapped_words = [] for word in simmilar_words: unmapped_words.append(textprocessor.unmap_words(word)) print(f'Slowo nie wystepuje w polskim jezyku.') print(f'Moze chodzilo o \'{unmapped_words[0]}\'?') show_hints = input('Chcesz zobaczyc inne mozliwosci? t/n\n') if(show_hints == 't'): print(f'Inne możliwosci {unmapped_words[1:num_of_simmilar]}\n') else:
'com', 'eu', 'pl', 'telfax', 'office', 'burg', 'poland' ] fileName = "./data/pap.txt" note_idx = 98 #https://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf?fbclid=IwAR3ax6JNemqmWzfau24-UwePT7isOEDP5mAE3jbCQG92dITVVwV9ZS7CYiA if __name__ == '__main__': start = time.time() ### PREPROCESSING textProcessor = TextProcessor() textProcessor.create_dictionary("data", "odm.txt") lineWords = [] for line in open(fileName, 'r', encoding='utf-8'): read_line = line.replace('#', '').strip('\n').strip(' ') if not read_line.isdigit(): lineWords.append(textProcessor.preprocess(read_line)) textProcessor.create_frequency_dict(lineWords) preprocessed = [] textProcessor.pre_process_vol_2(lineWords) ### Document-term matrix stop_list = get_stop_list() ct_vectorizer = CountVectorizer(min_df=1, stop_words=stop_list)