def main(inputFile): print("Start making emoticon map") map_emoticon = generateEmotMap('emoticon_id.txt') print("Finished") print("Start making senti map") map_senti = generateSentiMap(['boosterwords_id.txt', 'idioms_id.txt', 'negatingword.txt', 'sentiwords_id.txt']) print("Finished") print("Start making abbreviation dictionary for bahasa") corrector = Corrector('singkatan.dic') print("Finished") print("Start making stopword dictionary for bahasa") cutter = Cutter('stopword.txt') print("Finished") print("Start making stemmer for bahasa") stemmer = Stemmer() print("Finished") output_file = sys.argv[2] + '.txt' file_read = open(str(inputFile), "r", encoding='utf-8') file_write = open(output_file, "w", encoding='utf-8') start = timeit.default_timer() review_number = 0; for line in file_read.readlines(): review_number += 1 user_rating = line.split('<>')[0] sentence_number = 0 header_string = 'REVIEW-' + str(review_number) + ' ' + str(user_rating) body_string = '' # file_write.write('REVIEW-' + str(review_number) + ' [rating] ' + str(user_rating) + '\n') review = line.split('<>')[1] review = erase_question_sentence(review) # Erase question sentence for i, sentence in enumerate(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', review)): print("Processing sentence " + str(i+1) + ' from review ' + str(review_number)) sentence = sentence.lower() sentence = corrector.correct(sentence, map_emoticon, map_senti).strip() sentence = cutter.cut(sentence, map_emoticon, map_senti).strip() sentence = stemmer.stem(sentence, map_emoticon, map_senti).strip() if (sentence != ''): # file_write.write(sentence + "\n") body_string = body_string + sentence + '\n' sentence_number += 1 header_string = header_string + ' ' + str(sentence_number) + '\n' output_string = header_string + body_string file_write.write(output_string) stop = timeit.default_timer() print("Running time: " + str(stop - start)) print("Finished.\nOutput file: " + output_file) file_read.close() file_write.close() duration = 1000 # millisecond freq = 440 # Hz winsound.Beep(freq, duration)
''' feature extraction ''' para_vector = [] stemmed_sent = [] para_sents = words_list(file, stemmed_sent) fd = nltk.FreqDist(stemmed_sent) #finding thematic words thematic_words = sorted(fd.items(), key=operator.itemgetter(1)) ten_thematic_words = thematic_words[-10:] thematics = [] for i in range(len(ten_thematic_words)): thematics.append(ten_thematic_words[-(i + 1)]) dict_thematics = dict(thematics) #print(thematics) title_words = [] #finding the title words for w in para_sents[0]: if w not in stop_words: title_words.append(stemmer.stem(w)) len_sents = [] for sentence in para_sents[ 1:]: #compute the length of sentences to find the maximum sentence length in a paragraph then len_sents.append(len(sentence)) sentense_number = 0 #finding sentence location for sentence in para_sents[1:]: if para_sents.index(sentence) == 1: #checks if it's the first sentence sent_vector = [1, 1 / (len(para_sents) - 1)] else: sent_vector = [0] sent_vector.append( para_sents.index(sentence) / (len(para_sents) - 1)) sent_vector.append( len(sentence) /
class Lemmatizer(object): def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): self.verbs = {} self.stemmer = Stemmer() tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) self.words = tokenizer.words if verbs_file: self.verbs['است'] = '#است' for verb in tokenizer.verbs: for tense in self.conjugations(verb): self.verbs[tense] = verb if joined_verb_parts: for verb in tokenizer.verbs: bon = verb.split('#')[0] for after_verb in tokenizer.after_verbs: self.verbs[bon + 'ه_' + after_verb] = verb self.verbs['ن' + bon + 'ه_' + after_verb] = verb for before_verb in tokenizer.before_verbs: self.verbs[before_verb + '_' + bon] = verb def lemmatize(self, word, pos=''): if not pos and word in self.words: return word if (not pos or pos == 'V') and word in self.verbs: return self.verbs[word] if pos.startswith('AJ') and word[-1] == 'ی': return word if pos == 'PRO': return word if word in self.words: return word stem = self.stemmer.stem(word) if stem and stem in self.words: return stem return word def conjugations(self, verb): past, present = verb.split('#') ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] if verb == '#هست': return ['هست' + end for end in ends] + ['نیست' + end for end in ends] past_simples = [past + end for end in ends] past_imperfects = ['می' + item for item in past_simples] ends = ['هام', 'های', 'ه', 'هایم', 'هاید', 'هاند'] past_narratives = [past + end for end in ends] imperatives = ['ب' + present, 'ن' + present] if present.endswith('ا') or present in ('آ', 'گو'): present = present + 'ی' ends = ['م', 'ی', 'د', 'یم', 'ید', 'بودم', 'ند'] present_simples = [present + end for end in ends] present_imperfects = ['می' + item for item in present_simples] present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples] present_not_subjunctives = ['ن' + item for item in present_simples] with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items)) aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \ items[0].startswith('آ') else items return aa_refinement( with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots( past_narratives) + with_nots(present_simples) + with_nots( present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives)
# words.append("наука учённые инженер врач открытие исследование разработка создание грант установка") words.append( "спорт соревнования олимпиада спортсмен рекорд достижение медаль награждение футбол воллейбол плавание" " стадион бег баскетбол гонки прыжок падение тренер сезон турнир финал лига чемпион" ) TP = textProcessor() ST = Stemmer() words = [w.lower() for w in words] # Переводим все строки в нижний регистр words = TP.remove_symbols(words) # Удаляем стоп-символы words = TP.remove_stopwords(words) # Удаляем стоп-слова stemmed = [] for sentence in words: s = [ST.stem(i) for i in sentence] # Производится стемминг stemmed.append(s) keys = TP.remove_unique( stemmed, freq) # Удаление слов, встречающихся во всех документах более freq раз/ # По умолчанию частота freq=1 равна еденице # Получаем массив ключевых слов - термов table, disp_table = TP.table_generator( keys, stemmed) # Формируем частотную матрицу - table # И таблицу частоты встречаемости - disp_table LA = numpy.linalg freqMatrix = numpy.array(table) terms, s, docs = LA.svd(freqMatrix, full_matrices=False) assert numpy.allclose(freqMatrix,