예제 #1
0
def main(inputFile):
  print("Start making emoticon map")
  map_emoticon = generateEmotMap('emoticon_id.txt')
  print("Finished")

  print("Start making senti map")
  map_senti = generateSentiMap(['boosterwords_id.txt', 'idioms_id.txt', 'negatingword.txt', 'sentiwords_id.txt'])
  print("Finished")
  
  print("Start making abbreviation dictionary for bahasa")
  corrector = Corrector('singkatan.dic')
  print("Finished")

  print("Start making stopword dictionary for bahasa")
  cutter = Cutter('stopword.txt')
  print("Finished")

  print("Start making stemmer for bahasa")
  stemmer = Stemmer()
  print("Finished")
  
  output_file = sys.argv[2] + '.txt'
  file_read = open(str(inputFile), "r", encoding='utf-8')
  file_write = open(output_file, "w", encoding='utf-8')
  start = timeit.default_timer()
  review_number = 0;
  for line in file_read.readlines():
    review_number += 1
    user_rating = line.split('<>')[0]
    sentence_number = 0
    header_string = 'REVIEW-' + str(review_number) + ' ' + str(user_rating)
    body_string = ''
    # file_write.write('REVIEW-' + str(review_number) + ' [rating] ' + str(user_rating) + '\n')
    review = line.split('<>')[1]
    review = erase_question_sentence(review) # Erase question sentence
    for i, sentence in enumerate(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', review)):
      print("Processing sentence " + str(i+1) + ' from review ' + str(review_number))
      sentence = sentence.lower()
      sentence = corrector.correct(sentence, map_emoticon, map_senti).strip()
      sentence = cutter.cut(sentence, map_emoticon, map_senti).strip()
      sentence = stemmer.stem(sentence, map_emoticon, map_senti).strip()
      if (sentence != ''):
        # file_write.write(sentence + "\n")
        body_string = body_string + sentence + '\n'
        sentence_number += 1
    header_string = header_string + ' ' + str(sentence_number) + '\n'
    output_string = header_string + body_string
    file_write.write(output_string)
  stop = timeit.default_timer()
  print("Running time: " + str(stop - start))
  print("Finished.\nOutput file: " + output_file)
  file_read.close()
  file_write.close()
  duration = 1000  # millisecond
  freq = 440  # Hz
  winsound.Beep(freq, duration)
예제 #2
0
    ''' feature extraction '''
    para_vector = []
    stemmed_sent = []
    para_sents = words_list(file, stemmed_sent)
    fd = nltk.FreqDist(stemmed_sent)  #finding thematic words
    thematic_words = sorted(fd.items(), key=operator.itemgetter(1))
    ten_thematic_words = thematic_words[-10:]
    thematics = []
    for i in range(len(ten_thematic_words)):
        thematics.append(ten_thematic_words[-(i + 1)])
        dict_thematics = dict(thematics)
    #print(thematics)
    title_words = []  #finding the title words
    for w in para_sents[0]:
        if w not in stop_words:
            title_words.append(stemmer.stem(w))
    len_sents = []
    for sentence in para_sents[
            1:]:  #compute the length of sentences to find the maximum sentence length in a paragraph then
        len_sents.append(len(sentence))

    sentense_number = 0  #finding sentence location
    for sentence in para_sents[1:]:
        if para_sents.index(sentence) == 1:  #checks if it's the first sentence
            sent_vector = [1, 1 / (len(para_sents) - 1)]
        else:
            sent_vector = [0]
            sent_vector.append(
                para_sents.index(sentence) / (len(para_sents) - 1))
        sent_vector.append(
            len(sentence) /
예제 #3
0
class Lemmatizer(object):

    def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True):
        self.verbs = {}
        self.stemmer = Stemmer()

        tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file)
        self.words = tokenizer.words

        if verbs_file:
            self.verbs['است'] = '#است'
            for verb in tokenizer.verbs:
                for tense in self.conjugations(verb):
                    self.verbs[tense] = verb
            if joined_verb_parts:
                for verb in tokenizer.verbs:
                    bon = verb.split('#')[0]
                    for after_verb in tokenizer.after_verbs:
                        self.verbs[bon + 'ه_' + after_verb] = verb
                        self.verbs['ن' + bon + 'ه_' + after_verb] = verb
                    for before_verb in tokenizer.before_verbs:
                        self.verbs[before_verb + '_' + bon] = verb

    def lemmatize(self, word, pos=''):
        if not pos and word in self.words:
            return word

        if (not pos or pos == 'V') and word in self.verbs:
            return self.verbs[word]

        if pos.startswith('AJ') and word[-1] == 'ی':
            return word

        if pos == 'PRO':
            return word

        if word in self.words:
            return word

        stem = self.stemmer.stem(word)
        if stem and stem in self.words:
            return stem

        return word

    def conjugations(self, verb):

        past, present = verb.split('#')
        ends = ['م', 'ی', '', 'یم', 'ید', 'ند']

        if verb == '#هست':
            return ['هست' + end for end in ends] + ['نیست' + end for end in ends]

        past_simples = [past + end for end in ends]
        past_imperfects = ['می‌' + item for item in past_simples]
        ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند']
        past_narratives = [past + end for end in ends]

        imperatives = ['ب' + present, 'ن' + present]

        if present.endswith('ا') or present in ('آ', 'گو'):
            present = present + 'ی'

        ends = ['م', 'ی', 'د', 'یم', 'ید', 'بودم', 'ند']
        present_simples = [present + end for end in ends]
        present_imperfects = ['می‌' + item for item in present_simples]
        present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples]
        present_not_subjunctives = ['ن' + item for item in present_simples]

        with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items))
        aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \
            items[0].startswith('آ') else items
        return aa_refinement(
            with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots(
                past_narratives) + with_nots(present_simples) + with_nots(
                present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives)
예제 #4
0
# words.append("наука учённые инженер врач открытие исследование разработка создание грант установка")
words.append(
    "спорт соревнования олимпиада спортсмен рекорд достижение медаль награждение футбол воллейбол плавание"
    " стадион бег баскетбол гонки прыжок падение тренер сезон турнир финал лига чемпион"
)

TP = textProcessor()
ST = Stemmer()

words = [w.lower() for w in words]  # Переводим все строки в нижний регистр
words = TP.remove_symbols(words)  # Удаляем стоп-символы
words = TP.remove_stopwords(words)  # Удаляем стоп-слова

stemmed = []
for sentence in words:
    s = [ST.stem(i) for i in sentence]  # Производится стемминг
    stemmed.append(s)

keys = TP.remove_unique(
    stemmed,
    freq)  # Удаление слов, встречающихся во всех документах более freq раз/
# По умолчанию частота freq=1 равна еденице
# Получаем массив ключевых слов - термов

table, disp_table = TP.table_generator(
    keys, stemmed)  # Формируем частотную матрицу - table
# И таблицу частоты  встречаемости - disp_table
LA = numpy.linalg
freqMatrix = numpy.array(table)
terms, s, docs = LA.svd(freqMatrix, full_matrices=False)
assert numpy.allclose(freqMatrix,