Esempio n. 1
0
def _pos_tag(words_all):
    from nltk import pos_tag as pt

    n_sent = len(words_all)
    pos_tags_all = [[] for i in range(n_sent)]

    for i, words in enumerate(words_all):
        pos_tags = pt(words)
        pos_tags_all[i] = pos_tags

    return pos_tags_all
Esempio n. 2
0
def add_lemmatizer():
    in_fp = open(word_topic_file)
    out_fp = open(word_topic_lexeme_file, 'w')
    wnl = WordNetLemmatizer()
    ###
    line = ''
    line_num = 0
    while 1 and line_num < max_line_num:
        line = in_fp.readline()
        line = line.strip()
        line_words = line.split(' ')
        line_write = ''
        for words in line_words:
            word_topic = words.split(':')
            word_id = word_topic[0]
            topic_id = word_topic[1]
            line_write += word_id
            line_write += ':'
            line_write += topic_id
            line_write += ':'
            ##
            if id_word_dict.has_key(word_id):
                word = id_word_dict[word_id]
                if word_lexeme_id_dict.has_key(word):
                    line_write += word_lexeme_id_dict[word]
                    line_write += ' '
                else:
                    word_list = []
                    word_list.append(word)
                    pos = pt(word_list)
                    tag = pos[0][1]
                    lexeme = wnl.lemmatize(word, penn_to_wn(tag))
                    #print ': ', word,  lexeme
                    if word_id_dict.has_key(lexeme):
                        lexeme_id = word_id_dict[lexeme]
                        word_lexeme_id_dict[word] = lexeme_id
                        line_write += lexeme_id
                        line_write += ' '
                    else:
                        word_lexeme_id_dict[word] = word_id
                        line_write += word_id
                        line_write += ' '

            ##
        line_write = line_write.strip()
        out_fp.write(line_write)
        if line_num < max_line_num - 1:
            out_fp.write('\n')
        line_num += 1
        if line_num % 1000 == 0:
            print 'line: ', line_num
    ###
    in_fp.close()
    out_fp.close()
def add_lemmatizer():
    in_fp = open(word_topic_file)
    out_fp = open(word_topic_lexeme_file,  'w')
    wnl = WordNetLemmatizer()
    ###
    line = ''
    line_num = 0
    while 1 and line_num < max_line_num:
        line = in_fp.readline()
        line = line.strip()
        line_words = line.split(' ')
        line_write = ''
        for words in line_words:
            word_topic = words.split(':')
            word_id = word_topic[0]
            topic_id = word_topic[1]
            line_write += word_id
            line_write += ':'
            line_write += topic_id
            line_write += ':'
            ##
            if id_word_dict.has_key(word_id):
                word = id_word_dict[word_id]
                if word_lexeme_id_dict.has_key(word):
                    line_write += word_lexeme_id_dict[word]
                    line_write += ' '
                else:
                    word_list = []
                    word_list.append(word)
                    pos = pt(word_list)
                    tag = pos[0][1]
                    lexeme = wnl.lemmatize(word,  penn_to_wn(tag))
                    #print ': ', word,  lexeme
                    if word_id_dict.has_key(lexeme):
                        lexeme_id = word_id_dict[lexeme]
                        word_lexeme_id_dict[word] = lexeme_id
                        line_write += lexeme_id
                        line_write += ' '
                    else:
                        word_lexeme_id_dict[word] = word_id
                        line_write += word_id
                        line_write += ' '
                
            ##
        line_write = line_write.strip()
        out_fp.write(line_write)
        if line_num < max_line_num -1:
            out_fp.write('\n')
        line_num += 1
        if line_num%1000 ==0:
            print 'line: ', line_num
    ###
    in_fp.close()
    out_fp.close()
Esempio n. 4
0
 def word_freq(self):
     '''
     single word frequency without any context. This will result in the top 100 words that will be shown and
     identified as the most repeated words. However, rigorous filtration will be applied to the printed words
     getting rid of words that are not Nouns
     :return: the frequency distribution, obj.
     '''
     classified_text = pt(self.clean_words)
     noun_descriptor = [word for word, pos in classified_text if pos == 'NN']
     revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words]
     self.fdist = FreqDist(revised_noun_descriptor)
     return self.fdist
Esempio n. 5
0
# text = fsm_m1.read()
# module_test = 1

# print("%-20s %-8s %s\n" %("Token", "Tag", "Corrected Tag"))

# for sent in tagged_data:
#     for word in sent:
#         print("%-20s %-8s %s" %(word[0], word[1], word[1]))

# fsm_m1.close()

with open('data/FMFS_Module_1_Verified_Post_Verbatim_TScript.txt',
          mode='r') as f:
    for i in range(
            3
    ):  # for ignoring the forst three lines (details about the module)
        f.readline()

    module_text = f.read()
    module_text = re.sub(r'([A-Za-z])/([a-zA-Z])', r'\1 / \2', module_text)
    module_text = re.sub(r'([a-z])\.([A-Z])', r'\1\. \2', module_text)
    sentences = tk.sent_tokenize(module_text)

    tagged_data = []

    for sent in sentences:
        tagged_data.append(pt(tk.word_tokenize(sent)))
    # print(tagged_data)
    for sent in tagged_data:
        for token in sent:
            print("%-25s %-4s" % (token[0], token[1]))
Esempio n. 6
0
    return (order >= 48 and order <= 57) or (order >= 65 and order <= 90) or (order >= 97 and order <= 122) or order == 9 or order == 32

def intersection(keywords, text):
    resultVec = []
    textSet = set(text)
    for i in keywords:
        if i in textSet:
            resultVec.append(1)
        else:
            resultVec.append(0)
    return resultVec

if __name__ == '__main__':
    loadModule(argv[1])
    os.system("echo -n 'loading keywords...\t\t'")
    wordList = loadWords(argv[1])
    os.system("echo -n '[done]\n'")

    review = asciify(open(raw_input("Please enter the review location: "), 'r').read()).split()
    if argv[1] == 'P':
        review = pt(review)
    X = intersection(wordList, review)
    result = movieReviewer.predict(X)[0]

    if (result > 0):
        print "Good movie!"
        #print result
    else:
        print "BAD movie!"
        #print result
Esempio n. 7
0
        if i in textSet:
            resultVec.append(1)
        else:
            resultVec.append(0)
    return resultVec

os.system("echo -n 'loading training files:\n[05%'")
files = os.listdir('./train/pos')[:3500]
total = float(len(files)) * 2
done = 0
progress = 0

for i in files:
    f = asciify(open('./train/pos/' + i, 'r').read()).split()
    if argv[1] == 'P':
        f = pt(f)
    X.append(intersection(wordList, f))
    y.append(1)
    done += 1
    if (done / total * 100 >= progress + 5):
        progress += 5
        os.system("echo -n '\b\b\b=%d%%'" % progress)

files = os.listdir('./train/neg')[:3500]

for i in files:
    f = asciify(open('./train/neg/' + i, 'r').read()).split()
    if argv[1] == 'P':
        f = pt(f)
    X.append(intersection(wordList, f))
    y.append(-1)
Esempio n. 8
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 12 15:32:07 2013

@author: ibews
"""
from nltk import pos_tag as pt

t = ['Dies', 'ist', 'eine', 'Beispiel', 'Game', '.']

a = [len(x) for x in t if "a" in x]

# Was ist mit types der Länge 4 gemeint?
# Ist eine Liste der Types von „Worten mit einer Länge == 4“ gefordert?
b = sorted(list(set(x[1] for x in pt(t) if len(x[0])==4)))