Ejemplo n.º 1
0
def dict_ingest(path_to_dict):
    noun = []
    verb = []
    adjective = []
    adverb = []
    miscel = []
    f = open(path_to_dict, 'r')
    for l in f:
        word = l.strip()
        if en.is_noun(word):
            noun.append(word)
        elif en.is_verb(word):
            verb.append(word)
        elif en.is_adjective(word):
            adjective.append(word)
        elif en.is_adverb(word):
            adverb.append(word)
        else:
            miscel.append(word)
    print noun[:5]
    print verb[:5]
    print adjective[:5]
    print adverb[:5]
    print miscel[:5]
    return noun, verb, adjective, adverb, miscel
Ejemplo n.º 2
0
def simplify_word(a):

    # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a),

    try:#测试是否为动词,如果是则返回
        try_present_verb = en.verb.present(a)#try
        if en.is_verb(try_present_verb):
            # if try_present_verb != a:
            #     print " 动词现在时化:{0} -> {1}".format(a,try_present_verb)
            # else:
            #     print ""
            return try_present_verb
    except:#否则继续检查
        pass

    #测试是否是名词
    try_singular_noun = en.noun.singular(a)
    if en.is_noun(try_singular_noun):
        # if try_singular_noun != a:
        #     print " 名词单数化:{0} -> {1}".format(a,try_singular_noun)
        # else:
        #     print ""
        return try_singular_noun

    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        # print ""
        return a

    return ''
Ejemplo n.º 3
0
def simplify_word(a):

    # print "[{0}],正在分析词汇: {1}".format(time.ctime().split()[3], a),

    try:  #测试是否为动词,如果是则返回
        try_present_verb = en.verb.present(a)  #try
        if en.is_verb(try_present_verb):
            # if try_present_verb != a:
            #     print " 动词现在时化:{0} -> {1}".format(a,try_present_verb)
            # else:
            #     print ""
            return try_present_verb
    except:  #否则继续检查
        pass

    #测试是否是名词
    try_singular_noun = en.noun.singular(a)
    if en.is_noun(try_singular_noun):
        # if try_singular_noun != a:
        #     print " 名词单数化:{0} -> {1}".format(a,try_singular_noun)
        # else:
        #     print ""
        return try_singular_noun

    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(
            a) or en.is_connective(a):
        # print ""
        return a

    return ''
Ejemplo n.º 4
0
def dict_ingest(path_to_dict):
    noun = []
    verb = []
    adjective = []
    adverb = []
    miscel = []
    f = open(path_to_dict,'r')
    for l in f:
        word = l.strip()
        if en.is_noun(word):
            noun.append(word)
        elif en.is_verb(word):
            verb.append(word)
        elif en.is_adjective(word):
            adjective.append(word)
        elif en.is_adverb(word):
            adverb.append(word)
        else:
            miscel.append(word)
    print noun[:5]
    print verb[:5]
    print adjective[:5]
    print adverb[:5]
    print miscel[:5]
    return noun, verb, adjective, adverb, miscel
Ejemplo n.º 5
0
 def is_a_expression(self, word):
     return self.is_a_hash_tag(word)\
            or self.is_negation(word) \
            or en.is_noun(word) \
            or en.is_adjective(word) \
            or en.is_verb(word) \
            or en.is_adverb(word) \
            or self.is_orality(word)
Ejemplo n.º 6
0
 def is_a_expression(self, word):
     return self.is_a_hash_tag(word)\
            or self.is_negation(word) \
            or en.is_noun(word) \
            or en.is_adjective(word) \
            or en.is_verb(word) \
            or en.is_adverb(word) \
            or self.is_orality(word)
Ejemplo n.º 7
0
    def giveNearestEmotion(self, word):
        if en.is_verb(word):
            return en.verb.is_emotion(word, boolean=False)

        if en.is_adverb(word):
            return en.adverb.is_emotion(word, boolean=False)

        if en.is_adjective(word):
            return en.adjective.is_emotion(word, boolean=False)

        return en.noun.is_emotion(word, boolean=False)
Ejemplo n.º 8
0
Archivo: views.py Proyecto: mitnk/mc
def get_gloss(word):
    if en.is_verb(word):
        return en.verb.gloss(word)
    elif en.is_adjective(word):
        return en.adjective.gloss(word)
    elif en.is_adverb(word):
        return en.adverb.gloss(word)
    elif en.is_noun(word):
        return en.noun.gloss(word)
    else:
        return en.wordnet.gloss(word)
Ejemplo n.º 9
0
def get_article(word, tokens, index):
    article_index = index - 1

    if index <= 0:
        return tokens[0]

    if not is_noun(word) and not is_adjective(word) and not is_adverb(word):
        return tokens[article_index]

    if tokens[article_index] == 'a' or tokens[article_index] == 'an':
        proper_article = noun.article(word).split()[0]
        return proper_article

    return tokens[article_index]
Ejemplo n.º 10
0
def simplify_word(a):
    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        return a
    try:#测试是否为动词,如果是则返回
        en.is_verb(en.verb.present(a))
        return en.verb.present(a)
    except:#否则继续检查
        pass
    
    #测试是否是名词
    if en.is_noun(en.noun.singular(a)):
        return en.noun.singular(a)
    otherwordlist.append(a)
    #print a
    return a
Ejemplo n.º 11
0
def convertVerb(srclst):
    dstlst = []
    itemnew = ""
    for item in srclst:
        #print(item)  ############################when nos lib give error
        #if (item.endswith("ed") or item.endswith("ing")) \
        if en.is_verb(item) \
            and (not en.is_noun(item)) \
            and (not en.is_adjective(item)) \
            and (not en.is_adverb(item)) \
            and (item not in WIERDWORDS):
            try:
                itemnew = en.verb.present(item)
            except:
                print "unrecognized word:", item
                itemnew = item
        else:
            itemnew = item
        dstlst.append(itemnew)
    return dstlst
Ejemplo n.º 12
0
def simplify_word(a):
    
    try:#测试是否为动词,如果是则返回
        en.is_verb(en.verb.present(a))
        return en.verb.present(a)
    except:#否则继续检查
        pass
    
    #测试是否是名词
    if en.is_noun(en.noun.singular(a)):
        return en.noun.singular(a)
    
    #如果已经可以判断是名词,动词,形容词,副词,连词
    if en.is_noun(a) or en.is_verb(a) or en.is_adjective(a) or en.is_adverb(a) or en.is_connective(a):
        return a
        
    
    
    
    otherwordlist.append(a)
    return a
Ejemplo n.º 13
0
def valid_pos(word):
    if not is_noun(word) and not is_verb(word) and not is_adjective(
            word) and not is_adverb(word) and len(word) < 7:
        return False

    return True
Ejemplo n.º 14
0
 def is_major(word):
     return en.is_verb(word) or en.is_adjective(word) or\
     en.is_adverb(word) or (word in MODAL_VERBS)
Ejemplo n.º 15
0
def get_frequncy_dist(dir_path):
    files = os.listdir(dir_path)

    all_words = 0
    words_wt_freq = {}   
    '''get words'''
    for filename in files:
        if (filename.endswith('.srt')):
            file_handler = open(dir_path + '\\' + filename, 'r')
            for line in file_handler :
                for word in line.strip().split():
                    sword = word.strip(punctuation)
                    if (sword.isalpha()):
                        lword = sword.lower()
                        words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1
                        all_words += 1
            file_handler.close()
    logger.debug('# all words: ' + str (all_words - 1))
    logger.debug('# unique words: ' + str (len(words_wt_freq.keys())))
    lexical_diversity_for_freq(words_wt_freq.values())
    
    lemmatized_words_wt_freq = {}
    for word in words_wt_freq.keys():
        lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word)
        if (word != lemmatized_word and lemmatized_word != None):
            lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word)
            #print(lemmatized_word, word)
        else:
            lemmatized_words_wt_freq[word] = words_wt_freq.get(word)
    lemmatized_size = len(lemmatized_words_wt_freq.keys())            
    logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size))
    lexical_diversity_for_freq(lemmatized_words_wt_freq.values())
    words_wt_freq = {} # Save memory

    
    stopwords_en = stopwords.words('english')
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    comparative = swadesh.words('en')
    ignore_list = [] ;
    ignore_list.extend(stopwords_en)
    ignore_list.extend(male_names)
    ignore_list.extend(female_names)
    ignore_list.extend(comparative)            
    filtered_words = []

    out_file = open(dir_path + '\\wfd.csv', 'w')
    out_file.write ('Word, Type, Frequency \n')
        
    for word in lemmatized_words_wt_freq.keys():
        if len(word) > 2 and word not in ignore_list:
            filtered_words.append(word)   
        else:
            out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words)))
    ignore_list = [] #save memory

    '''wordnet has 155k'''                                 
    usual_words = []
    for word in  filtered_words:
        if (len(wordnet.synsets(word)) != 0):
            usual_words.append(word)
        else:
            out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words)))
    filtered_words = [] # save memory 

    tag_filtered_words_wt_freq = {}
    words_wt_tags = nltk.pos_tag(usual_words)
    for (word, tag) in words_wt_tags:
        if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']):
            if(en.is_adverb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADV,' + word)
            elif (en.is_adjective(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADJ,' + word)
            elif (en.is_verb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('VB,' + word)
            elif (en.is_noun(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('N,' + word) 
            else:
                if (tag in ['VBZ', 'NNS']):
                    if word.endswith('s'):
                        new_word = word[:-1]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                        #print (word , new_word,tag)    
                elif (tag == 'VBG'):
                    new_word = en.verb.infinitive(word)
                    if new_word != None and word != new_word:
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                elif (tag == 'JJS'):
                    if word.endswith('est'):
                        new_word = word[:-3]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)     
                else:
                    tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]        
                    #print (word,tag)   
        else:
            out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys())))
    lexical_diversity_for_freq(tag_filtered_words_wt_freq.values())
    lemmatized_words_wt_freq = {} # save memory
    usual_words = [] #save memory

    basic_english_vocab = en.basic.words
    non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab)
    non_basic_words_wt_freq = {}
    for non_basic_word in non_basic_words:
        non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] 
    words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys())))
    lexical_diversity_for_freq(non_basic_words_wt_freq.values())
    tag_filtered_words_wt_freq = {} #save memory


    fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r')
    my_words = [word.lower() for line in fh for word in line.strip().split()]
    fh.close()
    new_words = set(non_basic_words).difference(my_words)
    words_in_both = set(non_basic_words).intersection(my_words)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n')    
    new_words_wt_freq = {}
    for new_word in new_words:
        new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] 
    logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys())))
    lexical_diversity_for_freq(new_words_wt_freq.values())
    
    sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0))
    for (word, frequency) in sorted_words:
        out_file.write (word + ',lexicon,' + str(frequency) + '\n')
    out_file.close()
    
    return new_words_wt_freq