Exemple #1
0
    def lemma(self, text):

        lemmatzer = nltk.WordNetLemmatizer()
        text = [[lemmatzer.lemmatize(w) for w in doc] for doc in text]

        return text
Exemple #2
0

#joining all strings into one string
rawText = " ".join(lst)

#tokenize rawtext words
tokens = nltk.word_tokenize(rawText)
text = nltk.Text(tokens)

# Remove extra chars and remove stop words.
stopWords = prepareStopWords()
text_content = [word for word in text if word not in stopWords]
# Remove any entries where the len is zero.
text_content = [s for s in text_content if len(s) != 0]
#get the lemmas of each word to reduce the number of similar words
WNL = nltk.WordNetLemmatizer()
text_content = [WNL.lemmatize(t) for t in text_content]

# =============================================================================
# UNIGRAM
# =============================================================================

unigram_strg = " ".join(text_content)

# Setting word cloud params and plotting the word cloud.
WC_height = 500
WC_width = 1000
WC_max_words = 100

unigram_wordcloud = WordCloud(max_font_size=50,
                              max_words=100,
def raw_preprocess(textfile):
    #order of functions:
    #read in x
    #tokenise x
    #normalize x
    #perform tests on most common words x
    #contractions (needs to come before removal of punctuation and negation) x
    #remove stopwords x
    #negation (needs to come before removal of punctuation) x
    #lemmatize x
    #remove punctuation x
    #remove other words x
    #tag x 4: default, RE, unigram, bigram x

    domain = open(textfile)  #read in data from text file
    raw_domain = domain.read()

    tokens = nltk.word_tokenize(raw_domain)  #tokenise
    words = [w.lower() for w in tokens]  #normalize

    wnl = nltk.WordNetLemmatizer()

    #find the lexical diversity of the debate
    def lex_diversity(text):
        return len(set(text)) / len(text) * 100

    #find the most common collocations within the debate
    def collocations(text):
        finder = BigramCollocationFinder.from_words(tokens)
        return sorted(
            finder.above_score(bigram_measures.raw_freq,
                               3.0 / len(tuple(nltk.bigrams(tokens)))))

    #process and replace contracted words with full phrases
    def contractions(text):
        #list of contractions
        contractions = [['don', 't', 'do', 'not'], ['can', 't', 'can', 'not'],
                        ['isn', 't', 'is', 'not'], ['aren', 't', 'are', 'not'],
                        ['wasn', 't', 'was', 'not'],
                        ['weren', 't', 'were', 'not'],
                        ['hasn', 't', 'has', 'not'],
                        ['haven', 't', 'have', 'not'],
                        ['hadn', 't', 'had',
                         'not'], ['won', 't', 'will', 'not'],
                        ['wouldn', 't', 'would', 'not'],
                        ['doesn', 't', 'does', 'not'],
                        ['didn', 't', 'did', 'not'],
                        ['couldn', 't', 'could', 'not'],
                        ['shouldn', 't', 'should', 'not'],
                        ['mightn', 't', 'might', 'not'],
                        ['mustn', 't', 'must', 'not'],
                        ['would', 've', 'would', 'have'],
                        ['could', 've', 'could', 'have'],
                        ['should', 've', 'should', 'have'],
                        ['I', 'm', 'I', 'am'], ['I', 'll', 'I', 'will'],
                        ['I', 'd', 'I', 'would'], ['I', 've', 'I', 'have'],
                        ['I', 'd', 'I', 'had'], ['you', 're', 'you', 'are'],
                        ['you', 'll', 'you', 'will'],
                        ['you', 'd', 'you', 'would'],
                        ['you', 've', 'you',
                         'have'], ['you', 'd', 'you', 'had'],
                        ['he', 's', 'he', 'am'], ['he', 'll', 'he', 'will'],
                        ['he', 'd', 'he', 'would'], ['he', 's', 'he', 'has'],
                        ['he', 'd', 'he', 'had'], ['she', 's', 'she', 'am'],
                        ['she', 'll', 'she', 'will'],
                        ['she', 'd', 'she', 'would'],
                        ['she', 's', 'she', 'has'], ['she', 'd', 'she', 'had'],
                        ['it', 's', 'it', 'is'], ['it', 'll', 'it', 'will'],
                        ['it', 'd', 'it', 'would'], ['it', 's', 'it', 'has'],
                        ['it', 'd', 'it', 'had'], ['we', 're', 'we', 'are'],
                        ['we', 'll', 'we', 'will'], ['we', 'd', 'we', 'would'],
                        ['we', 've', 'we', 'have'], ['we', 'd', 'we', 'had'],
                        ['they', 're', 'they', 'are'],
                        ['they', 'll', 'they', 'will'],
                        ['they', 'd', 'they', 'would'],
                        ['they', 've', 'they', 'have'],
                        ['they', 'd', 'they', 'had'],
                        ['that', 's', 'that', 'is'],
                        ['that', 'll', 'that', 'will'],
                        ['that', 'd', 'that', 'would'],
                        ['that', 's', 'that', 'has'],
                        ['that', 'd', 'that',
                         'had'], ['who', 's', 'who', 'is'],
                        ['who', 'll', 'who',
                         'will'], ['who', 'd', 'who', 'would'],
                        ['who', 's', 'who', 'has'], ['who', 'd', 'who', 'had'],
                        ['what', 's', 'what', 'is'],
                        ['what', 're', 'what', 'are'],
                        ['what', 'll', 'what', 'will'],
                        ['what', 'd', 'what', 'would'],
                        ['what', 's', 'what', 'has'],
                        ['what', 'd', 'what', 'had'],
                        ['where', 's', 'where', 'is'],
                        ['where', 're', 'where', 'are'],
                        ['where', 'll', 'where', 'will'],
                        ['where', 'd', 'where', 'would'],
                        ['where', 's', 'where', 'has'],
                        ['where', 'd', 'where', 'had'],
                        ['when', 's', 'when', 'is'],
                        ['when', 're', 'when', 'are'],
                        ['when', 'll', 'when', 'will'],
                        ['when', 'd', 'when', 'would'],
                        ['when', 's', 'when', 'has'],
                        ['when', 'd', 'when',
                         'had'], ['why', 's', 'why', 'is'],
                        ['why', 're', 'why', 'are'],
                        ['why', 'll', 'why', 'will'],
                        ['why', 'd', 'why',
                         'would'], ['why', 's', 'why', 'has'],
                        ['why', 'd', 'why', 'had'], ['how', 's', 'how', 'is'],
                        ['how', 're', 'how', 'are'],
                        ['how', 'll', 'how', 'will'],
                        ['how', 'd', 'how',
                         'would'], ['how', 's', 'how', 'has'],
                        ['how', 'd', 'how', 'had']]

        location = 0
        for word in text:
            if word == '’':
                for (before, after, newbefore, newafter) in contractions:
                    if text[location - 1] == before:
                        if text[location + 1] == after:
                            text[location - 1] = newbefore
                            text[location + 1] = newafter
            location += 1
        return text

    #append NOT_ to each word that follows 'not' within a sentence or clause
    def negation(text):  # apply negation
        for w in range(0, len(text)):  #for word in the text
            if (text[w] == 'not'):  #if that word is not - add more??
                n = 1
                not_ = w
                while True:  #while still in sentence/word clause and not at end of text
                    text[not_ +
                         n] = 'NOT_' + text[not_ + n]  #add 'NOT' to each word
                    n += 1
                    if ((not_ + n) >= len(text)): break
                    if text[not_ + n] in ('NOT_.', '.', 'NOT_?', '?', 'NOT_!',
                                          '!', 'NOT_,', ',', 'NOT_:', ':',
                                          'NOT_;',
                                          ';'):  #until end of sentence
                        break
        return text

    #common, potentially skewing, words in text
    words_to_remove = ['baroness', 'lord', 'lords', 'noble']

    words = contractions(
        words)  #apply above function: deal with contractions and apostrophes
    words = [
        word for word in words
        if word not in stopwords.words('english') or word in ('not')
    ]  #remove stopwords
    words = negation(words)  #deal with negations in the text
    words = [wnl.lemmatize(w) for w in words]  #lemmatize
    words = [
        word for word in words if word.isalpha() or word.startswith('NOT_')
    ]  #remove punctuation
    #print("distinct words: ", distinct_words(words))
    #print("lexical diversity: ",lex_diversity(words),'%')
    #print("collocations: ",collocations(words))
    words = [word for word in words if word not in words_to_remove]
    words = [
        word.replace('NOT_' + r'^-?[0-9]+$', 'NOT_') for word in words
    ]  #remove punctuation/special characters aside from the underscore in NOT_
    words = [word.replace('NOT_' + r'(\.|\,|!\?)', 'NOT_') for word in words]
    words = [word for word in words if word not in ('NOT_')]

    #print(words)
    #TAG REMAINING WORDS

    #find more frequent (therefore default) tagger
    bts = brown.tagged_sents(categories='news', tagset='universal')
    tags = [
        t
        for (w, t) in brown.tagged_words(categories='news', tagset='universal')
    ]
    fd = nltk.FreqDist(tags)
    fd.most_common(1)  #most common is NOUN

    #regular expressions tagger
    patterns = [
        (r'.+ing$', 'VERB'),  #	gerunds
        (r'.+ed$', 'VERB'),  #    past tense of verbs
        (r'.+es$', 'VERB'),  #    present tense
        (r'.+ould$', 'VERB'),  #    modal verb
        (r'.+\'s$', 'NOUN'),  #    possessive
        (r'.+s$', 'NOUN'),  #	 plural nouns
        (r'^-?[0-9]+$', 'NUM'),  #     cardinal numbers
        (r'.+ly$', 'ADV'),  #     adverbs
        (r'(^the$|^a$|^wh)', 'DET'),  # determiner
        (r'^[A-Z]', 'NOUN'),  #     proper names
        (r'(^he$|^she$|^they$|^him$|^her$|^his$|^hers$|^theirs$)',
         'PRO'),  #personal pronouns
        (r'\b(at|in|of)(?:\w+\s){0,3}([A-Z]\w+)', 'PPO'),  # prepositions
        (r'(^can$|^may$|^must$|^should$|^would$|^could$)', 'MOD'),  #modals
        (r'(\.|\,|!\?)', '.')
    ]

    #unigram
    size = int(len(bts) * 0.9)
    train = bts[:size]
    test = bts[size:]
    unigram_tagger = nltk.UnigramTagger(train)
    unigram_tagger.evaluate(test)

    #bringing in context: n-gram except issue with sparse data problem
    #bigram
    bg_tag = nltk.BigramTagger(train)
    bg_tag.evaluate(test)
    bg_tagged_sents = [bg_tag.tag(s) for s in words]
    bad_tags = [s for s in bg_tagged_sents if None in [tag for (w, tag) in s]]
    bad_tags[0]

    #combine different taggers to get the most accurate
    t0 = nltk.DefaultTagger('NOUN')
    t1 = nltk.RegexpTagger(patterns, backoff=t0)
    t2 = nltk.UnigramTagger(train, backoff=t1)
    t3 = nltk.BigramTagger(train, backoff=t2)
    t3.evaluate(test)

    tagged_words = [t3.tag(words)]

    #count up the NOUN/VERB/ADV/ADJ in the text
    #c = tuple(i for i in tagged_words[0])
    #d = Counter(elem[1] for elem in c)
    #print(d)
    #print(tagged_words)
    return tagged_words
Exemple #4
0
def lemmatize(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(token)) for token in tokens]
    return tokens
Exemple #5
0
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
print('\ndir(wn):')
print(dir(wn))
Exemple #6
0
def data_predict(text):
    print('---------------------'+text+'-------------------')
    fullCorpus = pd.read_csv('data/final.tsv', sep='|', header=None)
    fullCorpus.columns = ['label', 'body_text']
    fullCorpus.head()

    print("input data has {} rows columns".format(len(fullCorpus), len(fullCorpus.columns)))

    print("Out of {} rows , {} are happy , {} are sad , {} are angry , {} are calm".format(len(fullCorpus),
                                                             len(fullCorpus[fullCorpus['label'] == 'happy']),
                                                             len(fullCorpus[fullCorpus['label'] == 'sad']),
                                                             len(fullCorpus[fullCorpus['label'] == 'angry']),
                                                             len(fullCorpus[fullCorpus['label'] == 'calm'])))

    print("Number of null in label: {}".format(fullCorpus['label'].isnull().sum()))
    print("Number of null in label: {}".format(fullCorpus['body_text'].isnull().sum()))

    def convert_to_lowercase(text):
        input_str = text
        input_str = input_str.lower()
        return input_str

    fullCorpus['body_text_lower'] = fullCorpus['body_text'].apply(lambda x: convert_to_lowercase(x))

    # print seperate file for lowercase output
    print('----------------------------Print in the punctuation.txt-----------------------------------------')
    file = open('pre-process_output/lowercase.txt', 'w',encoding="utf-8")
    lowercased = fullCorpus['body_text_lower']
    for index, val in lowercased.iteritems():
        line = str(index) + '\t' + str(val)
        file.write(line + '\n')
    file.close()

    def remove_numbers(text):
        result = re.sub(r'\d+', '', text)
        return result

    fullCorpus['body_text_no_numbers'] = fullCorpus['body_text_lower'].apply(lambda x: remove_numbers(x))
    # print('finished')

    # print seperate file for without number output
    print('----------------------------Print in the no_number.txt-----------------------------------------')
    file = open('pre-process_output/no_numbers.txt', 'w',encoding="utf-8")
    no_numbers = fullCorpus['body_text_no_numbers']
    for index, val in no_numbers.iteritems():
        line = str(index) + '\t' + str(val)
        file.write(line + '\n')
    file.close()

    # remove punctuation
    string.punctuation
    def remove_punct(text):
        text_nopunct = "".join([char for char in text if char not in string.punctuation])
        return text_nopunct
    fullCorpus['body_text_no_punctuation'] = fullCorpus['body_text_no_numbers'].apply(lambda x: remove_punct(x))
    fullCorpus.head(25)

    # print seperate file for remove punctuation output
    print('----------------------------Print in the punctuation.txt-----------------------------------------')
    file = open('pre-process_output/no_punctuation.txt', 'w',encoding="utf-8")
    punctuationed = fullCorpus['body_text_no_punctuation']
    for index, val in punctuationed.iteritems():
        line = str(index)+'\t'+str(val)
        file.write(line+'\n')
    file.close()


    # tokenization
    def tokenize(text):
        tokens = re.split('\W+', text)
        return tokens
    fullCorpus['body_text_tokenized'] = fullCorpus['body_text_no_punctuation'].apply(lambda x: tokenize(x.lower()))
    fullCorpus.head()

    # print seperate file for tokenize output
    print('----------------------------Print in the tokenize.txt-----------------------------------------')
    file = open('pre-process_output/tokenize.txt', 'w',encoding="utf-8")
    tokenized = fullCorpus['body_text_tokenized']
    for index, val in tokenized.iteritems():
        line = str(index)+'\t'+str(val)
        file.write(line+'\n')
    file.close()


    # stop word
    stopword = nltk.corpus.stopwords.words('english')
    def remove_stopwords(tokenized_list):
        # for word in tokenized_list:
        #     if word not in stopword:
        #         print(word)
        text = [word for word in tokenized_list if word not in stopword]
        return text
    fullCorpus['body_text_nostop'] = fullCorpus['body_text_tokenized'].apply(lambda x: remove_stopwords(x))
    fullCorpus.head()

    # print seperate file for remove stop word output
    print('----------------------------Print in the nonstop.txt-----------------------------------------')
    file = open('pre-process_output/nostop.txt', 'w',encoding="utf-8")
    nostoped = fullCorpus['body_text_nostop']
    for index, val in nostoped.iteritems():
        line = str(index)+'\t'+str(val)
        file.write(line+'\n')
    file.close()


    #lemmatizing
    wn = nltk.WordNetLemmatizer()

    def clean_text(tokeized_text):
        text = [wn.lemmatize(word) for word in tokeized_text]
        return text

    def join_text(sentence):
        return ' '.join(sentence)

    def data_lematization():
        fullCorpus['body_text_lemmatized'] = fullCorpus['body_text_nostop'].apply(lambda x: clean_text(x))
        fullCorpus['body_text_lemmatized'] = fullCorpus['body_text_lemmatized'].apply(lambda x: join_text(x))
    data_lematization()

    # print seperate file for lematization output
    print('----------------------------Print in the cleaned.txt-----------------------------------------')
    file = open('pre-process_output/lemmatized.txt', 'w',encoding="utf-8")
    cleaned=fullCorpus['body_text_lemmatized']
    for index, val in cleaned.iteritems():
        line = str(index)+'\t'+val
        file.write(line+'\n')
    file.close()


    # separate training and testing data
    trainData = fullCorpus['body_text_lemmatized'][:1500]
    testData = fullCorpus['body_text_lemmatized'][1500:]
    train_labels = fullCorpus['label'][:1500]
    test_labels = fullCorpus['label'][1500:]

    # print train data and test data
    print("Train data count:\n", train_labels.value_counts())
    print("Test data count:\n", test_labels.value_counts())

    #extract Unigrams
    unigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(1,1), max_df=1.0,min_df=1,max_features=None)
    count_unigram = unigram_vectorized.fit(trainData)
    unigrams = unigram_vectorized.transform(trainData)

    #extract Bigrams
    bigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(2,2), max_df=1.0,min_df=1,max_features=None)
    count_bigram = bigram_vectorized.fit(trainData)
    bigrams = bigram_vectorized.transform(trainData)

    #extract Trigrams
    trigram_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(3,3), max_df=1.0,min_df=1,max_features=None)
    count_trigram = trigram_vectorized.fit(trainData)
    trigrams = trigram_vectorized.transform(trainData)

    # unigram,bigram and trigram as together
    full_vectorized = CountVectorizer(stop_words="english",analyzer="word",ngram_range=(1,3), max_df=1.0,min_df=1,max_features=None)
    count_full = full_vectorized.fit(trainData)
    full = full_vectorized.transform(trainData)

    # unigram frequency
    unigram_fre = TfidfTransformer().fit(unigrams)
    transformer_unigrams = unigram_fre.transform(unigrams)

    #Bigram frequency
    bigram_fre = TfidfTransformer().fit(bigrams)
    transformer_bigrams = bigram_fre.transform(bigrams)

    #Trigram frequency
    trigram_fre = TfidfTransformer().fit(trigrams)
    transformer_trigrams = trigram_fre.transform(trigrams)

    #full set frequency
    full_fre = TfidfTransformer().fit(full)
    transformer_full = full_fre.transform(full)

    # define model for n-grams
    u_model = MultinomialNB().fit(transformer_unigrams,train_labels)
    b_model = MultinomialNB().fit(transformer_bigrams,train_labels)
    t_model = MultinomialNB().fit(transformer_trigrams,train_labels)
    f_model = MultinomialNB().fit(transformer_full,train_labels)


    # sample for testing comment
    test_sample = [text]

    #Unigram probability counting
    uni_test_CountVectorized = unigram_vectorized.transform(test_sample)
    uni_test_fre = unigram_fre.transform(uni_test_CountVectorized)
    unimodel_test_CountVectorized = unigram_vectorized.transform(testData)
    unimodel_test_fre = unigram_fre.transform(unimodel_test_CountVectorized)

    #Bigram probability counting
    bi_test_CountVectorized = bigram_vectorized.transform(test_sample)
    bi_test_fre = bigram_fre.transform(bi_test_CountVectorized)
    bimodel_test_CountVectorized = bigram_vectorized.transform(testData)
    bimodel_test_fre = bigram_fre.transform(bimodel_test_CountVectorized)

    #Trigram probability counting
    tri_test_CountVectorized = trigram_vectorized.transform(test_sample)
    tri_test_fre = trigram_fre.transform(tri_test_CountVectorized)
    trimodel_test_CountVectorized = trigram_vectorized.transform(testData)
    trimodel_test_fre = trigram_fre.transform(trimodel_test_CountVectorized)

    #Full set probability counting
    full_test_CountVectorized = full_vectorized.transform(test_sample)
    full_test_fre = full_fre.transform(full_test_CountVectorized)
    full_test_CountVectorized = full_vectorized.transform(testData)
    full_test_fre = full_fre.transform(full_test_CountVectorized)


    # output final results
    print("\n        Unigram Model result:")
    unigram_result = u_model.predict(uni_test_fre)
    print(unigram_result)
    unigrammodel_result = u_model.predict(unimodel_test_fre)
    print (classification_report(test_labels, unigrammodel_result))

    print("\n            Bigram Model result:")
    bigram_result = b_model.predict(bi_test_fre)
    print(bigram_result)
    bigrammodel_result = b_model.predict(bimodel_test_fre)
    print (classification_report(test_labels, bigrammodel_result))

    print("\n            Trigram Model result:")
    trigram_result = t_model.predict(tri_test_fre)
    print(trigram_result)
    trigrammodel_result = t_model.predict(trimodel_test_fre)
    print (classification_report(test_labels, trigrammodel_result))

    print("\n            Full Model result:")
    full_set_result = f_model.predict(full_test_fre)
    print(full_set_result)
    full_model_result = f_model.predict(full_test_fre)
    print (classification_report(test_labels, full_model_result))
import re
import string
import typing # for TypeAliases/pseudotypes such as Match
from typing import *
import nltk
from toolz import pipe

_LEMMATISER = nltk.WordNetLemmatizer()

def functional_cleaner(ticket_description: str) -> List[str]:
    """
    Takes a ticket description as an argument, and passes it though a functional 
    pipeline, that sanitises, tokenise and lemmatise it, and returns as a list of 
    strings.
    """
    s_list = re.split(r'\s+', ticket_description.lower()) # the input list (*TO LOWER CASE!*)
    return text_cleaning_pipeline(s_list)


def text_cleaning_pipeline(s_list: List[str]) -> List[str]:
    """
    The actual pipeline through which the list of unigrams is passed.
    """
    return pipe( 
            s_list, # ie the argument being passed through the pipeline
            transform_relevant_URLs_into_tags,
            remove_twitter_contacts,
            remove_email_addresses,
            remove_URLs,
            remove_non_printable_hex,
            remove_stopwords,
Exemple #8
0
bad_symbols = re.compile(
    '[^0-9a-z #+_]')  #create function to remove special characters
stopword = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
    "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
    'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
    'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
    'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
    'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between',
    'to', 'from', 'then', 'here', 'there', 'when', 'where', 'so', 'than'
]  #Identify stopwords to remove, you can identify your own here
ps = nltk.PorterStemmer()  #Load word stemmer
wn = nltk.WordNetLemmatizer()  # Load word lemmatizer


def clean(text):
    text = text.lower()  # lowercase text
    text = re.sub('[0-9]+', '', text)
    text = BeautifulSoup(text, "html.parser").text  # HTML decoding
    text = replace_with_space.sub(
        ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = bad_symbols.sub(
        '', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
    return text  #Write function to conduct all text cleaning


def contract(text):
    text = [wn.lemmatize(word) for word in text]
def lemmatize(text):
    wnl = nltk.WordNetLemmatizer()
    return [(wnl.lemmatize(w), t) for (w, t) in text]
Exemple #10
0
def cleanThisTextLemmatized(rawText, removeProperNouns=False):
    print("Tokenize")
    tokenized = nltk.word_tokenize(rawText)
    tokenized = [w.lower() for w in tokenized if w.isalpha()]
    #list of words.
    #-punctuation is in its own element
    #-breaks up "It's" -> ["It", "'s']
    #-does not stem plurals

    print("Remove stop words")
    from nltk.corpus import stopwords
    swEn = stopwords.words('english')  #includes "don" "s".  all lowercase
    tokenized_noSW = [w for w in tokenized if w not in swEn]

    print("Part-of-speech tagging (wait...)")
    pos_tagged = nltk.pos_tag(tokenized_noSW)
    #default tags from https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    #tag_types = set(t for (w,t) in pos_tagged)

    pos_tagged_simple = [(w, get_wordnet_pos(w, t)) for (w, t) in pos_tagged]

    #mistagged = [(w,t) for (w,t) in pos_tagged if len(t)<2]
    #this is often empty, but some common words like "young" are not tagged correctly as as "IN" etc.

    #make all nouns
    #pos_tagged_simple2 = [(w, t if t != '' else wn.NOUN) for (w, t) in pos_tagged_simple]

    #mistagged_freq = nltk.FreqDist((w,t) for (w, t) in pos_tagged_simple if t == '')
    #print(mistagged_freq.most_common())
    # #brown corpus categories:
    # # [u'mystery', u'belles_lettres', u'humor', u'government', u'fiction', u'reviews', u'religion', u'romance', u'science_fiction', u'adventure', u'editorial', u'hobbies', u'lore', u'news', u'learned']
    # brown_tagged = nltk.corpus.brown.tagged_words(categories='fiction', tagset='universal')
    # print([(w,t) for (w,t) in brown_tagged])

    #nText = nltk.Text(rawText)

    print("Lemmatize")
    wnl = nltk.WordNetLemmatizer()  #WordNet's lemmatizer
    lemmatized = [
        wnl.lemmatize(w, t) for (w, t) in pos_tagged_simple if t != ''
    ]

    #OLD: a lot of errors when the word could be used as an adjective
    #assert('.' not in tokenized)
    #wnl = nltk.WordNetLemmatizer()  #WordNet's lemmatizer
    #lemmatized = [wnl.lemmatize(t.lower()) for t in tokenized]  #wishlist: use POS tagger to improve this

    #leaves some noise: remembered (POS issue?), unicode versions of words

    if removeProperNouns:
        print("Remove proper nouns")
        text_vocab = set(w.lower() for w in lemmatized if w.isalpha())
        english_vocab = set(w.lower() for w in nltk.corpus.words.words())
        unusual = text_vocab.difference(english_vocab)
        #not completely OK.  misses girls, constitutes

        removed_proper = [w for w in lemmatized if w not in unusual]

        return removed_proper
    else:
        return lemmatized
Exemple #11
0
def cleanText(text, acronymFile, topic_id):
    #Convert text into lowercase
    text = text.lower()
    text = ' '+text;    # insert space in the begining, because text can start with acronyms, and acronym search assumes whitespace symbol in front of acronyms
    text    = re.sub('[\s]',' ',text)   # replace all \s=[ \t\n\r\f\v] symbols with whitespace ' '
    #Load dictionary of acronyms and abbrs
    if not os.path.isfile(acronymFile):
        pass
        #print('***Acronym/abbreviation file does not exist! Acronyms and abbrs will not be expanded! (In normalize.py)***')
    else:
        temp_reader = open(acronymFile,'r')
        abbr_list   = json.load(temp_reader)
        temp_reader.close()
        #Expand abbreviations and acronyms
        for abbr in abbr_list:
            if abbr in text:
                text = text.replace(abbr,abbr_list[abbr])
    #protect not identified (not in our list) abbrs and acronynms, by replacing periods with underscore, A.B.C.D.E.F .=> A_B_C_D_E_F_
    text    = re.sub('_',' ',text)                              # remove underscores before Abbr/Acronym protection
    #text    = re.sub(' ii ',' 2 ',text)
    #text    = re.sub(' iii ',' 3 ',text)
    #text    = re.sub(' iv ',' 4 ',text)
    #text    = re.sub(' vi ',' 6 ',text)
    #text    = re.sub(' vii ',' 7 ',text)
    #text    = re.sub(' viii ',' 8 ',text)
    #text    = re.sub(' ix ',' 9 ',text)
    text    = re.sub(' [a-z]\. ',' ',text)                      # Remove single characters with period, Ex: 'M. Jordan' =>' Jordan'
    text    = re.sub(' [a-z]\.\'s ',' ',text)                   # Ex: ' T.'s '=>' '
    text    = re.sub('([a-z])\.([a-z])','\g<1>_\g<2>',text)     # Ex: A.B. => A_B., A.B.C. => A_B.C., A.B.C.D. => A_B.C_D., A.B.C.D.E. => A_B.C_D.E.,
    text    = re.sub('_([a-z])\.','_\g<1>_',text)               # Ex: (after preprocessing of previous line) A_B. => A_B_, A_B.C. => A_B_C., A_B.C_D. => A_B_C_D_, A_B.C_D.E. => A_B_C_D_E, will cover all even number of letter abbs
    text    = re.sub('_([a-z])\.','_\g<1>_',text)               # Ex: (after preprocessing of two previous lines) A_B_C. => A_B_C_, A_B_C_D_E. => A_B_C_D_E_, will cover all odd number of letters abbs

    #Remove phrases enclosed in the brackers, eg: (word1 word2), <word3 word4> and etc. Non-greedy approach
    text    = re.sub('(\[.*?\]|\(.*?\)|\{.*?\}|<.*?>)','',text)
    text    = re.sub(' +',' ',text)     #remove multiple spaces

    #Seperate floating numbers by 'point', eg: 1.4 => 1 point 4, .5  => 0 point 5
    #text    = re.sub('(\d)\.(\d)','\g<1> point \g<2>',text)
    #text   = re.sub('( +)\.(\d)',' 0 point \g<2>',text)

    #Protect decimal points
    text    = re.sub('(\d)\.(\d)','\g<1>_\g<2>',text)

    #Join thousands, eg: 1,000,000 => 1000000
    text    = re.sub('(\d),(\d)','\g<1>\g<2>',text)

    #Put space between decimal and non decimal, eg: 50%=>5 %, 4am=>4 am, 100$=>100 $, $1=>1 $,  #5=># 5, 0s => 0 s $
    #text    = re.sub('(\d)(\D)','\g<1> \g<2>',text)
    #text    = re.sub('(\$)(\d+)','\g<2> \g<1>',text)
    #text    = re.sub('([^\s\d]+)(\d)','\g<1> \g<2>',text)
    #text    = re.sub(' +',' ',text)     #remove multiple spaces

    #Replace symbols, eg: 2 $=>2 dollars,1 $=>1 dollars, 5 %=>5 percents, G&G =>G and G,  etc.
    #text    = re.sub('\$','dollars', text)
    #text   = re.sub('','euros', text)
    #text   = re.sub('','pounds', text)
    #text   = re.sub('','yuans', text)
    #text    = re.sub('%','percents', text)
    #text    = re.sub(' +',' ',text)            #remove multiple spaces
    text    = re.sub(' &amp; ','and', text)     #TDT2 uses '&AMP;' symbol for ampersand
    text    = re.sub(' & ',' and ', text)       #TDT2 uses '&AMP;' symbol for ampersand
    text    = re.sub(' +',' ',text)             #remove multiple spaces

    #Math equations, eg: 4 + 5 = 9, 4 plus 5 equals 9, 1/1 => 1 over 1
    #text    = re.sub('(\d) *\+ *(\d)','\g<1> plus \g<2>', text)
    #text    = re.sub('(\d) *- *(\d)','\g<1> minus \g<2>', text)
    #text    = re.sub('(\d) *\* *(\d)','\g<1> product \g<2>', text)
    #text    = re.sub('(\d) */ *(\d)','\g<1> over \g<2>', text)
    #text    = re.sub('(\d) *= *(\d)','\g<1> equals \g<2>', text)
    #text    = re.sub('(\d) *> *(\d)','\g<1> more than \g<2>', text)
    #text    = re.sub('(\d) *< *(\d)','\g<1> less than \g<2>', text)
    #text   = re.sub('(\d) >= (\d)','\g<1> more or equal to \g<2>', text)
    #text   = re.sub('(\d) <= (\d)','\g<1> less or equal to \g<2>', text)
    #text    = re.sub(' +',' ',text)     #remove multiple spaces

    #Fill some words
    #text    = re.sub("( |^)\'cause( |$)",' because ',text)
    #text    = re.sub("\'ll( |$)",' will ', text)
    #text    = re.sub("( |^)can\'t( |$)",' can not ', text)
    #text    = re.sub('( |^)cannot( |$)',' can not ', text)
    #text    = re.sub("\'re( |$)",' are ', text)
    #text    = re.sub("( |^)he\'s( |$)",' he is ', text)
    #text    = re.sub("( |^)she\'s( |$)",' she is ', text)
    #text    = re.sub("it\'s",'it is', text)
    #text    = re.sub("( |^)i\'m( |$)",' i am ', text)
    #text    = re.sub("( |^)isn\'t( |$)",' is not ', text)
    #text    = re.sub("( |^)aren\'t( |$)",' are not ', text)
    #text    = re.sub("( |^)doesn\'t( |$)",' does not ', text)
    #text    = re.sub("( |^)don\'t( |$)",' do not ', text)
    #text    = re.sub("( |^)didn\'t( |$)",' did not ', text)
    #text    = re.sub("( |^)wasn\'t( |$)",' was not ', text)
    #text    = re.sub("( |^)won\'t( |$)",' will not ', text)
    #text    = re.sub("( |^)shouldn\'t( |$)",' should not ', text)
    #text    = re.sub("( |^)haven\'t( |$)",' have not ', text)
    #text    = re.sub("( |^)hasn\'t( |$)",' has not ', text)
    #text    = re.sub("( |^)couldn\'t( |$)",' could not ', text)
    #text    = re.sub("( |^)wouldn\'t( |$)",' would not ', text)
#   #text    = re.sub("i\'d",'i would', text)
#   #text    = re.sub("i\'d",'i had', text)

    #Replaces other symbols with whitespace, symbols untouched: NONE
    #text    = re.sub('(\++|-+|,+|\*+|:+|/+|>+|<+|=+|\^+|%+|\$+|#+|@+|\|+|~+|`+|\"+|`+|\'s|\(+|\)+|\[+|\]+|{+|}+|<+|>+|&lr|&ur|&qc|&qr|\'+)',' ',text)
    #text    = re.sub('(\++|-+|,+|\*+|:+|/+|>+|<+|=+|\^+|%+|\$+|#+|@+|\|+|~+|`+|\"+|`+|\(+|\)+|\[+|\]+|{+|}+|<+|>+|&lr|&ur|&qc|&qr| \' |\'\'+)',' ',text)
    text    = re.sub('(,+|/+|>+|<+|\^+|#+|@+|\|+|~+|`+|\"+|`+|\(+|\)+|\[+|\]+|{+|}+|&lr|&ur|&qc|&qr| \' |\'\'+|-+|\++)',' ',text)
    text    = re.sub(' +',' ',text) #replace multiple spaces with single space
    text    = re.sub('(^\'+ | \'+ | \'+ +\'+ | \'+$)',' ',text) # remove quotes
    text    = re.sub(' +',' ',text) #replace multiple spaces with single space

    #Remove multiple periods, eg: '...'=> ''
    text    = re.sub('\.\.+','.',text)
    text    = re.sub(' +',' ',text)     #replace multiple spaces with single space

    #num to words conversion
    #for word in text.split():
    #    if str(word.encode('utf-8')).isdigit():
    #        text = text.replace(word, num2words(float(word)),1)
    #text    = re.sub('\d+',' ',text)   # remove all the numbers

    #Replace the rest symbols with the period, in order to split sentences by period later.
    text    = re.sub('(!+|\?+|;+|:+|\.+)','.', text)
    text    = re.sub(' +',' ',text)     #remove multiple spaces

    #Remove leading and trailing whitespaces
    text.strip();

    # Stemming and lemmatization
    if 0:
        porter  = nltk.PorterStemmer()
        text    = [porter.stem(t) for t in text.split()]
        wnl     = nltk.WordNetLemmatizer()
        text    = [wnl.lemmatize(t) for t in text]
        text    = ' '.join(text)

    #Convert to one line per sentence manner
    text    = text.replace('.','\n')
    text    = re.sub(' +',' ',text)     #remove multiple spaces
    text    = text.splitlines()
    for line in text:
        if line.strip() != '':
            line_tmp = re.sub(' +',' ',line)
            line_tmp = re.sub('_','.',line)     # put back periods for abbs/acros and floatin point numbers
            text[text.index(line)] = topic_id+' '+line_tmp.strip()
        else:
            text[text.index(line)] = ''

    text = [line for line in text if line.strip() != '']

    text = '\n'.join(text)

    return text.strip()
Exemple #12
0
for i in stopwords.words('english'):
    sw_dict[i] = 1

# In[19]:

# In[20]:

import re
from nltk.stem import WordNetLemmatizer
#from nltk.stem import PorterStemmer
from stemming.porter2 import stem

lemi_words = []
stem_words = []
lmtzr = nltk.WordNetLemmatizer().lemmatize
ps = PorterStemmer()


def do_lemitise(word):
    old_word = word
    word = lmtzr(word)
    if word != old_word:
        lemi_words.append((old_word, word))

    return word


def do_stem(word):
    old_word = word
    word = stem(word)
def lemmatizer():
    wnl = nltk.WordNetLemmatizer()
    return wnl
Exemple #14
0
def main():
    if len(sys.argv) < 2:
        return usage()
    print("[.] Checking dependenices")
    try:
        check_dependencies()
    except ValueError as e:
        print("  [-] Checking failed:")
        print("    |", e)
        return

    path_to_script = os.path.dirname(os.path.realpath(__file__))
    path_to_nltk = os.path.join(path_to_script, "nltk_data")
    nltk.data.path = [path_to_nltk] + nltk.data.path

    path_to_book = os.path.abspath(sys.argv[1])
    if os.path.exists(sys.argv[1]) == False:
        print("[-] Wrong path to book: {}".format(path_to_book))
        sys.exit()

    book_name = os.path.basename(path_to_book)
    book_name_without_ex = os.path.splitext(book_name)[0]
    result_dir_name = "{}-WordWised".format(book_name_without_ex)
    result_dir_path = os.path.join(os.path.dirname(path_to_book),
                                   result_dir_name)
    new_book_path = os.path.join(result_dir_path, book_name)

    if os.path.exists(result_dir_path):
        shutil.rmtree(result_dir_path)

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path)

    print("[.] Getting ASIN")
    try:
        book_asin = get_book_asin(path_to_book)
    except WiseException as e:
        print("  [-] Can't get ASIN:")
        for item in e.desc:
            print("    |", item)
        return

    if book_asin == None:
        print("  [-] The original book doesn't have ASIN:")
        print("    [.] Converting mobi 2 mobi to generate ASIN")
        try:
            cmd_str = "{} {} {}".format('ebook-convert', path_to_book,
                                        new_book_path)
            out = subprocess.check_output(cmd_str, shell=True)
        except Exception as e:
            print("    [-] Failed to convert mobi 2 mobi:")
            print("      |", e)
            return
        path_to_book = new_book_path
        book_asin = get_book_asin(path_to_book)
    else:
        shutil.copyfile(path_to_book, new_book_path)

    print("[.] Getting rawml content of the book")
    try:
        book_content = get_rawml_content(path_to_book)
    except WiseException as e:
        print("  [-] Can't get rawml content:")
        print("    |", e)
        return

    sdr_dir_name = "{}.sdr".format(book_name_without_ex)
    sdr_dir_path = os.path.join(result_dir_path, sdr_dir_name)
    if not os.path.exists(sdr_dir_path):
        os.makedirs(sdr_dir_path)

    LangLayerDb = LanguageLayerDB(sdr_dir_path, book_asin)

    print("[.] Collecting words")
    parser = RawmlRarser(book_content)
    words = parser.parse()
    count = len(words)
    if count == 0:
        print("[.] There are no suitable words in the book")
        return
    else:
        print("[.] Count of words: {}".format(count))

    lookup = {}
    senses_path = get_resource_path("lemmas.csv")
    with open(senses_path, 'rb') as f:
        f = f.read().decode('utf-8')
        for line in f.splitlines():
            l = line.strip()
            if l[0] == '"':
                continue
            word, sense_id = l.split(',')
            lookup[word] = sense_id

    lemmatizer = nltk.WordNetLemmatizer()
    prfx = "[.] Processing words: "
    print_progress(0, count, prefix=prfx, suffix='')
    LangLayerDb.start_transaction()
    if DEBUG == True:
        f = open('log.txt', 'a')
    for i, gloss in enumerate(words):
        word_offset = gloss[0]
        word = gloss[1]
        word = word.lower()
        pos_tag = nltk.pos_tag([word])[0][1]
        pos_tag_wordnet = get_wordnet_pos(pos_tag)
        word = lemmatizer.lemmatize(word, pos=pos_tag_wordnet)
        if word in lookup:
            sense_id = lookup[word]
            if DEBUG == True:
                f.write("{} - {} - {}\n".format(word_offset, word, sense_id))
            LangLayerDb.add_gloss(word_offset, sense_id)
        print_progress(i + 1, count, prefix=prfx, suffix='')

    if DEBUG == True:
        f.close()
    LangLayerDb.end_transaction()
    LangLayerDb.close_db()

    print("[.] Success!")
    print(
        "Now copy this folder: \"{}\" to your Kindle".format(result_dir_path))
Exemple #15
0
def non_punkt_normalize(txt):
    return nltk.WordNetLemmatizer(txt)
def tokenize_corpus(path, train=True):

  # used to stem words later on/get roots of words
  porter = nltk.PorterStemmer() # also lancaster stemmer
  # used to get lemmas (complete words themselves) of words
  wnl = nltk.WordNetLemmatizer()
  # list of stopwords, can print to view
  stopWords = stopwords.words("english")
  classes = []
  samples = []
  docs = []
  if train == True:
    words = {}
  f = open(path, 'r')
  lines = f.readlines()

  for line in lines:
    # separating serial, review and label
    classes.append(line.rsplit()[-1])
    samples.append(line.rsplit()[0])
    raw = line.decode('latin1')
    raw = ' '.join(raw.rsplit()[1:-1])
    # remove noisy characters; tokenize - specified at the start
    raw = re.sub('[%s]' % ''.join(chars), ' ', raw)
    tokens = word_tokenize(raw)
    # make lower case - !! consider all capitals as more positive or negative?
    # !! what if we didn't do this?
    tokens = [w.lower() for w in tokens]
    # removing stopwords
    # using python stopwords scripts - !! add manually to stopwords?
    # !! what if we didn't do this?
    tokens = [w for w in tokens if w not in stopWords]
    # first lemmatize then stem, significance?
    # !! what if we didn't do this? - lemmatize
    tokens = [wnl.lemmatize(t) for t in tokens]
    # !! what if we didn't do this? - stem
    tokens = [porter.stem(t) for t in tokens] 

    # ---------------------------------------------------------
    # !! add bigram collocations here
    # tokens = bigram_tokenize_corpus(tokens, BigramAssocMeasures.chi_sq, 200)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    # !! experiment with window size for accuracy
    finder = BigramCollocationFinder.from_words(tokens, window_size = 3)
    # !! experiment with frequency count for accuracy 
    finder.apply_freq_filter(2)
    # !! understand pmi filter 
    # bigrams = finder.nbest(bigram_measures.pmi, 10)
    
    #for k,v in finder.ngram_fd.items():
        #print(k,v)

    for k in finder.ngram_fd.items(0):
      print(k)

    # ---------------------------------------------------------

    if train == True:
        # add to word count frequency
      for t in tokens: 
        try:
            words[t] = words[t]+1
        except:
            words[t] = 1
    docs.append(tokens)

  # docs: consist of a long list of tokens as they
  # appear in the text ie may be repeated
  # words: dictionary - has the frequency of the vocabulary

  if train == True:
     return(docs, classes, samples, words)
  else:
     return(docs, classes, samples)
def lemmatization():
    wnl = nltk.WordNetLemmatizer()
    [wnl.lemmatize(t) for t in tokens]
Exemple #18
0
    def fullTokenize(self, text, removalPattern=False, gensim=False):
        '''
        A method to process texts by tokenising, stemming and retrieving noun phrases.
        The method
        a. removes useless stuff (stopwords and the like)
        b. stems words (removing plurals and verbal tenses)
        c. detects noun-phrases and joins them with hypens
        ---------------------------
        Mandatory Arguments:
        i. text : a string to be processed
        
        KeyWord Arguments:
        i. removalPattern : a regex pattern for sentences to be discarded
        ---------------------------
        Returns:
        finalText : a tokenized and stemmed list of the input text
        '''
        ## split the text into sentences
        if removalPattern:
            sentences = [ p for p in nltk.sent_tokenize(text) if ( not\
                ('ociety' in str(p) and ('©' in str(p) or 'ublished' in p) and len(p.split()) < 10) and \
                removalPattern.search( p + ' ') == None) ] # NLTK default sentence segmenter
        else:
            sentences = [ p for p in nltk.sent_tokenize(text) if ( not\
                                                                  ('ociety' in str(p) and ('©' in str(p) or 'ublished' in p) and len(p.split()) < 10) ) ] # NLTK default sentence segmenter
        sentences = [nltk.word_tokenize(sent)
                     for sent in sentences]  # NLTK word tokenizer
        sentences = [nltk.pos_tag(sent)
                     for sent in sentences]  # NLTK POS tagger

        # lemmatizing stuff
        sentences = [[(nltk.WordNetLemmatizer().lemmatize(
            w, self.Lemmatizer[t[0].lower()]), t) for w, t in sent]
                     for sent in sentences]

        ## retain the nounphrases as a single-hypened word
        finalText = []
        for sentence in sentences:
            ## tag the sentence with the chunker
            result = self.chunker.tagger.tag(sentence)
            result = filter(lambda _: _[0][1] != 'DT', result)

            ## merge the nounphrases
            phrase = []
            previous = 'O'
            for w0, t in result:
                if t == 'I-NP' and previous != 'O':

                    phrase[-1] += '-%s' % w0[0]

                else:
                    phrase.append(w0[0])

                previous = t
            ## remove stopstuff
            filterStop = filter(
                lambda x: '-' in x and
                (len(set(x.split('-')) & set(stopwords.words('english'))) <
                 len(x.split('-')) / 2) or not ('-' in x), phrase)
            phrase = list(filterStop)
            finalText.extend(phrase)

        return finalText
Exemple #19
0
def extract_filled_templates2(sentence, stem):
    impact_word = ""
    cause_subject = ""
    cause_object = ""
    Predictor = ""
    imp_sub_sentence = ""
    impact_subjects = []
    cause_object_word = ""

    lmtzr = nltk.WordNetLemmatizer()
    word_tokens = word_tokenize(sentence)
    # lemmas = [lmtzr.lemmatize(token, 'v') for token in word_tokens]
    # lemma_set = set(lemmas)

    # for item in lemma_set:
    #     print("stem is ",stem,"item is ",item)
    #     if stem in item:
    #         matching_template = item

    matching_template = stem
    word_index = word_tokens.index(matching_template)
    # divide the sentence before the template verb and after the template verb by considering 'by' to be active or passive voice determiner

    sub_sentence, obj_sentence = get_subject_object(word_index, word_tokens)
    # print(sub_sentence)
    # print(obj_sentence)

    # include grammar for extracting head level noun phrases in the template
    grammar = r"""
         NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
         {<NNP>+}                # chunk sequences of proper nouns
         {<NN>+}                 # chunk consecutive nouns
         {<NNS>+}
         {<NNPS>+}
         """
    cp = nltk.RegexpParser(
        grammar)  # Define Parser for extracting noun phrases

    sub_tagged_sent = nltk.pos_tag(sub_sentence.split())
    sub_tokens = word_tokenize(sub_sentence)
    sub_parsed_sent = cp.parse(sub_tagged_sent)
    predictors = set([
        get_noun_phrases(npstr, sentence)
        for npstr in extract_np(sub_parsed_sent)
    ])
    # predictors = [get_noun_phrases(entry,sub_sentence) for entry in sub_tagged_sent]

    parsetree = get_parse_tree(sentence)
    for triple in parsetree:
        # if 'VB' in triple[0][1] and triple[1] in ['nsubj']:
        if stem in triple[0][0]:
            predictor_word = triple[2][0]
            # print("Predictor word is ",predictor_word)
            break
    # for entry in predictors:
    #     if predictor_word in entry:
    #         Predictor = entry
    Predictor = get_noun_phrases(predictor_word, sub_sentence)

    #

    parsetree_sub = get_parse_tree(obj_sentence)
    impact_word = ""
    for triple in parsetree_sub:
        # print("triple in sub sentence is ",triple)
        if 'VB' in triple[0][1] and triple[1] == 'nsubj':
            if (triple[0][0] in ['was', 'is']):
                continue
            impact_word = triple[0][0]
            # print("impact_word found is ************",impact_word)
            word_tokens = word_tokenize(obj_sentence)
            # lemmas = [lmtzr.lemmatize(token, 'v') for token in word_tokens]
            word_index_imp = word_tokens.index(impact_word)
            imp_sub_sentence, imp_obj_sentence = get_subject_object(
                word_index_imp, word_tokens)

        if triple[0][0] == impact_word and triple[1] == 'nsubj':
            cause_subject = get_noun_phrases(triple[2][0], imp_sub_sentence)

        elif triple[0][0] == impact_word and triple[1] == 'dobj':
            cause_object = get_full_word(triple[2][0], parsetree_sub)
            break

        # elif impact_word != "" and triple[2][1] == 'NN':
        #     cause_object = get_full_word(triple[2][0],parsetree_sub)
        #     break

    # if cause_subject == "":
    #     obj_tagged_sent = nltk.pos_tag(imp_sub_sentence.split())
    #     obj_parsed_sent = cp.parse(obj_tagged_sent)
    #     impact_subjects = set([get_noun_phrases(npstr,imp_sub_sentence) for npstr in extract_np(obj_parsed_sent)])

    if impact_word == "" or cause_object == "":
        doc_sub = nlp(obj_sentence)
        for triple in parsetree_sub:
            if triple[1] in ['pobj', 'dobj']:
                cause_object_word = triple[2][0]
                # print("cause_object word is **",cause_object_word)
                break

        for entry in doc_sub.noun_chunks:
            if cause_object_word in entry.text:
                cause_object = get_full_word(entry.text, parsetree_sub)
            else:
                impact_subjects.append(entry.text)

    print("Predictor:", predictors)
    # print("Predictor:", Predictor)
    # print("impact_subjects",impact_subjects)
    print("cause_subject:", cause_subject)
    print("Impact:", impact_word)
    print("cause_object:", cause_object)
Exemple #20
0
data = fp.read()

#to tokenize input text into sentences

#print('\n-----\n'.join(tokenizer.tokenize(data)))# splits text into sentences

#to tokenize the tokenized sentences into words

tokens = nltk.wordpunct_tokenize(data)
text = nltk.Text(tokens)
words = [w.lower() for w in text]
print(data)  #to print the tokens

#for a in data:
#print (a)
for synset in wn.synsets(tokens[30]):
    for lemma in synset.lemmas():
        print(lemma.name())

stemmer = PorterStemmer()
print(stemmer.stem(tokens[38]))

lemmetizer = nltk.WordNetLemmatizer()
print(lemmetizer.lemmatize('Breathing', pos='v'))

print(pos_tag(words))

print(nltk.ngrams('hey all li lee', 3))

# nltk.download('all')
print(nltk.ne_chunk(pos_tag(nltk.wordpunct_tokenize("heya ola hows it"))))
Exemple #21
0
def do_lemmitizing(filtered_sentence):
    lemma = nltk.WordNetLemmatizer()
    lemmas = []
    for i in filtered_sentence:
        lemmas.append(lemma.lemmatize(i))
    return lemmas
# split yerine tokenizer kullanabiliriz
description = nltk.word_tokenize(description)

# split kullanırsak "shouldn't " gibi kelimeler "should" ve "not" diye ikiye ayrılmaz ama word_tokenize() kullanirsak ayrilir
# %%
# greksiz kelimeleri cikar
description = [
    word for word in description if not word in set(stopwords.words("english"))
]

# %%
# lemmatazation loved => love   gitmeyecegim = > git

import nltk as nlp

lemma = nlp.WordNetLemmatizer()
description = [lemma.lemmatize(word) for word in description]

description = " ".join(description)

#%%
description_list = []
for description in data.description:
    description = re.sub("[^a-zA-Z]", " ", description)
    description = description.lower()  # buyuk harftan kucuk harfe cevirme
    description = nltk.word_tokenize(description)
    #description = [ word for word in description if not word in set(stopwords.words("english"))]
    lemma = nlp.WordNetLemmatizer()
    description = [lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description)
Exemple #23
0
import nltk
from nltk.corpus import stopwords

wordnet = nltk.WordNetLemmatizer()
stoplist = stopwords.words('english')


def normalize_token(token):
    """
    Convert token to lowercase, and stem using the Porter algorithm.

    Parameters
    ----------
    token : str

    Returns
    -------
    token : str
    """
    return wordnet.lemmatize(token.lower())


def filter_token(token):
    """
    Evaluate whether or not to retain ``token``.

    Parameters
    ----------
    token : str

    Returns
Exemple #24
0
 def __init__(self):
     NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
     self.normalizer = NltkNormalizer()
     self.lem = nltk.WordNetLemmatizer()
     self.tagger = nltk.PerceptronTagger()
     self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
Exemple #25
0
                  ┗┻┛  ┗┻┛
"""
from __future__ import division
from numpy.ma import sort
import re
import numpy
from scipy import spatial
from numpy import savetxt, loadtxt
import nltk
from opinion_mining.AMC_preprocess import domain_preprocess

f = open(
    r'E:\python_workplace\Opinion Mining (LML)\Data\English_stopwords.txt',
    encoding='utf-8')
stopwords = set(line.strip() for line in f.readlines())  # 读入停用词
lemitaion = nltk.WordNetLemmatizer()
f.close()
ignorechars = ''',:'.;!()#'''


def pre_proc(C):
    C = [w.replace(ignorechars, "") for w in C]
    C = [
        lemitaion.lemmatize(w) for w in C if w not in stopwords and len(w) >= 3
    ]
    C = [
        lemitaion.lemmatize(w, pos='v') for w in C
        if w not in stopwords and len(w) >= 3
    ]
    C = [w for w in C if w not in stopwords and len(w) >= 3]
    return C
Exemple #26
0
import nltk

stopwords = nltk.corpus.stopwords.words('english')


def stopwords_remove(text):
    no_stopwords = [word for word in text if word not in stopwords]
    return no_stopwords


# ### Sub 4: Lemmatizing Words

# In[5]:

lemwords = nltk.WordNetLemmatizer()


def lemmatizer(text):
    lemmatized = [lemwords.lemmatize(word) for word in text]
    return lemmatized


# ### Cleaning Data (with previous four functions)

# In[6]:


def clean_data(text):
    return lemmatizer(stopwords_remove(tokenizer(punctuation_remove(text))))
# Data Preparation and Preprocessing

print("Loading Data Sources")
#Load Data Source
data_path = os.getcwd() + '\\lsa\\links.json'
print("Data source : " + data_path)
data = pd.read_json(data_path)
data.head()

temp_data = pd.read_json(data_path)
question_data = temp_data['MESSAGE']
#Create Stop Word
newstopwords = set(stopwords.words('english'))
#define Wordnet Lemmatizer
WNlemma = nltk.WordNetLemmatizer()


#Create Preprocessing Function
def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens = [WNlemma.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [word for word in tokens if word.lower() not in newstopwords]
    # bigr = nltk.bigrams(tokens[:10])
    # trigr = nltk.trigrams(tokens[:10])
    return (tokens)


#greeting function
# GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
Exemple #28
0
def get_keywords_phrases(text):
    try:
        text = encode_ignore(text)

        lemmatizer = nltk.WordNetLemmatizer()
        # stemmer = nltk.stem.porter.PorterStemmer()

        # Based on... Extract key phrases with NLTK ... https://gist.github.com/alexbowe/879414
        # This gist is part of a blog post (http://alexbowe.com/au-naturale/)
        # in which the paper is cited:
        # S. N. Kim, T. Baldwin, and M.-Y. Kan. Evaluating n-gram based evaluation metrics for automatic
        # keyphrase extraction. Technical report, University of Melbourne, Melbourne 2010.
        grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """
        chunker = nltk.RegexpParser(grammar)

        # POS tagging
        postoks = get_pos_tags(text)

        this_tree = chunker.parse(postoks)

        from nltk.corpus import stopwords
        stopwords = stopwords.words('english')

        def leaves(tree):
            """Finds NP (nounphrase) leaf nodes of a chunk tree."""
            # for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
                yield subtree.leaves()

        def normalise(word):
            """Normalises words to lowercase and stems and lemmatizes it."""
            word = word.lower()
            # word = stemmer.stem_word(word)
            word = lemmatizer.lemmatize(word)
            return word

        def acceptable_word(word):
            """Checks conditions for acceptable word: length, stopword."""
            accepted = bool(2 <= len(word) <= 40
                            and word.lower() not in stopwords)
            return accepted

        def get_terms(tree):
            """a generator for the normalized, acceptable, leaf terms"""
            for leaf in leaves(tree):
                this_term = [
                    normalise(w) for w, t in leaf if acceptable_word(w)
                ]
                yield this_term

        terms = get_terms(this_tree)
        phrases = []
        terms_freq_dict = {}
        for termList in terms:
            phrase = " ".join([str(term) for term in termList])
            phrases.append(phrase)
            if phrase not in terms_freq_dict:
                terms_freq_dict[phrase] = 1
            else:
                terms_freq_dict[phrase] += 1
        sorted_tfd = sorted(list(terms_freq_dict.items()),
                            key=operator.itemgetter(1),
                            reverse=True)
        return sorted_tfd
    except Exception as e:
        error_msg = "processing error: ", str(e)
        return {'keyWordsPhrases': error_msg}
Exemple #29
0
import os
from functools import lru_cache

import pickle

import nltk
from nltk.stem.porter import PorterStemmer

_max_ppdb_score = 10.0
_min_ppdb_score = -_max_ppdb_score


_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


@lru_cache(maxsize=1)
def get_ppdb_data():
    with open('ppdb.pickle'), 'rb') as f:
        return pickle.load(f)


_stemmer = PorterStemmer()

Exemple #30
0
# dont store credentials within the repository
if os.path.exists('credentials.py'):
    import credentials
    alpow.sftp = credentials.sftp
    # p(alpow.sftp);
    p('sftp online:', ftpls()[0], '\n\n\n')
else:
    alpow.useFTP = False
    sendimages2ftp = 0
# }{
np.random.seed(1983)

nltk.download('stopwords')
nltk.download('wordnet')
alpow.stop_words = nltk.corpus.stopwords.words('english')
alpow.lemma = nltk.WordNetLemmatizer()
alpow.token = ToktokTokenizer()


def load(fn='allVars', onlyIfNotSet=1):
    fns = fn.split(',')
    for fn in fns:
        fn = fn.strip(', \n')
        ok = 1
        if (len(fn) == 0):
            continue
        if (onlyIfNotSet):
            if fn in globals().keys():
                # override empty lists, dict, dataframe and items
                if type(globals()[fn]) == type:
                    continue