Esempio n. 1
0
#preprocessing of the main text of ad

#removing lower case for the main text
df['description'] = df['description'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))  #" " between words
#removing punctuation in the main text
df['description'] = df['description'].str.replace('[^\w\s]', '')
#remove stop words in the main text
df['description'] = df['description'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))

#stemming
##from nltk.stem import PorterStemmer
##st = PorterStemmer()
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer("russian")
df['description'] = df['description'].apply(
    lambda x: " ".join([st.stem(word) for word in x.split()]))
#print(st.stem("перепрыгивающий"))
#results in перепрыгива
##############################################################################
dataT = df['description']
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#count_vect = CountVectorizer(max_features=10)
#dataT_counts = count_vect.fit_transform(dataT)
#print("countvect",dataT_counts.toarray().sum(axis=1))
#tfidf_transformer = TfidfTransformer(use_idf=True)
Esempio n. 2
0
def stemTokenize(doc):
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(word) for word in re.findall(r'\b\w+\b', doc)]
Esempio n. 3
0
import sklearn.svm as sksvm
import sklearn.linear_model as sklin
import inspect
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
import pandas as pd
import os
from itertools import compress
import logging,gensim,os
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem.snowball import SnowballStemmer

setwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))+'/'
prewd = os.path.abspath(os.path.join(setwd, os.pardir))

stemmer = SnowballStemmer("french")
models = gensim.models.Word2Vec.load(prewd+'/data/stemmed_frwiki.bin')


# In[4]:


class suffix:
    msuffix = '-an,-and, -ant, -ent, -in, -int, -om, -ond, -ont,-eau, -au, -aud, -aut, -o, -os, -ot,-ai, -ais, -ait, -es, -et,-ou, -out, -out, -oux,-i, -il, -it, -is,-y,-at, -as, -ois,-oit,-u,-us,-ut,-eu,-er,-age, -ege, –ème, -ome,-òme, -aume, -isme,-as, -is, -os, -us, -ex,-it, -est,-al, -el, -il, -ol, -eul, -all,-if, -ef,-ac, -ic, -oc, -uc,-am, -um, -en,-air, -er, -erf, -ert, -ar, -arc, -ars, -art, -our, -ours, -or, -ord, -ors, -ort, -ir, -oir,-eur,-ail, -eil, -euil, -ueil,-ing'
    msuffix = msuffix.split(',')

    fsuffix = 'aie, -oue, -eue, -ion, -te, – ée, -ie, -ue, -asse, -ace, -esse, -ece, -aisse, -isse,-ice, -ousse, -ance, -anse, -ence, -once,-enne, -onne, -une, -ine, -aine, -eine, -erne,-ande, -ende, -onde, -ade, -ude, -arde, -orde,-euse, -ouse, -ase, -aise, -ese, -oise, -ise, -yse, -ose, -use,-ache, -iche, -eche, -oche, -uche, -ouche, -anche,-ave, -eve, -ive,-iere, -ure, -eure,-ette, -ete, –ête, -atte, -otte, -oute, -orte, -ante, -ente, -inte, -onte,-alle, -elle, -ille, -olle,-aille, -eille, -ouille,-appe, -ampe, -ombe,-igue'
    fsuffix = fsuffix.split(',')

    ms = []
    for i in range(0,len(msuffix)):
#!/usr/bin/env python
"""coOccuranceMapper.py"""

import sys
import re
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

stopwords = [
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

# MUST MANUALLY CHANGE FOR EACH SUB-TOPIC
topten = [
    'train', 'get', 'like', 'im', 'go', 'station', 'time', 'one', 'dont',
    'peopl'
]
Esempio n. 5
0
s = s.value_counts()
s[:5]


# Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. Therefore, these can be safely removed. Finally, we will convert every word to its stem so that words such as *Dogs* and *Dog* are considered the same.

# In[ ]:


s = s[s > 1]


# In[ ]:


stemmer = SnowballStemmer('english')
stemmer.stem('dogs')


# In[ ]:


def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words


# In[ ]:
Esempio n. 6
0
"""
This is main script to extract all features from books 
(all books/pages for training and prediction must go through this)
"""

#import necessary libraries
from __future__ import print_function  #need this to print to file
import xml.etree.ElementTree as ET  #need for parsing XML file of book
import string  #need for testing if punctuation in word
from nltk.stem.snowball import SnowballStemmer  #need to stem words for cookWords and measureWords features
stemmer = SnowballStemmer("english")  #set up stemmer
import os
from os import listdir  #need for reading all files from folder
import csv
import itertools  #need for condensing list


#function to parse a book and print its features to a file
#parameters: takes a XML book file; like in this form: 'foodNewsletter.xml'
#and takes file name; if file DNE, will make file of that name
#Will make new output file for each xml book (if give new output file name for each book)
def parsePrint(xmlBk, f, cookWords, measures, foods):
    with open(xmlBk, 'rb') as xml_bk:
        tree = ET.parse(xml_bk)  #parse xml
        pages = tree.findall(
            ".//OBJECT")  #store all the bk pages in list called 'pages'
        count = 0  #count number of iterations of for loop
        avgP = avgPunc(pages)
        avgW = avgWord(pages)
        for p in pages:
            count += 1
Esempio n. 7
0
print(stem_out)

#Process stem analisys
eng_tokens_stem = []
print("TOKEN\t\tSTEM")
for eng_token in eng_tokens:
    eng_token_stem_radix = eng_porter_stemmer.stem(eng_token)
    eng_tokens_stem.append(eng_token_stem_radix)
    print("%s\t\t%s" % (eng_token, eng_token_stem_radix))

print("*** ENG - Snowball Stemmer algorithm ***")
from nltk.stem.snowball import SnowballStemmer
print("Original phrase")
print(eng_textToAnalize_03)
eng_tokens = word_tokenize(eng_textToAnalize_03)
eng_snowball_stemmer = SnowballStemmer("english")
#Process snowball stem analisys
eng_tokens_snowball_stem = []
print("TOKEN\t\tSTEM")
for eng_token in eng_tokens:
    eng_token_stem_radix = eng_snowball_stemmer.stem(eng_token)
    eng_tokens_snowball_stem.append(eng_token_stem_radix)
    print("%s\t\t%s" % (eng_token, eng_token_stem_radix))

print("*** ENG - Lancaster Stemmer algorithm ***")
'''

'''
from nltk.stem import LancasterStemmer
print("Original phrase")
print(eng_textToAnalize_03)
Esempio n. 8
0
    def extract_features(self, Req_list, score_target, export=True, corpal=True):

        nlp = spacy.load('de')
        stemmer = SnowballStemmer("german")
        stop = stopwords.words('german')
        features = pd.DataFrame()
        # create first column of dataframe by allocating requirement list to it; one requirement per line
        features['req'] = Req_list
        # get text, tag_ and pos_ attributes for each word
        features['req_nlp'] = features['req'].apply(lambda x: nlp(x))
        features['tags'] = features['req_nlp'].apply(lambda x: [(w.text, w.tag_, w.pos_) for w in x])

        # Analysis using NLTK
        # Split sentences then count number in each requirement
        features['sentences_by_nltk'] = features['req'].apply(lambda x: nltk.sent_tokenize(x, 'german'))
        features['sentence_nb_by_nltk'] = features['req'].apply(lambda x: len(nltk.sent_tokenize(x, 'german')))
        # analysis with spacy
        features['sentences_by_nlp'] = features['req_nlp'].apply(lambda x: [sent.string.strip() for sent in x.sents])
        features['sentence_nb_by_nlp'] = features['req_nlp'].apply(
            lambda x: len([sent.string.strip() for sent in x.sents]))

        # number of sentences per requirement
        features['sentences'] = features.apply(lambda x: self.select_sentences(x), axis=1)
        features['sentences_nb'] = features.apply(lambda x: self.select_sentences(x, "y"), axis=1)
        features['sentences_tagged'] = features['sentences'].apply(lambda x: [self.tag_sentence(nlp, w) for w in x])

        # Calculating Readability-Index
        # words in requirement
        features['words_nb'] = features['req'].apply(lambda x: len(x.split()))
        # words per sentence
        features['WPS'] = features['words_nb'] / features['sentences_nb']
        # syllables per word
        features['SPW'] = features['req'].apply(lambda x: self.compute_SPW(x))
        # flesch index
        features['Flesch_Index'] = features.apply(lambda x: round((180 - x['WPS'] - (58.5 * x['SPW']))), axis=1)
        # Analyzing punctuation
        features['internal_punctuation'] = features['tags'].apply(lambda x: self.count_punctuation(x))
        features['comma'] = features['tags'].apply(lambda x: self.count_comma(x))
        features['weird_words'] = features['tags'].apply(lambda x: self.count_weird_words(x))

        # Analyzing and counting specific words and list containing words
        features['beispiel'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'beispiel'))
        features['circa'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'circa'))
        features['wenn'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'wenn'))
        features['aber'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'aber'))
        features['max_min_presence'] = features['req'].apply(lambda x: self.check_max_min_presence(x))
        features['Nb_of_Umsetzbarkeit_conj'] = features['tags'].apply(lambda x: self.time_logical_conj(x))
        features['measurement_values'] = features['tags'].apply(lambda x: self.search_measurements_indicators(x))
        features['numerical_values'] = features['tags'].apply(lambda x: self.search_numerical_value(x))
        features['polarity'] = features['req'].map(lambda text: TextBlobDE(text).sentiment.polarity)

        # Analyzing passive and active and auxiliary attributes at the beginning of a requirement
        features['passive_global'] = features['tags'].apply(lambda x: self.passive_detection(x))
        features['passive_per_sentence'] = features['sentences_tagged'].apply(
            lambda x: [self.passive_detection(s) for s in x])
        features['passive_percent'] = features['passive_per_sentence'].apply(
            lambda x: (sum([y == "yes" for y in x]) / len(x)))
        features['Aux_Start'] = features['tags'].apply(lambda x: self.aux_1st(x))
        features['Aux_Start_per_sentence'] = features['sentences_tagged'].apply(lambda x: [self.aux_1st(s) for s in x])

        # Analyzing conjunctions, verbs and auxiliaries
        features['Sub_Conj'] = features['tags'].apply(lambda x: self.count_subordinate_conjunction(x))
        features['Comp_conj'] = features['tags'].apply(lambda x: self.count_comp_coor_conjunction(x))
        features['Nb_of_verbs'] = features['tags'].apply(lambda x: self.count_verb(x))
        features['Nb_of_auxiliary'] = features['tags'].apply(lambda x: self.count_aux(x))
        features['werden'] = features['req'].apply(lambda x: self.count_werden(x))

        # same functions as previous block but analysis made for each sentence on one requirement
        features['Sub_Conj_pro_sentece'] = features['sentences_tagged'].apply(
            lambda x: [self.count_subordinate_conjunction(s) for s in x])
        features['Comp_conj_pro_sentence'] = features['sentences_tagged'].apply(
            lambda x: [self.count_comp_coor_conjunction(s) for s in x])
        features['Nb_of_verbs_pro_sentence'] = features['sentences_tagged'].apply(
            lambda x: [self.count_verb(s) for s in x])
        features['Nb_of_auxiliary_pro_sentence'] = features['sentences_tagged'].apply(
            lambda x: [self.count_aux(s) for s in x])
        features['werden_pro_sentence'] = features['sentences'].apply(lambda x: [self.count_werden(s) for s in x])

        features['formal_global'] = features['req'].apply(lambda x: self.contain_Muss_Darf_nicht(stemmer, x))
        features['formal_per_sentence'] = features['sentences'].apply(
            lambda x: [self.contain_Muss_Darf_nicht(stemmer, s) for s in x])
        features['formal_percent'] = features['formal_per_sentence'].apply(
            lambda x: (sum([y == "yes" for y in x]) / len(x)))
        features['entities'] = features['req_nlp'].apply(lambda x: self.entities_label(x))

        # Graphical representation of the vocabulary of requirements corpus
        if corpal:
            self.Corpus_Analysis(Req_list, stop)

        if export:
            my_path = Path(u"/Users/selina/Code/Python/Thesis/src/Features/" + 'export_features')
            # my_path = Path(u"/Users/selina/Documents/UNI/Thesis/Code/Features/" + 'export_features')
            g_Dirpath = os.path.abspath(my_path)
            dataFile = g_Dirpath + '\\' + 'Features_Export.xlsx'
            print("Create Excel export file: %s" % (dataFile))
            features[0:5000].to_excel(dataFile, index=False)
            print("\nFeatures_Export XLS-file created and data copied.")

        return features, features.sentences_tagged
Esempio n. 9
0
    def __init__(self):

        self.df = pd.DataFrame()
        self.stemmer = SnowballStemmer("german")
Esempio n. 10
0
def load_references(input_file,
                    sep_doc_id=':',
                    sep_ref_keyphrases=',',
                    normalize_reference=False,
                    language="en",
                    encoding='utf-8'):
    """Load a reference file. Reference file can be either in json format or in
    the SemEval-2010 official format.

    Args:
        input_file (str): path to the reference file.
        sep_doc_id (str): the separator used for doc_id in reference file,
            defaults to ':'.
        sep_ref_keyphrases (str): the separator used for keyphrases in
            reference file, defaults to ','.
        normalize_reference (bool): whether to normalize the reference
            keyphrases using stemming, default to False.
        language (str): language of the input documents (used for computing the
            stems), defaults to 'en' (english).
        encoding (str): file encoding, default to utf-8.
    """

    logging.info('loading reference keyphrases from {}'.format(input_file))

    references = defaultdict(list)

    # open input file
    with codecs.open(input_file, 'r', encoding) as f:

        # load json data
        if input_file.endswith('.json'):
            references = json.load(f)
            for doc_id in references:
                references[doc_id] = [
                    keyphrase for variants in references[doc_id]
                    for keyphrase in variants
                ]
        # or load SemEval-2010 file
        else:
            for line in f:
                cols = line.strip().split(sep_doc_id)
                doc_id = cols[0].strip()
                keyphrases = cols[1].strip().split(sep_ref_keyphrases)
                for v in keyphrases:
                    if '+' in v:
                        for s in v.split('+'):
                            references[doc_id].append(s)
                    else:
                        references[doc_id].append(v)

        # normalize reference if needed
        if normalize_reference:

            # initialize stemmer
            stemmer = SnowballStemmer("porter")
            if language != 'en':
                stemmer = SnowballStemmer(ISO_to_language[language],
                                          ignore_stopwords=True)

            for doc_id in references:
                for i, keyphrase in enumerate(references[doc_id]):
                    stems = [stemmer.stem(w) for w in keyphrase.split()]
                    references[doc_id][i] = ' '.join(stems)

    return references
Esempio n. 11
0
def main(data_file, seed):

    # set seed
    np.random.seed(seed)

    # load in a pd.df
    data = [json.loads(line) for line in data_file]
    df = pd.DataFrame.from_dict(data)

    # make directory for images
    if not os.path.exists(IMAGES_DIRECTORY):
        os.mkdir(IMAGES_DIRECTORY)
    # make directory for representative words
    if not os.path.exists(REP_DIRECTORY):
        os.mkdir(REP_DIRECTORY)

    print_header('3.2.1 Popular Products and Frequent Reviewers', 50)

    ## 3.2.1 get top 10 products
    top_10_products = df['asin'].value_counts().head(10).reset_index().rename(
        columns={
            'index': 'productID',
            'asin': 'reviewCount'
        })
    print_header('Top 10 products', char='-')
    print(top_10_products)

    #     productID  reviewCount
    # 0  B005SUHPO6          836
    # 1  B0042FV2SI          690
    # 2  B008OHNZI0          657
    # 3  B009RXU59C          634
    # 4  B000S5Q9CA          627
    # 5  B008DJIIG8          510
    # 6  B0090YGJ4I          448
    # 7  B009A5204K          434
    # 8  B00BT7RAPG          431
    # 9  B0015RB39O          424

    ## 3.2.1 get top 10 reviewers
    top_10_reviewers = df['reviewerID'].value_counts().head(
        10).reset_index().rename(columns={
            'index': 'reviewerID',
            'reviewerID': 'reviewCount'
        })
    print_header('Top 10 reviewers', char='-')
    print(top_10_reviewers)

    #        reviewerID  reviewCount
    # 0  A2NYK9KWFMJV4Y          152
    # 1  A22CW0ZHY3NJH8          138
    # 2  A1EVV74UQYVKRY          137
    # 3  A1ODOGXEYECQQ8          133
    # 4  A2NOW4U7W3F7RI          132
    # 5  A36K2N527TXXJN          124
    # 6  A1UQBFCERIP7VJ          112
    # 7   A1E1LEVQ9VQNK          109
    # 8  A18U49406IPPIJ          109
    # 9   AYB4ELCS5AM8P          107

    ## 3.2.2 Sentence segmentation
    print_header('3.2.2 Sentence Segmentation', 50)

    df['sentences'] = df['reviewText'].apply(segment_sent)
    df['sentenceCount'] = df['sentences'].apply(len)

    # plotting for number of sentences
    plot_bar(df['sentenceCount'], \
            title = 'Distribution of Number of Sentences for Each Review', \
            x_label = "Sentence Count", y_label = "Review Count", countplot = False)

    plot_bar(df['sentenceCount'].clip(0, 50), \
            title = 'Distribution of Number of Sentences for Each Review (Clipped)', \
            x_label = "Sentence Count (Clipped)", y_label = "Review Count", countplot = True)

    # get 5 random reviews to do sentence segmentation and display results
    reviews = df['reviewText']
    _seed = 43  # To give us an interesting result
    random_reviews = reviews.sample(5, random_state=_seed)
    random_reviews = pd.DataFrame(
        random_reviews,
        columns=['reviewText']).reset_index().drop(columns=['index'])
    random_reviews['segmentedSentences'] = random_reviews['reviewText'].apply(
        segment_sent)
    print(
        "5 Randomly selected reviews before and after sentence segmenetation:")
    print(random_reviews)

    ## 3.2.3 Tokenization and Stemming
    print_header('3.2.3 Tokenization and Stemming', 50)

    df['tokenizedSentences'] = df['sentences'].apply(
        lambda sentences: [tokenize(sentence) for sentence in sentences])
    df['tokens'] = df['tokenizedSentences'].apply(flatten)

    ### No Stemming
    print_header('No Stemming', char='-')
    df['words'] = df['tokens'].apply(
        lambda tokens: [token.lower() for token in tokens])
    df['words'] = df['words'].apply(
        lambda tokens: [token for token in tokens if is_word(token)])
    df['uniqueWords'] = df['words'].apply(set)
    df['wordCount'] = df['uniqueWords'].apply(len)

    # token = {normal_word, emoji, stopword, punctuation}
    # word = {normal_word, emoji}

    plot_bar(
        df['wordCount'],
        title=
        'Distribution of Number of Words for Each Review Without Stemming',
        x_label="Word Count",
        y_label="Review Count",
        countplot=False)
    plot_bar(
        df['wordCount'].clip(0, 300),
        title=
        'Distribution of Number of Words for Each Review Without Stemming (Clipped)',
        x_label="Word Count (Clipped)",
        y_label="Review Count",
        countplot=False)

    words = flatten(df['words'])
    words_unique = flatten(df['uniqueWords'])

    top_20_words = pd.DataFrame.from_dict(Counter(words), orient='index').\
                reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\
                sort_values(['Count'], ascending = False).head(20).\
                reset_index().drop(columns = ['index'])

    print_header('Top 20 Words Without Stemming', char='-')
    print(top_20_words)

    ### With Stemming
    print_header('With Stemming', char='-')
    stemmer = SnowballStemmer("english")
    df['stemmedWords'] = df['words'].apply(
        lambda tokens: [stemmer.stem(token) for token in tokens])
    df['uniqueStemmedWords'] = df['stemmedWords'].apply(set)
    df['stemmedWordCount'] = df['uniqueStemmedWords'].apply(len)

    plot_bar(df['stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review With Stemming', \
            x_label = "Stemmed Word Count", y_label = "Review Count", countplot = False)
    plot_bar(df['stemmedWordCount'].clip(0, 300), \
            title = 'Distribution of Number of Words for Each Review With Stemming (Clipped)', \
            x_label = "Word Count (Clipped)", y_label = "Review Count", countplot = False)

    plot_bar_overlap(df, ['wordCount', 'stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review', \
            x_label = "Word Count", y_label = "Review Count", countplot = False)

    plot_bar_overlap(df[['wordCount', 'stemmedWordCount']].clip(0, 300), ['wordCount', 'stemmedWordCount'], \
            title = 'Distribution of Number of Words for Each Review (Clipped)', \
            x_label = "Word Count", y_label = "Review Count", countplot = False)

    stemmed_words = flatten(df['stemmedWords'])
    stemmed_words_unique = flatten(df['uniqueStemmedWords'])

    top_20_stemmed_words = pd.DataFrame.from_dict(Counter(stemmed_words), orient='index').\
                reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\
                sort_values(['Count'], ascending = False).head(20).\
                reset_index().drop(columns = ['index'])

    print_header('Top 20 Words with Stemming', char='-')
    print(top_20_stemmed_words)

    print_header('3.2.4 POS Tagging', 50)

    tokenized_sentences = pd.Series(flatten(df['tokenizedSentences']))
    print('Total Number of Sentences: ' + str(len(tokenized_sentences)))

    random_5_sentences = tokenized_sentences.sample(5, random_state=seed)
    random_5_df = pd.DataFrame(
        random_5_sentences,
        columns=['sentence']).reset_index().drop(columns=['index'])
    random_5_df['posTagged'] = random_5_df['sentence'].apply(pos_tag)
    print('=' * 30)
    print(random_5_df)
    print('=' * 30)

    # 3.3 Development of a Noun Phrase Summarizer
    print_header('3.3 Development of a Noun Phrase Summarizer', 50)

    df['posTagged'] = df['tokenizedSentences'].apply(
        lambda tokenizedSentences:
        [pos_tag(sentence) for sentence in tokenizedSentences])
    df['nounPhrases'] = df['posTagged'].apply(
        lambda posTagged:
        [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])])
    df[['reviewText', 'posTagged', 'nounPhrases']].head()

    # Including single noun phrases
    print_header('Including single noun phrases', char='-')
    noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrases'])), orient='index').\
                    reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\
                    sort_values(['Count'], ascending = False)
    top_20_noun_phrases = noun_phrases.head(20).reset_index().drop(
        columns=['index'])

    print_header('Top 20 Noun Phrases Including Single Noun Phrases', char='-')
    print(top_20_noun_phrases)

    df['nounPhrasesExcludeSingle'] = df['nounPhrases'].apply(
        lambda noun_phrases: [
            noun_phrase for noun_phrase in noun_phrases
            if len(noun_phrase.split()) > 1
        ])
    noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrasesExcludeSingle'])), orient='index').\
                    reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\
                    sort_values(['Count'], ascending = False)
    top_20_noun_phrases = noun_phrases.head(20).reset_index().drop(
        columns=['index'])

    print_header('Top 20 Noun Phrases Excluding Single Noun Phrases', char='-')
    print(top_20_noun_phrases)

    products = df['asin'].value_counts().head(3).index
    products_np_top1 = df[df['asin'] == products[0]]
    products_np_top2 = df[df['asin'] == products[1]]
    products_np_top3 = df[df['asin'] == products[2]]

    print_representative_np(products_np_top1, product=products[0], n=30)
    print_representative_np(products_np_top2, product=products[1], n=30)
    print_representative_np(products_np_top3, product=products[2], n=30)

    random_5_reviews = df[['reviewText', 'posTagged',
                           'nounPhrases']].sample(5, random_state=seed)
    random_5_reviews['nounPhrasesLen'] = random_5_reviews['nounPhrases'].apply(
        len)

    print_header('Noun Phrase Detector Evaluation for  Random 5 Reviews',
                 char='-')
    print(random_5_reviews)

    # 3.4. Sentiment Word Detection
    print(
        str(datetime.datetime.now()).split('.')[0] +
        ': Start processing sentence segmentation')

    # Without Stemming and Without Negation
    sentiment_score(df, "./rep_words/ns_nn.csv")

    # With Stemming and Without Negation
    sentiment_score(df, "./rep_words/s_nn.csv", stemmer=stemmer)

    # Without Stemming and With Negation
    sentiment_score(df, "./rep_words/ns_n.csv", convert_neg=True)

    # With Stemming and With Negation
    sentiment_score(df,
                    "./rep_words/s_n.csv",
                    stemmer=stemmer,
                    convert_neg=True)
Esempio n. 12
0
def hello_world():
    if request.method == "GET":
        return redirect("/app/index.html")
    else:
        pprint.pprint(request.form)
        pprint.pprint(request.files)

        #Language check
        if request.form['language'] not in ['english', 'dutch']:
            return jsonify(status='error', message="Invalid language!")

        #Input normalization
        if request.form['upload_option'] == 'text_field':
            input_text = request.form['upload_textarea']
        elif request.form['upload_option'] == 'url':
            page_text = requests.get(request.form['upload_url']).text
            soup = BeautifulSoup(page_text, "html.parser")
            input_text = soup.text
        elif request.form['upload_option'] == 'file':
            input_text = UnicodeDammit(
                request.files.get('upload_file').read()).unicode_markup

        #Stemmer selection
        if request.form['stemmer'] == 'no_stemmer':
            stemmer = None
        elif request.form['stemmer'] == 'porter':
            if request.form['language'] != 'english':
                return jsonify(status='error',
                               message="Invalid language for stemmer porter!")
            stemmer = PorterStemmer()
        elif request.form['stemmer'] == 'snowball':
            stemmer = SnowballStemmer(request.form['language'])
        else:
            return jsonify(status='error', message="Invalid stemmer!")

        #Lemmatizer selection
        if request.form['lemmatizer'] == 'lemmatizer_off':
            lemmatizer = None
        elif request.form['language'] == 'english':
            lemmatizer = lemmatizer_en
        else:
            lemmatizer = lemmatizer_nl

        #Stopwords selection
        if request.form['stopwords'] == 'no_stopwords':
            stopwords = None
        elif request.form['stopwords'] == 'our_stopwords':
            stopwords = obo.stopwords
        elif request.form['stopwords'] == 'custom_stopwords':
            custom_stopword_text = UnicodeDammit(
                request.files.get(
                    'custom_stopword_file').read()).unicode_markup
            stopwords = obo.stripNonAlphaNum(custom_stopword_text)

        #Process the text
        input_text_word_count = 0
        resulting_text = ""
        final_wordlist = []
        for word_type, word in text_processor.parse_text(input_text):
            if word_type == "non-word":
                resulting_text += word
            else:
                input_text_word_count += 1
                processed_word = word
                if stemmer:
                    processed_word = stemmer.stem(processed_word)
                if lemmatizer:
                    processed_word = lemmatizer(processed_word)
                if not stopwords or processed_word not in stopwords:
                    if request.form['exclude_vowels'] == 'exclude_vowels_yes':
                        if request.form['language'] == 'english':
                            regex = re_vowel_en
                        else:
                            regex = re_vowel_nl
                        processed_word = regex.sub("", processed_word)
                    resulting_text += processed_word
                    final_wordlist.append(processed_word)

        dictionary = obo.wordListToFreqDict(final_wordlist)
        sorteddict = obo.sortFreqDict(dictionary)

        ignore_results_amount = int(request.form['ignore_results_amount'])

        if ignore_results_amount > 0:
            initial_index = ignore_results_amount
            ignored_words = [word for rank, word in sorteddict[:initial_index]]
            sorteddict = sorteddict[initial_index:]
            new_text = ""
            new_wordlist = []
            for word_type, word in text_processor.parse_text(resulting_text):
                if word_type == "non-word":
                    new_text += word
                elif word not in ignored_words:
                    new_text += word
                    new_wordlist.append(word)
            resulting_text = new_text
            final_wordlist = new_wordlist

        else:
            initial_index = 0

        #Do the math!
        input_text_char_count = len(input_text)
        word_count = len(final_wordlist)
        distinct_words_count = len(sorteddict)
        words = []
        frequencies = []
        word_cloud = []
        for frequency, word in sorteddict:
            words.append(word)
            frequencies.append(frequency)
            word_cloud.append([word, frequency])

        acum_perc = Decimal(0)
        percentages = []
        acum_perc_list = []
        for freq in frequencies:
            perc = Decimal((freq * 100.0) / word_count)
            percentages.append(round(perc, 2))
            acum_perc += perc
            acum_perc_list.append(round(acum_perc, 2))

        logarithms = []
        for i in range(len(sorteddict)):
            logarithms.append((math.log(i + 1), math.log(frequencies[i])))

        #Calculate Linear regression
        #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
        x = numpy.array([math.log(f) for f in frequencies])
        y = numpy.array(
            [math.log(rank) for rank in range(1, distinct_words_count + 1)])
        A = numpy.vstack([x, numpy.ones(len(x))]).T
        m, c = numpy.linalg.lstsq(A, y)[0]

        #Calculate the regression line start and end,
        #  and sort making the start be the one with the lower X value
        #  (highcharts requires this)
        regline_start = (0, c)
        regline_end = (math.log(distinct_words_count),
                       math.log(distinct_words_count) * m + c)
        regression_line = {'start': regline_start, 'end': regline_end}

        return jsonify(status='success',
                       words=words,
                       frequencies=frequencies,
                       percentages=percentages,
                       acum_perc_list=acum_perc_list,
                       logarithms=logarithms,
                       regression_line=regression_line,
                       resulting_text=resulting_text,
                       input_text_char_count=input_text_char_count,
                       input_text_word_count=input_text_word_count,
                       output_text_word_count=word_count,
                       word_cloud=word_cloud,
                       sorteddict=sorteddict)
best_models = []
dummy_models = []
model = None  # Place holder
# Process each language indepedently
for lang in reviews_by_language.keys():
    print("PROCESSING ", lang)
    stem_lang = get_stemmer_lang(lang)
    if stem_lang is None:
        # Use default analyser if there is no matching stemmer for this language
        analyzer_for_lang = 'word'
    else:
        # Language has a stemmer
        analyzer_for_lang = stemmed_words
        # Redefine stemmer with specified language
        stemmer = SnowballStemmer(stem_lang)
    stem_vectorizer = CountVectorizer(analyzer=analyzer_for_lang,
                                      ngram_range=(2, 2))
    try:
        tokens = stem_vectorizer.fit_transform(reviews_by_language[lang]["x"])
    except:
        # On tokeniser error, skip the language
        continue
    X = np.array(tokens.toarray())
    y = np.array(reviews_by_language[lang]["y"])
    # use this line instead of the above one for early access models
    # y = np.array(reviews_by_language[lang]["z"])

    # Skip languages with less than 5 reviews (not possible with k-fold)
    # this may trigger depending on sampling size used
    if len(X) < 5:
Esempio n. 14
0
def tokeAndClean(str, bgrams = False, tgrams = False, stopwords = stopwords.words('english'), ngramMinFreq = 2, stemming = True, stemmer = SnowballStemmer('english')):
	tokenizer = RegexpTokenizer("[\w']+")

	tokens = tokenizer.tokenize(str)
	# lower-cases everything, removes words < 2 letters
	tokens = [token.lower() for token in tokens if len(token) > 2]
	
	if stemming:
		try:
			tokens = [stemmer.stem(token) for token in tokens if len(token) > 2]
	# sometimes get a weird error from snowball because these tokens have length longer than 3 where the tokens are too short and snowball
		except:
			tokens = [stemmer.stem(token) for token in tokens if len(token) > 3]

	def cleanNGram(ngrams):
		out = [' '.join(token) for token in ngrams]
		# includes only those ngrams which occur at least ngramMinFreq times
		out = [ngram for ngram in out if out.count(ngram) >= ngramMinFreq]
		return out
	
	# adds cleaned bigrams and trigrams if necessary
	if(bgrams): tokens.extend(cleanNGram(bigrams(tokens)))
	if(tgrams): tokens.extend(cleanNGram(trigrams(tokens)))
		
	
	return tokens
Esempio n. 15
0
def get_answer(question, story):
    """
    :param question: dict
    :param story: dict
    :return: str


    question is a dictionary with keys:
        dep -- A list of dependency graphs for the question sentence.
        par -- A list of constituency parses for the question sentence.
        text -- The raw text of story.
        sid --  The story id.
        difficulty -- easy, medium, or hard
        type -- whether you need to use the 'sch' or 'story' versions
                of the .
        qid  --  The id of the question.


    story is a dictionary with keys:
        story_dep -- list of dependency graphs for each sentence of
                    the story version.
        sch_dep -- list of dependency graphs for each sentence of
                    the sch version.
        sch_par -- list of constituency parses for each sentence of
                    the sch version.
        story_par -- list of constituency parses for each sentence of
                    the story version.
        sch --  the raw text for the sch version.
        text -- the raw text for the story version.
        sid --  the story id


    """
    ###     Your Code Goes Here         ###
    # Our tools

    stemmer = SnowballStemmer("english")
    chunker = nltk.RegexpParser(GRAMMAR)
    lmtzr = WordNetLemmatizer()

    driver = QABase()

    # question["qid"] returns the form: "fables-04-7"
    q = driver.get_question(question["qid"])
    current_story = driver.get_story(q["sid"])

    #############################################
    # if question["qid"] == 'blogs-03-1':
    #     print(question["text"])
    #     print(sent_tokenized_text[0])
    #     print("++++++++++++++++++++++++++++++++++++++++++++++")
    ############################################

    stopwords = set(nltk.corpus.stopwords.words("english"))


    if (question["difficulty"] == 'Easy'):



        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
            text = story["sch"]
            text = nltk.sent_tokenize(text)

        else:
            sentences = get_sentences(current_story["text"])
            text = story["text"]
            text = nltk.sent_tokenize(text)

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question)
        # print(question["qid"], best_idx)

        # tokenize questions, also removing punctuations to extract keywords
        tokenizer = RegexpTokenizer(r'\w+')
        tokenized_question_text = tokenizer.tokenize(question["text"])
        tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text)

        # remove stopwords
        tagged_keywords_list = []

        for word, tag in tagged_tokenized_question_text:
            if word not in stopwords:
                tagged_keywords_list.append((word, tag))

        # lemmatize keywords
        lemmatized_keywords_list = []
        for keyword, tag in tagged_keywords_list:
            lemmatized_keywords_list.append(stemmer.stem(keyword))

        #####################################################
        # if question["qid"] == 'fables-04-6':
        #     print("text:", text)
        #     print("best index:", best_idx)
        #     print("qid:", question["qid"])
        #     print(text[best_idx])
        #     print("==============================")
        #     print(get_sentences("".join(text)))
        #####################################################


        best_sent = get_sentences(text[best_idx])

        # Find the sentences that have all of our keywords in them
        # Last time, 2nd arg is sentences = get_sentences(text) which returns tuple of each word
        target_sentences = find_sentences(lemmatized_keywords_list, best_sent)
        # Extract the candidate locations from these sentences
        candidates_forest = find_candidates(target_sentences, chunker, question["text"])

        if len(candidates_forest) == 0:
            answer = doBaseline(question, story)
        else:

            possible_answers_list = []

            # locations is a list of trees
            for candidate in candidates_forest:
                # candidate.draw()
                possible_answers_list.append(" ".join([token[0] for token in candidate.leaves()]))
            answer = " ".join(possible_answers_list)

            ###########################################
            # currently, possible_answer contains the actual needed answer,
            # plus some garbage words around it from chunking,
            # we might be able to filter this out SOMEHOW
            # possible_answer is a list of strings
            ###########################################


    elif question["difficulty"] == 'Medium':

        if question["type"] != 'Story':
            sentences = get_sentences(current_story["sch"])
        else:
            sentences = get_sentences(current_story["text"])

        Q = nltk.word_tokenize(question["text"].lower())
        # print(Q)

        all_stemmed_sentences = []
        for sent in sentences:
            temp_sent = []
            for w, pos in sent:
                temp_sent.append((stemmer.stem(w), pos))
            all_stemmed_sentences.append(temp_sent)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        qbow = get_bow(get_sentences(question["text"])[0], stopwords)
        stemmed_qbow = []
        for w in qbow:
            stemmed_qbow.append(stemmer.stem(w))
        stemmed_qbow = set(stemmed_qbow)
        best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question)
        # print(question["qid"], best_idx)

        if question["type"] != 'Story':
            tree = current_story["sch_par"][best_idx]
        else:
            tree = current_story["story_par"][best_idx]

        #############################################
        # if question["qid"] == 'blogs-03-13':
        #     print(Q)
        #     print(tree)
        #     print("++++++++++++++++++++++++++++++++++++++++++++++")
        ############################################
        # print(tree)
        # Create our pattern

        #########################################
        # MAKE PATTERN FIT FOR TYPE OF QUESTION #
        #########################################
        # print(Q[0])
        if Q[0] == 'where' or Q[0] == 'when':
            pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))")
        elif Q[0] == 'who':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'what':
            pattern = nltk.ParentedTree.fromstring("(NP)")
        elif Q[0] == 'why':
            pattern = nltk.ParentedTree.fromstring("(SBAR)")
        elif Q[0] == 'how':
            pattern = nltk.ParentedTree.fromstring("(RB)")

        # don't know how to deal with 'did' questions
        elif Q[0] == 'did':
            pattern = nltk.ParentedTree.fromstring("(S)")

        subtree1 = pattern_matcher(pattern, tree)

        ############################################
        # if question["qid"] == 'blogs-03-13':
        #     print("subtree1")
        #     print(subtree1)
        ############################################
        if subtree1 == None:
            #######################################
            answer = doBaseline(question, story)
            # answer = "doBaseline"
            #######################################
        else:
            # create a new pattern to match a smaller subset of subtrees
            if Q[0] == 'where' or Q[0] == 'when':
                pattern = nltk.ParentedTree.fromstring("(VP)")
            elif Q[0] == 'who':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'what':
                pattern = nltk.ParentedTree.fromstring("(NP)")
            elif Q[0] == 'why':
                pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))")
            elif Q[0] == 'how':
                pattern = nltk.ParentedTree.fromstring("(RB)")

            # don't know how to deal with 'did' questions
            elif Q[0] == 'did':
                pattern = nltk.ParentedTree.fromstring("(S)")


            # Find and make the answer
            # print(subtree)
            subtree2 = pattern_matcher(pattern, subtree1)
            if subtree2 == None:
                #######################################
                answer = doBaseline(question, story)
                # answer = "doBaseline"
                #######################################
            else:
                answer = " ".join(subtree2.leaves())

            ############################################
            # if question["qid"] == 'mc500.train.18.18':
            #     print("subtree2")
            #     print(subtree2)
            ############################################
            # cheat for dealing with 'did' questions
            if Q[0] == 'did':
                answer = "yes"

    elif question["difficulty"] == 'Hard':

        answer = "h"


    elif question["difficulty"] == 'Discourse':

        answer = "h"


    else:
        #########################################
        answer = doBaseline(question, story)
        # answer = "doBaseline"
        #########################################

    ###     End of Your Code         ###
    return answer
Esempio n. 16
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

from rest import models
from django.conf import settings

######---------- VERSION 1.0 ----------######

medicines = pd.read_csv("data/baseDatos-completa.csv", header=0, delimiter=",", encoding = "utf-8")  #Obtaining the medicines names from the file
medicines_list = list(set([w.lower() for w in medicines["nombre-marca"]])) 						#Putting them tidily into a list
medicines = re.compile(r"\b" + r"\b|\b".join(map(re.escape, medicines_list)) + r"\b") 			#Then making it into a regex

stops = set(stopwords.words("spanish")) #Quicker to search in a set, so putting the stopwords in it

stemmer = SnowballStemmer("spanish")    #Initializing stemmer

forest = joblib.load('classifier/logistic_regression')	#Loading already trained logistic regression and initializing vectorizer
vectorizer = joblib.load('classifier/vectorizer')

#TOO MANY ISSUES WITH STREAMING, still here for archiving purposes
#class MyStreamListener(tp.StreamListener):	#Streamer for tweets
#	def on_status(self, status):	#What to do when it gets a tweet, we just classify it
#		classified = classify(status)
#
#	def on_error(self, status_code):	#In case of error, print code on screen
#		print(status_code)
#		return True

class Tweet:	#Tweet class for quicker and easier manipulation
	def __init__(self, url, text, medicines):	#Initially tweets only have their url, their cleaned text and the medicines found
        meaningful_words = [w for w in words if not w in stops]
        clean = []
        for word in meaningful_words:
            clean.append(SnowballStemmer("english").stem(word))
        patrick_repub.append(clean)
        jasper_repub.append(1)
        print(patrick_repub, 'hi')


loop = asyncio.get_event_loop()
loop.run_until_complete(wait(print(patrick_repub)))

patrick_demo = []
jasper_demo = []

for tweet in collection.distinct('Text', {'Classification': 0}):
    meaningful_words = []
    nonum = re.sub("[\d*]", "number ", tweet)
    letters_only = re.sub("[^a-zA-Z]", " ", nonum)
    nourlwords = re.sub(r'^https?:\/\/.*[\r\n]*',
                        'http ',
                        letters_only,
                        flags=re.MULTILINE)
    words = nourlwords.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    clean = []
    for word in meaningful_words:
        clean.append(SnowballStemmer("english").stem(word))
    patrick_demo.append(clean)
    jasper_demo.append(0)
Esempio n. 18
0
#import fastcluster
import scipy.cluster.hierarchy as hcluster
import matplotlib.pylab as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

df = pd.read_csv('D:\mCaas\Top 3 ques\Top3Data.csv')

quesdf = df.dropna(subset=['Query_Str'])
quesdf["DateTime"] = pd.to_datetime(quesdf["DateTime"])
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

stemmer = SnowballStemmer(
    "english", ignore_stopwords=True
)  # stems the word example - running to run, ignoring stopping words like having etc
print(stemmer.stem('running'))


#here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [
        word for sent in nltk.sent_tokenize(text)
        for word in nltk.word_tokenize(sent)
    ]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
Esempio n. 19
0
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

# In[26]:

# Stemming Code

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)


class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])


stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                             ('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
Esempio n. 20
0
 def __init__(self):
     self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
Esempio n. 21
0
import pandas as pd
import numpy as np
from collections import Counter
from clasificator import KNN_classifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score
import re

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('italian')
stop_words = set(stopwords.words('italian'))


def tokenize(text):
    '''
    Generic wrapper around different tokenization methods.
    '''
    text = str(text)
    text = text.lower()
    text = text.strip()  # stergem white space uri
    text = text.replace('{html}', "")
    text = re.sub(r'@[A-Z0-9a-z_:!@#$%^&()=+,.></?;|]+', '', text)
    text = re.sub(r'#[A-Z0-9a-z_:!@#$%^&()=+,.></?;|]+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
Esempio n. 22
0
Stemmer to Snowball Stemmer
'''
# Import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

print("---------- Create string vector and apply porterstemmer method:", "\n")

#PorterStemmer method
p_stemmer = PorterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly','consolingly']

for word in words:
    print(word+' --> '+p_stemmer.stem(word))

print("---------- Apply snowballstemmer method:", "\n")

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')    

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

print("---------- Create sentence and apply porterstemmer method:", "\n")

phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))    
Esempio n. 23
0
This function is used within preprocess(), which is used as a pre-processing chain: in this case we simply add a lowercasing feature for all the
tokens that are not emoticons (e.g. :D doesn’t become :d).
"""
cle = sys.argv[1]
if cle == '-h':
    print('passez en argument la clé pour trouver les élément dans la base')

database = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
listOfTweets = getTweetsByHash(cle, database)

punctuation = list(string.punctuation)
stop = stopwords.words('french') + punctuation + [
    'via', 'le', 'les', 'a', 'rt'
]  # Liste des tokens à effacer

stemmer1 = SnowballStemmer('french')
stemmer2 = FrenchStemmer()

count_stop = Counter()  # Inisialise un compteur
count_stem1 = Counter()  # Inisialise un compteur
count_stem2 = Counter()  # Inisialise un compteur
for tweet in listOfTweets:
    try:
        tweetText = getTweetText(tweet)
        print(tweetText)
        tokens = preprocess(tweetText)  # Tokenise le texte
        print('tokens')
        print(tokens)
        terms_stem = [stemmer1.stem(term) for term in tokens]
        print('stem sans stop')
        print(terms_stem)
Esempio n. 24
0
def wordStemmingSnowball(word):
    stemmer = SnowballStemmer("english")
    stem = str(stemmer.stem(word))
    return stem
ABREVIATIONS_DICT = {
    "'m": ' am',
    "'ve": ' have',
    "'ll": " will",
    "'d": " would",
    "'s": " is",
    "'re": " are",
    "  ": " ",
    "' s": " is",

    # debatable between and/or
    "/": " and "
}

STOPWORDS_SET = set(stopwords.words('english'))
SNOWBALL = SnowballStemmer('english')
WORDNET = WordNetLemmatizer()


def find_stop_words(corpus):
    '''
    takes in a normalized corpus and returns stop words in pandas Series
    '''
    unpacked_list = [word for document in corpus for word in document.split()]

    return pd.Series(unpacked_list).value_counts()


# I question the need for this but lets just do it for now
def _multiple_replace(text, adict=ABREVIATIONS_DICT):
    import re
Esempio n. 26
0
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        f - one eamil passed in 
        
        """
    print('\nBegin parse_out_email_text.py parseOutText function\n')

    myReturnString = ''
    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()
    # print("all_text - begin .............................")
    # print(all_text)
    # print("all_text - end .............................\n")
    # print("type(all_text) - {}\n".format(type(all_text)))
    # type(all_text) - <class 'str'>

    ### split off metadata

    content = all_text.split("X-FileName:")  # split on text in email
    # example from email -    X-FileName: Stokley, Chris (Non-Privileged).pst

    #print("len(content) - {}\n".format(len(content)))

    # print("content[0] - {}".format(content[0]))
    # print("type(content[0]) - {}".format(type(content[0])))
    #        type(content[0]) - <class 'str'>

    # content[1] - With original punctuation from email
    # print("content[1] - begin .....")
    # print(content[1])
    # print("content[1] - end .....\n")
    # print("type(content[1]) - {}".format(type(content[1])))
    #        type(content[1]) - <class 'str'>

    words = ""
    if len(content) > 1:
        ### remove punctuation
        # text_string = content[1].translate(string.maketrans("", ""), string.punctuation) # no older Python

        # print("string.punctuation - {}\n".format(string.punctuation))
        #      string.punctuation - !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~  # all of these - None
        # print("type(string.punctuation) - {}\n".format(type(string.punctuation)))
        #        type(string.punctuation) - <class 'str'>

        # print('str.maketrans("", "", string.punctuation) - ')
        # print(str.maketrans("", "", string.punctuation))
        # {64: None, 124: None, 125: None, 91: None, 92: ....
        # Python documentation - dictionary mapping Unicode ordinals (integers) or characters (strings of length 1) to Unicode ordinals, strings (of arbitrary lengths) or None.
        # print("type(str.maketrans("", "", string.punctuation)) - {}\n".format(type(str.maketrans("", "", string.punctuation))))
        #        type(str.maketrans(, , string.punctuation)) - <class 'dict'>

        text_string = content[1].translate(
            str.maketrans("", "", string.punctuation))

        # Without original punctuation from email
        # print("text_string (punctuation stripped out) - ")
        # print(text_string)
        # print()
        # print("type(text_string) - {}\n".format(type(text_string)))
        #        type(text_string) - <class 'str'>

        ### project part 2: comment out the line below
        words = text_string
        # print("words - ")
        # print(words)
        # print()
        # print("type(words) - {}\n".format(type(words)))
        #        type(words) - <class 'str'>

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)

        mySplitOutput = text_string.split()
        # print("mySplitOutput - {}\n".format(mySplitOutput))
        #      mySplitOutput - ['Hi', 'Everyone', 'If', 'you', 'can', 'read', 'this', 'message', 'youre', 'properly', 'using', 'parseOutText', 'Please', 'proceed', 'to', 'the', 'next', 'part', 'of', 'the', 'project']
        # print("type(mySplitOutput) - {}\n".format(type(mySplitOutput)))
        #        type(mySplitOutput) - <class 'list'>

        # done AFTER stemmimg
        # vectorizer = CountVectorizer()
        myStemmer = SnowballStemmer('english')
        # print("myStemmer - {}".format(myStemmer))
        #       myStemmer - <nltk.stem.snowball.SnowballStemmer object at 0x10b4b57f0>
        # print("type(myStemmer) - {}\n".format(type(myStemmer)))
        #        type(myStemmer) - <class 'nltk.stem.snowball.SnowballStemmer'>
        for myWord in mySplitOutput:
            # print("myWord - {}".format(myWord))
            # print("type(myWord) - {}\n".format(type(myWord)))
            #        type(myWord) - <class 'str'>
            myStemmedWord = myStemmer.stem(myWord)
            # print("myStemmedWord - {}\n".format(myStemmedWord))
            # print("type(myStemmedWord) - {}\n".format(type(myStemmedWord)))
            #        type(myStemmedWord) - <class 'str'>
            # print("{} - {}".format(myWord, myStemmedWord))
            myReturnString = myReturnString + myStemmedWord + ' '
        # print()

        print('\nEnd parse_out_email_text.py parseOutText function\n')

    return myReturnString
Esempio n. 27
0
 def prepareParams(self):
     self.stopwords = set(stopwords.words('english'))
     self.dataFile = STYLE_WITH_DESC_N_TITLE
     self.indexFile = INVERTED_IDX_FILE
     self.stemmer = SnowballStemmer('english')  #PorterStemmer()
     self.lemmatizer = WordNetLemmatizer()
Esempio n. 28
0
 def __init__(self, language):
     self.s = sume.ConceptBasedILPSummarizer(" ", language)
     self.LANGUAGE = language
     self.stoplist = set(stopwords.words(self.LANGUAGE))
     self.stemmer = SnowballStemmer(self.LANGUAGE)
Esempio n. 29
0
 def __init__(self, articles):
     self.searcher = articlesearch.ArticleSearch(articles)
     self.keys = keywords.KeyWords()
     self.stemmer = SnowballStemmer("english")
    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [
            ('[START2]', '[START2]'), ('[START1]', '[START1]')
        ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        f = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,
            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,
            'next-next-word': nextnextword,
            'nextnextpos': nextnextpos,
            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,
            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,
            'prev-iob': previob,
            'contains-dash': contains_dash,
            'contains-dot': contains_dot,
            'all-caps': allcaps,
            'capitalized': capitalized,
            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,
            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

        return f