#preprocessing of the main text of ad #removing lower case for the main text df['description'] = df['description'].apply( lambda x: " ".join(x.lower() for x in x.split())) #" " between words #removing punctuation in the main text df['description'] = df['description'].str.replace('[^\w\s]', '') #remove stop words in the main text df['description'] = df['description'].apply( lambda x: " ".join(x for x in x.split() if x not in stop)) #stemming ##from nltk.stem import PorterStemmer ##st = PorterStemmer() from nltk.stem.snowball import SnowballStemmer st = SnowballStemmer("russian") df['description'] = df['description'].apply( lambda x: " ".join([st.stem(word) for word in x.split()])) #print(st.stem("перепрыгивающий")) #results in перепрыгива ############################################################################## dataT = df['description'] from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer #count_vect = CountVectorizer(max_features=10) #dataT_counts = count_vect.fit_transform(dataT) #print("countvect",dataT_counts.toarray().sum(axis=1)) #tfidf_transformer = TfidfTransformer(use_idf=True)
def stemTokenize(doc): stemmer = SnowballStemmer('english') return [stemmer.stem(word) for word in re.findall(r'\b\w+\b', doc)]
import sklearn.svm as sksvm import sklearn.linear_model as sklin import inspect from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA import matplotlib.pyplot as plt import pandas as pd import os from itertools import compress import logging,gensim,os from gensim.models.keyedvectors import KeyedVectors from nltk.stem.snowball import SnowballStemmer setwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))+'/' prewd = os.path.abspath(os.path.join(setwd, os.pardir)) stemmer = SnowballStemmer("french") models = gensim.models.Word2Vec.load(prewd+'/data/stemmed_frwiki.bin') # In[4]: class suffix: msuffix = '-an,-and, -ant, -ent, -in, -int, -om, -ond, -ont,-eau, -au, -aud, -aut, -o, -os, -ot,-ai, -ais, -ait, -es, -et,-ou, -out, -out, -oux,-i, -il, -it, -is,-y,-at, -as, -ois,-oit,-u,-us,-ut,-eu,-er,-age, -ege, –ème, -ome,-òme, -aume, -isme,-as, -is, -os, -us, -ex,-it, -est,-al, -el, -il, -ol, -eul, -all,-if, -ef,-ac, -ic, -oc, -uc,-am, -um, -en,-air, -er, -erf, -ert, -ar, -arc, -ars, -art, -our, -ours, -or, -ord, -ors, -ort, -ir, -oir,-eur,-ail, -eil, -euil, -ueil,-ing' msuffix = msuffix.split(',') fsuffix = 'aie, -oue, -eue, -ion, -te, – ée, -ie, -ue, -asse, -ace, -esse, -ece, -aisse, -isse,-ice, -ousse, -ance, -anse, -ence, -once,-enne, -onne, -une, -ine, -aine, -eine, -erne,-ande, -ende, -onde, -ade, -ude, -arde, -orde,-euse, -ouse, -ase, -aise, -ese, -oise, -ise, -yse, -ose, -use,-ache, -iche, -eche, -oche, -uche, -ouche, -anche,-ave, -eve, -ive,-iere, -ure, -eure,-ette, -ete, –ête, -atte, -otte, -oute, -orte, -ante, -ente, -inte, -onte,-alle, -elle, -ille, -olle,-aille, -eille, -ouille,-appe, -ampe, -ombe,-igue' fsuffix = fsuffix.split(',') ms = [] for i in range(0,len(msuffix)):
#!/usr/bin/env python """coOccuranceMapper.py""" import sys import re from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") stopwords = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] # MUST MANUALLY CHANGE FOR EACH SUB-TOPIC topten = [ 'train', 'get', 'like', 'im', 'go', 'station', 'time', 'one', 'dont', 'peopl' ]
s = s.value_counts() s[:5] # Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. Therefore, these can be safely removed. Finally, we will convert every word to its stem so that words such as *Dogs* and *Dog* are considered the same. # In[ ]: s = s[s > 1] # In[ ]: stemmer = SnowballStemmer('english') stemmer.stem('dogs') # In[ ]: def filter_keywords(x): words = [] for i in x: if i in s: words.append(i) return words # In[ ]:
""" This is main script to extract all features from books (all books/pages for training and prediction must go through this) """ #import necessary libraries from __future__ import print_function #need this to print to file import xml.etree.ElementTree as ET #need for parsing XML file of book import string #need for testing if punctuation in word from nltk.stem.snowball import SnowballStemmer #need to stem words for cookWords and measureWords features stemmer = SnowballStemmer("english") #set up stemmer import os from os import listdir #need for reading all files from folder import csv import itertools #need for condensing list #function to parse a book and print its features to a file #parameters: takes a XML book file; like in this form: 'foodNewsletter.xml' #and takes file name; if file DNE, will make file of that name #Will make new output file for each xml book (if give new output file name for each book) def parsePrint(xmlBk, f, cookWords, measures, foods): with open(xmlBk, 'rb') as xml_bk: tree = ET.parse(xml_bk) #parse xml pages = tree.findall( ".//OBJECT") #store all the bk pages in list called 'pages' count = 0 #count number of iterations of for loop avgP = avgPunc(pages) avgW = avgWord(pages) for p in pages: count += 1
print(stem_out) #Process stem analisys eng_tokens_stem = [] print("TOKEN\t\tSTEM") for eng_token in eng_tokens: eng_token_stem_radix = eng_porter_stemmer.stem(eng_token) eng_tokens_stem.append(eng_token_stem_radix) print("%s\t\t%s" % (eng_token, eng_token_stem_radix)) print("*** ENG - Snowball Stemmer algorithm ***") from nltk.stem.snowball import SnowballStemmer print("Original phrase") print(eng_textToAnalize_03) eng_tokens = word_tokenize(eng_textToAnalize_03) eng_snowball_stemmer = SnowballStemmer("english") #Process snowball stem analisys eng_tokens_snowball_stem = [] print("TOKEN\t\tSTEM") for eng_token in eng_tokens: eng_token_stem_radix = eng_snowball_stemmer.stem(eng_token) eng_tokens_snowball_stem.append(eng_token_stem_radix) print("%s\t\t%s" % (eng_token, eng_token_stem_radix)) print("*** ENG - Lancaster Stemmer algorithm ***") ''' ''' from nltk.stem import LancasterStemmer print("Original phrase") print(eng_textToAnalize_03)
def extract_features(self, Req_list, score_target, export=True, corpal=True): nlp = spacy.load('de') stemmer = SnowballStemmer("german") stop = stopwords.words('german') features = pd.DataFrame() # create first column of dataframe by allocating requirement list to it; one requirement per line features['req'] = Req_list # get text, tag_ and pos_ attributes for each word features['req_nlp'] = features['req'].apply(lambda x: nlp(x)) features['tags'] = features['req_nlp'].apply(lambda x: [(w.text, w.tag_, w.pos_) for w in x]) # Analysis using NLTK # Split sentences then count number in each requirement features['sentences_by_nltk'] = features['req'].apply(lambda x: nltk.sent_tokenize(x, 'german')) features['sentence_nb_by_nltk'] = features['req'].apply(lambda x: len(nltk.sent_tokenize(x, 'german'))) # analysis with spacy features['sentences_by_nlp'] = features['req_nlp'].apply(lambda x: [sent.string.strip() for sent in x.sents]) features['sentence_nb_by_nlp'] = features['req_nlp'].apply( lambda x: len([sent.string.strip() for sent in x.sents])) # number of sentences per requirement features['sentences'] = features.apply(lambda x: self.select_sentences(x), axis=1) features['sentences_nb'] = features.apply(lambda x: self.select_sentences(x, "y"), axis=1) features['sentences_tagged'] = features['sentences'].apply(lambda x: [self.tag_sentence(nlp, w) for w in x]) # Calculating Readability-Index # words in requirement features['words_nb'] = features['req'].apply(lambda x: len(x.split())) # words per sentence features['WPS'] = features['words_nb'] / features['sentences_nb'] # syllables per word features['SPW'] = features['req'].apply(lambda x: self.compute_SPW(x)) # flesch index features['Flesch_Index'] = features.apply(lambda x: round((180 - x['WPS'] - (58.5 * x['SPW']))), axis=1) # Analyzing punctuation features['internal_punctuation'] = features['tags'].apply(lambda x: self.count_punctuation(x)) features['comma'] = features['tags'].apply(lambda x: self.count_comma(x)) features['weird_words'] = features['tags'].apply(lambda x: self.count_weird_words(x)) # Analyzing and counting specific words and list containing words features['beispiel'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'beispiel')) features['circa'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'circa')) features['wenn'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'wenn')) features['aber'] = features['tags'].apply(lambda x: self.search_specific_words(x, 'aber')) features['max_min_presence'] = features['req'].apply(lambda x: self.check_max_min_presence(x)) features['Nb_of_Umsetzbarkeit_conj'] = features['tags'].apply(lambda x: self.time_logical_conj(x)) features['measurement_values'] = features['tags'].apply(lambda x: self.search_measurements_indicators(x)) features['numerical_values'] = features['tags'].apply(lambda x: self.search_numerical_value(x)) features['polarity'] = features['req'].map(lambda text: TextBlobDE(text).sentiment.polarity) # Analyzing passive and active and auxiliary attributes at the beginning of a requirement features['passive_global'] = features['tags'].apply(lambda x: self.passive_detection(x)) features['passive_per_sentence'] = features['sentences_tagged'].apply( lambda x: [self.passive_detection(s) for s in x]) features['passive_percent'] = features['passive_per_sentence'].apply( lambda x: (sum([y == "yes" for y in x]) / len(x))) features['Aux_Start'] = features['tags'].apply(lambda x: self.aux_1st(x)) features['Aux_Start_per_sentence'] = features['sentences_tagged'].apply(lambda x: [self.aux_1st(s) for s in x]) # Analyzing conjunctions, verbs and auxiliaries features['Sub_Conj'] = features['tags'].apply(lambda x: self.count_subordinate_conjunction(x)) features['Comp_conj'] = features['tags'].apply(lambda x: self.count_comp_coor_conjunction(x)) features['Nb_of_verbs'] = features['tags'].apply(lambda x: self.count_verb(x)) features['Nb_of_auxiliary'] = features['tags'].apply(lambda x: self.count_aux(x)) features['werden'] = features['req'].apply(lambda x: self.count_werden(x)) # same functions as previous block but analysis made for each sentence on one requirement features['Sub_Conj_pro_sentece'] = features['sentences_tagged'].apply( lambda x: [self.count_subordinate_conjunction(s) for s in x]) features['Comp_conj_pro_sentence'] = features['sentences_tagged'].apply( lambda x: [self.count_comp_coor_conjunction(s) for s in x]) features['Nb_of_verbs_pro_sentence'] = features['sentences_tagged'].apply( lambda x: [self.count_verb(s) for s in x]) features['Nb_of_auxiliary_pro_sentence'] = features['sentences_tagged'].apply( lambda x: [self.count_aux(s) for s in x]) features['werden_pro_sentence'] = features['sentences'].apply(lambda x: [self.count_werden(s) for s in x]) features['formal_global'] = features['req'].apply(lambda x: self.contain_Muss_Darf_nicht(stemmer, x)) features['formal_per_sentence'] = features['sentences'].apply( lambda x: [self.contain_Muss_Darf_nicht(stemmer, s) for s in x]) features['formal_percent'] = features['formal_per_sentence'].apply( lambda x: (sum([y == "yes" for y in x]) / len(x))) features['entities'] = features['req_nlp'].apply(lambda x: self.entities_label(x)) # Graphical representation of the vocabulary of requirements corpus if corpal: self.Corpus_Analysis(Req_list, stop) if export: my_path = Path(u"/Users/selina/Code/Python/Thesis/src/Features/" + 'export_features') # my_path = Path(u"/Users/selina/Documents/UNI/Thesis/Code/Features/" + 'export_features') g_Dirpath = os.path.abspath(my_path) dataFile = g_Dirpath + '\\' + 'Features_Export.xlsx' print("Create Excel export file: %s" % (dataFile)) features[0:5000].to_excel(dataFile, index=False) print("\nFeatures_Export XLS-file created and data copied.") return features, features.sentences_tagged
def __init__(self): self.df = pd.DataFrame() self.stemmer = SnowballStemmer("german")
def load_references(input_file, sep_doc_id=':', sep_ref_keyphrases=',', normalize_reference=False, language="en", encoding='utf-8'): """Load a reference file. Reference file can be either in json format or in the SemEval-2010 official format. Args: input_file (str): path to the reference file. sep_doc_id (str): the separator used for doc_id in reference file, defaults to ':'. sep_ref_keyphrases (str): the separator used for keyphrases in reference file, defaults to ','. normalize_reference (bool): whether to normalize the reference keyphrases using stemming, default to False. language (str): language of the input documents (used for computing the stems), defaults to 'en' (english). encoding (str): file encoding, default to utf-8. """ logging.info('loading reference keyphrases from {}'.format(input_file)) references = defaultdict(list) # open input file with codecs.open(input_file, 'r', encoding) as f: # load json data if input_file.endswith('.json'): references = json.load(f) for doc_id in references: references[doc_id] = [ keyphrase for variants in references[doc_id] for keyphrase in variants ] # or load SemEval-2010 file else: for line in f: cols = line.strip().split(sep_doc_id) doc_id = cols[0].strip() keyphrases = cols[1].strip().split(sep_ref_keyphrases) for v in keyphrases: if '+' in v: for s in v.split('+'): references[doc_id].append(s) else: references[doc_id].append(v) # normalize reference if needed if normalize_reference: # initialize stemmer stemmer = SnowballStemmer("porter") if language != 'en': stemmer = SnowballStemmer(ISO_to_language[language], ignore_stopwords=True) for doc_id in references: for i, keyphrase in enumerate(references[doc_id]): stems = [stemmer.stem(w) for w in keyphrase.split()] references[doc_id][i] = ' '.join(stems) return references
def main(data_file, seed): # set seed np.random.seed(seed) # load in a pd.df data = [json.loads(line) for line in data_file] df = pd.DataFrame.from_dict(data) # make directory for images if not os.path.exists(IMAGES_DIRECTORY): os.mkdir(IMAGES_DIRECTORY) # make directory for representative words if not os.path.exists(REP_DIRECTORY): os.mkdir(REP_DIRECTORY) print_header('3.2.1 Popular Products and Frequent Reviewers', 50) ## 3.2.1 get top 10 products top_10_products = df['asin'].value_counts().head(10).reset_index().rename( columns={ 'index': 'productID', 'asin': 'reviewCount' }) print_header('Top 10 products', char='-') print(top_10_products) # productID reviewCount # 0 B005SUHPO6 836 # 1 B0042FV2SI 690 # 2 B008OHNZI0 657 # 3 B009RXU59C 634 # 4 B000S5Q9CA 627 # 5 B008DJIIG8 510 # 6 B0090YGJ4I 448 # 7 B009A5204K 434 # 8 B00BT7RAPG 431 # 9 B0015RB39O 424 ## 3.2.1 get top 10 reviewers top_10_reviewers = df['reviewerID'].value_counts().head( 10).reset_index().rename(columns={ 'index': 'reviewerID', 'reviewerID': 'reviewCount' }) print_header('Top 10 reviewers', char='-') print(top_10_reviewers) # reviewerID reviewCount # 0 A2NYK9KWFMJV4Y 152 # 1 A22CW0ZHY3NJH8 138 # 2 A1EVV74UQYVKRY 137 # 3 A1ODOGXEYECQQ8 133 # 4 A2NOW4U7W3F7RI 132 # 5 A36K2N527TXXJN 124 # 6 A1UQBFCERIP7VJ 112 # 7 A1E1LEVQ9VQNK 109 # 8 A18U49406IPPIJ 109 # 9 AYB4ELCS5AM8P 107 ## 3.2.2 Sentence segmentation print_header('3.2.2 Sentence Segmentation', 50) df['sentences'] = df['reviewText'].apply(segment_sent) df['sentenceCount'] = df['sentences'].apply(len) # plotting for number of sentences plot_bar(df['sentenceCount'], \ title = 'Distribution of Number of Sentences for Each Review', \ x_label = "Sentence Count", y_label = "Review Count", countplot = False) plot_bar(df['sentenceCount'].clip(0, 50), \ title = 'Distribution of Number of Sentences for Each Review (Clipped)', \ x_label = "Sentence Count (Clipped)", y_label = "Review Count", countplot = True) # get 5 random reviews to do sentence segmentation and display results reviews = df['reviewText'] _seed = 43 # To give us an interesting result random_reviews = reviews.sample(5, random_state=_seed) random_reviews = pd.DataFrame( random_reviews, columns=['reviewText']).reset_index().drop(columns=['index']) random_reviews['segmentedSentences'] = random_reviews['reviewText'].apply( segment_sent) print( "5 Randomly selected reviews before and after sentence segmenetation:") print(random_reviews) ## 3.2.3 Tokenization and Stemming print_header('3.2.3 Tokenization and Stemming', 50) df['tokenizedSentences'] = df['sentences'].apply( lambda sentences: [tokenize(sentence) for sentence in sentences]) df['tokens'] = df['tokenizedSentences'].apply(flatten) ### No Stemming print_header('No Stemming', char='-') df['words'] = df['tokens'].apply( lambda tokens: [token.lower() for token in tokens]) df['words'] = df['words'].apply( lambda tokens: [token for token in tokens if is_word(token)]) df['uniqueWords'] = df['words'].apply(set) df['wordCount'] = df['uniqueWords'].apply(len) # token = {normal_word, emoji, stopword, punctuation} # word = {normal_word, emoji} plot_bar( df['wordCount'], title= 'Distribution of Number of Words for Each Review Without Stemming', x_label="Word Count", y_label="Review Count", countplot=False) plot_bar( df['wordCount'].clip(0, 300), title= 'Distribution of Number of Words for Each Review Without Stemming (Clipped)', x_label="Word Count (Clipped)", y_label="Review Count", countplot=False) words = flatten(df['words']) words_unique = flatten(df['uniqueWords']) top_20_words = pd.DataFrame.from_dict(Counter(words), orient='index').\ reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\ sort_values(['Count'], ascending = False).head(20).\ reset_index().drop(columns = ['index']) print_header('Top 20 Words Without Stemming', char='-') print(top_20_words) ### With Stemming print_header('With Stemming', char='-') stemmer = SnowballStemmer("english") df['stemmedWords'] = df['words'].apply( lambda tokens: [stemmer.stem(token) for token in tokens]) df['uniqueStemmedWords'] = df['stemmedWords'].apply(set) df['stemmedWordCount'] = df['uniqueStemmedWords'].apply(len) plot_bar(df['stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review With Stemming', \ x_label = "Stemmed Word Count", y_label = "Review Count", countplot = False) plot_bar(df['stemmedWordCount'].clip(0, 300), \ title = 'Distribution of Number of Words for Each Review With Stemming (Clipped)', \ x_label = "Word Count (Clipped)", y_label = "Review Count", countplot = False) plot_bar_overlap(df, ['wordCount', 'stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review', \ x_label = "Word Count", y_label = "Review Count", countplot = False) plot_bar_overlap(df[['wordCount', 'stemmedWordCount']].clip(0, 300), ['wordCount', 'stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review (Clipped)', \ x_label = "Word Count", y_label = "Review Count", countplot = False) stemmed_words = flatten(df['stemmedWords']) stemmed_words_unique = flatten(df['uniqueStemmedWords']) top_20_stemmed_words = pd.DataFrame.from_dict(Counter(stemmed_words), orient='index').\ reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\ sort_values(['Count'], ascending = False).head(20).\ reset_index().drop(columns = ['index']) print_header('Top 20 Words with Stemming', char='-') print(top_20_stemmed_words) print_header('3.2.4 POS Tagging', 50) tokenized_sentences = pd.Series(flatten(df['tokenizedSentences'])) print('Total Number of Sentences: ' + str(len(tokenized_sentences))) random_5_sentences = tokenized_sentences.sample(5, random_state=seed) random_5_df = pd.DataFrame( random_5_sentences, columns=['sentence']).reset_index().drop(columns=['index']) random_5_df['posTagged'] = random_5_df['sentence'].apply(pos_tag) print('=' * 30) print(random_5_df) print('=' * 30) # 3.3 Development of a Noun Phrase Summarizer print_header('3.3 Development of a Noun Phrase Summarizer', 50) df['posTagged'] = df['tokenizedSentences'].apply( lambda tokenizedSentences: [pos_tag(sentence) for sentence in tokenizedSentences]) df['nounPhrases'] = df['posTagged'].apply( lambda posTagged: [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])]) df[['reviewText', 'posTagged', 'nounPhrases']].head() # Including single noun phrases print_header('Including single noun phrases', char='-') noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrases'])), orient='index').\ reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\ sort_values(['Count'], ascending = False) top_20_noun_phrases = noun_phrases.head(20).reset_index().drop( columns=['index']) print_header('Top 20 Noun Phrases Including Single Noun Phrases', char='-') print(top_20_noun_phrases) df['nounPhrasesExcludeSingle'] = df['nounPhrases'].apply( lambda noun_phrases: [ noun_phrase for noun_phrase in noun_phrases if len(noun_phrase.split()) > 1 ]) noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrasesExcludeSingle'])), orient='index').\ reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\ sort_values(['Count'], ascending = False) top_20_noun_phrases = noun_phrases.head(20).reset_index().drop( columns=['index']) print_header('Top 20 Noun Phrases Excluding Single Noun Phrases', char='-') print(top_20_noun_phrases) products = df['asin'].value_counts().head(3).index products_np_top1 = df[df['asin'] == products[0]] products_np_top2 = df[df['asin'] == products[1]] products_np_top3 = df[df['asin'] == products[2]] print_representative_np(products_np_top1, product=products[0], n=30) print_representative_np(products_np_top2, product=products[1], n=30) print_representative_np(products_np_top3, product=products[2], n=30) random_5_reviews = df[['reviewText', 'posTagged', 'nounPhrases']].sample(5, random_state=seed) random_5_reviews['nounPhrasesLen'] = random_5_reviews['nounPhrases'].apply( len) print_header('Noun Phrase Detector Evaluation for Random 5 Reviews', char='-') print(random_5_reviews) # 3.4. Sentiment Word Detection print( str(datetime.datetime.now()).split('.')[0] + ': Start processing sentence segmentation') # Without Stemming and Without Negation sentiment_score(df, "./rep_words/ns_nn.csv") # With Stemming and Without Negation sentiment_score(df, "./rep_words/s_nn.csv", stemmer=stemmer) # Without Stemming and With Negation sentiment_score(df, "./rep_words/ns_n.csv", convert_neg=True) # With Stemming and With Negation sentiment_score(df, "./rep_words/s_n.csv", stemmer=stemmer, convert_neg=True)
def hello_world(): if request.method == "GET": return redirect("/app/index.html") else: pprint.pprint(request.form) pprint.pprint(request.files) #Language check if request.form['language'] not in ['english', 'dutch']: return jsonify(status='error', message="Invalid language!") #Input normalization if request.form['upload_option'] == 'text_field': input_text = request.form['upload_textarea'] elif request.form['upload_option'] == 'url': page_text = requests.get(request.form['upload_url']).text soup = BeautifulSoup(page_text, "html.parser") input_text = soup.text elif request.form['upload_option'] == 'file': input_text = UnicodeDammit( request.files.get('upload_file').read()).unicode_markup #Stemmer selection if request.form['stemmer'] == 'no_stemmer': stemmer = None elif request.form['stemmer'] == 'porter': if request.form['language'] != 'english': return jsonify(status='error', message="Invalid language for stemmer porter!") stemmer = PorterStemmer() elif request.form['stemmer'] == 'snowball': stemmer = SnowballStemmer(request.form['language']) else: return jsonify(status='error', message="Invalid stemmer!") #Lemmatizer selection if request.form['lemmatizer'] == 'lemmatizer_off': lemmatizer = None elif request.form['language'] == 'english': lemmatizer = lemmatizer_en else: lemmatizer = lemmatizer_nl #Stopwords selection if request.form['stopwords'] == 'no_stopwords': stopwords = None elif request.form['stopwords'] == 'our_stopwords': stopwords = obo.stopwords elif request.form['stopwords'] == 'custom_stopwords': custom_stopword_text = UnicodeDammit( request.files.get( 'custom_stopword_file').read()).unicode_markup stopwords = obo.stripNonAlphaNum(custom_stopword_text) #Process the text input_text_word_count = 0 resulting_text = "" final_wordlist = [] for word_type, word in text_processor.parse_text(input_text): if word_type == "non-word": resulting_text += word else: input_text_word_count += 1 processed_word = word if stemmer: processed_word = stemmer.stem(processed_word) if lemmatizer: processed_word = lemmatizer(processed_word) if not stopwords or processed_word not in stopwords: if request.form['exclude_vowels'] == 'exclude_vowels_yes': if request.form['language'] == 'english': regex = re_vowel_en else: regex = re_vowel_nl processed_word = regex.sub("", processed_word) resulting_text += processed_word final_wordlist.append(processed_word) dictionary = obo.wordListToFreqDict(final_wordlist) sorteddict = obo.sortFreqDict(dictionary) ignore_results_amount = int(request.form['ignore_results_amount']) if ignore_results_amount > 0: initial_index = ignore_results_amount ignored_words = [word for rank, word in sorteddict[:initial_index]] sorteddict = sorteddict[initial_index:] new_text = "" new_wordlist = [] for word_type, word in text_processor.parse_text(resulting_text): if word_type == "non-word": new_text += word elif word not in ignored_words: new_text += word new_wordlist.append(word) resulting_text = new_text final_wordlist = new_wordlist else: initial_index = 0 #Do the math! input_text_char_count = len(input_text) word_count = len(final_wordlist) distinct_words_count = len(sorteddict) words = [] frequencies = [] word_cloud = [] for frequency, word in sorteddict: words.append(word) frequencies.append(frequency) word_cloud.append([word, frequency]) acum_perc = Decimal(0) percentages = [] acum_perc_list = [] for freq in frequencies: perc = Decimal((freq * 100.0) / word_count) percentages.append(round(perc, 2)) acum_perc += perc acum_perc_list.append(round(acum_perc, 2)) logarithms = [] for i in range(len(sorteddict)): logarithms.append((math.log(i + 1), math.log(frequencies[i]))) #Calculate Linear regression #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq x = numpy.array([math.log(f) for f in frequencies]) y = numpy.array( [math.log(rank) for rank in range(1, distinct_words_count + 1)]) A = numpy.vstack([x, numpy.ones(len(x))]).T m, c = numpy.linalg.lstsq(A, y)[0] #Calculate the regression line start and end, # and sort making the start be the one with the lower X value # (highcharts requires this) regline_start = (0, c) regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c) regression_line = {'start': regline_start, 'end': regline_end} return jsonify(status='success', words=words, frequencies=frequencies, percentages=percentages, acum_perc_list=acum_perc_list, logarithms=logarithms, regression_line=regression_line, resulting_text=resulting_text, input_text_char_count=input_text_char_count, input_text_word_count=input_text_word_count, output_text_word_count=word_count, word_cloud=word_cloud, sorteddict=sorteddict)
best_models = [] dummy_models = [] model = None # Place holder # Process each language indepedently for lang in reviews_by_language.keys(): print("PROCESSING ", lang) stem_lang = get_stemmer_lang(lang) if stem_lang is None: # Use default analyser if there is no matching stemmer for this language analyzer_for_lang = 'word' else: # Language has a stemmer analyzer_for_lang = stemmed_words # Redefine stemmer with specified language stemmer = SnowballStemmer(stem_lang) stem_vectorizer = CountVectorizer(analyzer=analyzer_for_lang, ngram_range=(2, 2)) try: tokens = stem_vectorizer.fit_transform(reviews_by_language[lang]["x"]) except: # On tokeniser error, skip the language continue X = np.array(tokens.toarray()) y = np.array(reviews_by_language[lang]["y"]) # use this line instead of the above one for early access models # y = np.array(reviews_by_language[lang]["z"]) # Skip languages with less than 5 reviews (not possible with k-fold) # this may trigger depending on sampling size used if len(X) < 5:
def tokeAndClean(str, bgrams = False, tgrams = False, stopwords = stopwords.words('english'), ngramMinFreq = 2, stemming = True, stemmer = SnowballStemmer('english')): tokenizer = RegexpTokenizer("[\w']+") tokens = tokenizer.tokenize(str) # lower-cases everything, removes words < 2 letters tokens = [token.lower() for token in tokens if len(token) > 2] if stemming: try: tokens = [stemmer.stem(token) for token in tokens if len(token) > 2] # sometimes get a weird error from snowball because these tokens have length longer than 3 where the tokens are too short and snowball except: tokens = [stemmer.stem(token) for token in tokens if len(token) > 3] def cleanNGram(ngrams): out = [' '.join(token) for token in ngrams] # includes only those ngrams which occur at least ngramMinFreq times out = [ngram for ngram in out if out.count(ngram) >= ngramMinFreq] return out # adds cleaned bigrams and trigrams if necessary if(bgrams): tokens.extend(cleanNGram(bigrams(tokens))) if(tgrams): tokens.extend(cleanNGram(trigrams(tokens))) return tokens
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) lmtzr = WordNetLemmatizer() driver = QABase() # question["qid"] returns the form: "fables-04-7" q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) ############################################# # if question["qid"] == 'blogs-03-1': # print(question["text"]) # print(sent_tokenized_text[0]) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ stopwords = set(nltk.corpus.stopwords.words("english")) if (question["difficulty"] == 'Easy'): if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) text = story["sch"] text = nltk.sent_tokenize(text) else: sentences = get_sentences(current_story["text"]) text = story["text"] text = nltk.sent_tokenize(text) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question) # print(question["qid"], best_idx) # tokenize questions, also removing punctuations to extract keywords tokenizer = RegexpTokenizer(r'\w+') tokenized_question_text = tokenizer.tokenize(question["text"]) tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text) # remove stopwords tagged_keywords_list = [] for word, tag in tagged_tokenized_question_text: if word not in stopwords: tagged_keywords_list.append((word, tag)) # lemmatize keywords lemmatized_keywords_list = [] for keyword, tag in tagged_keywords_list: lemmatized_keywords_list.append(stemmer.stem(keyword)) ##################################################### # if question["qid"] == 'fables-04-6': # print("text:", text) # print("best index:", best_idx) # print("qid:", question["qid"]) # print(text[best_idx]) # print("==============================") # print(get_sentences("".join(text))) ##################################################### best_sent = get_sentences(text[best_idx]) # Find the sentences that have all of our keywords in them # Last time, 2nd arg is sentences = get_sentences(text) which returns tuple of each word target_sentences = find_sentences(lemmatized_keywords_list, best_sent) # Extract the candidate locations from these sentences candidates_forest = find_candidates(target_sentences, chunker, question["text"]) if len(candidates_forest) == 0: answer = doBaseline(question, story) else: possible_answers_list = [] # locations is a list of trees for candidate in candidates_forest: # candidate.draw() possible_answers_list.append(" ".join([token[0] for token in candidate.leaves()])) answer = " ".join(possible_answers_list) ########################################### # currently, possible_answer contains the actual needed answer, # plus some garbage words around it from chunking, # we might be able to filter this out SOMEHOW # possible_answer is a list of strings ########################################### elif question["difficulty"] == 'Medium': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR)") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") subtree1 = pattern_matcher(pattern, tree) ############################################ # if question["qid"] == 'blogs-03-13': # print("subtree1") # print(subtree1) ############################################ if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: # create a new pattern to match a smaller subset of subtrees if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP)") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) if subtree2 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': answer = "yes" elif question["difficulty"] == 'Hard': answer = "h" elif question["difficulty"] == 'Discourse': answer = "h" else: ######################################### answer = doBaseline(question, story) # answer = "doBaseline" ######################################### ### End of Your Code ### return answer
from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.externals import joblib from rest import models from django.conf import settings ######---------- VERSION 1.0 ----------###### medicines = pd.read_csv("data/baseDatos-completa.csv", header=0, delimiter=",", encoding = "utf-8") #Obtaining the medicines names from the file medicines_list = list(set([w.lower() for w in medicines["nombre-marca"]])) #Putting them tidily into a list medicines = re.compile(r"\b" + r"\b|\b".join(map(re.escape, medicines_list)) + r"\b") #Then making it into a regex stops = set(stopwords.words("spanish")) #Quicker to search in a set, so putting the stopwords in it stemmer = SnowballStemmer("spanish") #Initializing stemmer forest = joblib.load('classifier/logistic_regression') #Loading already trained logistic regression and initializing vectorizer vectorizer = joblib.load('classifier/vectorizer') #TOO MANY ISSUES WITH STREAMING, still here for archiving purposes #class MyStreamListener(tp.StreamListener): #Streamer for tweets # def on_status(self, status): #What to do when it gets a tweet, we just classify it # classified = classify(status) # # def on_error(self, status_code): #In case of error, print code on screen # print(status_code) # return True class Tweet: #Tweet class for quicker and easier manipulation def __init__(self, url, text, medicines): #Initially tweets only have their url, their cleaned text and the medicines found
meaningful_words = [w for w in words if not w in stops] clean = [] for word in meaningful_words: clean.append(SnowballStemmer("english").stem(word)) patrick_repub.append(clean) jasper_repub.append(1) print(patrick_repub, 'hi') loop = asyncio.get_event_loop() loop.run_until_complete(wait(print(patrick_repub))) patrick_demo = [] jasper_demo = [] for tweet in collection.distinct('Text', {'Classification': 0}): meaningful_words = [] nonum = re.sub("[\d*]", "number ", tweet) letters_only = re.sub("[^a-zA-Z]", " ", nonum) nourlwords = re.sub(r'^https?:\/\/.*[\r\n]*', 'http ', letters_only, flags=re.MULTILINE) words = nourlwords.lower().split() stops = set(stopwords.words("english")) meaningful_words = [w for w in words if not w in stops] clean = [] for word in meaningful_words: clean.append(SnowballStemmer("english").stem(word)) patrick_demo.append(clean) jasper_demo.append(0)
#import fastcluster import scipy.cluster.hierarchy as hcluster import matplotlib.pylab as plt from sklearn.metrics.pairwise import cosine_similarity import numpy as np df = pd.read_csv('D:\mCaas\Top 3 ques\Top3Data.csv') quesdf = df.dropna(subset=['Query_Str']) quesdf["DateTime"] = pd.to_datetime(quesdf["DateTime"]) # load nltk's English stopwords as variable called 'stopwords' stopwords = nltk.corpus.stopwords.words('english') print(stopwords) stemmer = SnowballStemmer( "english", ignore_stopwords=True ) # stems the word example - running to run, ignoring stopping words like having etc print(stemmer.stem('running')) #here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [ word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) ] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token):
# NLTK # Removing stop words from sklearn.pipeline import Pipeline text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) # In[26]: # Stemming Code import nltk nltk.download() from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english", ignore_stopwords=True) class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) stemmed_count_vect = StemmedCountVectorizer(stop_words='english') text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('mnb', MultinomialNB(fit_prior=False))]) text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
def __init__(self): self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
import pandas as pd import numpy as np from collections import Counter from clasificator import KNN_classifier from sklearn.neighbors import KNeighborsClassifier from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer, PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import f1_score import re lemmatizer = WordNetLemmatizer() stemmer = SnowballStemmer('italian') stop_words = set(stopwords.words('italian')) def tokenize(text): ''' Generic wrapper around different tokenization methods. ''' text = str(text) text = text.lower() text = text.strip() # stergem white space uri text = text.replace('{html}', "") text = re.sub(r'@[A-Z0-9a-z_:!@#$%^&()=+,.></?;|]+', '', text) text = re.sub(r'#[A-Z0-9a-z_:!@#$%^&()=+,.></?;|]+', '', text) text = re.sub(r'http\S+', '', text) text = re.sub(r'\d+', '', text)
Stemmer to Snowball Stemmer ''' # Import the toolkit and the full Porter Stemmer library import nltk from nltk.stem.porter import * from nltk.stem.snowball import SnowballStemmer print("---------- Create string vector and apply porterstemmer method:", "\n") #PorterStemmer method p_stemmer = PorterStemmer() words = ['run','runner','running','ran','runs','easily','fairly','consolingly'] for word in words: print(word+' --> '+p_stemmer.stem(word)) print("---------- Apply snowballstemmer method:", "\n") # The Snowball Stemmer requires that you pass a language parameter s_stemmer = SnowballStemmer(language='english') for word in words: print(word+' --> '+s_stemmer.stem(word)) print("---------- Create sentence and apply porterstemmer method:", "\n") phrase = 'I am meeting him tomorrow at the meeting' for word in phrase.split(): print(word+' --> '+p_stemmer.stem(word))
This function is used within preprocess(), which is used as a pre-processing chain: in this case we simply add a lowercasing feature for all the tokens that are not emoticons (e.g. :D doesn’t become :d). """ cle = sys.argv[1] if cle == '-h': print('passez en argument la clé pour trouver les élément dans la base') database = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) listOfTweets = getTweetsByHash(cle, database) punctuation = list(string.punctuation) stop = stopwords.words('french') + punctuation + [ 'via', 'le', 'les', 'a', 'rt' ] # Liste des tokens à effacer stemmer1 = SnowballStemmer('french') stemmer2 = FrenchStemmer() count_stop = Counter() # Inisialise un compteur count_stem1 = Counter() # Inisialise un compteur count_stem2 = Counter() # Inisialise un compteur for tweet in listOfTweets: try: tweetText = getTweetText(tweet) print(tweetText) tokens = preprocess(tweetText) # Tokenise le texte print('tokens') print(tokens) terms_stem = [stemmer1.stem(term) for term in tokens] print('stem sans stop') print(terms_stem)
def wordStemmingSnowball(word): stemmer = SnowballStemmer("english") stem = str(stemmer.stem(word)) return stem
ABREVIATIONS_DICT = { "'m": ' am', "'ve": ' have', "'ll": " will", "'d": " would", "'s": " is", "'re": " are", " ": " ", "' s": " is", # debatable between and/or "/": " and " } STOPWORDS_SET = set(stopwords.words('english')) SNOWBALL = SnowballStemmer('english') WORDNET = WordNetLemmatizer() def find_stop_words(corpus): ''' takes in a normalized corpus and returns stop words in pandas Series ''' unpacked_list = [word for document in corpus for word in document.split()] return pd.Series(unpacked_list).value_counts() # I question the need for this but lets just do it for now def _multiple_replace(text, adict=ABREVIATIONS_DICT): import re
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) f - one eamil passed in """ print('\nBegin parse_out_email_text.py parseOutText function\n') myReturnString = '' f.seek(0) ### go back to beginning of file (annoying) all_text = f.read() # print("all_text - begin .............................") # print(all_text) # print("all_text - end .............................\n") # print("type(all_text) - {}\n".format(type(all_text))) # type(all_text) - <class 'str'> ### split off metadata content = all_text.split("X-FileName:") # split on text in email # example from email - X-FileName: Stokley, Chris (Non-Privileged).pst #print("len(content) - {}\n".format(len(content))) # print("content[0] - {}".format(content[0])) # print("type(content[0]) - {}".format(type(content[0]))) # type(content[0]) - <class 'str'> # content[1] - With original punctuation from email # print("content[1] - begin .....") # print(content[1]) # print("content[1] - end .....\n") # print("type(content[1]) - {}".format(type(content[1]))) # type(content[1]) - <class 'str'> words = "" if len(content) > 1: ### remove punctuation # text_string = content[1].translate(string.maketrans("", ""), string.punctuation) # no older Python # print("string.punctuation - {}\n".format(string.punctuation)) # string.punctuation - !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ # all of these - None # print("type(string.punctuation) - {}\n".format(type(string.punctuation))) # type(string.punctuation) - <class 'str'> # print('str.maketrans("", "", string.punctuation) - ') # print(str.maketrans("", "", string.punctuation)) # {64: None, 124: None, 125: None, 91: None, 92: .... # Python documentation - dictionary mapping Unicode ordinals (integers) or characters (strings of length 1) to Unicode ordinals, strings (of arbitrary lengths) or None. # print("type(str.maketrans("", "", string.punctuation)) - {}\n".format(type(str.maketrans("", "", string.punctuation)))) # type(str.maketrans(, , string.punctuation)) - <class 'dict'> text_string = content[1].translate( str.maketrans("", "", string.punctuation)) # Without original punctuation from email # print("text_string (punctuation stripped out) - ") # print(text_string) # print() # print("type(text_string) - {}\n".format(type(text_string))) # type(text_string) - <class 'str'> ### project part 2: comment out the line below words = text_string # print("words - ") # print(words) # print() # print("type(words) - {}\n".format(type(words))) # type(words) - <class 'str'> ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) mySplitOutput = text_string.split() # print("mySplitOutput - {}\n".format(mySplitOutput)) # mySplitOutput - ['Hi', 'Everyone', 'If', 'you', 'can', 'read', 'this', 'message', 'youre', 'properly', 'using', 'parseOutText', 'Please', 'proceed', 'to', 'the', 'next', 'part', 'of', 'the', 'project'] # print("type(mySplitOutput) - {}\n".format(type(mySplitOutput))) # type(mySplitOutput) - <class 'list'> # done AFTER stemmimg # vectorizer = CountVectorizer() myStemmer = SnowballStemmer('english') # print("myStemmer - {}".format(myStemmer)) # myStemmer - <nltk.stem.snowball.SnowballStemmer object at 0x10b4b57f0> # print("type(myStemmer) - {}\n".format(type(myStemmer))) # type(myStemmer) - <class 'nltk.stem.snowball.SnowballStemmer'> for myWord in mySplitOutput: # print("myWord - {}".format(myWord)) # print("type(myWord) - {}\n".format(type(myWord))) # type(myWord) - <class 'str'> myStemmedWord = myStemmer.stem(myWord) # print("myStemmedWord - {}\n".format(myStemmedWord)) # print("type(myStemmedWord) - {}\n".format(type(myStemmedWord))) # type(myStemmedWord) - <class 'str'> # print("{} - {}".format(myWord, myStemmedWord)) myReturnString = myReturnString + myStemmedWord + ' ' # print() print('\nEnd parse_out_email_text.py parseOutText function\n') return myReturnString
def prepareParams(self): self.stopwords = set(stopwords.words('english')) self.dataFile = STYLE_WITH_DESC_N_TITLE self.indexFile = INVERTED_IDX_FILE self.stemmer = SnowballStemmer('english') #PorterStemmer() self.lemmatizer = WordNetLemmatizer()
def __init__(self, language): self.s = sume.ConceptBasedILPSummarizer(" ", language) self.LANGUAGE = language self.stoplist = set(stopwords.words(self.LANGUAGE)) self.stemmer = SnowballStemmer(self.LANGUAGE)
def __init__(self, articles): self.searcher = articlesearch.ArticleSearch(articles) self.keys = keywords.KeyWords() self.stemmer = SnowballStemmer("english")
def features(self, tokens, index, history): # for more details see: http://nlpforhackers.io/named-entity-extraction/ """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [ ('[START2]', '[START2]'), ('[START1]', '[START1]') ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase f = { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'nextnextpos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, } return f