def stem(self, query_df: pd.DataFrame, col: str): porter = PorterStemmer() query_df[col] = query_df[col].apply( lambda query: [porter.stem(word) for word in query]) return query_df
def stemmer(text): """Applies stemming input text.""" ps = PorterStemmer() return ps.stem(text)
from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize ps = PorterStemmer() example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"] ##for w in example_words: ## print(ps.stem(w)) new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once." words = word_tokenize(new_text) for w in words: print(ps.stem(w))
def calc_word_freq( df: dd.DataFrame, top_words: int = 30, stopword: bool = True, lemmatize: bool = False, stem: bool = False, ) -> Dict[str, Any]: """ Parse a categorical column of text data into words, and then compute the frequency distribution of words and the total number of words. Parameters ---------- df Groupby-count on the categorical column as a dataframe top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ col = df.columns[0] if stopword: # use a regex to replace stop words and non-alphanumeric characters with empty string df[col] = df[col].str.replace(fr"\b(?:{'|'.join(ess)})\b|[^\w+ ]", "") else: df[col] = df[col].str.replace(r"[^\w+ ]", "") # convert to lowercase and split df[col] = df[col].str.lower().str.split() # "explode()" to "stack" all the words in a list into a new column df = df.explode(col) # lemmatize and stem if lemmatize or stem: df[col] = df[col].dropna() if lemmatize: lem = WordNetLemmatizer() df[col] = df[col].apply(lem.lemmatize, meta="object") if stem: porter = PorterStemmer() df[col] = df[col].apply(porter.stem, meta="object") # counts of words, excludes null values word_cnts = df.groupby(col)[df.columns[1]].sum() # total number of words nwords = word_cnts.sum() # total uniq words nuniq_words = word_cnts.shape[0] # words with the highest frequency fnl_word_cnts = word_cnts.nlargest(n=top_words) return { "word_cnts": fnl_word_cnts, "nwords": nwords, "nuniq_words": nuniq_words }
def preprocess_story(story, stem=True, remove_stop_words=True, remove_punctuation=True, metaparagraph_size=5): stemmer = PorterStemmer() stop_words = set(stopwords.words('english')) # Split into a list of paragraphs paragraphs = story.split("<newline>") simplified_paragraphs = [] untokenized_paragraphs = [] par_index = 0 # Loop through paragraphs while par_index < len(paragraphs): meta_paragraph = [] # Combine small paragraphs into meta_paragraphs with at least some minimum number of sentences while par_index < len(paragraphs) and len( meta_paragraph) < metaparagraph_size: paragraph = paragraphs[par_index] # Split paragraph into a list of sentences sentences = nltk.sent_tokenize(paragraph) meta_paragraph += sentences par_index += 1 meta_paragraph_unprocessed = meta_paragraph if remove_stop_words: meta_paragraph = [ sentence.replace("<num>", " ") for sentence in meta_paragraph ] # For the tokenized version, split each sentence into a list of words paragraph_tokenized = [ nltk.word_tokenize(sentence) for sentence in meta_paragraph ] # Extra preprocessing if remove_stop_words: paragraph_tokenized = [[ word for word in sentence if word not in stop_words ] for sentence in paragraph_tokenized] if remove_punctuation: paragraph_tokenized = [[ regex.sub('[\p{P}\p{Sm}`]+', '', word) for word in sentence ] for sentence in paragraph_tokenized] paragraph_tokenized = [[word for word in sentence if word != ""] for sentence in paragraph_tokenized] if stem: paragraph_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in paragraph_tokenized] if len(meta_paragraph) < metaparagraph_size and len( untokenized_paragraphs) > 0: untokenized_paragraphs[-1] += meta_paragraph_unprocessed simplified_paragraphs[-1] += paragraph_tokenized else: if len(meta_paragraph) != 0: untokenized_paragraphs.append(meta_paragraph_unprocessed) simplified_paragraphs.append(paragraph_tokenized) return untokenized_paragraphs, simplified_paragraphs
def stemmingWords(sent): PS = PorterStemmer() stemmed_sent = [] for w in sent: stemmed_sent.append(PS.stem(w)) return stemmed_sent
e.g to read files, preprocess text, etc. """ import sys import platform from os import system from os import listdir from os.path import isfile, join from string import punctuation from nltk import pos_tag, sent_tokenize from nltk.corpus import wordnet, stopwords from nltk.stem.wordnet import WordNetLemmatizer from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize lemmatizer = WordNetLemmatizer() # Initialize lemmatizer once. stemmer = PorterStemmer() # Initialize Porter's stemmer once. stop_words = set(stopwords.words('english')).union([ # Augment the stopwords set. 'don','didn', 'doesn', 'aren', 'ain', 'hadn', 'hasn', 'mightn', 'mustn', 'couldn', 'shouldn', 'dont', 'didnt', 'doesnt', 'arent', 'aint', 'hadnt', 'hasnt', 'may', 'mightve', 'couldnt', 'shouldnt', 'shouldnot', 'shouldntve', 'mustnt', 'would', 'woulda', 'wouldany', 'wouldnot', 'woudnt', 'wouldve', 'must', 'could', 'can', 'have', 'has', 'do', 'does', 'did', 'are', 'is', 'ive', 'cant', 'thats', 'isnt', 'youre', 'wont', 'from', 'subject', 'hes', 'etc', 'edu', 'com', 'org', 've', 'll', 'd', 're', 't', 's']) def get_wordnet_tag(tag): """
def process_email(filename): import re import numpy as np import pandas as pd from nltk.stem import PorterStemmer stemmer = PorterStemmer() #============================================================================== # Process email function #============================================================================== email_pattern = r'[A-Z0-9._%+-]+@[A-Z0-9._%+-]+\.[A-Z]{2,4}' email_regex = re.compile(email_pattern, flags = re.IGNORECASE) #url_pattern = r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?' url_pattern = r'(http|https)://[^\s]*' url_regex = re.compile(url_pattern, flags = re.IGNORECASE) number_pattern = r'[0-9]+' number_regex = re.compile(number_pattern) dollar_pattern = r'[$]+' dollar_regex = re.compile(dollar_pattern) http_pattern = r'<[^<>]+>' http_regex = re.compile(http_pattern) nonword_pattern = r'[^a-zA-Z0-9]' nonword_regex = re.compile(nonword_pattern) email_list = [] # Get the body of the email line = get_email_text(filename) # process the body of the email. line = line.lower() line = http_regex.sub(' ',line) line = email_regex.sub('emailaddr',line) line = url_regex.sub('httpaddr',line) line = number_regex.sub('number',line) line = dollar_regex.sub('dollar',line) line = nonword_regex.sub(' ',line) listline = line.split() newline = [] for word in listline: word = word.strip() word = stemmer.stem(word) newline.append(word) # print(line) email_list.extend(newline) # print(email_list) vocab_filename = '../vocab.txt' b = pd.read_table(vocab_filename, header = None) vocab = pd.DataFrame(b) vocab = pd.Series(vocab[1]) invocab = np.array(vocab[vocab.isin(email_list)].index) # print(invocab) x = np.zeros(len(vocab)) x[invocab] = 1 # Sanity checks # print(invocab.shape) # print(x.shape) # print(x[x==1].shape) return x
def load_and_cache_examples(args, task, tokenizer, evaluate=False, mode=None): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists( cached_features_file) and not args.overwrite_cache and False: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if mode == 'train': examples = processor.get_train_examples( os.path.join(args.data_dir, args.train_file)) elif mode == 'eval': examples = processor.get_dev_examples( os.path.join(args.data_dir, args.dev_file)) elif mode == 'predict': examples = processor.get_test_examples( os.path.join(args.data_dir, args.test_file)) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.use_matchings: assert len(features) == len(examples) if args.do_eval or args.do_train: with open(os.path.join(args.data_dir, 'train_instances_train'), 'rb') as f: train_examples = pickle.load(f) if args.do_predict: with open(os.path.join(args.data_dir, 'train_instances'), 'rb') as f: train_examples = pickle.load(f) ps = PorterStemmer() for i in range(len(features)): features[i].matchings = get_matchings(examples[i].text_a, train_examples, ps) if args.local_rank in [-1, 0] and False: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) if args.use_matchings: all_matchings = torch.tensor([f.matchings for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_matchings) else: dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def stemming(self, bow): stemmer = PorterStemmer() return [stemmer.stem(word) for word in bow]
def extract_cooccurence(): global cfd if len(sys.argv) > 1: # Define the data path data_path = sys.argv[1] start_time = time.time() list_of_file = sorted(glob.glob(data_path)) cfd = nltk.ConditionalFreqDist() list_freq = nltk.FreqDist() stop = set(stopwords.words('english')) if not STOP_FLAG: stop = [] ps = PorterStemmer() for index, fname in enumerate(list_of_file): print("No.{} File: {}".format(index, fname)) with open(fname, encoding='latin') as file: raw = file.read() # Extract all the <TEXT> field result = re.findall(r'<TEXT>(.*?)</TEXT>', raw, re.DOTALL) texts = ''.join(result) # Tokenize tokens = word_tokenize(texts) # Filter Tokens is alphabetical and keep the in lower case # Filter by stopwords tokens_norm = [t.lower() for t in tokens if t.isalpha() and (t.lower() not in stop)] # Count the Frequency for each word list_freq = nltk.FreqDist(tokens_norm) # Tokes neighbors window wnd = [''*WND_SIZE] for t in tokens_norm: wnd.append(t) wnd = wnd[-WND_SIZE:] # Add to conditional frequency table add_conditional_frequence_table(wnd) print("Time1: {}".format(time.time() - start_time)) cfd_filter = nltk.ConditionalFreqDist() # Filter the MIN_COOCC and Calculate the score # Calculate cfd.N() cfd_N = list_freq.N()*TERM_DISTANCE*2 for term_i in cfd: cfd_filter[term_i] = nltk.FreqDist({term_j: score_term_in_term(term_j, term_i, cfd_N) for term_j in cfd[term_i] if cfd[term_i][term_j] > MIN_COOCC}) cfd[term_i].pop[term_i] print("Time2: {}".format(time.time() - start_time)) cfd_topn = nltk.ConditionalFreqDist() # Get the TOP N for w in cfd_filter: cfd_topn[w] = nltk.FreqDist(dict(cfd_filter[w].most_common(DOUBLE_TOP_N))) print("Time3: {}".format(time.time() - start_time)) print("Time4: {}".format(time.time() - start_time)) file_tag = { 'dist': '_dist'+str(TERM_DISTANCE), 'min': '_min'+str(MIN_COOCC), 'top': '_top'+str(TOP_N), 'stop': '_stp' if STOP_FLAG else '', 'pmi': '_pmi' if PMI_FLAG else '' } ujson.dump(cfd_topn, open("/Users/jeanneluo/Downloads/ap_cfd{dist}{min}{top}{stop}{pmi}.json".format( **file_tag), "w"), double_precision=3) print("Time5: {}".format(time.time() - start_time)) pdb.set_trace() return cfd_topn
import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.probability import FreqDist import matplotlib.pyplot as plt from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem.wordnet import WordNetLemmatizer lem = WordNetLemmatizer() from nltk.stem.porter import PorterStemmer stem = PorterStemmer() nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') file = open("articles/computer.txt", "rt") contents = file.read() file.close() tokenized_text = sent_tokenize(contents) # print("TOKENIZED TEXT ------------------------------------------------------------") # print(tokenized_text) tokenized_word = word_tokenize(contents) print("TOKENIZED WORD ------------------------------------------------------------") print(tokenized_word)
def stemming(file): print("Performing Stemming ...") #Using PorterStemmer from nltk for stemming ps = PorterStemmer() #Path for output file output_file = 'Data/stemmed_Lemetized_comments.csv' #Reading data from input file data = pd.read_csv(file) date = list(data.Date) label = list(data.Tag) thread = list(data.Comment_thread_id) comment_list = list(data.comments) comment_pos = list(data.comment_position) tokenized_comments_list = [] pos_tagged_comments = [] stemmed_tokens = [] stemmed_comments = [] tokenized_comments = [] #Iterating through each comment one by one and stemming for cmt in comment_list: comment = '' if (str(cmt) == 'nan'): tokenized_comments = [] else: tokenized_comments = word_tokenize(cmt) for word in tokenized_comments: #Stemming the tokenized words and appending to a new string comment stemmed_word = ps.stem(word) comment = comment + " " + stemmed_word #Appending the stemmed comment to processed comments list stemmed_comments.append(comment) tokenized_comments.clear() i = 0 #Output file creation to store the stemmed comments out = open(output_file, 'w', newline='', encoding='utf8') fieldnames = [ 'Date', 'Comment_thread_id', 'comment_position', 'Tag', 'comments' ] writer = csv.DictWriter(out, fieldnames) writer.writerow({ 'Date': 'Date', 'Comment_thread_id': 'Comment_thread_id', 'comment_position': 'comment_position', 'Tag': 'Tag', 'comments': 'comments' }) #Iterating through the stemmed comments and storying it into the output csv file for cmt in stemmed_comments: cmt = cmt.replace('\\n', '') writer.writerow({ 'Date': date[i], 'Comment_thread_id': thread[i], 'comment_position': comment_pos[i], 'Tag': label[i], 'comments': cmt }) i = i + 1 print("Stemming successfully done") #Return the file_name of the output file return output_file
def stemming(text): ps = PorterStemmer() text = [ps.stem(word) for word in text] return text
#Loop gather content from doc for i in f: #Convert to lower case s += ' '.join(i.split()).lower() return s #Main Code #Create object for posterStemmer PorterS = PorterStemmer() #Stop words stop_words = set(stopwords.words('english')) #call mwethod to Read file you should give your path as parameter instead of '/content/drive/My Drive/Colab Notebooks/result1.txt' document = readFile('.txt') #Token converter word_tokens = word_token(document) #Stem content filter_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words] print("File content before stem:\n", word_tokens)
def __init__(self): self.stop_words = stopwords.words('english') self.stemmer = PorterStemmer()
class Bayes_stemmed_porter(Bayes_stemmed): STEMMER = PorterStemmer()
def stem_text(text): ps = PorterStemmer() stemmed_text = [] for word in text.split(" "): stemmed_text.append(ps.stem(word)) return " ".join(stemmed_text)
def __steeming(self): porter = PorterStemmer() return [porter.stem(word) for word in self.tokenized]
def preprocess(tweets, text_options): #Translate emojis for all tweets (this is the default in parsing) emojis = {} with open('data/emojilist5.csv', 'r') as f: for line in f: unic = line.split(',')[0].lower() trans = line.split(',')[1] emojis[unic] = trans tweets = tweets.apply(emojify, args=(emojis, )) #Metadata information clean for all tweets pat1 = r'https?://[A-Za-z0-9./]+' pat2 = r'www\\.[^ ]+' combined_pat = r'|'.join((pat1, pat2)) url_pat = re.compile(combined_pat) pat3 = r'\\u[^ ]+' unicode_pat = re.compile(pat3) pat4 = r'@[A-Za-z0-9_]+' mention_pat = re.compile(pat4) tweets = tweets.apply(metadata_clean, args=(url_pat, unicode_pat, mention_pat)) #Expand negations if text_options['negation_expand'] is True: negations_dic = { "isn\'t": "is not", "aren\'t": "are not", "wasn\'t": "was not", "weren\'t": "were not", "haven\'t": "have not", "hasn\'t": "has not", "hadn\'t": "had not", "won\'t": "will not", "wouldn\'t": "would not", "don\'t": "do not", "doesn\'t": "does not", "didn\'t": "did not", "can\'t": "can not", "couldn\'t": "could not", "shouldn\'t": "should not", "mightn\'t": "might not", "mustn\'t": "must not", "shan\'t": "shall not", "ain\'t": "am not" } neg_expand_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') tweets = tweets.apply(negation_expand, args=(neg_expand_pattern, negations_dic)) #Remove punctuation if text_options['punctuation_remove'] is True: tweets = tweets.apply(punctuation_remove) #Remove metadata- hashtags, urls, mentions, unicode if text_options['metadata_remove'] is True: tweets = tweets.apply(metadata_remove) #Remove emojis from tweets if text_options['emoji_remove'] is True: tweets = tweets.apply(emoji_remove) #Remove digits if text_options['digits_remove'] is True: tweets = tweets.apply(digits_remove) #Mark negations if text_options['negation_mark'] is True: neg_words = [ 'not', 'never', 'no', 'nothing', 'noone', 'nowhere', 'none', 'isnt', 'arent', 'wasnt', 'werent', 'havent', 'hasnt', 'hadnt', 'wont', 'wouldnt', 'dont', 'doesnt', 'didnt', 'cant', 'couldnt', 'shouldnt', 'mightnt', 'mustnt', 'shant', 'aint' ] neg_mark_pattern = re.compile(r'\b(' + '|'.join(neg_words) + r')\b') tweets = tweets.apply(negation_marking, args=(neg_mark_pattern, )) #Normalization if text_options['normalize'] is True: repeat_pattern = re.compile(r"(.)\1{2,}") tweets = tweets.apply(normalize_text, args=(repeat_pattern, )) #Remove stopwords: done before stemming and after negation marking (stemmer stems stopwords, if done before negation marking, some negation words removed) if text_options['stopwords_remove'] is True: tweets = tweets.apply(stopwords_remove) #Stemming. Note: will convert to lowercase and also stem stopwords (such as 'was' to 'wa') if text_options['stemming'] is True: ps = PorterStemmer() tweets = tweets.apply(stemming_apply, args=(ps, )) #Lowercasing of text if text_options['lower'] is True: tweets = tweets.str.lower() return tweets
def __init__(self): self.porterStemmer = PorterStemmer()
def fillblanks(self, sent): """Méthode principale : Génère une phrase avec un verbe manquant depuis un sous-titre reçu dont l'élève doit trouver le bon verbe parmis d'autres. """ ext = ["'s", "'re", "is", "are", "'ve", "'m", "am"] # cherche le verbe dans le texte en postagant avec nltk p = [] for s in sent: token = nltk.word_tokenize(s) pos = nltk.pos_tag(token) p.append(pos) # créer une liste de verbes postagués f = [] for i in range(0, len(p)): for l in range(0, len(p[i])): if p[i][l][1] in list(M.modes.keys()): f.append(p[i][l]) # retire les doublons par un set() g = [] for i in range(len(f)): if f[i][0] in ext: continue else: g.append(f[i]) g = dict(set(g)) # créé les phrases avec les blancs ans = [] fill = [] for s in sent: for w in g.keys(): if w in s: m = s.replace(w, ' ___________ ') fill.append(m) ans.append(w) prefinal = np.column_stack((fill, ans)) prefinal = pd.DataFrame(prefinal, columns=['Phrase', 'Answer']) answers = pd.DataFrame(prefinal.iloc[:, 1]) answers['2'] = None answers['3'] = None answers['4'] = None # itère toutes les réponses, # identifie le postag depuis le set g[] et # lui assigne les synonymes stem = PorterStemmer() lemmatizer = WordNetLemmatizer() for i in range(len(answers)): synonyms = [] antonyms = [] word = answers.iloc[i, 0] word_tag = g[word] # identifie les synonymes et antonymes de chaque mot for syn in wordnet.synsets(word): for l in syn.lemmas(): p1 = re.compile('[A-Za-z]+_[A-Za-z]+') p2 = re.compile('[A-Za-z]+_[A-Za-z]+_[A-Za-z]+') p3 = re.compile('[A-Za-z]+_[A-Za-z]+_[A-Za-z]+_[A-Za-z]+') p4 = re.compile('[A-Za-z]+-[A-Za-z]+-[A-Za-z]+') if any([ re.match(l.name(), word, re.IGNORECASE), re.match(word, l.name(), re.IGNORECASE), l.name() == stem.stem(word), l.name() == lemmatizer.lemmatize(word, 'v'), p1.match(l.name()), p2.match(l.name()), p3.match(l.name()), p4.match(l.name()) ]): continue synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) # teste si la liste des synonymes est non nulle if len(synonyms) != 0: answers.iloc[i, 1] = self.fill(synonyms[0], word_tag) answers.iloc[i, 2] = self.fill(synonyms[1], word_tag) else: continue # concatène les réponses dans un data frame final = np.column_stack((prefinal, answers)) final = pd.DataFrame(final, columns=['Phrase', 'Answer', '1', '2', '3', '4']) return final
def stemming_tokenizer(text): stemmer = PorterStemmer() temp = text.replace("\\r", " ") temp = temp.replace("\\n", " ") return [stemmer.stem(w) for w in word_tokenize(temp)]
we must protect and nurture and build on. If we are not free, no one will respect us. My second vision for India’s development. For fifty years we have been a developing nation. It is time we see ourselves as a developed nation. We are among the top 5 nations of the world in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling. Our achievements are being globally recognised today. Yet we lack the self-confidence to see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect? I have a third vision. India must stand up to the world. Because I believe that unless India stands up to the world, no one will respect us. Only strength respects strength. We must be strong not only as a military power but also as an economic power. Both must go hand-in-hand. My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material. I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. I see four milestones in my career ''' sp = PorterStemmer() wordnet = WordNetLemmatizer() sentences = nltk.sent_tokenize(para) corpus = [] for i in range(len(sentences)): review = re.sub('[^a-zA-Z]', ' ', sentences[i]) review = review.lower() review = review.split() review = [ wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review)
def clean_tweets(data, save_to_file=False, stopwords_stemming=False, path="resources/Sentiment140_clean.csv"): """ This function prepares and cleans all tweets within the given data frame. Contractions handling, lowercase delete special signs, @s with username, https, word stemming, stopwords removal, ... :param data: pandas data frame containing raw Sentiment140 tweets :param save_to_file: option to save cleaned data :param stopwords_stemming: set True if stopwords removal and word stemming should be applied :param path: path to save cleaned data :return: cleaned/prepared pandas data frame containing tweets """ tweets = [] labels = [] data['label'].replace([4], 1, inplace=True) stop_words = set(stopwords.words('english')) ps = PorterStemmer() for i in range(len(data.text)): if (i + 1) % 100000 == 0: print( f"INFO:{i + 1} of {len(data.text)} tweets have been processed." ) # apply different cleaning steps on each tweet tweet = data.text[i] label = data.label[i] tweet = tweet.lower() tweet = re.sub(r"@\S+", "", tweet) tweet = re.sub(r"http\S+", "", tweet) tweet = re.sub(r"www.\S+", "", tweet) tweet = " ".join( [contractions_mapping.get(i, i) for i in tweet.split()]) tweet = re.sub("[^a-zA-Z]", " ", tweet) # tokenize tweet word_tokens = word_tokenize(tweet) # if parameter is set: apply stopwords removal and stemming if stopwords_stemming is True: words_temp = [w for w in word_tokens if w not in stop_words] word_tokens = [ps.stem(w) for w in words_temp] # put words back together to tweet (eliminates 'double-spaces' which might have been created) tweet = [] for w in word_tokens: tweet.append(w) tweet = " ".join(tweet) tweets.append(tweet) labels.append(label) print( f"INFO:{len(data.text)} of {len(data.text)} tweets have been processed." ) # concat to data frame and drop empty entries data = pd.DataFrame({'label': labels, 'text': tweets}) data_clean = drop_empty_entries(data) # save csv file if demanded if save_to_file is True: data_clean.to_csv(path, encoding="utf-8") print(f"INFO: cleaned tweets saved to path: {path}") return data_clean
def form_full_questions(self, candidate, jsondata, tagged): """ flag : is a flag to distinguis the blank ques nd actual ques 0 = blank ques 1 = actual ques ans : is a flag to keep a track of modified answer 0 = no change in answer ans = <other then 0> is the modified ans """ full_ques = candidate['Question'] sentence = candidate['Sentence'] answer = candidate['Answer'] flag = 0 ans = 0 new_full_ques = [] pattern_strings = self.pattern_verb_noun(candidate['Sentence'], jsondata) for word, pos in tagged: # to check is word is in answer if ((answer.find(word)) >= 0): #to check if blank is in starting if (flag == 0) and ((sentence.find(word)) == 0): if ((('NN' == pos) or ('NNP' == pos) or ('NNPS' == pos)) and jsondata.has_key("PERSON")) and ( word in jsondata['PERSON']): full_ques = full_ques.replace("_____", 'Who') full_ques = full_ques + "?" flag = 1 elif (jsondata.has_key("LOCATION") and (word in jsondata['LOCATION'])) or ( jsondata.has_key("GPE") and (word in jsondata['GPE'])): full_ques = full_ques.replace("_____", 'Where') full_ques = full_ques + "?" flag = 1 elif ('NN' == pos) or ('NNP' == pos) or ('NNPS' == pos): full_ques = full_ques.replace("_____", 'What') full_ques = full_ques + "?" flag = 1 if (flag == 0 and (len(pattern_strings) > 0)): for pattern_string_no in range(len(pattern_strings)): if ((('NN' == pos) or ('NNP' == pos) or ('NNPS' == pos)) and jsondata.has_key("PERSON") ) and (word in jsondata['PERSON']): individual_words = pattern_strings[ pattern_string_no].split() verb = [ word for word in individual_words if word not in jsondata['PERSON'] ] print "Verb : ", str(verb) print "pattern_strings[pattern_string_no] : ", str( pattern_strings[pattern_string_no]) full_ques = sentence.replace( str(pattern_strings[pattern_string_no]), '') full_ques = "What " + str( verb[0]) + " " + str(full_ques).lower() + "?" print "word : " + word + " pos : " + pos flag = 1 if (flag == 0): pattern_strings, verbs, nouns = self.pattern_verb_dt_adj_noun( candidate['Sentence'], jsondata) print "pattern_strings : " print pattern_strings if (len(pattern_strings) > 0): if (jsondata.has_key("LOCATION") and (word in jsondata['LOCATION'])) or ( jsondata.has_key("GPE") and (word in jsondata['GPE'])): for pattern_string_no in range( len(pattern_strings)): if (pattern_strings[pattern_string_no].find( answer) >= 0): print "pattern_string_no : ", pattern_string_no individual_words = pattern_strings[ pattern_string_no].split() print "individual_words :" print individual_words verb = [ word for word in individual_words if word in verbs ] print "Verb : ", str(verb) print "pattern_strings[pattern_string_no] : ", pattern_strings[ pattern_string_no] full_ques = sentence.replace( pattern_strings[pattern_string_no], '') full_ques = "Where " + str( verb[0]) + " " + str( full_ques).lower() + "?" noun = [ word for word in individual_words if word in nouns ] ps = PorterStemmer() ans = ps.stem(word) # print "word : " + word + " pos : " + pos flag = 1 new_full_ques.append(full_ques) print new_full_ques return new_full_ques, ans, flag # if __name__ == '__main__': # form_full_questions("Ahmedabad is walking run a very good city")
# To find the frequency of top 10 words fdist1 = fdist.most_common(10) fdist1 #-------------------------------------------------- ### Stemming #-------------------------------------------------- # Stemming is the process of categorizing words into # the root form. # Approach 1: Importing Porterstemmer from nltk library # Checking for the word ‘giving’ from nltk.stem import PorterStemmer pst = PorterStemmer() pst.stem("waiting") # Categorizing the list of words stm = ["waited", "waiting", "waits"] for word in stm : print(word+ ":" +pst.stem(word)) # Approach 2: Importing LancasterStemmer from nltk from nltk.stem import LancasterStemmer lst = LancasterStemmer() stm = ["giving", "given", "given", "gave"] for word in stm : print(word + ":" + lst.stem(word))
def stemming_tweets(self, tweet): ps = PorterStemmer() tweets_stemming = ps.stem(tweet) return tweets_stemming
see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect? I have a third vision. India must stand up to the world. Because I believe that unless India stands up to the world, no one will respect us. Only strength respects strength. We must be strong not only as a military power but also as an economic power. Both must go hand-in-hand. My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material. I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. I see four milestones in my career""" #cleaning the text import re #for regularization from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer stemmer = PorterStemmer() lemma = WordNetLemmatizer() sentences = nltk.sent_tokenize(paragraph) cleaned_sentences = [] for i in range(len(sentences)): review = re.sub( '[^a-zA-Z]', " ", sentences[i] ) #replaces all the characters except a-z and A-Z letters with spaces review = review.lower() # lowering uppercase characters review = review.split( ) # splitting on the basis of spaces creates list of words
def processEmail(email_contents): vocabList = getVocabList() word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Tokenize and get rid of any punctuation # [str, email_contents] = ... # strtok(email_contents, ... # [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents) # print(email_contents) # Output the email to screen as well #print('\n==== Processed Email ====\n\n') # Process file l = 0 for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print(token) l = l + len(token) + 1 # Print footer #print('\n\n=========================\n') return word_indices