def stemming_phrases(self, phrase): porter_stemmer = PorterStemmer() tokenize = [docs for docs in phrase.split(" ")] stemmazied_phrase_lists = [] for word in tokenize: stemmazied_phrase_lists.append(porter_stemmer.stem(word)) return " ".join(stemmazied_phrase_lists)
def api(): content = request.json text = content["message"] nltk.download('punkt') nltk.download('stopwords') related_news = relatednews(text) NBVocab = open('NBVocab.pkl', 'rb') cv = joblib.load(NBVocab) model = open('model.pkl', 'rb') clf = joblib.load(model) ps = PorterStemmer() sw = set(stopwords.words('english')) sw.remove('not') sw.remove('no') sw.add('\n') text = text.lower() tokenizer = RegexpTokenizer('[A-z]+') word_list = tokenizer.tokenize(text) clean_list = [w for w in word_list if w not in sw] stemmed_list = [ps.stem(w) for w in clean_list] clean_text = ' '.join(stemmed_list) X_vec = cv.transform([clean_text]) pred = clf.predict(X_vec) pred = pred[0] return jsonify({"prediction": pred, "related_news": related_news})
def removepunct_tokenize_stem(text): text = "".join([ch for ch in text if ch not in string.punctuation]) #Remove punctuation tokens = word_tokenize(text) stemmer = PorterStemmer() final = [stemmer.stem(item) for item in tokens] return final
def doStemming(tokens): ps = PorterStemmer() stemmed_tokens = [] for w in tokens: stemmed_tokens.append(ps.stem(w)) return stemmed_tokens
def generate_frequencies(labeled_data, filter_threshold=0.03): stemmer = PorterStemmer() stop_words = stopwords.words('english') categories = dict() # dict(category_name, {num_docs : int, counts : Counter(words)}) # word_tokenize = lambda x: RegexpTokenizer(r'\w+').tokenize(x) for doc in labeled_data: category = doc["Category"].lower() # some of the labels are inconsistent in case # if category == 'uninformative': # continue if category not in categories.keys(): categories[category] = {'num_docs': 1, 'counts': Counter()} else: categories[category]['num_docs'] += 1 # use word_tokenize to parse words, make unique, remove stopwords # leaves non word things like '?', and "`", in input # NOTE: 2/27/20 -- Found forgot to call lower here message = doc["message"].lower().strip() message = word_tokenize(message) segmented_message = [] for wd in message: segmented_message.append(wd) segments = wordsegment.segment(wd) if len(segments) > 1: segmented_message.extend(segments) processed_message = [stemmer.stem(wd) for wd in segmented_message if wd not in stop_words and sum(map((lambda x: 1 if x[1].isalnum() else 0), enumerate(wd))) > 0] for wd in processed_message: categories[category]['counts'][wd] += 1 term_freqs = deepcopy(categories) doc_freqs = Counter() for cat in categories: category = categories[cat] for wd in category['counts']: # calculate term frequency % (within a single category) # Note: can also do number of times word appears across all categories count = category['counts'][wd] freq = count / category['num_docs'] if freq < filter_threshold: del term_freqs[cat]['counts'][wd] # else: # print(cat, " : ('", wd, "', ", freq, ")", sep='') # Increase document frequency (here doc refers to category) # each word should appear only once per category, # so this counts number of categories a word appears in doc_freqs[wd] += 1 return term_freqs, doc_freqs
def tokenizer_porter(text): """ Split the comment text into a single word and extract the stem, :param text: :return: """ porter = PorterStemmer() return [porter.stem(word) for word in text.split()]
def doStemming(tokens): ps = PorterStemmer() stemmed_tokens = [] for w in tokens: stemmed_tokens.append(ps.stem(w)) print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAfter Stemming = ", stemmed_tokens) return stemmed_tokens
def stemmer(tokens): ''' Simple stemming loop for general use throughout project. Will stem all tokens in a list. ''' ps = PorterStemmer() stemmed = list() for t in tokens: stemmed.append(ps.stem(t)) return stemmed
def get_tokens(self, text, n=-1): article = TextSplitter().text_splitter(text) stemmer = PorterStemmer() stems = FreqDist() for word in article: lower_word = word.lower() # filter the stop words if not lower_word in stopwords.words(DOC_LANG): stems.inc(stemmer.stem_word(lower_word)) # top n frequency items if n == -1: return stems.items() return stems.items()[:n]
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix: matrix_loc = Path('data', name, 'tf_idf_matrix.pickle') if matrix_loc.exists(): logger.info("Matrix exists! loading...") with matrix_loc.open('rb') as f: matrix = pickle.loads(f.read()) return matrix stemmer = PorterStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True) texts = [] for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."): text = tweet.text tokens = tokenizer.tokenize(text) text_proc = [] for token in tokens: token = token.strip() if len(token) < 3: continue elif token in stopwords.words('english'): continue elif nlp_utils.match_url(token): continue elif token in string.punctuation: continue # elif token.startswith(("#", "$")): # continue token = token.translate({ord(k): "" for k in string.punctuation}) token = stemmer.stem(token) token = token.strip() if token == "": continue text_proc.append(token) texts.append(text_proc) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=lambda x: x, lowercase=False) m = vectorizer.fit_transform(texts) logger.info("Saving computed matrix...") with matrix_loc.open('wb') as f: f.write(pickle.dumps(m)) return m
def textprocessing(text): text = str(text) stemmer = PorterStemmer() text.replace('`', "") text.replace("\"", "") re_sp = re.sub(r'\s*(?:([^a-zA-Z0-9._\s "])|\b(?:[a-z])\b)', " ", text.lower()) text = re.sub("[!@#$%\n^'*)\\(-=]", " ", re_sp) no_char = ' '.join([w for w in text.split() if len(w) > 3]).strip() filtered_sp = [ w for w in no_char.split(" ") if not w in stopwords.words('english') ] stemmed_sp = [stemmer.stem(item) for item in filtered_sp] filtered_sp = ' '.join([x for x in filtered_sp]) return filtered_sp
def pipeline_csv(headlines): headlines['headline'] = headlines['headline'].apply(nltk.word_tokenize) stemmer = PorterStemmer() headlines['headline'] = headlines['headline'].apply( lambda x: [stemmer.stem(y) for y in x]) lemmatizer = nltk.WordNetLemmatizer() headlines['headline'] = headlines['headline'].apply( lambda x: [lemmatizer.lemmatize(y) for y in x]) stopwords = nltk.corpus.stopwords.words('english') stemmed_stops = [stemmer.stem(t) for t in stopwords] headlines['headline'] = headlines['headline'].apply( lambda x: [stemmer.stem(y) for y in x if y not in stemmed_stops]) headlines['headline'] = headlines['headline'].apply( lambda x: [e for e in x if len(e) >= 3]) headlines['headline'] = headlines['headline'].str.join(" ") return headlines
def stemmed_words(doc): ''' This function is normally called as the sklearn vectorizer's analyzer so that tokenize can be performed when a vectorizer is initialized Inputs: doc: the untokenized text body of a document Returns: The tokenized version of the document E.g. vectorizer = CountVectorizer(lowercase=True, analyzer=stemmed_words) ''' stemmer = PorterStemmer() analyzer = TfidfVectorizer().build_analyzer() return (stemmer.stem(w) for w in analyzer(doc))
def __init__(self, corpus, expanded_urls): self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.corpus = corpus self.expanded_urls = expanded_urls self.re_url = r'http\S+' self.punctuation = string.punctuation self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/' self.stanford_pos = StanfordPOSTagger( self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger', self.stanford_pos_pwd + 'stanford-postagger.jar') self.tag_vocab = defaultdict(Counter) self.tag_token = dict() self.vocab = defaultdict(set) self.tags = Counter()
def clean_text(text): x = re.compile('<.*?>') text = re.sub(x, '', text) stop_words = set(stopwords.words('english')) # obtain the stop words good_words = [] # save the correct words to consider like tokens tokenizer = RegexpTokenizer("[\w']+") # function to recognize the tokens words = tokenizer.tokenize(text) # tokenize the text for word in words: # check if the word is lower and it isn't a stop word or a number if word.lower() not in stop_words and word.isalpha(): word = PorterStemmer().stem(word) # use the stemmer function good_words.append( word.lower()) # insert the good token to lower case return good_words
def __init__(self, data_loader): self.data = data_loader self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.re_url = r'http\S+' self.punctuation = string.punctuation self.vocab = defaultdict(set)
def extract_keywords(sentence): sentence = sentence.lower() not_stopw = ["no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very"] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) print(stopw) pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') sentence = sentence.replace('\n', '') sentence = sentence.replace("n't", " not") sentence = clean_string(sentence) sentence = pattern.sub('', sentence) stemmer = Stemmer() s = [stemmer.stem(w) for w in sentence.split()] b = zip(*[s[i:] for i in [0, 1]]) b = [bigram[0] + " " + bigram[1] for bigram in b] return s + b
def process_text(text): # Lowercase text = text.lower() # Remove URLS text = re.sub(r'^https?:\/\/.*[\r\n]*', "", text) # Extract Alfanumberic Tokens tokens = re.findall(r'\w+', text) # Remove Stopwords list_stopwords = stopwords.words("portuguese") tokens = [word for word in tokens if word not in list_stopwords] # Stemming snow_stemmer = PorterStemmer() tokens = [snow_stemmer.stem(word) for word in tokens] return " ".join(tokens)
def extract_keywords(sentence): sentence = sentence.lower() not_stopw = [ "no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very" ] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) print(stopw) pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') sentence = sentence.replace('\n', '') sentence = sentence.replace("n't", " not") sentence = clean_string(sentence) sentence = pattern.sub('', sentence) stemmer = Stemmer() s = [stemmer.stem(w) for w in sentence.split()] b = zip(*[s[i:] for i in [0, 1]]) b = [bigram[0] + " " + bigram[1] for bigram in b] return s + b
def getCleanedReview(review): review = review.replace('<br /><br />', " ") # Tokenization of text tokenizer = RegexpTokenizer(r'\w+') wordsList = tokenizer.tokenize(review) wordsList = [word.lower() for word in wordsList] # Removing stopwords sw = stopwords.words('english') sw = set(sw) wordsList = [word for word in wordsList if word not in sw] # Text stemming ps = PorterStemmer() wordsList = [ps.stem(word) for word in wordsList] # print(wordsList) # Return clean review cleaned_review = " ".join(wordsList) return cleaned_review
def __get_stemmer(self, stemmer, lang): """ method (str): method for stemming, can be either snowball or porter """ lang_dict = {"da": "danish", "en": "english"} if lang in lang_dict: lang = lang_dict[lang] else: raise ValueError(f"language {lang} not in language dict for\ stemmer") if stemmer == "porter": ps = PorterStemmer() self.stemmer = ps.stem elif stemmer == "snowball": ss = SnowballStemmer(lang) self.stemmer = ss.stem elif not callable(self.stemmer): raise TypeError(f"stemmer should be a 'porter' or 'snowball' or\ callable not a type: {type(self.stemmer)}")
class GigawordParser(StreamParser): STEMMERS = { "eng": PorterStemmer(ignore_stopwords=False), "spa": SpanishStemmer(), } def __init__(self, language): self.next_id = 0 self.language = language self.stemmer = self.STEMMERS.get(language) if self.stemmer is None: raise Exception("Unsupported language %s" % language) def init_id_counter(self, initial): self.next_id = initial def new_id(self): new_id = self.next_id self.next_id += 1 return new_id def parse_raw(self, xml_str): xml = minidom.parseString(xml_str) if self.language == "es": try: url = "gigaword:" + xml.getElementsByTagName( "DOC")[0].attributes["id"].value title = xml.getElementsByTagName( "HEADLINE")[0].firstChild.nodeValue except: url = "<NONE>" title = "<NONE>" else: url = "<NONE>" title = "<NONE>" text = stringio.StringIO() for node in xml.getElementsByTagName("TEXT")[0].childNodes: if len(node.childNodes) > 0: text.write(node.firstChild.nodeValue) content = text.getvalue() terms = text_to_terms(content, self.language) return RuwacDocument(self.new_id(), url, title, content, terms)
def stemVector(vector, method="lemmatize"): output = [] if method == 'lemmatize': wnl = WordNetLemmatizer() for i in vector: i = wnl.lemmatize(i) output.append(i) if method == 'snowball': st = EnglishStemmer() for i in vector: i = st.stem(i) output.append(i) if method == 'porter': st = PorterStemmer() for i in vector: i = st.stem(i) output.append(i) if method == 'lancaster': st = LancasterStemmer() for i in vector: i = st.stem(i) output.append(i) return output
from sklearn.feature_extraction.text import CountVectorizer import pickle import itertools ############################################################################### ### Fetch the dataset ############################################################################### _20news = fetch_20newsgroups(subset="all") print("Dataset 20NEWS loaded...") data = _20news.data target = _20news.target ############################################################################### # Pre-process the dataset ############################################################################### print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [ 'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization', 'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks', 'don', 'know', 'help', 'use', 'copy' ] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words ]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word)
""" TFIDF based baseline normalization. """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import accuracy_score from sklearn.metrics.pairwise import cosine_similarity from ontology import read_ontology import numpy from write import write from read import read import settings import sys from nltk.stem.snowball import PorterStemmer stemmer = PorterStemmer() from nltk import word_tokenize from baseline import build_tfidf def normalize(in_path, out_path): print 'Reading concept data' concepts = list(read_ontology()) concept_ids, concept_names, concept_map, concept_vectors, tfidf_vectorizer = build_tfidf(concepts) reverse_concept_map = {concept_ids[i]:concept_names[i] for i in range(len(concept_names))} print 'Making predictions' devel_data = read(in_path) devel_tuples = [] for entity_id, data in devel_data.items():
def __init__(self): self.api_key = KEY self.tokenizer = nltk.WordPunctTokenizer() self.stm = PorterStemmer()
class ApiClient(object): API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json" MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json" def __init__(self): self.api_key = KEY self.tokenizer = nltk.WordPunctTokenizer() self.stm = PorterStemmer() def _load(self, **kwargs): """ Loads list of movies via filter """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.API_URL, params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def _load_movie(self, movie_id, **kwargs): """ Loads extra movie information such as directors, genres, etc. """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def normalize(self, text): tokens = list() for token in self.tokenizer.tokenize(text.lower()): # Excludes stopwords, punctuation; stemming if token in stopwords.words('english'): continue token = self.stm.stem(token) if token.isalpha(): tokens.append(token) return tokens def get_extra_params(self, movie_id, movie): """ Saves extra features of movie """ m = self._load_movie(movie_id) if (m.has_key('genres') and m.has_key('runtime') and m.has_key('critics_consensus') and m.has_key('abridged_cast') and m.has_key('abridged_directors') and m.has_key('studio')): movie.genres = m.get("genres") movie.runtime = m.get("runtime") movie.critics_consensus = self.normalize( m.get("critics_consensus")) movie.abridged_cast_names = [ ac['name'] for ac in m.get("abridged_cast") ] try: movie.first_director = m.get("abridged_directors")[0]['name'] # This never happened: check type of exception except ValueError: return False movie.studio = m.get("studio") return True return False def search_movies(self, keyword, movie_ids, page_limit=50): #DBG logging.debug("Searching movies by keyword '%s'", keyword) # Get list of movies response = self._load(q=keyword, page_limit=1, page=1) n = response.get("total") # Load all 25 pages x 50 movies for i in xrange(min(n / page_limit, 25)): response = self._load(q=keyword, page_limit=page_limit, page=i + 1) if response: movies = response.get("movies") if movies: for result in movies: movie_id = result.get("id") print movie_id if not movie_id or movie_id in movie_ids: continue movie_ids.add(movie_id) title = result.get("title") synopsis = result.get("synopsis") # Convert rating into linear scale [0-4] rating = self.set_rating(result.get("mpaa_rating")) if title and rating >= 0: movie = Movie(movie_id, title) if not synopsis: movie.synopsis = ['EMPTY'] else: movie.synopsis = self.normalize(synopsis) movie.mpaa_rating = rating # Load extra movie information if self.get_extra_params(movie_id, movie): yield movie @staticmethod def set_rating(rating): if rating == 'G': return 0 elif rating == 'PG': return 1 elif rating == 'PG-13': return 2 elif rating == 'R': return 3 elif rating == 'NC-17': return 4 else: return -1
from nltk.stem.snowball import PorterStemmer import food_detection_root import os import codecs stemmer = PorterStemmer() path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep what_food_list_file = codecs.open(path + "list - what_food.txt", encoding='utf-8') what_food_list = what_food_list_file.read().splitlines() stemmed_list = list() what_food_list_file.close() for word in what_food_list: stemmed_word = stemmer.stem(word) stemmed_list.append(stemmed_word) what_food_stemmed_list_file = codecs.open(path + "list - stemmed_what_food.txt", encoding='utf-8', mode='a') for word in stemmed_list: what_food_stemmed_list_file.write(word + "\n") what_food_stemmed_list_file.close()
def removepunct_tokenize_stem(text): text = "".join([ch for ch in text if ch not in string.punctuation]) #Remove punctuation tokens = word_tokenize(text) stemmer = PorterStemmer() final = stem_tokens(tokens, stemmer) return final
class ApiClient(object): API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json" MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json" def __init__(self): self.api_key = KEY self.tokenizer = nltk.WordPunctTokenizer() self.stm = PorterStemmer() def _load(self, **kwargs): """ Loads list of movies via filter """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.API_URL, params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def _load_movie(self, movie_id, **kwargs): """ Loads extra movie information such as directors, genres, etc. """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def normalize(self, text): tokens = list() for token in self.tokenizer.tokenize(text.lower()): # Excludes stopwords, punctuation; stemming if token in stopwords.words('english'): continue token = self.stm.stem(token) if token.isalpha(): tokens.append(token) return tokens def get_extra_params(self, movie_id, movie): """ Saves extra features of movie """ m = self._load_movie(movie_id) if (m.has_key('genres') and m.has_key('runtime') and m.has_key('critics_consensus') and m.has_key('abridged_cast') and m.has_key('abridged_directors') and m.has_key('studio')): movie.genres = m.get("genres") movie.runtime = m.get("runtime") movie.critics_consensus = self.normalize(m.get("critics_consensus")) movie.abridged_cast_names = [ac['name'] for ac in m.get("abridged_cast")] try: movie.first_director = m.get("abridged_directors")[0]['name'] # This never happened: check type of exception except ValueError: return False movie.studio = m.get("studio") return True return False def search_movies(self, keyword, movie_ids, page_limit=50): #DBG logging.debug("Searching movies by keyword '%s'", keyword) # Get list of movies response = self._load(q=keyword, page_limit=1, page=1) n = response.get("total") # Load all 25 pages x 50 movies for i in xrange(min(n/page_limit, 25)): response = self._load(q=keyword, page_limit=page_limit, page=i+1) if response: movies = response.get("movies") if movies: for result in movies: movie_id = result.get("id") print movie_id if not movie_id or movie_id in movie_ids: continue movie_ids.add(movie_id) title = result.get("title") synopsis = result.get("synopsis") # Convert rating into linear scale [0-4] rating = self.set_rating(result.get("mpaa_rating")) if title and rating >= 0: movie = Movie(movie_id, title) if not synopsis: movie.synopsis = ['EMPTY'] else: movie.synopsis = self.normalize(synopsis) movie.mpaa_rating = rating # Load extra movie information if self.get_extra_params(movie_id, movie): yield movie @staticmethod def set_rating(rating): if rating == 'G': return 0 elif rating == 'PG': return 1 elif rating == 'PG-13': return 2 elif rating == 'R': return 3 elif rating == 'NC-17': return 4 else: return -1
with open('data/dbpedia/test.csv', 'r', encoding='utf-8') as f: csv_file = csv.reader(f, delimiter=',') for row in csv_file: target.append(int(row[0])) # Class index data.append(row[2].encode('utf-8', 'ignore')) # Text description (ignore the entity name) data = np.asarray(data) target = np.asarray(target) target = target - 1 # Labels starting from 0 print("Dataset DBPEDIA loaded...") ############################################################################### ### Pre-process the dataset ############################################################################### print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i)
def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, PorterStemmer()) return stems
for key, value in dic.items(): if value < 2: to_rm.append(key) pattern = re.compile(r'\b(' + r'|'.join(to_rm) + r')\b\s*') for i in range(len(lines)): if not (len(to_rm) == 0): lines[i] = pattern.sub('', lines[i]) return lines if __name__ == "__main__": not_stopw = ["no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very"] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) stemmer = Stemmer() pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') script, fin, fout = sys.argv with open(fin, 'r') as f_in: lines = f_in.readlines() grades = [] for i in range(len(lines)): line = lines[i].split("\t") grades.append(line[0]) lines[i] = line[1].replace("\n", "") lines[i] = cls(lines[i]) for i in range(len(lines)): lines[i] = lines[i].replace("n't", " not") for i in range(len(lines)): lines[i] = lines[i].lower() lines[i] = pattern.sub('', lines[i])