def q02_tokenize(path): df, X_train, X_test, y_train, y_test = q01_load_data(path) x_train_ser = pd.Series(X_train) x_train_lower_case = x_train_ser.str.lower() tokenizer = TreebankWordTokenizer() variable = x_train_lower_case.apply( lambda row: tokenizer.tokenize(str(row))) return variable
def q04_count_vectors(path,ranges=(1,2),max_df=0.5,min_df=2): data,X_train,X_test,y_train,y_test=q01_load_data(path) tokenizer1=TreebankWordTokenizer() tf=CountVectorizer(decode_error='ignore',tokenizer=tokenizer1.tokenize,ngram_range=ranges,max_df=max_df, min_df=min_df,stop_words='english') tf.fit(X_train) variable1=tf.transform(X_train) variable2=tf.transform(X_test) return variable1,variable2
def q02_tokenize(path): 'write your solution here' twenty_train, X_train, X_test, y_train, y_test = q01_load_data(path) tokenizer = TreebankWordTokenizer() to_tokenize = pd.Series(X_train) almost_tokenized = to_tokenize.apply(lambda row: row.lower()) tokenized = almost_tokenized.apply( lambda row: tokenizer.tokenize(str(row))) # X_train.head(20) return tokenized
def q03_stop_word_stemmer(path): p_stemmer = PorterStemmer() data, X_train, X_test, y_train, y_test = q01_load_data(path) X_train = pd.Series(X_train).astype(str) stop_words = X_train.apply(lambda row: [i for i in row if i not in stop]) text = [] for i in range(len(stop_words)): tokens = stop_words[i] tokens = [p_stemmer.stem(i) for i in tokens] text = text + tokens return text
def q02_tokenize(path): file,X_train,X_test,y_train,y_test = q01_load_data(path) tree_bank_tokenizer = TreebankWordTokenizer() X_train = pd.Series(X_train) X_test = pd.Series(X_test) X_train = X_train.apply(lambda x : x.lower()) X_test = X_test.apply(lambda x : x.lower()) i=0 for row in X_train: X_train.iloc[i]=tree_bank_tokenizer.tokenize(str(row)) i+=1 return X_train
def q04_count_vectors(path, ranges=(1, 2), max_df=0.5, min_df=2): data, X_train, X_test, y_train, y_test = q01_load_data(path) X_train = pd.Series(X_train) vect = CountVectorizer(decode_error='ignore') tokenizer = TreebankWordTokenizer() vect.set_params(tokenizer=tokenizer.tokenize, stop_words='english', ngram_range=ranges, max_df=max_df, min_df=min_df) train_transformed = vect.fit_transform(X_train) test_transformed = vect.transform(X_test) return train_transformed, test_transformed
def q03_stop_word_stemmer(path): data, X_train, X_test, y_train, y_test = q01_load_data(path) X_train = pd.Series(X_train) X_train = X_train.apply(lambda x: x.lower()) tree = TreebankWordTokenizer() ps = PorterStemmer() Ltokens = [] for i in X_train: tokens = tree.tokenize(str(i)) wl = [w for w in tokens if not w in stop] psl = [ps.stem(w) for w in wl] Ltokens.append(psl) return Ltokens
def q02_tokenize(path): data, X_train, X_test, y_train, y_test = q01_load_data(path) X_train = pd.Series(X_train) X_train = X_train.apply(lambda x: x.lower()) tree = TreebankWordTokenizer() Ftokens = pd.Series() Ltokens = [] for i in X_train: tokens = tree.tokenize(str(i)) Ltokens.append(tokens) Ftokens = pd.Series(Ltokens) return Ftokens