Exemple #1
0
def q02_tokenize(path):
    df, X_train, X_test, y_train, y_test = q01_load_data(path)
    x_train_ser = pd.Series(X_train)
    x_train_lower_case = x_train_ser.str.lower()
    tokenizer = TreebankWordTokenizer()
    variable = x_train_lower_case.apply(
        lambda row: tokenizer.tokenize(str(row)))
    return variable
Exemple #2
0
def q04_count_vectors(path,ranges=(1,2),max_df=0.5,min_df=2):
    data,X_train,X_test,y_train,y_test=q01_load_data(path)
    tokenizer1=TreebankWordTokenizer()
    tf=CountVectorizer(decode_error='ignore',tokenizer=tokenizer1.tokenize,ngram_range=ranges,max_df=max_df, min_df=min_df,stop_words='english')
    tf.fit(X_train)
    variable1=tf.transform(X_train)
    variable2=tf.transform(X_test)
    return variable1,variable2
Exemple #3
0
def q02_tokenize(path):
    'write your solution here'
    twenty_train, X_train, X_test, y_train, y_test = q01_load_data(path)
    tokenizer = TreebankWordTokenizer()
    to_tokenize = pd.Series(X_train)
    almost_tokenized = to_tokenize.apply(lambda row: row.lower())
    tokenized = almost_tokenized.apply(
        lambda row: tokenizer.tokenize(str(row)))
    # X_train.head(20)
    return tokenized
Exemple #4
0
def q03_stop_word_stemmer(path):
    p_stemmer = PorterStemmer()
    data, X_train, X_test, y_train, y_test = q01_load_data(path)
    X_train = pd.Series(X_train).astype(str)
    stop_words = X_train.apply(lambda row: [i for i in row if i not in stop])
    text = []
    for i in range(len(stop_words)):
        tokens = stop_words[i]
        tokens = [p_stemmer.stem(i) for i in tokens]
        text = text + tokens

    return text
def q02_tokenize(path):
    file,X_train,X_test,y_train,y_test = q01_load_data(path)
    tree_bank_tokenizer = TreebankWordTokenizer()
    X_train = pd.Series(X_train)
    X_test = pd.Series(X_test)
    X_train = X_train.apply(lambda x : x.lower())
    X_test = X_test.apply(lambda x : x.lower())
    i=0
    for row in X_train:
        X_train.iloc[i]=tree_bank_tokenizer.tokenize(str(row))
        i+=1
    return X_train
Exemple #6
0
def q04_count_vectors(path, ranges=(1, 2), max_df=0.5, min_df=2):
    data, X_train, X_test, y_train, y_test = q01_load_data(path)
    X_train = pd.Series(X_train)
    vect = CountVectorizer(decode_error='ignore')
    tokenizer = TreebankWordTokenizer()
    vect.set_params(tokenizer=tokenizer.tokenize,
                    stop_words='english',
                    ngram_range=ranges,
                    max_df=max_df,
                    min_df=min_df)
    train_transformed = vect.fit_transform(X_train)
    test_transformed = vect.transform(X_test)
    return train_transformed, test_transformed
Exemple #7
0
def q03_stop_word_stemmer(path):
    data, X_train, X_test, y_train, y_test = q01_load_data(path)
    X_train = pd.Series(X_train)
    X_train = X_train.apply(lambda x: x.lower())
    tree = TreebankWordTokenizer()
    ps = PorterStemmer()
    Ltokens = []
    for i in X_train:
        tokens = tree.tokenize(str(i))
        wl = [w for w in tokens if not w in stop]
        psl = [ps.stem(w) for w in wl]
        Ltokens.append(psl)
    return Ltokens
Exemple #8
0
def q02_tokenize(path):
    data, X_train, X_test, y_train, y_test = q01_load_data(path)
    X_train = pd.Series(X_train)
    X_train = X_train.apply(lambda x: x.lower())
    tree = TreebankWordTokenizer()
    Ftokens = pd.Series()
    Ltokens = []
    for i in X_train:
        tokens = tree.tokenize(str(i))
        Ltokens.append(tokens)

    Ftokens = pd.Series(Ltokens)
    return Ftokens