Exemple #1
0
def build_model(clf):
    global tf
    tf = tf()
    DATA_PATH = "data/data-2.csv"

    X, y = [], []

    with open(DATA_PATH, 'r') as f:
        for line in f:
            X.append(line.split(',')[0].lower())
            y.append(line.split(',')[1].strip('\n'))

    y = np.array(y)
    y = np.where(y == 'm', 1, 0)

    X_train, X_test, y_train, y_test = f**k(X, y, test_size=0.5)
    x_train = tf.fit_transform(X_train)

    clf.fit(x_train, y_train)
    return clf, tf
Exemple #2
0
from sklearn.feature_extraction.text import TfidfVectorizer as tf

get_train_data = Pipeline([
    ('get_train_data', pre_prop.GetTraindata()),
    #('term_frequency',tf(stop_words= cfg.STOP_WORDS, max_df = cfg.MAX_DF)),
    #('pa_classifier', pa(max_iter = cfg.PA_MAX_ITER))
])

get_train_label_data = Pipeline([
    ('get_train_label_data', pre_prop.GetTrainlabel()),
    #('term_frequency',tf(stop_words= cfg.STOP_WORDS, max_df = cfg.MAX_DF)),
    #('pa_classifier', pa(max_iter = cfg.PA_MAX_ITER))
])
get_test_data = Pipeline([
    ('get_test_data', pre_prop.GetTestdata()),
    #('term_frequency',tf(stop_words= cfg.STOP_WORDS, max_df = cfg.MAX_DF)),
    #('pa_classifier', pa(max_iter = cfg.PA_MAX_ITER))
])

get_test_label_data = Pipeline([
    ('get_test_label_data', pre_prop.GetTestlabel()),
    #('term_frequency',tf(stop_words= cfg.STOP_WORDS, max_df = cfg.MAX_DF)),
    #('pa_classifier', pa(max_iter = cfg.PA_MAX_ITER))
])

get_term_frequency = Pipeline([
    ('get_term_frequency', tf(stop_words=cfg.STOP_WORDS, max_df=cfg.MAX_DF)),
    #('term_frequency',tf(stop_words= cfg.STOP_WORDS, max_df = cfg.MAX_DF)),
    #('pa_classifier', pa(max_iter = cfg.PA_MAX_ITER))
])
    res = f1_score(y_test, clf.predict(X_test), pos_label=None, average='macro')
    print 'f1 macro:', res
    print
    # color = cm(1. * i / NUM_COLORS)  # color will now be an RGBA tuple
    # cm = plt.get_cmap('gist_rainbow')
    # fig = plt.figure(figsize=(8.0, 5.0))
    # ax = fig.add_subplot(111)
    # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
    # ax.plot(range(len(scores)), scores, label=str(threshold))
    # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller')
    # plt.show()
    print name
    return res


vec_list = [tf(), cv()]
clf_list = [svc(), lr()]
threshold_list = np.arange(0.5, 3, 0.5)
print len(threshold_list)
# results_size = (len(vec_list), len(clf_list),len(threshold_list))
# results = np.zeros(results_size, dtype = np.float)
# a, b, c = range(3), range(3), range(3)
# def my_func(x, y, z):
#     return (x + y + z) / 3.0, x * y * z, max(x, y, z)

grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list))
# mean_grid, product_grid, max_grid = grids
print len(grids)
try:
    print grids.shape
except:
Exemple #4
0
    deriva = SnowballStemmer(language='spanish')

    aux2 = nlp(str(nueva_cadena))
    nueva_cadena = ""
    for token in aux2:
        deri = deriva.stem(str(token.lemma_))
        nueva_cadena += deri + " "

    sentencias.append("")

    # ahora si lo incluimos al vector salida 3
    print(nueva_cadena)
    fin.append(nueva_cadena)

    #hacemos la transformación
    tfuno = tf()
    tfdos = tfuno.fit_transform(raw_documents=fin)
    validador = cs(tfdos[-1], tfdos)

    #buscamos la aproximación
    idx = validador.argsort()[0][-2]
    print("este es el idx", idx)

    flat = validador.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    print(req_tfidf)

    if (req_tfidf <= 0.7):
        print("lo siento no puedo identicar que es lo que deseas")
    else: