コード例 #1
0
def random_forest_generator(data_filename, replacer,train_ratio = 0.8):
    #initialize variables
    data = pd.read_csv(data_filename)
    data_labels = data['state'].tolist()
    data_texts = data['content']
    bow_transformer = CountVectorizer()

    #vectorize train data
    word_list = preprocess(data_texts, replacer)
    vectorized_data = bow_transformer.fit_transform(word_list)
    tfidf_transformer = TfidfTransformer().fit(vectorized_data)
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray()
    #print(vectorized_data_list)
    all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio)

    train_data = all_data['train_data']
    train_labels = all_data['train_labels']
    test_data = all_data['test_data']
    test_labels = all_data['test_labels']
    
    #build model
    forest = RandomForestClassifier(n_estimators=30,n_jobs=-1)
    forest = forest.fit(train_data,train_labels)
    pred=forest.predict(test_data)
    accuracy = worker.calculate_result(test_labels,pred)
 
    #save data and model
    joblib.dump(forest,'./var/model/forest_0',compress = 3)
    joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3)
    joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3)
    return forest
コード例 #2
0
def logistic_regression_generator(data_filename, replacer, train_ratio = 0.8):
    
    data = pd.read_csv(data_filename)
    data_labels = data['state'].tolist()
    data_texts = data['content']
    
    bow_transformer = CountVectorizer()
    
    #vectorize train data
    word_list = preprocess(data_texts, replacer)
    vectorized_data = bow_transformer.fit_transform(word_list)
    tfidf_transformer = TfidfTransformer().fit(vectorized_data)
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray()
    
    all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio)
    
    train_data = all_data['train_data']
    train_labels = all_data['train_labels']
    test_data = all_data['test_data']
    test_labels = all_data['test_labels']
    
    #build model
    lr = LogisticRegression(C=30,penalty='l2',random_state=0)
    lr.fit(train_data,train_labels)
    pred=lr.predict(test_data)
    print("generator prediction:")
    accuracy = worker.calculate_result(test_labels,pred)
    
    
    #save data and model
    joblib.dump(lr,'./var/model/lr_0',compress = 3)
    joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3)
    joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3)
    return lr