def get_total_features(search_frame,first_col_name='product_title',second_col_name='search_term'):
    feature_hash_first_text = hashfeatures.FeatureHash()
    feature_hash_second_text = hashfeatures.FeatureHash(max_feature_num=100)
    first_text_list = preprocess.text_clean_pipeline_list(list(search_frame[first_col_name].values))
    second_text_list = preprocess.text_clean_pipeline_list(list(search_frame[second_col_name].values))
    first_feature_set = feature_hash_first_text.get_feature_set(first_text_list)
    second_feature_set = feature_hash_second_text.get_feature_set(second_text_list)
    final_consolidated_feature_list = np.hstack((first_feature_set,second_feature_set))
    return final_consolidated_feature_list
Esempio n. 2
0
def get_full_feature_set(dataframe):
    title_list = list(dataframe['Title'].values)
    body_list = list(dataframe['BodyMarkdown'].values)
    clean_title_list = preprocess.text_clean_pipeline_list(title_list)
    clean_body_list = preprocess.text_clean_pipeline_list(body_list)
    title_feature = hashfeatures.FeatureHash(max_feature_num=100)
    body_feature = hashfeatures.FeatureHash(max_feature_num=400)
    title_hash_features = title_feature.get_feature_set(clean_title_list)
    body_hash_features = body_feature.get_feature_set(clean_body_list)
    del dataframe['Title']
    del dataframe['BodyMarkdown']
    full_feature_set = np.hstack(
        (title_hash_features, body_hash_features, dataframe.values))
    return full_feature_set
Esempio n. 3
0
import pandas as pd
import preprocess
from textfeatures import TfIdf
from textfeatures import BagOfWords
from textfeatures import FeatureHash
from textfeatures import DocumentVector
import time

input_data_filepath = 'data/train_input.csv'
input_label_filepath = 'data/train_output.csv'

thread_dataframe = pd.read_csv(input_data_filepath)
label_dataframe = pd.read_csv(input_label_filepath)

conversation_list = list(thread_dataframe['conversation'].values)
cleaned_conversation_list = preprocess.text_clean_pipeline_list(conversation_list)

start = time.time()

preprocess.write_sentences_to_file(cleaned_conversation_list,'data/cleaned_conversation.txt')

tf_idf = TfIdf()
bag_of_words = BagOfWords()
feature_hash = FeatureHash() 

doc2vec_feat = DocumentVector(filepath='data/cleaned_conversation.txt')

doc2vec_feat.fit_feature_model()
doc2vec_feat.save_feature_model('trained_moodel/doc2vec_feature_model.bin')

feature_hash.fit_feature_model(preprocess.tokenize_string_list(cleaned_conversation_list,separator=' '))
    classifier_list = [rf, linear_svm]
    classifier_name_list = ['Random Forest', 'Linear SVM']
    return classifier_list, classifier_name_list


def report_classification_metrics(trained_model, X_test, y_test):
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test, predicted_values)
    print metrics.accuracy_score(y_test, predicted_values)


filename = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\DonorsChoose\\train.csv'
train_frame = pd.read_csv(filename)
train_frame = train_frame.head(25000)
class_labels = list(train_frame['project_is_approved'].values)
summary_text = preprocess.text_clean_pipeline_list(
    list(train_frame['project_resource_summary'].values))
feature_set = hashfeatures.FeatureHash(
    max_feature_num=2000).get_feature_set(summary_text)

del train_frame
del summary_text
X_train, X_test, y_train, y_test = train_test_split(feature_set,
                                                    class_labels,
                                                    test_size=0.2,
                                                    random_state=42)
del class_labels
del feature_set

classifier_list, classifier_name_list = get_classifiers()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
Esempio n. 5
0
    ada = AdaBoostClassifier(n_estimators=51,random_state=42)
    grad = GradientBoostingClassifier(n_estimators=101,random_state=42)
    classifier_list = [rf,bagg,extra,ada,grad]
    classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
    return classifier_list,classifier_name_list


def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print '--------- For Model : ------------', trained_model_name
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test,predicted_values)
    print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values)
    print "---------------------------------------\n"


filename = 'train.csv'
author_frame = pd.read_csv(filename)
class_labels = list(author_frame['author'].values)
del author_frame['id']
del author_frame['author']
text_list = list(author_frame['text'].values)
cleaned_text_list = preprocess.text_clean_pipeline_list(text_list)
feat_hash = hashfeatures.FeatureHash(max_feature_num=1000)
text_features = feat_hash.get_feature_set(cleaned_text_list)
X_train,X_test,y_train,y_test = train_test_split(text_features,class_labels,test_size=0.2,random_state=42)
classifier_list,classifier_name_list = get_ensemble_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    classifier.fit(X_train,y_train)
    print_evaluation_metrics(classifier,classifier_name,X_test,y_test)

    print metrics.accuracy_score(y_test, predicted_values)


script_file_path = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\Seinfield Scripts\\scripts.csv'
script_frame = pd.read_csv(script_file_path)
character_group_series = script_frame['Character'].value_counts()
filtered_character_list = []
for character, count in character_group_series.iteritems():
    if count > 300:
        filtered_character_list.append(character)

filtered_script_frame = script_frame[script_frame['Character'].isin(
    filtered_character_list)]
del script_frame
character_list = list(filtered_script_frame['Character'].values)
dialogue_list = preprocess.text_clean_pipeline_list(
    list(filtered_script_frame['Dialogue'].values))
hash_feature_set = hashfeatures.FeatureHash(
    max_feature_num=1000).get_feature_set(dialogue_list)
del filtered_script_frame
X_train, X_test, y_train, y_test = train_test_split(hash_feature_set,
                                                    character_list,
                                                    test_size=0.2,
                                                    random_state=42)
del character_list
del dialogue_list
del hash_feature_set
classifier_list, classifier_name_list = get_classifiers()
for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    classifier.fit(X_train, y_train)
    report_classification_metrics(classifier, X_test, y_test)