def get_total_features(search_frame,first_col_name='product_title',second_col_name='search_term'): feature_hash_first_text = hashfeatures.FeatureHash() feature_hash_second_text = hashfeatures.FeatureHash(max_feature_num=100) first_text_list = preprocess.text_clean_pipeline_list(list(search_frame[first_col_name].values)) second_text_list = preprocess.text_clean_pipeline_list(list(search_frame[second_col_name].values)) first_feature_set = feature_hash_first_text.get_feature_set(first_text_list) second_feature_set = feature_hash_second_text.get_feature_set(second_text_list) final_consolidated_feature_list = np.hstack((first_feature_set,second_feature_set)) return final_consolidated_feature_list
def get_full_feature_set(dataframe): title_list = list(dataframe['Title'].values) body_list = list(dataframe['BodyMarkdown'].values) clean_title_list = preprocess.text_clean_pipeline_list(title_list) clean_body_list = preprocess.text_clean_pipeline_list(body_list) title_feature = hashfeatures.FeatureHash(max_feature_num=100) body_feature = hashfeatures.FeatureHash(max_feature_num=400) title_hash_features = title_feature.get_feature_set(clean_title_list) body_hash_features = body_feature.get_feature_set(clean_body_list) del dataframe['Title'] del dataframe['BodyMarkdown'] full_feature_set = np.hstack( (title_hash_features, body_hash_features, dataframe.values)) return full_feature_set
import pandas as pd import preprocess from textfeatures import TfIdf from textfeatures import BagOfWords from textfeatures import FeatureHash from textfeatures import DocumentVector import time input_data_filepath = 'data/train_input.csv' input_label_filepath = 'data/train_output.csv' thread_dataframe = pd.read_csv(input_data_filepath) label_dataframe = pd.read_csv(input_label_filepath) conversation_list = list(thread_dataframe['conversation'].values) cleaned_conversation_list = preprocess.text_clean_pipeline_list(conversation_list) start = time.time() preprocess.write_sentences_to_file(cleaned_conversation_list,'data/cleaned_conversation.txt') tf_idf = TfIdf() bag_of_words = BagOfWords() feature_hash = FeatureHash() doc2vec_feat = DocumentVector(filepath='data/cleaned_conversation.txt') doc2vec_feat.fit_feature_model() doc2vec_feat.save_feature_model('trained_moodel/doc2vec_feature_model.bin') feature_hash.fit_feature_model(preprocess.tokenize_string_list(cleaned_conversation_list,separator=' '))
classifier_list = [rf, linear_svm] classifier_name_list = ['Random Forest', 'Linear SVM'] return classifier_list, classifier_name_list def report_classification_metrics(trained_model, X_test, y_test): predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test, predicted_values) print metrics.accuracy_score(y_test, predicted_values) filename = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\DonorsChoose\\train.csv' train_frame = pd.read_csv(filename) train_frame = train_frame.head(25000) class_labels = list(train_frame['project_is_approved'].values) summary_text = preprocess.text_clean_pipeline_list( list(train_frame['project_resource_summary'].values)) feature_set = hashfeatures.FeatureHash( max_feature_num=2000).get_feature_set(summary_text) del train_frame del summary_text X_train, X_test, y_train, y_test = train_test_split(feature_set, class_labels, test_size=0.2, random_state=42) del class_labels del feature_set classifier_list, classifier_name_list = get_classifiers() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train)
ada = AdaBoostClassifier(n_estimators=51,random_state=42) grad = GradientBoostingClassifier(n_estimators=101,random_state=42) classifier_list = [rf,bagg,extra,ada,grad] classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost'] return classifier_list,classifier_name_list def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test): print '--------- For Model : ------------', trained_model_name predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test,predicted_values) print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values) print "---------------------------------------\n" filename = 'train.csv' author_frame = pd.read_csv(filename) class_labels = list(author_frame['author'].values) del author_frame['id'] del author_frame['author'] text_list = list(author_frame['text'].values) cleaned_text_list = preprocess.text_clean_pipeline_list(text_list) feat_hash = hashfeatures.FeatureHash(max_feature_num=1000) text_features = feat_hash.get_feature_set(cleaned_text_list) X_train,X_test,y_train,y_test = train_test_split(text_features,class_labels,test_size=0.2,random_state=42) classifier_list,classifier_name_list = get_ensemble_models() for classifier,classifier_name in zip(classifier_list,classifier_name_list): classifier.fit(X_train,y_train) print_evaluation_metrics(classifier,classifier_name,X_test,y_test)
print metrics.accuracy_score(y_test, predicted_values) script_file_path = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\Seinfield Scripts\\scripts.csv' script_frame = pd.read_csv(script_file_path) character_group_series = script_frame['Character'].value_counts() filtered_character_list = [] for character, count in character_group_series.iteritems(): if count > 300: filtered_character_list.append(character) filtered_script_frame = script_frame[script_frame['Character'].isin( filtered_character_list)] del script_frame character_list = list(filtered_script_frame['Character'].values) dialogue_list = preprocess.text_clean_pipeline_list( list(filtered_script_frame['Dialogue'].values)) hash_feature_set = hashfeatures.FeatureHash( max_feature_num=1000).get_feature_set(dialogue_list) del filtered_script_frame X_train, X_test, y_train, y_test = train_test_split(hash_feature_set, character_list, test_size=0.2, random_state=42) del character_list del dialogue_list del hash_feature_set classifier_list, classifier_name_list = get_classifiers() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) report_classification_metrics(classifier, X_test, y_test)