def get_complete_feature_set(dataframe): summary_list = list(dataframe['Summary'].values) text_list = list(dataframe['Text'].values) summary_features = hashfeatures.FeatureHash( max_feature_num=100).get_feature_set(summary_list) text_features = hashfeatures.FeatureHash( max_feature_num=400).get_feature_set(text_list) consolidated_feature_list = np.hstack((summary_features, text_features)) return consolidated_feature_list
def get_total_features(search_frame,first_col_name='product_title',second_col_name='search_term'): feature_hash_first_text = hashfeatures.FeatureHash() feature_hash_second_text = hashfeatures.FeatureHash(max_feature_num=100) first_text_list = preprocess.text_clean_pipeline_list(list(search_frame[first_col_name].values)) second_text_list = preprocess.text_clean_pipeline_list(list(search_frame[second_col_name].values)) first_feature_set = feature_hash_first_text.get_feature_set(first_text_list) second_feature_set = feature_hash_second_text.get_feature_set(second_text_list) final_consolidated_feature_list = np.hstack((first_feature_set,second_feature_set)) return final_consolidated_feature_list
def get_full_hash_features(dataframe): title_features = hashfeatures.FeatureHash( max_feature_num=20).get_feature_set(list(dataframe['title'].values)) desc_features = hashfeatures.FeatureHash( max_feature_num=450).get_feature_set( list(dataframe['description'].values)) attr_features = hashfeatures.FeatureHash( max_feature_num=100).get_feature_set(list(dataframe['attrs'].values)) dataframe.drop(['title', 'description', 'attrs'], axis=1, inplace=True) full_features = np.hstack( (title_features, desc_features, attr_features, dataframe.values)) return full_features
def get_full_feature_set(dataframe): title_list = list(dataframe['Title'].values) body_list = list(dataframe['BodyMarkdown'].values) clean_title_list = preprocess.text_clean_pipeline_list(title_list) clean_body_list = preprocess.text_clean_pipeline_list(body_list) title_feature = hashfeatures.FeatureHash(max_feature_num=100) body_feature = hashfeatures.FeatureHash(max_feature_num=400) title_hash_features = title_feature.get_feature_set(clean_title_list) body_hash_features = body_feature.get_feature_set(clean_body_list) del dataframe['Title'] del dataframe['BodyMarkdown'] full_feature_set = np.hstack( (title_hash_features, body_hash_features, dataframe.values)) return full_feature_set
def get_hash_features(dataframe, column_dim_dict): text_features_master_list = list([]) for key in column_dim_dict.keys(): text_list = list(dataframe[key].values) text_features = hashfeatures.FeatureHash( max_feature_num=column_dim_dict[key]).get_feature_set(text_list) text_features_master_list.append(text_features) return reduce(lambda x, y: np.hstack((x, y)), text_features_master_list)
def get_label_encoded_features(rent_frame,text_columns): description_list = list(rent_frame['description'].values) features_list = list(rent_frame['features'].values) features_list = map(lambda s: " ".join(s), features_list) address_list = list(rent_frame['display_address'].values) street_list = list(rent_frame['street_address'].values) feature_hash = hashfeatures.FeatureHash(max_feature_num=150) description_hash = feature_hash.get_feature_set(description_list) features_list_hash = feature_hash.get_feature_set(features_list) address_hash = feature_hash.get_feature_set(address_list) street_hash = feature_hash.get_feature_set(street_list) rent_frame.drop(text_columns,axis=1,inplace=True) numerical_features = rent_frame.values return numpy.hstack((numerical_features,description_hash,features_list_hash,address_hash,street_hash))
def get_hashed_features(tweet_list): feat = hashfeatures.FeatureHash(max_feature_num=5000) hash_feature_set = feat.get_feature_set(tweet_list) return hash_feature_set
return classifier_list, classifier_name_list def report_classification_metrics(trained_model, X_test, y_test): predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test, predicted_values) print metrics.accuracy_score(y_test, predicted_values) filename = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\DonorsChoose\\train.csv' train_frame = pd.read_csv(filename) train_frame = train_frame.head(25000) class_labels = list(train_frame['project_is_approved'].values) summary_text = preprocess.text_clean_pipeline_list( list(train_frame['project_resource_summary'].values)) feature_set = hashfeatures.FeatureHash( max_feature_num=2000).get_feature_set(summary_text) del train_frame del summary_text X_train, X_test, y_train, y_test = train_test_split(feature_set, class_labels, test_size=0.2, random_state=42) del class_labels del feature_set classifier_list, classifier_name_list = get_classifiers() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) print "---------- For Classifier: ", classifier_name, " --------------------\n" report_classification_metrics(classifier, X_test, y_test)
ada = AdaBoostClassifier(n_estimators=51,random_state=42) grad = GradientBoostingClassifier(n_estimators=101,random_state=42) classifier_list = [rf,bagg,extra,ada,grad] classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost'] return classifier_list,classifier_name_list def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test): print '--------- For Model : ------------', trained_model_name predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test,predicted_values) print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values) print "---------------------------------------\n" filename = 'train.csv' author_frame = pd.read_csv(filename) class_labels = list(author_frame['author'].values) del author_frame['id'] del author_frame['author'] text_list = list(author_frame['text'].values) cleaned_text_list = preprocess.text_clean_pipeline_list(text_list) feat_hash = hashfeatures.FeatureHash(max_feature_num=1000) text_features = feat_hash.get_feature_set(cleaned_text_list) X_train,X_test,y_train,y_test = train_test_split(text_features,class_labels,test_size=0.2,random_state=42) classifier_list,classifier_name_list = get_ensemble_models() for classifier,classifier_name in zip(classifier_list,classifier_name_list): classifier.fit(X_train,y_train) print_evaluation_metrics(classifier,classifier_name,X_test,y_test)
top_keys = get_top_n_dict_keys(word_dict, top_n=7) top_key_string = " ".join(top_keys) dataframe[column] = map(lambda x: top_key_string if x is np.nan else x, dataframe[column].values) return dataframe filename = 'training.csv' train_frame = pd.read_csv(filename) name_desc_cap_frame = train_frame[['name', 'description', 'caption']] target_class_labels = train_frame['good'].values train_frame.drop(['name', 'description', 'caption', 'good'], axis=1, inplace=True) name_desc_cap_frame = fill_nan_in_string(name_desc_cap_frame) name_features = hashfeatures.FeatureHash(max_feature_num=100).get_feature_set( name_desc_cap_frame['name'].values) desc_features = hashfeatures.FeatureHash(max_feature_num=500).get_feature_set( name_desc_cap_frame['description'].values) caption_features = hashfeatures.FeatureHash( max_feature_num=200).get_feature_set(name_desc_cap_frame['caption'].values) train_features = Imputer().fit_transform(train_frame.values) final_features = np.hstack( (name_features, desc_features, caption_features, train_features)) X_train, X_test, y_train, y_test = train_test_split(final_features, target_class_labels, test_size=0.1, random_state=42) classifer_list, classifier_name_list = get_ensemble_models() for classifier, classifier_name in zip(classifer_list, classifier_name_list): classifier.fit(X_train, y_train) print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
y_test, predicted_values) print "Median Absolute Error : ", metrics.median_absolute_error( y_test, predicted_values) print "Mean Squared Error : ", metrics.mean_squared_error( y_test, predicted_values) print "R2 Score : ", metrics.r2_score(y_test, predicted_values) print "---------------------------------------\n" filename = 'train.csv' train_frame = pd.read_csv(filename) columns_to_delete = ['Id', 'LocationRaw', 'ContractType'] train_frame.drop(columns_to_delete, axis=1, inplace=True) train_frame.dropna(inplace=True) title_features = hashfeatures.FeatureHash(max_feature_num=100).get_feature_set( list(train_frame['Title'].values)) description_features = hashfeatures.FeatureHash( max_feature_num=900).get_feature_set( list(train_frame['FullDescription'].values)) target_values = train_frame['SalaryNormalized'].values train_frame.drop(['Title', 'FullDescription', 'SalaryNormalized'], axis=1, inplace=True) train_frame = label_encode_frame(train_frame) final_feature_set = np.hstack( (title_features, description_features, train_frame.values)) X_train, X_test, y_train, y_test = train_test_split(final_feature_set, target_values, test_size=0.2, random_state=42) regressor_list, regressor_name_list = get_ensemble_models()
return pd.DataFrame(text_list) text_filename = 'training_text' mutation_filename = 'training_variants' text_frame = get_data_frame(text_filename) mutation_frame = pd.read_csv(mutation_filename) mutation_frame['ID'] = map(lambda x: int(x), mutation_frame['ID'].values) final_frame = pd.merge(text_frame, mutation_frame, left_on='ID', right_on='ID', how='outer') class_labels = list(final_frame['Class'].values) gene_text = list(final_frame['Text'].values) gene_features = hashfeatures.FeatureHash( max_feature_num=5000).get_feature_set(gene_text) del final_frame['Class'] del final_frame['Text'] del final_frame['ID'] final_frame = label_encode_frame(final_frame) final_feature_set = np.hstack((gene_features, final_frame.values)) X_train, X_test, y_train, y_test = train_test_split(final_feature_set, class_labels, test_size=0.2, random_state=42) classifier_list, classifier_name_list = get_ensemble_models() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) print_evaluation_metrics(classifier, classifier_name, X_test, y_test)
script_file_path = 'C:\\Users\\rupachak\\Desktop\\Kaggle Data\\Seinfield Scripts\\scripts.csv' script_frame = pd.read_csv(script_file_path) character_group_series = script_frame['Character'].value_counts() filtered_character_list = [] for character, count in character_group_series.iteritems(): if count > 300: filtered_character_list.append(character) filtered_script_frame = script_frame[script_frame['Character'].isin( filtered_character_list)] del script_frame character_list = list(filtered_script_frame['Character'].values) dialogue_list = preprocess.text_clean_pipeline_list( list(filtered_script_frame['Dialogue'].values)) hash_feature_set = hashfeatures.FeatureHash( max_feature_num=1000).get_feature_set(dialogue_list) del filtered_script_frame X_train, X_test, y_train, y_test = train_test_split(hash_feature_set, character_list, test_size=0.2, random_state=42) del character_list del dialogue_list del hash_feature_set classifier_list, classifier_name_list = get_classifiers() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) report_classification_metrics(classifier, X_test, y_test)
'Gradient Boost' ] return classifier_list, classifier_name_list def print_evaluation_metrics(trained_model, trained_model_name, X_test, y_test): print '--------- For Model : ', trained_model_name predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test, predicted_values) print "Accuracy Score : ", metrics.accuracy_score(y_test, predicted_values) print "---------------------------------------\n" filename = 'train.csv' imperial_frame = pd.read_csv(filename) feature_hash = hashfeatures.FeatureHash(max_feature_num=5000) insult_features = feature_hash.get_feature_set( list(imperial_frame['Comment'].values)) class_labels = list(imperial_frame['Insult'].values) rf_embed_features = RandomTreesEmbedding(n_estimators=151, random_state=42) insult_features = rf_embed_features.fit_transform(insult_features) X_train, X_test, y_train, y_test = train_test_split(insult_features, class_labels, test_size=0.1, random_state=42) classifier_list, classifier_name_list = get_ensemble_models() for classifier, classifier_name in zip(classifier_list, classifier_name_list): classifier.fit(X_train, y_train) print_evaluation_metrics(classifier, classifier_name, X_test, y_test)