n2 = '100000' strong_ext = '_chi2_strong_' + vectorizer weak_ext = '_chi2_weak_' + vectorizer train_features, train_labels = load_data.load_feature_data(0, test_train='train') len_train= len(train_labels) test_features, test_labels = load_data.load_feature_data(0, test_train='test') all_labels = np.append(train_labels, test_labels) ## Do better Normalization on Customer Features all_features = np.vstack((train_features, test_features)) float_feats = [[float(i) for i in row] for row in all_features] # Turn values to floating point new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0) train_features = new_feats[:len_train] test_features = new_feats[len_train:] ### Done Normalizing train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=v2) train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=v2) train_email_features_n1, test_email_features_n1 = load_data.load_email_data(0, n1+v2, stemmer=stem, vectorizer=v2) train_subject_features_n1, test_subject_features_n1 = load_data.load_subject_data(0, n1+v2, stemmer=stem, vectorizer=v2) train_email_features_n2, test_email_features_n2 = load_data.load_email_data(0, n2+v2, stemmer=stem, vectorizer=v2) train_subject_features_n2, test_subject_features_n2 = load_data.load_subject_data(0, n2+v2, stemmer=stem, vectorizer=v2) train_email_strong_features, test_email_strong_features = load_data.load_email_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer) train_subject_strong_features, test_subject_strong_features = load_data.load_subject_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer) train_email_weak_features, test_email_weak_features = load_data.load_email_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer) train_subject_weak_features, test_subject_weak_features = load_data.load_subject_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer) logging.info("All Data Loaded") for t in ['email']: for s in ['n2']: if t == 'email': if s =='strong': trainer = train_email_strong_features
stemmers = ['RegexpStemmer', 'LancasterStemmer', 'PorterStemmer'] for vectorizer in vectorizers: for stem in stemmers: train_features, train_labels = load_data.load_feature_data(0, test_train='train') len_train= len(train_labels) test_features, test_labels = load_data.load_feature_data(0, test_train='test') all_labels = np.append(train_labels, test_labels) ## Do better Normalization on Customer Features all_features = np.vstack((train_features, test_features)) float_feats = [[float(i) for i in row] for row in all_features] # Turn values to floating point new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0) train_features = new_feats[:len_train] test_features = new_feats[len_train:] ### Done Normalizing train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer) train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer) logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("VECTORIZER = %s" % vectorizer) logging.info("STEMMER = %s" % stem) logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info("All Data Loaded") for t in ['both', 'all', 'subject', 'email', 'normal']: if t == 'email': trainer = train_email_features test = test_email_features