コード例 #1
0
ファイル: exploration2.py プロジェクト: jmrohnson/DataScience
  n2 = '100000'
  strong_ext = '_chi2_strong_' + vectorizer
  weak_ext = '_chi2_weak_' + vectorizer
  train_features, train_labels = load_data.load_feature_data(0, test_train='train')
  len_train= len(train_labels)
  test_features, test_labels = load_data.load_feature_data(0, test_train='test')
  all_labels = np.append(train_labels, test_labels)
  ## Do better Normalization on Customer Features
  all_features = np.vstack((train_features, test_features))
  float_feats = [[float(i) for i in row] for row in all_features]  # Turn values to floating point
  new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0)
  train_features = new_feats[:len_train]
  test_features = new_feats[len_train:]
  ### Done Normalizing
  train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=v2)
  train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=v2)
  train_email_features_n1, test_email_features_n1 = load_data.load_email_data(0, n1+v2, stemmer=stem, vectorizer=v2)
  train_subject_features_n1, test_subject_features_n1 = load_data.load_subject_data(0, n1+v2, stemmer=stem, vectorizer=v2)
  train_email_features_n2, test_email_features_n2 = load_data.load_email_data(0, n2+v2, stemmer=stem, vectorizer=v2)
  train_subject_features_n2, test_subject_features_n2 = load_data.load_subject_data(0, n2+v2, stemmer=stem, vectorizer=v2)
  train_email_strong_features, test_email_strong_features = load_data.load_email_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer)
  train_subject_strong_features, test_subject_strong_features = load_data.load_subject_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer)
  train_email_weak_features, test_email_weak_features = load_data.load_email_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer)
  train_subject_weak_features, test_subject_weak_features = load_data.load_subject_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer)

  logging.info("All Data Loaded")
  for t in ['email']:
    for s in ['n2']:
      if t == 'email':
        if s =='strong':
          trainer = train_email_strong_features
コード例 #2
0
ファイル: exploration3.py プロジェクト: jmrohnson/DataScience
  stemmers = ['RegexpStemmer', 'LancasterStemmer', 'PorterStemmer']
  for vectorizer in vectorizers:
    for stem in stemmers:
      train_features, train_labels = load_data.load_feature_data(0, test_train='train')
      len_train= len(train_labels)
      test_features, test_labels = load_data.load_feature_data(0, test_train='test')
      all_labels = np.append(train_labels, test_labels)
      ## Do better Normalization on Customer Features
      all_features = np.vstack((train_features, test_features))
      float_feats = [[float(i) for i in row] for row in all_features]  # Turn values to floating point
      new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0)
      train_features = new_feats[:len_train]
      test_features = new_feats[len_train:]
      ### Done Normalizing
      train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer)
      train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer)

      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("VECTORIZER = %s" % vectorizer)
      logging.info("STEMMER = %s" % stem)
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      
      logging.info("All Data Loaded")
      for t in ['both', 'all', 'subject', 'email', 'normal']:
        if t == 'email':
          trainer = train_email_features
          test = test_email_features