Esempio n. 1
0
  n1 = '1000'
  n2 = '100000'
  strong_ext = '_chi2_strong_' + vectorizer
  weak_ext = '_chi2_weak_' + vectorizer
  train_features, train_labels = load_data.load_feature_data(0, test_train='train')
  len_train= len(train_labels)
  test_features, test_labels = load_data.load_feature_data(0, test_train='test')
  all_labels = np.append(train_labels, test_labels)
  ## Do better Normalization on Customer Features
  all_features = np.vstack((train_features, test_features))
  float_feats = [[float(i) for i in row] for row in all_features]  # Turn values to floating point
  new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0)
  train_features = new_feats[:len_train]
  test_features = new_feats[len_train:]
  ### Done Normalizing
  train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=v2)
  train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=v2)
  train_email_features_n1, test_email_features_n1 = load_data.load_email_data(0, n1+v2, stemmer=stem, vectorizer=v2)
  train_subject_features_n1, test_subject_features_n1 = load_data.load_subject_data(0, n1+v2, stemmer=stem, vectorizer=v2)
  train_email_features_n2, test_email_features_n2 = load_data.load_email_data(0, n2+v2, stemmer=stem, vectorizer=v2)
  train_subject_features_n2, test_subject_features_n2 = load_data.load_subject_data(0, n2+v2, stemmer=stem, vectorizer=v2)
  train_email_strong_features, test_email_strong_features = load_data.load_email_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer)
  train_subject_strong_features, test_subject_strong_features = load_data.load_subject_data(0, strong_ext, stemmer=stem, vectorizer=vectorizer)
  train_email_weak_features, test_email_weak_features = load_data.load_email_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer)
  train_subject_weak_features, test_subject_weak_features = load_data.load_subject_data(0, weak_ext, stemmer=stem, vectorizer=vectorizer)

  logging.info("All Data Loaded")
  for t in ['email']:
    for s in ['n2']:
      if t == 'email':
        if s =='strong':
Esempio n. 2
0
  vectorizers = ['TfidfVectorizer', 'HashingVectorizer']
  stemmers = ['RegexpStemmer', 'LancasterStemmer', 'PorterStemmer']
  for vectorizer in vectorizers:
    for stem in stemmers:
      train_features, train_labels = load_data.load_feature_data(0, test_train='train')
      len_train= len(train_labels)
      test_features, test_labels = load_data.load_feature_data(0, test_train='test')
      all_labels = np.append(train_labels, test_labels)
      ## Do better Normalization on Customer Features
      all_features = np.vstack((train_features, test_features))
      float_feats = [[float(i) for i in row] for row in all_features]  # Turn values to floating point
      new_feats = preprocessing.normalize(float_feats, norm='l1', axis=0)
      train_features = new_feats[:len_train]
      test_features = new_feats[len_train:]
      ### Done Normalizing
      train_email_features, test_email_features = load_data.load_email_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer)
      train_subject_features, test_subject_features = load_data.load_subject_data(0, vectorizer, stemmer=stem, vectorizer=vectorizer)

      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("VECTORIZER = %s" % vectorizer)
      logging.info("STEMMER = %s" % stem)
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      
      logging.info("All Data Loaded")
      for t in ['both', 'all', 'subject', 'email', 'normal']:
        if t == 'email':
          trainer = train_email_features
Esempio n. 3
0
            observed = classifier.classify(feats)
            testsets[observed].add(i)
  print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
  print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
  print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
  print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
  print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
  classifier.show_most_informative_features()

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=400):
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigrams = bigram_finder.nbest(score_fn, n)
  return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

if __name__ == "__main__":
  train_subject, test_subject = load_data.load_email_data(0,  extension='unVectorized', stemmer='PorterStemmer', vectorizer='unVectorized')
  train_email, test_email = load_data.load_email_data(0,  extension='unVectorized', stemmer='PorterStemmer', vectorizer='unVectorized')
  train_features, train_labels = load_data.load_feature_data(0, test_train='train')
  test_features, test_labels = load_data.load_feature_data(0, test_train='test')
  train_email_features = [email.split(' ') for email in train_email]
  test_email_features = [email.split(' ') for email in test_email]
  train_subject_features = [subject.split(' ') for subject in train_subject]
  test_subject_features = [subject.split(' ') for subject in test_subject]
  for email_group in ['email', 'subject']:
    for wordType in ['single', 'bigram', 'trigram']:
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
      logging.info("Getting Word Stuff for %s and %s" % (email_group, wordType))
      if email_group == 'email':
Esempio n. 4
0
    #     trainer = train_features
    #     test = test_features
    #     split_train, split_labels = segment_for_even_distribution(trainer, train_labels)
    #     logging.info("============================================================")
    #     logging.info("Grid Search SVM on Customer Info")
    #     svmTrainandPrintWithGridSearch(split_train, split_labels, test_email_features, test_labels)
    # #SPlit up the data to better parts

    # for stem in ['PorterStemmer', 'RegexpStemmer', 'LancasterStemmer']:
    #   for vectorizer in ['TfidfVectorizer', 'HashingVectorizer']:
    for n in [100, 1000, 10000, 100000, 1000000, 10000000, 100000000]:
        stem = "RegexpStemmer"
        vectorizer = "HashingVectorizer"
        logging.info("Loading Data for %s, %s, %i:" % (stem, vectorizer, n))
        train_email_features, test_email_features = load_data.load_email_data(
            0, str(n) + vectorizer, stemmer=stem, vectorizer=vectorizer, num_features=n
        )
        train_subject_features, test_subject_features = load_data.load_subject_data(
            0, str(n) + vectorizer, stemmer=stem, vectorizer=vectorizer, num_features=n
        )
        for textType in ["both"]:
            logging.info("Building Model for %s " % textType)
            if textType == "email":
                train, test = train_email_features, test_email_features
            elif textType == "subject":
                train, test = train_subject_features, test_subject_features
            elif textType == "both":
                train = scipy.sparse.hstack([train_subject_features, train_email_features])
                test = scipy.sparse.hstack([test_subject_features, test_email_features])
                train = scipy.sparse.csr_matrix(train)
                test = scipy.sparse.csr_matrix(test)