Beispiel #1
0
def buildEmailText(requests, rebuild = 1, stemmer='PorterStemmer', vectorizer='TfidfVectorizer', num_features = None):
  rawText = {}
  output = {}
  if rebuild:
    for i in range(0, 54):
      data_file = data_directory + 'email_text/email_text_tmp_test_' + str(i) + '.txt'
      print "Reading data from %s" % data_file
      emails = read_email_file(data_file)
      # for req_id in emails:
      #   text = emails[req_id]
      #   rawText{req_id}, text])
      rawText.update(emails)      
    f = open('pickle_data/rawEmailText.pkl', 'w')
    pickle.dump(rawText, f)
    f.close()
  else:
    f = open('pickle_data/rawEmailText.pkl', 'r')
    rawText = pickle.load(f)
    f.close()
  requests_only = select_data(requests, 0)
  requests_only = [[int(req_id)] for req_id in requests_only]
  append_data(requests_only, rawText)
  for row in requests_only:
    if len(row) < 2:
      row.append("")
  rawText = [row[1] for row in requests_only]
  print "New Raw Text Array %i" % len(rawText)
  print "Got data from files: "
  print len(rawText)
  return preProcessText(rawText, stemmer=stemmer, vectorizer=vectorizer, num_features=num_features)
        stemmer = LancasterStemmer()
    elif stemmer_type == 'RegexpStemmer':
        stemmer = RegexpStemmer('ing$|s$|e$', min=3)

    for word in word_list:
        stemmed_words.append(stemmer.stem(word))
    return stemmed_words



if __name__ == "__main__":
  final_out = {}
  for i in range(0, 44):
    data_file = data_directory + 'email_text/email_text_tmp_test_' + str(i) + '.txt'
    print "Reading data from %s" % data_file
    emails = read_email_file(data_file)
    for req_id in emails:
      text = emails[req_id]
      tokens = word_tokenize(text)
      tokens = [str(t).lower() for t in tokens]
      remove_stopwords(tokens)
      stemmed_words = stemming(tokens, 'PorterStemmer')
      final_out[req_id] = ' '.join(stemmed_words)
      i+=1

  f = open('testEmailText.pkl', 'w')
  pickle.dump(final_out, f)
  f.close()