aff_word_num_in_title += 1 if is_a_in_b(aff_word, content): aff_word_num_in_content += 1 aff_word_proportion_in_title = float(aff_word_num_in_title) / float(len(self.affiliation_word_list) + 1) aff_word_proportion_in_content = float(aff_word_num_in_content) / float( len(self.affiliation_word_list) + 1 ) if tem_email_model.max_affiliation_proportion_in_title < aff_word_proportion_in_title: tem_email_model.max_affiliation_proportion_in_title = aff_word_proportion_in_title if tem_email_model.max_affiliation_proportion_in_content < aff_word_proportion_in_content: tem_email_model.max_affiliation_proportion_in_content = aff_word_proportion_in_content self.email_email_model_dict[email_addr] = tem_email_model def write_feature_file(person_dict_list, feature_filename): with open(feature_filename, "w") as feature_file: for person_dict in person_dict_list: person = Person(person_dict, "../resource/Top1000_mail_list/") if not person.get_right_email_list() or not person.google_item_dict_list: continue for email_addr, email_model in person.email_email_model_dict.items(): feature_file.write(email_model.get_feature_line() + "\n") if __name__ == "__main__": train_person_num = 200 top_person_list = get_top_person_names(250) write_feature_file(top_person_list[:train_person_num], "../svm_light/email/train.dat") write_feature_file(top_person_list[train_person_num:], "../svm_light/email/test.dat")
def get_train_test_file(self, test_start, test_end): print 'writing train and test file', test_start top_person_list = get_top_person_names(self.tot_data_num) write_feature_file(top_person_list[:test_start] + top_person_list[test_end:], self.get_train_file_name(test_start)) write_feature_file(top_person_list[test_start: test_end], self.get_test_file_name(test_start))