Beispiel #1
0
                        aff_word_num_in_title += 1
                    if is_a_in_b(aff_word, content):
                        aff_word_num_in_content += 1
                aff_word_proportion_in_title = float(aff_word_num_in_title) / float(len(self.affiliation_word_list) + 1)
                aff_word_proportion_in_content = float(aff_word_num_in_content) / float(
                    len(self.affiliation_word_list) + 1
                )
                if tem_email_model.max_affiliation_proportion_in_title < aff_word_proportion_in_title:
                    tem_email_model.max_affiliation_proportion_in_title = aff_word_proportion_in_title
                if tem_email_model.max_affiliation_proportion_in_content < aff_word_proportion_in_content:
                    tem_email_model.max_affiliation_proportion_in_content = aff_word_proportion_in_content

                self.email_email_model_dict[email_addr] = tem_email_model


def write_feature_file(person_dict_list, feature_filename):
    with open(feature_filename, "w") as feature_file:
        for person_dict in person_dict_list:
            person = Person(person_dict, "../resource/Top1000_mail_list/")
            if not person.get_right_email_list() or not person.google_item_dict_list:
                continue
            for email_addr, email_model in person.email_email_model_dict.items():
                feature_file.write(email_model.get_feature_line() + "\n")


if __name__ == "__main__":
    train_person_num = 200
    top_person_list = get_top_person_names(250)
    write_feature_file(top_person_list[:train_person_num], "../svm_light/email/train.dat")
    write_feature_file(top_person_list[train_person_num:], "../svm_light/email/test.dat")
Beispiel #2
0
 def get_train_test_file(self, test_start, test_end):
     print 'writing train and test file', test_start
     top_person_list = get_top_person_names(self.tot_data_num)
     write_feature_file(top_person_list[:test_start] + top_person_list[test_end:], self.get_train_file_name(test_start))
     write_feature_file(top_person_list[test_start: test_end], self.get_test_file_name(test_start))