Esempio n. 1
0
    def gen_train_samps(self, train_samp_params):
        fname_samps_train = train_samp_params['samp_dir'] + os.sep + 'train.samp'
        fname_term_set_train = train_samp_params['term_set_dir'] + os.sep + 'term.set'

        doc_class_list_train, doc_str_token_train, doc_terms_list_train, term_set_train = \
        self.gen_doc_terms_list(train_samp_params, train_opt = 1)
        pytc.save_term_set(term_set_train, fname_term_set_train)

        # unigram的token,单独作为参数用来构建其他规则特征
        doc_uni_token_train = pytc.gen_N_gram(doc_str_token_train,'uni')

        term_dict = dict(zip(term_set_train, range(1,len(term_set_train)+1)))
        class_dict = dict(zip(train_samp_params['class_name'], range(1,1+len(train_samp_params['class_name']))))

        if self.term_weight=='TFIDF':
            doc_num_train = len(doc_class_list_train)
            df_term_train = pytc.stat_df_term(term_set_train,doc_terms_list_train)
            idf_term_train = pytc.stat_idf_term(doc_num_train,df_term_train)
        else:
            idf_term_train = []

        train_embeddings = []

        print "building training samps......"
        samp_list_train, class_list_train = pytc.build_samps(term_dict, class_dict, doc_class_list_train,
        doc_terms_list_train, doc_uni_token_train, self.term_weight, self.rule_feature, idf_term_train, train_embeddings)
        print "saving training samps......"
        pytc.save_samps(samp_list_train, class_list_train, fname_samps_train)
Esempio n. 2
0
    def gen_test_samps(self, test_samp_params):
        fname_term_set = test_samp_params['term_set_dir'] + os.sep + 'term.set'
        fname_samps_test = test_samp_params['samp_dir'] +os.sep+'test.samp'
        # if not os.path.isfile(fname_term_set):
        #     print "cant find term set file."
        #     return
        if not os.path.exists(test_samp_params['raw_dir']):
            print "test dir does not exist."
            os.mkdir(test_samp_params['raw_dir'])

        doc_class_list_test, doc_str_token_test, doc_terms_list_test, term_set_test = \
        self.gen_doc_terms_list(test_samp_params, train_opt = 0)

        # unigram的token,用来构建其他规则特征
        doc_uni_token_test = pytc.gen_N_gram(doc_str_token_test,'uni')

        term_set_train = pytc.load_term_set(fname_term_set)
        term_dict = dict(zip(term_set_train, range(1,len(term_set_train)+1)))
        class_dict = dict(zip(test_samp_params['class_name'], range(1,1+len(test_samp_params['class_name']))))
        class_dict['test'] = 0

        if self.term_weight=='TFIDF':
            doc_num_test = len(doc_class_list_test)
            df_term_test = pytc.stat_df_term(term_set_test,doc_terms_list_test)
            idf_term_test = pytc.stat_idf_term(doc_num_test,df_term_test)
        else:
            idf_term_test = []

        test_embeddings = []
        print "building testing samps......"
        samp_list_test, class_list_test = pytc.build_samps(term_dict, class_dict, doc_class_list_test,
        doc_terms_list_test, doc_uni_token_test, self.term_weight, self.rule_feature, idf_term_test, test_embeddings)
        print "saving testing samps......"
        pytc.save_samps(samp_list_test, class_list_test, fname_samps_test)
Esempio n. 3
0
    input_dir = 'corpus' + os.sep + 'unlabeled_data'
    # input_dir = 'corpus' + os.sep + 'book2'
    pos_fenci_lines = [
        re.sub('#.*?#', '', x.strip())
        for x in open(input_dir + os.sep + 'pos_fenci').readlines()
    ]
    neg_fenci_lines = [
        re.sub('#.*?#', '', x.strip())
        for x in open(input_dir + os.sep + 'neg_fenci').readlines()
    ]
    random.shuffle(pos_fenci_lines)
    pos_fenci_lines = pos_fenci_lines[:len(neg_fenci_lines)]

    for ngram in ngram_list:
        pos_terms_list = pytc.gen_N_gram(pos_fenci_lines, ngram)
        neg_terms_list = pytc.gen_N_gram(neg_fenci_lines, ngram)
        term_set = gen_term_set(pos_terms_list + neg_terms_list)

        term_pos_freq = {}.fromkeys(term_set, 0)  # 词在情感正向类中的词频或者文档频
        term_neg_freq = {}.fromkeys(term_set, 0)  # 词在情感负向类中的词频或者文档频

        pos_df = len(pos_terms_list)
        neg_df = len(neg_terms_list)

        print "pos_df=", pos_df
        print "neg_df=", neg_df

        for lst in pos_terms_list:
            if metric == 'DF':
                lst = list(set(lst))
Esempio n. 4
0
    def gen_doc_terms_list(self, samp_params, train_opt = 0):
        doc_class_list = []

        # token ngram
        doc_token_list, token_set = {}, {}
        doc_pos_list, pos_set = {}, {}
        doc_tag_list, tag_set = {}, {}
        doc_character_list, character_set = {}, {}


        doc_str_token, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + \
            os.sep + x for x in samp_params['token']], samp_params['class_name'])

        if len(self.pos_gram.keys()) > 0:
            doc_str_pos, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + os.sep + x \
                for x in samp_params['pos']], samp_params['class_name'])

        if len(self.tag_gram.keys()) > 0:
            doc_str_tag, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + os.sep + x \
                for x in samp_params['tag']], samp_params['class_name'])

        for gram_key in ['uni', 'bis', 'tri', 'quat', 'five', 'six']:
            if self.token_gram.has_key(gram_key):
                doc_token_list[gram_key] = pytc.gen_N_gram(doc_str_token, gram_key)
                token_set[gram_key] = pytc.get_term_set(doc_token_list[gram_key])
                params = self.token_gram[gram_key]

                # if it's in training producure
                if train_opt == 1 and len(doc_token_list[gram_key]) > 0:
                    print "token " + gram_key, "len(token_set)=",len(token_set[gram_key])
                    token_set[gram_key] = pytc.feature_selection_all(doc_token_list[gram_key], doc_class_list, samp_params['class_name'],
                        token_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num'])
                    print "token " + gram_key, "after feature selection, len(token_set) =", len(token_set[gram_key])

            if self.pos_gram.has_key(gram_key):
                doc_pos_list[gram_key] = pytc.gen_N_gram(doc_str_pos, gram_key)
                pos_set[gram_key] = pytc.get_term_set(doc_pos_list[gram_key])
                params = self.pos_gram[gram_key]
                # if it's in training producure
                if train_opt == 1 and len(doc_pos_list[gram_key]) > 0:
                    print "pos " + gram_key, "len(pos_set)=",len(pos_set[gram_key])
                    pos_set[gram_key] = pytc.feature_selection_all(doc_pos_list[gram_key], doc_class_list, samp_params['class_name'],
                        pos_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num'])
                    print "pos " + gram_key, "after feature selection, len(pos_set) =", len(pos_set[gram_key])

            if self.tag_gram.has_key(gram_key):
                doc_tag_list[gram_key] = pytc.gen_N_gram(doc_str_tag, gram_key)
                tag_set[gram_key] = pytc.get_term_set(doc_tag_list[gram_key])
                params = self.tag_gram[gram_key]

                # if it's in training producure
                if train_opt == 1 and len(doc_tag_list[gram_key]) > 0:
                    print "tag " + gram_key, "len(tag_set)=",len(tag_set[gram_key])
                    tag_set[gram_key] = pytc.feature_selection_all(doc_tag_list[gram_key], doc_class_list, samp_params['class_name'],
                        tag_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num'])
                    print "tag " + gram_key, "after feature selection, len(tag_set) =", len(tag_set[gram_key])

            if self.character_gram.has_key(gram_key):
                doc_character_list[gram_key] = pytc.gen_character_ngram_list(doc_str_token, gram_key)
                character_set[gram_key] = pytc.get_term_set(doc_character_list[gram_key])
                params = self.character_gram[gram_key]

                # if it's in training producure
                if train_opt == 1 and len(doc_character_list[gram_key]) > 0:
                    print "character " + gram_key, "len(character_set)=",len(character_set[gram_key])
                    character_set[gram_key] = pytc.feature_selection_all(doc_character_list[gram_key], doc_class_list, samp_params['class_name'],
                        character_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num'])
                    print "character " + gram_key, "after feature selection, len(character_set) =", len(character_set[gram_key])

        doc_terms_list, term_set = [], []
        for gram_key in ['uni', 'bis', 'tri', 'quat', 'five', 'six']:
            if self.token_gram.has_key(gram_key):
                term_set += token_set[gram_key]
                if len(doc_terms_list) == 0:
                    doc_terms_list = doc_token_list[gram_key]
                else:
                    pytc.get_joint_sets(doc_terms_list, doc_token_list[gram_key])

            if self.pos_gram.has_key(gram_key):
                term_set += pos_set[gram_key]
                if len(doc_terms_list) == 0:
                    doc_terms_list = doc_pos_list[gram_key]
                else:
                    pytc.get_joint_sets(doc_terms_list, doc_pos_list[gram_key])

            if self.tag_gram.has_key(gram_key):
                term_set += tag_set[gram_key]
                if len(doc_terms_list) == 0:
                    doc_terms_list = doc_tag_list[gram_key]
                else:
                    pytc.get_joint_sets(doc_terms_list, doc_tag_list[gram_key])

            if self.character_gram.has_key(gram_key):
                term_set += character_set[gram_key]
                if len(doc_terms_list) == 0:
                    doc_terms_list = doc_character_list[gram_key]
                else:
                    pytc.get_joint_sets(doc_terms_list, doc_character_list[gram_key])


        return doc_class_list, doc_str_token, doc_terms_list, term_set
Esempio n. 5
0
    for test_data_name in test_data_list:
        test_dir = base_dir + os.sep + test_data_name
        output_dir = test_dir + os.sep + 'rule_distant'
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)

        fenci_fname = 'test_fenci'
        rule_score_fname = 'rule_score.txt'
        rule_result_fname = 'rule_result.txt'

        test_fenci_lines = [x.strip() for x in open(test_dir + os.sep + fenci_fname).readlines()]
        print "一共" + str(len(test_fenci_lines)) + "篇文档......"

        final_score = [0.0] * len(test_fenci_lines)

        terms_list = pytc.gen_N_gram(test_fenci_lines, 'uni')
        for j in range(len(terms_list)):
            score = distant_dict_score(terms_list[j], senti_dict)
            final_score[j] += score

        tools.write_score_file(final_score, output_dir + os.sep + rule_score_fname)

        num = 0
        tools.classify_2_way(output_dir + os.sep + rule_score_fname, output_dir + os.sep + rule_result_fname, num)

        '''performance'''
        result = [x.strip() for x in open(output_dir + os.sep + 'rule_result.txt').readlines()]
        label = [x.strip() for x in open(test_dir + os.sep + 'test_label').readlines()]
        class_dict = {'1':'neg','2':'pos'}
        result_dict = performance.demo_performance(result,label,class_dict)