コード例 #1
0
    def cal_sentence(self, sentence):
        '''计算一个完整句子的规则情感特征'''
        sentence_dict = {
            'score': 0,
            'pos_ct': 0,
            'neg_ct': 0,
            'pos_sub': 0,
            'neg_sub': 0,
            'deny_ct': 0,
            'degree_ct': 0,
            'score_detail': ''
        }

        # 再次切割为子句单元
        sub_sentence_list = tools.cut_sentence(sentence, self.sub_puncs_list)

        sentence_dict['sub_ct'] = len(sub_sentence_list)  #子句数量
        for x in sub_sentence_list:
            sub_res = self.cal_subsent(x)
            sentence_dict['score'] += sub_res['score']
            sentence_dict['pos_ct'] += sub_res['pos_ct']
            sentence_dict['neg_ct'] += sub_res['neg_ct']
            sentence_dict['degree_ct'] += sub_res['degree_ct']
            sentence_dict['score_detail'] += sub_res['score_detail']
            if sub_res['score'] > 0:
                sentence_dict['pos_sub'] += 1
            if sub_res['score'] < 0:
                sentence_dict['neg_sub'] += 1

        return sentence_dict
コード例 #2
0
ファイル: binmt.py プロジェクト: miradel51/THUMT-1
    def get_inputs_batch(self, xp, yp, xm, ym):
        '''
			Getting a batch for semi-supervised training.

			:type xp: numpy array
			:param xp: the indexed source sentences in parallel corpus

			:type yp: numpy array
			:param yp: the indexed target sentences in parallel corpus

			:type xm: numpy array
			:param xm: the indexed source sentences in monolingual corpus

			:type ym: numpy array
			:param ym: the indexed target sentences in monolingual corpus
		'''
        # preparation
        null_x = self.config['index_eos_src']
        null_y = self.config['index_eos_trg']
        unk_x = self.config['index_unk_src']
        unk_y = self.config['index_unk_trg']
        sample_n = self.config['sample_num']
        bs = self.config['batchsize']
        x = []
        y = []
        valid = []
        trans_xs = []
        trans_ys = []

        for i in range(bs):
            trans_x = self.fwd_nmt.translate(tools.cut_sentence(
                xm[:, i], null_x),
                                             sample_n,
                                             return_array=True)
            trans_y = self.bwd_nmt.translate(tools.cut_sentence(
                ym[:, i], null_y),
                                             sample_n,
                                             return_array=True)
            while len(trans_x) < sample_n:
                trans_x.append([unk_y])
            while len(trans_y) < sample_n:
                trans_y.append([unk_x])
            for xx in trans_x:
                trans_xs.append(xx)
            for yy in trans_y:
                trans_ys.append(yy)

        for i in range(bs):
            indx = numpy.where(xp[:, i] == null_x)[0][0]
            x.append(xp[:indx + 1, i])
            indx = numpy.where(yp[:, i] == null_y)[0][0]
            y.append(yp[:indx + 1, i])
            valid.append(1)

        for i in range(bs):
            now_x = xm[:numpy.where(xm[:, i] == null_x)[0][0] + 1, i]
            for j in range(sample_n):
                x.append(now_x)
                now_len = min(80, len(trans_xs[i * sample_n + j]))
                y.append(trans_xs[i * sample_n + j][:now_len])
                valid.append(
                    self.is_valid(trans_xs[i * sample_n + j], null_y, unk_y))

        for i in range(bs):
            now_y = ym[:numpy.where(ym[:, i] == null_y)[0][0] + 1, i]
            for j in range(sample_n):
                y.append(now_y)
                now_len = min(80, len(trans_ys[i * sample_n + j]))
                x.append(trans_ys[i * sample_n + j][:now_len])
                valid.append(
                    self.is_valid(trans_ys[i * sample_n + j], null_x, unk_x))

        max_x = max([len(xx) for xx in x])
        max_y = max([len(yy) for yy in y])
        valid = numpy.asarray(valid, dtype='float32')
        new_x = numpy.zeros((max_x, len(x)), dtype='int64')
        new_y = numpy.zeros((max_y, len(y)), dtype='int64')
        new_x_mask = numpy.zeros((max_x, len(x)), dtype='float32')
        new_y_mask = numpy.zeros((max_y, len(y)), dtype='float32')

        for i in range(len(x)):
            for j in range(len(x[i])):
                new_x[j][i] = x[i][j]
                new_x_mask[j][i] = 1.

        for i in range(len(y)):
            for j in range(len(y[i])):
                new_y[j][i] = y[i][j]
                new_y_mask[j][i] = 1.
        print new_x.shape, new_x_mask.shape, new_y.shape, new_y_mask.shape, valid.shape
        return new_x, new_x_mask, new_y, new_y_mask, valid
コード例 #3
0
    def cal_document(self, doc, normalize_opt='none'):
        '''计算一整篇文档的规则情感特征'''
        document_dict = {
            'score': 0,
            'pos_ct': 0,
            'neg_ct': 0,
            'pos_sub': 0,
            'neg_sub': 0,
            'deny_ct': 0,
            'sub_ct': 0,
            'degree_ct': 0,
            'score_detail': ''
        }

        # 切分为句子
        sentence_list = tools.cut_sentence(doc, self.puncs_list)

        document_dict['doc_len'] = tools.cal_len(doc)
        document_dict['word_num'] = len(doc)
        document_dict['sent_num'] = len(sentence_list)  #句子数量

        for x in sentence_list:
            sub_res = self.cal_sentence(x)
            document_dict['score'] += sub_res['score']
            document_dict['pos_ct'] += sub_res['pos_ct']
            document_dict['neg_ct'] += sub_res['neg_ct']
            document_dict['pos_sub'] += sub_res['pos_sub']
            document_dict['neg_sub'] += sub_res['neg_sub']
            document_dict['sub_ct'] += sub_res['sub_ct']
            document_dict['degree_ct'] += sub_res['degree_ct']
            document_dict['score_detail'] += sub_res['score_detail']

        # document_dict['score'] += self.distant_dict_score(doc)

        if normalize_opt == 'senti_word_num':
            senti_ct = document_dict['pos_ct'] + document_dict['neg_ct']
            if senti_ct > 0:
                document_dict['score'] = document_dict['score'] / senti_ct
        if normalize_opt == 'sent_num':
            if document_dict['sent_num'] > 0:
                document_dict['score'] = document_dict[
                    'score'] / document_dict['sent_num']
        if normalize_opt == 'subsent_num':
            if document_dict['sub_ct'] > 0:
                document_dict[
                    'score'] = document_dict['score'] / document_dict['sub_ct']
        if normalize_opt == 'word_num':
            if document_dict['word_num'] > 0:
                document_dict['score'] = document_dict[
                    'score'] / document_dict['word_num']
        if normalize_opt == 'none':
            pass

        # 表情得分加入
        doc_face_dict = self.cal_face(doc)
        document_dict.update(doc_face_dict)
        document_dict['final_score'] = document_dict['score'] + document_dict[
            'face_score']
        #        if document_dict['sent_len']!=0:
        #            document_dict['final_score'] = tools.normalize_score(document_dict['sent_len'],document_dict['final_score'])
        #        print document_dict['score_detail']
        return document_dict
コード例 #4
0
            y_rule_idx, y_parent_idx, y_parent_t = y[:, :, 0], y[:, :,
                                                                 1], y[:, :, 2]

            # sample
            if data.num_iter % config['sample_freq'] == 0:
                logging.info(
                    '%d iterations passed, %d sentences trained' %
                    (data.num_iter, data.num_iter * config['batchsize']))
                logging.info('sampling')
                if config['sample_sentence']:
                    xs = data.toindex_source(
                        config['sample_sentence'].split(' '))
                    logging.info('source: %s' % data.print_source(xs))
                    sample, probs = model.sample(
                        tools.cut_sentence(xs, config['index_eos_src']),
                        config['sample_length'])
                    logging.info('output: %s\n' % data.print_target(sample[0]))
                else:
                    for i in range(min(x.shape[1], config['sample_times'])):
                        logging.info('source: %s' % data.print_source(x[:, i]))
                        logging.info('target: %s' % data.print_target(y[:, i]))
                        sample, probs = model.sample(
                            tools.cut_sentence(x[:, i],
                                               config['index_eos_src']),
                            config['sample_length'])
                        logging.info('output: %s\n' %
                                     data.print_target(sample[0]))

            # save checkpoint
            if data.num_iter % config['checkpoint_freq'] == 0: