def cal_sentence(self, sentence): '''计算一个完整句子的规则情感特征''' sentence_dict = { 'score': 0, 'pos_ct': 0, 'neg_ct': 0, 'pos_sub': 0, 'neg_sub': 0, 'deny_ct': 0, 'degree_ct': 0, 'score_detail': '' } # 再次切割为子句单元 sub_sentence_list = tools.cut_sentence(sentence, self.sub_puncs_list) sentence_dict['sub_ct'] = len(sub_sentence_list) #子句数量 for x in sub_sentence_list: sub_res = self.cal_subsent(x) sentence_dict['score'] += sub_res['score'] sentence_dict['pos_ct'] += sub_res['pos_ct'] sentence_dict['neg_ct'] += sub_res['neg_ct'] sentence_dict['degree_ct'] += sub_res['degree_ct'] sentence_dict['score_detail'] += sub_res['score_detail'] if sub_res['score'] > 0: sentence_dict['pos_sub'] += 1 if sub_res['score'] < 0: sentence_dict['neg_sub'] += 1 return sentence_dict
def get_inputs_batch(self, xp, yp, xm, ym): ''' Getting a batch for semi-supervised training. :type xp: numpy array :param xp: the indexed source sentences in parallel corpus :type yp: numpy array :param yp: the indexed target sentences in parallel corpus :type xm: numpy array :param xm: the indexed source sentences in monolingual corpus :type ym: numpy array :param ym: the indexed target sentences in monolingual corpus ''' # preparation null_x = self.config['index_eos_src'] null_y = self.config['index_eos_trg'] unk_x = self.config['index_unk_src'] unk_y = self.config['index_unk_trg'] sample_n = self.config['sample_num'] bs = self.config['batchsize'] x = [] y = [] valid = [] trans_xs = [] trans_ys = [] for i in range(bs): trans_x = self.fwd_nmt.translate(tools.cut_sentence( xm[:, i], null_x), sample_n, return_array=True) trans_y = self.bwd_nmt.translate(tools.cut_sentence( ym[:, i], null_y), sample_n, return_array=True) while len(trans_x) < sample_n: trans_x.append([unk_y]) while len(trans_y) < sample_n: trans_y.append([unk_x]) for xx in trans_x: trans_xs.append(xx) for yy in trans_y: trans_ys.append(yy) for i in range(bs): indx = numpy.where(xp[:, i] == null_x)[0][0] x.append(xp[:indx + 1, i]) indx = numpy.where(yp[:, i] == null_y)[0][0] y.append(yp[:indx + 1, i]) valid.append(1) for i in range(bs): now_x = xm[:numpy.where(xm[:, i] == null_x)[0][0] + 1, i] for j in range(sample_n): x.append(now_x) now_len = min(80, len(trans_xs[i * sample_n + j])) y.append(trans_xs[i * sample_n + j][:now_len]) valid.append( self.is_valid(trans_xs[i * sample_n + j], null_y, unk_y)) for i in range(bs): now_y = ym[:numpy.where(ym[:, i] == null_y)[0][0] + 1, i] for j in range(sample_n): y.append(now_y) now_len = min(80, len(trans_ys[i * sample_n + j])) x.append(trans_ys[i * sample_n + j][:now_len]) valid.append( self.is_valid(trans_ys[i * sample_n + j], null_x, unk_x)) max_x = max([len(xx) for xx in x]) max_y = max([len(yy) for yy in y]) valid = numpy.asarray(valid, dtype='float32') new_x = numpy.zeros((max_x, len(x)), dtype='int64') new_y = numpy.zeros((max_y, len(y)), dtype='int64') new_x_mask = numpy.zeros((max_x, len(x)), dtype='float32') new_y_mask = numpy.zeros((max_y, len(y)), dtype='float32') for i in range(len(x)): for j in range(len(x[i])): new_x[j][i] = x[i][j] new_x_mask[j][i] = 1. for i in range(len(y)): for j in range(len(y[i])): new_y[j][i] = y[i][j] new_y_mask[j][i] = 1. print new_x.shape, new_x_mask.shape, new_y.shape, new_y_mask.shape, valid.shape return new_x, new_x_mask, new_y, new_y_mask, valid
def cal_document(self, doc, normalize_opt='none'): '''计算一整篇文档的规则情感特征''' document_dict = { 'score': 0, 'pos_ct': 0, 'neg_ct': 0, 'pos_sub': 0, 'neg_sub': 0, 'deny_ct': 0, 'sub_ct': 0, 'degree_ct': 0, 'score_detail': '' } # 切分为句子 sentence_list = tools.cut_sentence(doc, self.puncs_list) document_dict['doc_len'] = tools.cal_len(doc) document_dict['word_num'] = len(doc) document_dict['sent_num'] = len(sentence_list) #句子数量 for x in sentence_list: sub_res = self.cal_sentence(x) document_dict['score'] += sub_res['score'] document_dict['pos_ct'] += sub_res['pos_ct'] document_dict['neg_ct'] += sub_res['neg_ct'] document_dict['pos_sub'] += sub_res['pos_sub'] document_dict['neg_sub'] += sub_res['neg_sub'] document_dict['sub_ct'] += sub_res['sub_ct'] document_dict['degree_ct'] += sub_res['degree_ct'] document_dict['score_detail'] += sub_res['score_detail'] # document_dict['score'] += self.distant_dict_score(doc) if normalize_opt == 'senti_word_num': senti_ct = document_dict['pos_ct'] + document_dict['neg_ct'] if senti_ct > 0: document_dict['score'] = document_dict['score'] / senti_ct if normalize_opt == 'sent_num': if document_dict['sent_num'] > 0: document_dict['score'] = document_dict[ 'score'] / document_dict['sent_num'] if normalize_opt == 'subsent_num': if document_dict['sub_ct'] > 0: document_dict[ 'score'] = document_dict['score'] / document_dict['sub_ct'] if normalize_opt == 'word_num': if document_dict['word_num'] > 0: document_dict['score'] = document_dict[ 'score'] / document_dict['word_num'] if normalize_opt == 'none': pass # 表情得分加入 doc_face_dict = self.cal_face(doc) document_dict.update(doc_face_dict) document_dict['final_score'] = document_dict['score'] + document_dict[ 'face_score'] # if document_dict['sent_len']!=0: # document_dict['final_score'] = tools.normalize_score(document_dict['sent_len'],document_dict['final_score']) # print document_dict['score_detail'] return document_dict
y_rule_idx, y_parent_idx, y_parent_t = y[:, :, 0], y[:, :, 1], y[:, :, 2] # sample if data.num_iter % config['sample_freq'] == 0: logging.info( '%d iterations passed, %d sentences trained' % (data.num_iter, data.num_iter * config['batchsize'])) logging.info('sampling') if config['sample_sentence']: xs = data.toindex_source( config['sample_sentence'].split(' ')) logging.info('source: %s' % data.print_source(xs)) sample, probs = model.sample( tools.cut_sentence(xs, config['index_eos_src']), config['sample_length']) logging.info('output: %s\n' % data.print_target(sample[0])) else: for i in range(min(x.shape[1], config['sample_times'])): logging.info('source: %s' % data.print_source(x[:, i])) logging.info('target: %s' % data.print_target(y[:, i])) sample, probs = model.sample( tools.cut_sentence(x[:, i], config['index_eos_src']), config['sample_length']) logging.info('output: %s\n' % data.print_target(sample[0])) # save checkpoint if data.num_iter % config['checkpoint_freq'] == 0: