def next_batch(self): ''' get the netxt batch_data. ''' self.nN = self.nN + 1 rins, lab, ret_ids, rinlens, rmaxlens, self.idx, rinlens_float32 = batch_range( self.batch_size, self.idx, self.nsamps, self.rand_idx, self.class_num, self.labels, self.ids, [self.texts, self.aspsubs, self.aspects, self.left_ctx, self.right_ctx, self.leftsubs, self.rightsubs] ) fsents = rins[0] asubs = rins[1] asps = rins[2] left_ctx = rins[3] right_ctx = rins[4] lsubs = rins[5] rsubs = rins[6] # context bitmap. sent_bitmap = [] # row sentence lengths. sequence_lengs = rinlens[0] left_sequence_lengs = rinlens[3] right_sequence_lengs = rinlens[4] seq_lens = [] reverse_lens = [] for x in xrange(len(sequence_lengs)): nl = sequence_lengs[x][0] if self.eos: nl += 1 seq_lens.append(nl) # add the <eos> reverse_lens.append(sequence_lengs[x][0]) left_seq_lens = [] right_seq_lens = [] for x in xrange(len(left_sequence_lengs)): nl = left_sequence_lengs[x][0] left_seq_lens.append(nl) # reverse_lens.append(sequence_lengs[x][0]) for x in xrange(len(right_sequence_lengs)): nl = right_sequence_lengs[x][0] right_seq_lens.append(nl) # reverse_lens.append(sequence_lengs[x][0]) left_max_len = rmaxlens[3] right_max_len = rmaxlens[4] # pad index add_pad( inputs=[fsents, left_ctx, right_ctx], max_lens=[rmaxlens[0] + 1, rmaxlens[3], rmaxlens[4]], pad_idx=self.pad_idx ) max_len = rmaxlens[0] + 1 sent_bitmap = bitmap_by_padid(fsents, self.pad_idx) left_sent_bitmap = bitmap_by_padid(left_ctx, self.pad_idx) right_sent_bitmap = bitmap_by_padid(right_ctx, self.pad_idx) alpha_adj = copy.deepcopy(sent_bitmap) for row in alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break left_alpha_adj = copy.deepcopy(left_sent_bitmap) for row in left_alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break right_alpha_adj = copy.deepcopy(right_sent_bitmap) for row in right_alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break # count the aspect lens, and size # count the memory size abs_poses = [] left_abs_poses = [] right_abs_poses = [] pos_ids = [] left_pos_ids = [] right_pos_ids = [] asp_lens = [] asp_size = 0 mem_size = 0 for x in xrange(len(seq_lens)): sl = seq_lens[x] left_l = left_seq_lens[x] right_l = right_seq_lens[x] asub = asubs[x] al = asub[1] - asub[0] asp_lens.append(al) if al > asp_size: asp_size = al ms = sl - al if ms > mem_size: mem_size = ms # count the position # left lt_pos = range(asub[0] + 1)[1:] lt_pid = range(asub[0] + 1)[1:] lt_pos.reverse() lt_pid.reverse() left_start = self.pos_id_range['left_start'] tmp_lt_pid = np.array(lt_pid) tmp_lt_pid += left_start - 1 lt_pid = tmp_lt_pid.tolist() # right rt_pos = range(sl - asub[1] + 1)[1:] rt_pid = range(sl - asub[1] + 1)[1:] right_start = self.pos_id_range['right_start'] tmp_rt_pid = np.array(rt_pid) tmp_rt_pid += right_start - 1 rt_pid = tmp_rt_pid.tolist() # aspect asp_pos = [0 for _ in range(al)] asp_pid = [self.pos_id_range['asp'] for _ in range(al)] # lest. the pads. lest_pos = [0 for _ in range(max_len - sl)] l_lest_pos = [0 for _ in range(rmaxlens[3] - left_l)] r_lest_pos = [0 for _ in range(rmaxlens[4] - right_l)] lest_pid = [self.pos_id_range['oos'] for _ in range(max_len - sl)] l_lest_pid = [self.pos_id_range['oos'] for _ in range(rmaxlens[3] - left_l)] r_lest_pid = [self.pos_id_range['oos'] for _ in range(rmaxlens[4] - right_l)] # build the rets. abs_pos = [] abs_pos.extend(lt_pos) abs_pos.extend(asp_pos) abs_pos.extend(rt_pos) abs_pos.extend(lest_pos) abs_poses.append(abs_pos) lt_pos.extend(l_lest_pos) left_abs_poses.append(lt_pos) rt_pos.extend(r_lest_pos) right_abs_poses.append(rt_pos) pos_id = [] pos_id.extend(lt_pid) pos_id.extend(asp_pid) pos_id.extend(rt_pid) pos_id.extend(lest_pid) pos_ids.append(pos_id) lt_pid.extend(l_lest_pid) left_pos_ids.append(lt_pid) rt_pid.extend(r_lest_pid) right_pos_ids.append(rt_pid) add_pad( inputs=[asps], max_lens=[asp_size], pad_idx=self.pad_idx ) # build the subs. asp_subs = [] left_subs = [] right_subs = [] f_asp_subs = [] b_asp_subs = [] f_left_subs = [] b_left_subs = [] f_right_subs = [] b_right_subs = [] for k in xrange(len(fsents)): bias = k * max_len asp_sub = [] left_sub = [] right_sub = [] asub = asubs[k] lsub = lsubs[k] rsub = rsubs[k] # test # print bias # print asub # print seq_lens[k] # test asp_sub = range(bias + asub[0], bias + asub[1]) left_sub = range(bias+lsub[0],bias + lsub[1]) right_sub = range(bias + rsub[0], bias + rsub[1]) f_asp_subs.append(bias + asub[1] - 1) b_asp_subs.append(bias + asub[0]) f_left_subs.append(bias + lsub[1] -1) b_left_subs.append(bias + lsub[0]) f_right_subs.append(bias + rsub[1] - 1) b_right_subs.append(bias + rsub[0]) aslen = len(asp_sub) leftlen = len(left_sub) rightlen = len(right_sub) while aslen < asp_size: asp_sub.append(bias + max_len - 1) aslen = aslen + 1 while leftlen < left_max_len: left_sub.append(bias + max_len - 1) leftlen = leftlen + 1 while rightlen < right_max_len: right_sub.append(bias + max_len - 1) rightlen = rightlen + 1 asp_subs.append(asp_sub) left_subs.append(left_sub) right_subs.append(right_sub) ret_data = { 'text_idxes' : fsents, 'left_ctx_idxes': left_ctx, 'right_ctx_idxes': right_ctx, 'batch_ids' : ret_ids, 'aspect_idxes' : asps, 'labels' : lab, 'text_lens' : seq_lens, 'left_lens': left_seq_lens, 'right_lens': right_seq_lens, 'aspect_lens' : asp_lens, 'text_reverse_lens' : reverse_lens, 'aspect_subs' : asp_subs, #所有经过pad的句子拼接在一起后经过pad的方面在其中的下标, 'left_subs' : left_subs, 'right_subs' : right_subs, 'text_bitmap' : sent_bitmap, 'left_bitmap': left_sent_bitmap, 'right_bitmap': right_sent_bitmap, 'f_asp_sub' : f_asp_subs, 'b_asp_sub' : b_asp_subs, 'f_left_sub': f_left_subs, 'b_left_sub': b_left_subs, 'f_right_sub': f_right_subs, 'b_right_sub': b_right_subs, 'alpha_adj' : alpha_adj, 'abs_poses' : abs_poses, 'left_abs_poses' : left_abs_poses, 'right_abs_poses' : right_abs_poses, 'pos_ids' : pos_ids, 'left_pos_ids' : left_pos_ids, 'right_pos_ids' : right_pos_ids } return ret_data
def get_data(sample, class_num, pad_idx, eos): labels = [] texts = [] aspects = [] leftsubs = [] rightsubs = [] left_ctx = [] left_ctx_asp = [] right_ctx = [] right_ctx_asp = [] seq_lens = [] asp_lens = [] left_seq_lens = [] right_seq_lens = [] aspsubs = [] ids = [] eos = eos class_num = class_num pad_idx = pad_idx ids.append(sample.id) texts.append(sample.text_idxes) seq_lens.append(len(sample.text_idxes)) left_ctx.append(sample.left_context_idxes) left_seq_lens.append(len(sample.left_context_idxes)) tmp = sample.left_context_idxes + sample.aspect_idxes tmp.reverse() left_ctx_asp.append(tmp) right_ctx.append(sample.right_context_idxes) right_seq_lens.append(len(sample.right_context_idxes)) right_ctx_asp.append((sample.aspect_idxes + sample.right_context_idxes)) aspects.append(sample.aspect_idxes) asp_lens.append(len(sample.aspect_idxes)) aspsubs.append(sample.aspect_wordpos) leftsubs.append(range(sample.left_wordpos[1])) rightsubs.append( range(len(sample.aspect_idxes), len(sample.right_context_idxes) + len(sample.aspect_idxes))) crt_lab = [0.0] * class_num crt_lab[sample.label] = 1.0 labels.append(crt_lab) add_pad(inputs=[left_ctx, right_ctx], max_lens=[left_seq_lens[0] + 1, right_seq_lens[0] + 1], pad_idx=pad_idx) sent_bitmap = bitmap_by_padid(texts, pad_idx) left_sent_bitmap = bitmap_by_padid(left_ctx, pad_idx) right_sent_bitmap = bitmap_by_padid(right_ctx, pad_idx) ret_data = { 'text_idxes': texts, 'left_ctx_idxes': left_ctx, 'right_ctx_idxes': right_ctx, 'left_ctx_asp': left_ctx_asp, 'right_ctx_asp': right_ctx_asp, 'batch_ids': ids, 'aspect_idxes': aspects, 'labels': labels, 'text_lens': seq_lens, 'left_lens': left_seq_lens, 'right_lens': right_seq_lens, 'aspect_lens': asp_lens, # 'aspect_subs': asp_subs, # 所有经过pad的句子拼接在一起后经过pad的方面在其中的下标, 'left_subs': leftsubs, 'right_subs': rightsubs, 'text_bitmap': sent_bitmap, 'left_bitmap': left_sent_bitmap, 'right_bitmap': right_sent_bitmap } return ret_data
def next_batch(self): ''' get the netxt batch_data. ''' self.nN = self.nN + 1 rins, lab, ret_ids, rinlens, rmaxlens, self.idx, rinlens_float32 = batch_range( self.batch_size, self.idx, self.nsamps, self.rand_idx, self.class_num, self.labels, self.ids, [self.texts, self.aspsubs, self.aspects] ) fsents = rins[0] asubs = rins[1] asps = rins[2] # context bitmap. sent_bitmap = [] # row sentence lengths. sequence_lengs = rinlens[0] seq_lens = [] reverse_lens = [] for x in xrange(len(sequence_lengs)): nl = sequence_lengs[x][0] if self.eos: nl += 1 seq_lens.append(nl) # add the <eos> reverse_lens.append(sequence_lengs[x][0]) # pad index add_pad( inputs=[fsents], max_lens=[rmaxlens[0] + 1], pad_idx=self.pad_idx ) max_len = rmaxlens[0] + 1 sent_bitmap = bitmap_by_padid(fsents, self.pad_idx) alpha_adj = copy.deepcopy(sent_bitmap) for row in alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break # count the aspect lens, and size # count the memory size asp_lens = [] asp_size = 0 mem_size = 0 for x in xrange(len(seq_lens)): sl = seq_lens[x] asub = asubs[x] al = asub[1] - asub[0] asp_lens.append(al) if al > asp_size: asp_size = al ms = sl - al if ms > mem_size: mem_size = ms add_pad( inputs=[asps], max_lens=[asp_size], pad_idx=self.pad_idx ) # build the subs. asp_subs = [] f_asp_subs = [] b_asp_subs = [] for k in xrange(len(fsents)): bias = k * max_len asp_sub = [] asub = asubs[k] # test # print bias # print asub # print seq_lens[k] # test asp_sub = range(bias + asub[0], bias + asub[1]) f_asp_subs.append(bias + asub[1] - 1) b_asp_subs.append(bias + asub[0]) aslen = len(asp_sub) while aslen < asp_size: asp_sub.append(bias + max_len - 1) aslen = aslen + 1 asp_subs.append(asp_sub) seq_lens_float32 = [] for l in seq_lens: seq_lens_float32.append([float(l)]) asp_lens_float32 = [] for l in asp_lens: asp_lens_float32.append([float(l)]) ret_data = { 'text_idxes' : fsents, 'batch_ids' : ret_ids, 'aspect_idxes' : asps, 'labels' : lab, 'text_lens' : seq_lens, 'text_lens_float32' : seq_lens_float32, 'aspect_lens' : asp_lens, 'aspect_lens_float32' : asp_lens_float32, 'text_reverse_lens' : reverse_lens, 'aspect_subs' : asp_subs, #所有经过pad的句子拼接在一起后经过pad的方面在其中的下标, 'text_bitmap' : sent_bitmap, 'f_asp_sub' : f_asp_subs, 'b_asp_sub' : b_asp_subs, 'alpha_adj' : alpha_adj } return ret_data
def get_data(sample, class_num, pad_idx, eos): labels = [] texts = [] aspects = [] leftsubs = [] rightsubs = [] left_ctx = [] left_ctx_asp = [] right_ctx = [] right_ctx_asp = [] seq_lens = [] asp_lens = [] left_seq_lens = [] right_seq_lens = [] left_ctxasp_lens = [] right_ctxasp_lens = [] aspsubs = [] ids = [] eos = eos class_num = class_num pad_idx = pad_idx ids.append(sample.id) texts.append(sample.text_idxes) seq_lens.append(len(sample.text_idxes)) left_ctx.append(sample.left_context_idxes) left_seq_lens.append(len(sample.left_context_idxes)) tmp = sample.left_context_idxes + sample.aspect_idxes tmp.reverse() left_ctx_asp.append(tmp) right_ctx.append(sample.right_context_idxes) right_seq_lens.append(len(sample.right_context_idxes)) right_ctx_asp.append((sample.aspect_idxes + sample.right_context_idxes)) left_ctxasp_lens.append(len(left_ctx_asp[0])) right_ctxasp_lens.append(len(right_ctx_asp[0])) aspects.append(sample.aspect_idxes) asp_lens.append(len(sample.aspect_idxes)) aspsubs.append(sample.aspect_wordpos) leftsubs.append(sample.left_wordpos) rightsubs.append(sample.aspect_wordpos) crt_lab = [0.0] * class_num crt_lab[sample.label] = 1.0 labels.append(crt_lab) add_pad(inputs=[texts, left_ctx_asp, right_ctx_asp, left_ctx, right_ctx], max_lens=[ seq_lens[0] + 1, seq_lens[0] + 1, seq_lens[0] + 1, left_seq_lens[0], right_seq_lens[0] ], pad_idx=pad_idx) sent_bitmap = bitmap_by_padid(texts, pad_idx) left_sent_bitmap = bitmap_by_padid(left_ctx, pad_idx) right_sent_bitmap = bitmap_by_padid(right_ctx, pad_idx) max_len = seq_lens[0] + 1 left_max_len = left_seq_lens[0] + 2 right_max_len = right_seq_lens[0] + 2 asp_subs = [] left_subs = [] right_subs = [] left_ngrams = [] left_ngram_lens = [] right_ngrams = [] right_ngram_lens = [] for k in xrange(1): bias = 0 asp_sub = [] left_sub = [] right_sub = [] asub = aspsubs[0] lsub = leftsubs[0] rsub = rightsubs[0] # test # print bias # print asub # print seq_lens[k] # test asp_sub = range(bias + asub[0], bias + asub[1]) left_sub = range(bias + lsub[0], bias + lsub[1]) right_sub = range(bias + rsub[0], bias + rsub[1]) aslen = len(asp_sub) leftlen = len(left_sub) rightlen = len(right_sub) while leftlen < left_max_len: left_sub.append(bias + max_len - 1) leftlen = leftlen + 1 while rightlen < right_max_len: right_sub.append(bias + max_len - 1) rightlen = rightlen + 1 left_ngram = [] left_ngram_len = [] right_ngram = [] right_ngram_len = [] for i in range(1, len(left_sub)): left_ngram.append([left_sub[i - 1], left_sub[i]]) if left_sub[i - 1] == (max_len - 1) or left_sub[i] == (max_len - 1): left_ngram_len.append(1) else: left_ngram_len.append(2) for i in range(1, len(right_sub)): right_ngram.append([right_sub[i - 1], right_sub[i]]) if right_sub[i - 1] == (max_len - 1) or right_sub[i] == (max_len - 1): right_ngram_len.append(1) else: right_ngram_len.append(2) asp_subs.append(asp_sub) left_subs.append(left_sub) right_subs.append(right_sub) left_ngrams.append(left_ngram) right_ngrams.append(right_ngram) left_ngram_lens.append(left_ngram_len) right_ngram_lens.append(right_ngram_len) asp_mask = [] for x in range(len(seq_lens)): asp_mask.append([]) for i in range(len(seq_lens)): for x in range(left_seq_lens[i]): asp_mask[i].append(1.0) for x in range(asp_lens[i]): asp_mask[i].append(0.5) for x in range(right_seq_lens[i]): asp_mask[i].append(1.0) asp_mask[i].append(0.0) ret_data = { 'text_idxes': texts, 'left_ctx_idxes': left_ctx, 'right_ctx_idxes': right_ctx, 'left_ctx_asp': left_ctx_asp, 'right_ctx_asp': right_ctx_asp, 'batch_ids': ids, 'aspect_idxes': aspects, 'labels': labels, 'text_lens': seq_lens, 'left_lens': left_seq_lens, 'right_lens': right_seq_lens, 'left_ca_lens': left_ctxasp_lens, 'right_ca_lens': right_ctxasp_lens, 'left_ngram_lens': left_ngram_lens, 'right_ngram_lens': right_ngram_lens, 'aspect_lens': asp_lens, # 'aspect_subs': asp_subs, # 所有经过pad的句子拼接在一起后经过pad的方面在其中的下标, 'left_subs': left_subs, 'right_subs': right_subs, 'asp_subs': asp_subs, 'left_ngrams': left_ngrams, 'right_ngrams': right_ngrams, 'text_bitmap': sent_bitmap, 'left_bitmap': left_sent_bitmap, 'right_bitmap': right_sent_bitmap, 'asp_mask': asp_mask } return ret_data
def next_batch(self): ''' get the netxt batch_data. ''' self.labels = [] self.texts = [] self.aspects = [] self.leftsubs = [] self.rightsubs = [] self.left_ctx = [] self.left_ctx_asp = [] self.right_ctx = [] self.right_ctx_asp = [] self.aspsubs = [] self.left_aspsubs = [] self.right_aspsubs = [] self.ids = [] samplelist = self.len_dic[self.key_list[self.rand_idx[self.idx]]] random.shuffle(samplelist) for sample in samplelist: self.ids.append(sample.id) self.texts.append(sample.text_idxes) self.left_ctx.append(sample.left_context_idxes) left_tmp = sample.left_context_idxes + sample.aspect_idxes left_tmp.reverse() self.left_ctx_asp.append(left_tmp) right_tmp = sample.aspect_idxes+sample.right_context_idxes self.right_ctx_asp.append(right_tmp) self.right_ctx.append(sample.right_context_idxes) self.aspects.append(sample.aspect_idxes) self.aspsubs.append(sample.aspect_wordpos) self.left_aspsubs.append([0,(len(sample.aspect_idxes))]) self.right_aspsubs.append([0,(len(sample.aspect_idxes))]) self.leftsubs.append(sample.left_wordpos) self.rightsubs.append(sample.right_wordpos) self.labels.append(sample.label) rins, lab, rinlens, rmaxlens, rinlens_float32 = batch_all( [self.texts, self.aspsubs, self.aspects, self.left_ctx, self.right_ctx, self.leftsubs, self.rightsubs, self.left_ctx_asp, self.right_ctx_asp, self.left_aspsubs, self.right_aspsubs], self.labels, self.class_num, ) self.idx += 1 fsents = rins[0] asubs = rins[1] asps = rins[2] left_ctx = rins[3] right_ctx = rins[4] lsubs = rins[5] rsubs = rins[6] l_ctx_asp = rins[7] r_ctx_asp = rins[8] l_asubs = rins[9] r_asubs = rins[10] # context bitmap. sent_bitmap = [] # row sentence lengths. sequence_lengs = rinlens[0] left_sequence_lengs = rinlens[3] right_sequence_lengs = rinlens[4] l_ctx_asp_len =rinlens[7] r_ctx_asp_len = rinlens[8] seq_lens = [] left_seq_lens = [] right_seq_lens = [] l_ca_len = [] r_ca_len = [] reverse_lens = [] for x in xrange(len(sequence_lengs)): nl = sequence_lengs[x][0] if self.eos: nl += 1 seq_lens.append(nl) # add the <eos> reverse_lens.append(sequence_lengs[x][0]) left_seq_lens.append(left_sequence_lengs[x][0]) right_seq_lens.append(right_sequence_lengs[x][0]) l_ca_len.append(l_ctx_asp_len[x][0]) r_ca_len.append(r_ctx_asp_len[x][0]) # for x in xrange(len(left_sequence_lengs)): # nl = left_sequence_lengs[x][0] # left_seq_lens.append(nl) # # reverse_lens.append(sequence_lengs[x][0]) # for x in xrange(len(right_sequence_lengs)): # nl = right_sequence_lengs[x][0] # right_seq_lens.append(nl) # # reverse_lens.append(sequence_lengs[x][0]) left_max_len = rmaxlens[3] right_max_len = rmaxlens[4] # pad index add_pad( inputs=[fsents, left_ctx, right_ctx,l_ctx_asp,r_ctx_asp], max_lens=[rmaxlens[0]+1, rmaxlens[0], rmaxlens[0],rmaxlens[0]+1,rmaxlens[0]+1], pad_idx=self.pad_idx ) max_len = rmaxlens[0]+1 sent_bitmap = bitmap_by_padid(fsents, self.pad_idx) left_sent_bitmap = bitmap_by_padid(left_ctx, self.pad_idx) right_sent_bitmap = bitmap_by_padid(right_ctx, self.pad_idx) alpha_adj = copy.deepcopy(sent_bitmap) for row in alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break left_alpha_adj = copy.deepcopy(left_sent_bitmap) for row in left_alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break right_alpha_adj = copy.deepcopy(right_sent_bitmap) for row in right_alpha_adj: for i in range(len(row)): if row[i] == 1.0: row[i] = 0.0 else: row[i] = 1.0 break # count the aspect lens, and size # count the memory size abs_poses = [] left_abs_poses = [] right_abs_poses = [] pos_ids = [] left_pos_ids = [] right_pos_ids = [] asp_lens = [] asp_size = 0 mem_size = 0 for x in xrange(len(seq_lens)): sl = seq_lens[x] left_l = left_seq_lens[x] right_l = right_seq_lens[x] asub = asubs[x] al = asub[1] - asub[0] asp_lens.append(al) if al > asp_size: asp_size = al ms = sl - al if ms > mem_size: mem_size = ms # count the position # left lt_pos = range(asub[0] + 1)[1:] lt_pid = range(asub[0] + 1)[1:] lt_pos.reverse() lt_pid.reverse() left_start = self.pos_id_range['left_start'] tmp_lt_pid = np.array(lt_pid) tmp_lt_pid += left_start - 1 lt_pid = tmp_lt_pid.tolist() # right rt_pos = range(sl - asub[1] + 1)[1:] rt_pid = range(sl - asub[1] + 1)[1:] right_start = self.pos_id_range['right_start'] tmp_rt_pid = np.array(rt_pid) tmp_rt_pid += right_start - 1 rt_pid = tmp_rt_pid.tolist() # aspect asp_pos = [0 for _ in range(al)] asp_pid = [self.pos_id_range['asp'] for _ in range(al)] # lest. the pads. lest_pos = [0 for _ in range(max_len - sl)] l_lest_pos = [0 for _ in range(rmaxlens[3] - left_l)] r_lest_pos = [0 for _ in range(rmaxlens[4] - right_l)] lest_pid = [self.pos_id_range['oos'] for _ in range(max_len - sl)] l_lest_pid = [self.pos_id_range['oos'] for _ in range(rmaxlens[3] - left_l)] r_lest_pid = [self.pos_id_range['oos'] for _ in range(rmaxlens[4] - right_l)] # build the rets. abs_pos = [] abs_pos.extend(lt_pos) abs_pos.extend(asp_pos) abs_pos.extend(rt_pos) abs_pos.extend(lest_pos) abs_poses.append(abs_pos) lt_pos.extend(l_lest_pos) left_abs_poses.append(lt_pos) rt_pos.extend(r_lest_pos) right_abs_poses.append(rt_pos) pos_id = [] pos_id.extend(lt_pid) pos_id.extend(asp_pid) pos_id.extend(rt_pid) pos_id.extend(lest_pid) pos_ids.append(pos_id) lt_pid.extend(l_lest_pid) left_pos_ids.append(lt_pid) rt_pid.extend(r_lest_pid) right_pos_ids.append(rt_pid) add_pad( inputs=[asps], max_lens=[asp_size], pad_idx=self.pad_idx ) asp_mask = [] for x in range(len(seq_lens)): asp_mask.append([]) for i in range(len(seq_lens)): for x in range(left_seq_lens[i]): asp_mask[i].append(1.0) for x in range(asp_lens[i]): asp_mask[i].append(0.5) for x in range(right_seq_lens[i]): asp_mask[i].append(1.0) asp_mask[i].append(0.0) asp_pos = [] for x in range(len(seq_lens)): asp_pos.append([]) for i in range(len(seq_lens)): for x in range(left_seq_lens[i]): asp_pos[i].append(0) for x in range(asp_lens[i]): asp_pos[i].append(1) for x in range(right_seq_lens[i]): asp_pos[i].append(0) left_a_mask=[] for x in range(len(seq_lens)): left_a_mask.append([]) for i in range(len(seq_lens)): for x in range(asp_lens[i]): left_a_mask[i].append(1) for x in range(seq_lens[i] - asp_lens[i]): left_a_mask[i].append(0) left_mask = [] for x in range(len(seq_lens)): left_mask.append([]) for i in range(len(seq_lens)): for x in range(asp_lens[i]): left_mask[i].append(0) for x in range(seq_lens[i] - asp_lens[i]): left_mask[i].append(1) left_mask2 = [] for x in range(len(seq_lens)): left_mask2.append([]) for i in range(len(seq_lens)): for x in range(left_seq_lens[i]): left_mask2[i].append(1) for x in range(seq_lens[i] - left_seq_lens[i]): left_mask2[i].append(0) right_mask2 = [] for x in range(len(seq_lens)): right_mask2.append([]) for i in range(len(seq_lens)): for x in range(right_seq_lens[i]): right_mask2[i].append(1) for x in range(seq_lens[i] - right_seq_lens[i]): right_mask2[i].append(0) left_asp_mask =[] for x in range(len(seq_lens)): left_asp_mask.append([]) for i in range(len(seq_lens)): for x in range(l_ca_len[i]): left_asp_mask[i].append(1) for x in range(seq_lens[i] - l_ca_len[i]): left_asp_mask[i].append(0) right_asp_mask = [] for x in range(len(seq_lens)): right_asp_mask.append([]) for i in range(len(seq_lens)): for x in range(r_ca_len[i]): right_asp_mask[i].append(1) for x in range(seq_lens[i] - r_ca_len[i]): right_asp_mask[i].append(0) # build the subs. asp_subs = [] left_subs = [] right_subs = [] f_asp_subs = [] b_asp_subs = [] f_left_subs = [] b_left_subs = [] f_right_subs = [] b_right_subs = [] window_subs = [] window_lens = [] for k in xrange(len(fsents)): bias = k * max_len asp_sub = [] left_sub = [] right_sub = [] asub = asubs[k] lsub = lsubs[k] rsub = rsubs[k] # test # print bias # print asub # print seq_lens[k] # test asp_sub = range(bias + asub[0], bias + asub[1]) left_sub = range(bias+lsub[0],bias + lsub[1]) right_sub = range(bias + rsub[0], bias + rsub[1]) f_asp_subs.append(bias + asub[1] - 1) b_asp_subs.append(bias + asub[0]) f_left_subs.append(bias + lsub[1] -1) b_left_subs.append(bias + lsub[0]) f_right_subs.append(bias + rsub[1] - 1) b_right_subs.append(bias + rsub[0]) window_sub = [] window_len = [] window_size = 5 w = [] for x in range(len(fsents[k])): lenth = 0 for s in range(1,window_size): if x - s<0: left_1 = bias + max_len - 1 else: left_1 = bias + fsents[k][x - 2] lenth += 1 w.append(left_1) # if x -1 < 0 : # left_2 = bias + max_len - 1 # else: # left_2 = bias + fsents[k][x - 1] # lenth += 1 for s in range(1, window_size): if x + s >= max_len: right_1 = bias + max_len - 1 else: right_1 = bias + fsents[k][x + 1] lenth += 1 w.append(right_1) # if x + 2 >= max_len: # right_2 = bias + max_len - 1 # else: # right_2 = bias + fsents[k][x + 2] # lenth += 1 if lenth == 0: lenth +=1 window_len.append(lenth) window_sub.append(w) window_subs.append(window_sub) window_lens.append(window_len) aslen = len(asp_sub) leftlen = len(left_sub) rightlen = len(right_sub) while aslen < asp_size: asp_sub.append(bias + max_len - 1) aslen = aslen + 1 while leftlen < left_max_len: left_sub.append(bias + max_len - 1) leftlen = leftlen + 1 while rightlen < right_max_len: right_sub.append(bias + max_len - 1) rightlen = rightlen + 1 asp_subs.append(asp_sub) left_subs.append(left_sub) right_subs.append(right_sub) ret_data = { 'text_idxes' : fsents, 'left_ctx_idxes': left_ctx, 'right_ctx_idxes': right_ctx, 'left_ctx_asp': l_ctx_asp, 'right_ctx_asp':r_ctx_asp, 'batch_ids' : self.ids, 'aspect_idxes' : asps, 'labels' : lab, 'text_lens' : seq_lens, 'left_lens': left_seq_lens, 'right_lens': right_seq_lens, 'left_ca_lens':l_ca_len, 'right_ca_lens':r_ca_len, 'aspect_lens' : asp_lens, 'text_reverse_lens' : reverse_lens, 'aspect_subs' : asp_subs, #所有经过pad的句子拼接在一起后经过pad的方面在其中的下标, 'window_subs' : window_subs, 'window_lens' : window_lens, # 'left_subs' : left_subs, # 'right_subs' : right_subs, 'text_bitmap' : sent_bitmap, 'left_bitmap': left_sent_bitmap, 'right_bitmap': right_sent_bitmap, # 'f_asp_sub' : f_asp_subs, # 'b_asp_sub' : b_asp_subs, # 'f_left_sub': f_left_subs, # 'b_left_sub': b_left_subs, # 'f_right_sub': f_right_subs, # 'b_right_sub': b_right_subs, 'alpha_adj' : alpha_adj, 'abs_poses' : abs_poses, 'left_abs_poses' : left_abs_poses, 'right_abs_poses' : right_abs_poses, 'pos_ids' : pos_ids, 'left_pos_ids' : left_pos_ids, 'right_pos_ids' : right_pos_ids, 'asp_mask': asp_mask, 'left_a_mask': left_a_mask, 'left_mask': left_mask, 'left_mask2':left_mask2, 'right_mask2': right_mask2, 'left_asp_mask': left_asp_mask, 'right_asp_mask': right_asp_mask, 'asp_pos': asp_pos } return ret_data