def word_to_extra(words_data, w, w_range, h_size): wd_extra = [] wpresent = [] for iword in range(len(words_data)): if words_data[iword] == -2 and not words_data[iword]: continue h_index, h_vect, wp, Word_ids, w_size, dep_order, hh_index, p = wd_preprocess( words_data[iword]) if h_index == -1: continue cflag = 0 for i in wp: if len(i) > abs(w_range / 2): cflag = 1 break for j in i: wpresent.append(j) if type(w[j]) != np.ndarray: w[j] = init_weight(h_size, h_size) Global.m[j] = zero_weight(h_size, h_size) Global.v[j] = zero_weight(h_size, h_size) Global.lr[j] = neta if cflag == 1: continue wpresent = list(set(wpresent)) Word_vects = [] try: for i in sorted(words_data[iword]): Word_vects.append( get_word_vect(words_data[iword][i]['word'].lower(), Global.v_size)) except KeyError: continue wd_extra.append({ 'w_size': w_size, 'h_index': h_index, 'h_vect': h_vect, 'Word_vects': Word_vects, 'wp': wp, "dep_order": dep_order, "hh_index": hh_index, 'Word_ids': Word_ids, 'p': p }) return wd_extra, wpresent
def get_vect_by_wd_dep(flag, words_data, mtype='normal'): if flag == 't': Word_ids = get_words_id(words_data) Word_vects = [] for i in sorted(Word_ids): Word_vects.append( preprocess( get_word_vect(words_data[i]['word'].lower(), Global.v_size))) w_size = len(Word_ids) p = get_parents(words_data) d = get_dep(words_data) dep_order, d1 = pdep_2_deporder_dep(p, d) h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids) # Word_vects = get_words_vect(words_data, Word_ids, Global.v_size) vect = Word_vects + [None for i in range(len(h_vect))] del Word_vects w = pickle.load(open(Global.wfname, 'rb')) if mtype == 'normal': RAE_adam_herical.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) elif mtype == 'deep': RAE_adam_herical_deep1.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) chunks = {} chunks_vect = {} for i in range(len(Word_ids)): chunks[i] = words_data[Word_ids[i]]['word'] chunks_vect[i] = vect[i] rev_h_index = {v: k for k, v in h_index.items()} count = i + 1 for i in h_vect: if len(i) > 1: chunks_vect[count] = vect[h_index[Word_ids[min(i)]]] chunks[count] = ' '.join([ words_data[rev_h_index[j]]['word'] if j >= len(Word_ids) else words_data[Word_ids[j]]['word'] for j in i ]) count += 1 return chunks, chunks_vect
def get_chk_vect_by_wd(flag, words_data, mtype='normal'): if flag == 't': Word_ids = get_words_id(words_data) Word_vects = [] for i in sorted(Word_ids): Word_vects.append( preprocess( get_word_vect(words_data[i]['word'].lower(), Global.v_size))) w_size = len(Word_ids) p = get_parents(words_data) d = get_dep(words_data) dep_order, d1 = pdep_2_deporder_dep(p, d) h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids) # Word_vects = get_words_vect(words_data, Word_ids, Global.v_size) vect = Word_vects + [None for i in range(len(h_vect))] del Word_vects w = pickle.load(open(Global.wfname, 'rb')) if mtype == 'normal': RAE_adam_herical.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) elif mtype == 'deep': RAE_adam_herical_deep1.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) chks = get_chunks(words_data) chks_main = get_chunk_main(chks, dep_order) chunks = {} chunks_vect = {} for c in range(len(chks)): chunks[c] = ' '.join([words_data[i]['word'] for i in chks[c]]) del words_data for c in range(len(chks_main)): ind = h_index[chks_main[c]] chunks_vect[c] = vect[ind] return chunks, chunks_vect
def get_chk_vect(flag, line, mtype='normal'): if flag == 't': line = line_processing(line) # print line words_data = extract_feature_using_senna(line) p = get_parents(words_data) d = get_dep(words_data) Word_ids = get_words_id(words_data) for i in Word_ids: words_data[i]['vect'] = preprocess( get_word_vect(words_data[i]['word'].lower(), Global.v_size)) w_size = len(Word_ids) dep_order, d1 = pdep_2_deporder_dep(p, d) h_index, h_vect, wp, _ = dep_2_hid_var(p, dep_order, d1, Word_ids) Word_vects = get_words_vect(words_data, Word_ids, Global.v_size) vect = Word_vects + [None for i in h_vect] del Word_vects w = pickle.load(open(Global.wfname, 'rb')) if mtype == 'normal': RAE_adam_herical.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) elif mtype == 'deep': RAE_adam_herical_deep1.rae_encoding(vect=vect, w=w, w_size=w_size, h_vect=h_vect, wp=wp) chks = get_chunks_by_dep(Word_ids, h_index, h_vect) chunks = {} chunks_vect = {} count = 0 order = get_order(d1, w_size) for m in order: chunks[count] = ' '.join([words_data[i]['word'] for i in chks[m]]) chunks_vect[count] = vect[h_index[m]] count += 1 return chunks, chunks_vect
cflag = 0 # for i in wp: # if len(i) > abs(w_range / 2): # cflag = 1 # break # for j in i: # if type(w[j]) != np.ndarray: # w[j] = init_weight(h_size) # if cflag == 1: # continue Word_vects = [] for i in sorted(words_data[iword]): Word_vects.append( get_word_vect(words_data[iword][i]['word'].lower(), Global.v_size)) wd_extra.append({ 'w_size': w_size, 'h_index': h_index, 'h_vect': h_vect, 'Word_vects': Word_vects, 'wp': wp, "dep_order": dep_order, "hh_index": hh_index, 'Word_ids': Word_ids, 'p': p }) for i in w: if type(w[i]) == np.ndarray: w1[i] = w[i].copy() + e