def get_legit_word(str, flag): if flag == 0: for word in reversed(str): if word in [".", "!"]: return invalid_word if data_helpers.is_word(word): return word return invalid_word if flag == 1: for word in str: if word in [".", "!"]: return invalid_word if data_helpers.is_word(word): return word return invalid_word
def get_tokens(words): """ 找出words中的合法单词并以list返回 """ valid_words = [] for word in words: if data_helpers.is_word(word) and word in model.vocab: valid_words.append(word) return valid_words
def get_legit_word(str, flag): """ 返回一个合法的词,如果不合法返回invalid_word('UNK') """ if flag == 0: for word in reversed(str): # 把str反转 if word in [".", "!"]: return invalid_word if data_helpers.is_word(word): return word return invalid_word if flag == 1: for word in str: if word in [".", "!"]: return invalid_word if data_helpers.is_word(word): return word return invalid_word
def get_right_word(message, start): i = start is_space = 0 str = "" while i < len(message): if message[i].isspace() and is_space == 1 and str.strip(): break if message[i].isspace() and is_space == 1 and not data_helpers.is_word(str): is_space = 0 if message[i].isspace(): is_space = 1 str += message[i] i += 1 return tokenizer.tokenize(str)
def get_left_word(message, start): i = start - 1 is_space = 0 str = "" while i > -1: if message[i].isspace() and is_space == 1 and str.strip(): break if message[i].isspace() and is_space == 1 and not data_helpers.is_word(str): is_space = 0 if message[i].isspace(): is_space = 1 str += message[i] i -= 1 str = str[::-1] return tokenizer.tokenize(str)
def get_tokens(words): valid_words = [] for word in words: if data_helpers.is_word(word) and word in model.vocab: valid_words.append(word) return valid_words
def lexical_level_features(df): for index, row in df.iterrows(): try: # if index >= count: # break print("======================================") print(index) message = row['Message'].lower() if not message: continue if row['drug-offset-start'] < row['sideEffect-offset-start']: start = (row['drug-offset-start'], row['drug-offset-end']) else: start = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) if row['drug-offset-end'] > row['sideEffect-offset-end']: end = (row['drug-offset-start'], row['drug-offset-end']) else: end = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) sent = get_sentences(message) start1, start2 = start[0], end[0] end1, end2 = start[1], end[1] beg = -1 for l, r in sent: if (start1 >= l and start1 <= r) or (end1 >= l and end1 <= r) or ( start2 >= l and start2 <= r) or (end2 >= l and end2 <= r): if beg == -1: beg = l fin = r print(message[beg:fin]) entity1, entity2 = message[start1:end1], message[start2:end2] l1 = [ get_legit_word([word], 1) for word in tokenizer.tokenize(entity1) ] l2 = [ get_legit_word([word], 1) for word in tokenizer.tokenize(entity2) ] # TODO add PCA for phrases temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity1) print(l1) for word in l1: if word != "UNK" and data_helpers.is_word( word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: continue l1 = temp / float(valid_words) temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity2) print(l2) for word in l2: if word != "UNK" and data_helpers.is_word( word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: continue lword1 = lword2 = rword1 = rword2 = np.zeros(50) l2 = temp / float(valid_words) if get_legit_word(get_left_word(message, start1), 0) in model.vocab: lword1 = word2vec( get_legit_word(get_left_word(message, start1), 0)) if get_legit_word(get_left_word(message, start2), 0) in model.vocab: lword2 = word2vec( get_legit_word(get_left_word(message, start2), 0)) if get_legit_word(get_right_word(message, end1), 1) in model.vocab: rword1 = word2vec( get_legit_word(get_right_word(message, end1), 1)) if get_legit_word(get_right_word(message, end2), 1) in model.vocab: rword2 = word2vec( get_legit_word(get_right_word(message, end2), 1)) # l3 = np.divide(np.add(lword1, rword1), 2.0) # l4 = np.divide(np.add(lword2, rword2), 2.0) print(get_legit_word(get_left_word(message, start1), 0), get_legit_word(get_left_word(message, start2), 0)) print(get_legit_word(get_right_word(message, end1), 1), get_legit_word(get_right_word(message, end2), 1)) # tokens in between l_tokens = [] r_tokens = [] if beg != -1: l_tokens = get_tokens(tokenizer.tokenize(message[beg:start1])) if fin != -1: r_tokens = get_tokens(tokenizer.tokenize(message[end2:fin])) in_tokens = get_tokens(tokenizer.tokenize(message[end1:start2])) print(l_tokens, in_tokens, r_tokens) tot_tokens = len(l_tokens) + len(in_tokens) + len(r_tokens) + 2 while tot_tokens < FLAGS.sequence_length: r_tokens.append("UNK") tot_tokens += 1 # left tokens l_matrix = [] l_len = len(l_tokens) r_len = len(r_tokens) m_len = len(in_tokens) for idx, token in enumerate(l_tokens): word_vec, pv1, pv2 = word2vec(token), pos_vec[pivot + ( idx - l_len)], pos_vec[pivot + (idx - l_len - 1 - m_len)] l_matrix.append([word_vec, pv1, pv2]) # middle tokens in_matrix = [] for idx, token in enumerate(in_tokens): word_vec, pv1, pv2 = word2vec(token), pos_vec[ idx + 1], pos_vec[idx - m_len + pivot] in_matrix.append([word_vec, pv1, pv2]) # right tokens r_matrix = [] for idx, token in enumerate(r_tokens): if token == "UNK": word_vec, pv1, pv2 = extra_emb, pos_vec[idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) else: word_vec, pv1, pv2 = word2vec(token), pos_vec[ idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) tri_gram = [] llen = len(l_matrix) mlen = len(in_matrix) rlen = len(r_matrix) dist = llen + 1 if llen > 0: if llen > 1: tri_gram.append( np.hstack((beg_emb, l_matrix[0][0], l_matrix[1][0], l_matrix[0][1], l_matrix[0][2]))) for i in range(1, len(l_matrix) - 1): tri_gram.append( np.hstack((l_matrix[i - 1][0], l_matrix[i][0], l_matrix[i + 1][0], l_matrix[i][1], l_matrix[i][2]))) tri_gram.append( np.hstack( (l_matrix[llen - 2][0], l_matrix[llen - 1][0], l1, l_matrix[llen - 1][1], l_matrix[llen - 2][2]))) else: tri_gram.append( np.hstack((beg_emb, l_matrix[0][0], l1, l_matrix[0][1], l_matrix[0][2]))) if mlen > 0: tri_gram.append( np.hstack((l_matrix[llen - 1][0], l1, in_matrix[0][0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append( np.hstack((l_matrix[llen - 1][0], l1, l2, pos_vec[0], pos_vec[pivot - dist]))) else: if mlen > 0: tri_gram.append( np.hstack((beg_emb, l1, in_matrix[0][0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append( np.hstack((beg_emb, l1, l2, pos_vec[0], pos_vec[pivot - dist]))) if mlen > 0: if mlen > 1: tri_gram.append( np.hstack((l1, in_matrix[0][0], in_matrix[1][0], in_matrix[0][1], in_matrix[0][2]))) for i in range(1, len(in_matrix) - 1): tri_gram.append( np.hstack((in_matrix[i - 1][0], in_matrix[i][0], in_matrix[i + 1][0], in_matrix[i][1], in_matrix[i][2]))) tri_gram.append( np.hstack( (in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 2][2]))) else: tri_gram.append( np.hstack((l1, in_matrix[0][0], l2, in_matrix[0][1], in_matrix[0][2]))) if rlen > 0: tri_gram.append( np.hstack((in_matrix[mlen - 1][0], l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append( np.hstack((in_matrix[mlen - 1][0], l2, end_emb, pos_vec[dist], pos_vec[0]))) else: if rlen > 0: tri_gram.append( np.hstack((l1, l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append( np.hstack( (l1, l2, end_emb, pos_vec[dist], pos_vec[0]))) if rlen > 0: if rlen > 1: tri_gram.append( np.hstack((l2, r_matrix[0][0], r_matrix[1][0], r_matrix[0][1], r_matrix[0][2]))) for i in range(1, len(r_matrix) - 1): tri_gram.append( np.hstack((r_matrix[i - 1][0], r_matrix[i][0], r_matrix[i + 1][0], r_matrix[i][1], r_matrix[i][2]))) tri_gram.append( np.hstack( (r_matrix[rlen - 2][0], r_matrix[rlen - 1][0], end_emb, r_matrix[rlen - 1][1], r_matrix[rlen - 2][2]))) else: tri_gram.append( np.hstack((l2, r_matrix[0][0], end_emb, r_matrix[0][1], r_matrix[0][2]))) # tri_gram.append(np.hstack((l1, in_matrix[0][0], in_matrix[1][0], in_matrix[0][1], in_matrix[0][2]))) # # for idx in range(1, mlen - 1): # tri_gram.append( # np.hstack((in_matrix[idx - 1][0], in_matrix[idx][0], in_matrix[idx + 1][0], in_matrix[idx][1], in_matrix[idx][2]))) # tri_gram.append( # np.hstack((in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 1][2]))) # tri_gram.append(np.hstack((in_matrix[mlen - 1][0], l2, end_emb, pos_vec_entities[2], pos_vec_entities[3]))) print("======================================") # lf = np.vstack((l1, l2, l3, l4)) relation = row['relType'] print(np.asarray(tri_gram).shape) if relation == "valid": y = [0.0, 1.0] else: y = [1.0, 0.0] yield np.asarray((np.asarray(tri_gram), np.asarray(y))) except Exception as e: traceback.print_exc()
def lexical_level_features(df): """ 根据读取的数据df生成一个句子词级别的features 格式是一个矩阵,每一行代表一个此窗口的feature,格式为[WF WF WF PF PF] 每一句填充为长度为Sequence_length的句子,所以总的维度是 [Sequence_length, 3*embedding_size+2*distance_dim] 也就是[204 160] 先随机初始化pos_vec和beg_emb,end_emb,extra_emb,注意这里是以前的一个误区, 不论是word embedding还是上面初始的这些都是输入,只要不一样就可以了 (可以探讨一下随机初始和使用word2vec得到的结果的不同), 模型起作用的地方是训练出来的网络的权值 """ for index, row in df.iterrows(): try: # if index >= count: # break print("======================================") print(index) message = row['Message'].lower() if not message: continue # 如果是空则跳过 if row['drug-offset-start'] < row['sideEffect-offset-start']: start = (row['drug-offset-start'], row['drug-offset-end']) else: start = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) # 找出实体e1 start if row['drug-offset-end'] > row['sideEffect-offset-end']: end = (row['drug-offset-start'], row['drug-offset-end']) else: end = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) # 找出实体e2 end sent = get_sentences(message) # 句子的首尾位置 start1, start2 = start[0], end[0] # 两个实体的开始位置 end1, end2 = start[1], end[1] # 两个实体的结束位置 beg = -1 for l, r in sent: if (start1 >= l and start1 <= r) \ or (end1 >= l and end1 <= r) \ or (start2 >= l and start2 <= r) \ or (end2 >= l and end2 <= r): # 主要两个实体有一个在句中 if beg == -1: beg = l fin = r print(message[beg:fin]) # 找出包含实体的句子 entity1, entity2 = message[ start1:end1], message[start2:end2] # 两个实体 l1 = [get_legit_word([word], 1) # 把两个实体中的单词找出来 for word in tokenizer.tokenize(entity1)] l2 = [get_legit_word([word], 1) for word in tokenizer.tokenize(entity2)] # TODO add PCA for phrases temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity1) print(l1) for word in l1: if word != "UNK" and data_helpers.is_word(word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: continue l1 = temp / float(valid_words) # l1 代表实体1的词向量,如果有多个单词,那么加和求平均 temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity2) print(l2) for word in l2: if word != "UNK" and data_helpers.is_word(word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: continue l2 = temp / float(valid_words) # l2 代表实体2的词向量,如果有多个单词,那么加和求平均 # lword1 2 rword1 2 完全没有用到。。。 lword1 = get_legit_word(get_left_word(message, start1), 0) lword2 = get_legit_word(get_left_word(message, start2), 0) rword1 = get_legit_word(get_right_word(message, end1), 1) rword2 = get_legit_word(get_right_word(message, end2), 1) if lword1 in model.vocab: lword1 = word2vec(lword1) # 找到start1左边的一个词并转化为词向量 if lword2 in model.vocab: lword2 = word2vec(lword2) if rword1 in model.vocab: rword1 = word2vec(rword1) if rword2 in model.vocab: rword2 = word2vec(rword2) # l3 = np.divide(np.add(lword1, rword1), 2.0) # l4 = np.divide(np.add(lword2, rword2), 2.0) print(lword1, lword2) print(rword1, rword2) # tokens in between l_tokens = [] r_tokens = [] if beg != -1: l_tokens = get_tokens(tokenizer.tokenize(message[beg:start1])) if fin != -1: r_tokens = get_tokens(tokenizer.tokenize(message[end2:fin])) in_tokens = get_tokens(tokenizer.tokenize(message[end1:start2])) print(l_tokens, in_tokens, r_tokens) tot_tokens = len(l_tokens) + len(in_tokens) + len(r_tokens) + 2 while tot_tokens < FLAGS.sequence_length: r_tokens.append("UNK") tot_tokens += 1 # 句子长度补齐为 FLAGS.sequence_length(204)长度 # left tokens l_matrix = [] l_len = len(l_tokens) r_len = len(r_tokens) m_len = len(in_tokens) for idx, token in enumerate(l_tokens): word_vec = word2vec(token) pv1 = pos_vec[pivot + (idx - l_len)] pv2 = pos_vec[pivot + (idx - l_len - 1 - m_len)] l_matrix.append([word_vec, pv1, pv2]) # middle tokens in_matrix = [] for idx, token in enumerate(in_tokens): word_vec, pv1, pv2 = word2vec(token), pos_vec[ idx + 1], pos_vec[idx - m_len + pivot] in_matrix.append([word_vec, pv1, pv2]) # right tokens r_matrix = [] for idx, token in enumerate(r_tokens): if token == "UNK": word_vec, pv1, pv2 = extra_emb, pos_vec[ idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) else: word_vec, pv1, pv2 = word2vec(token), pos_vec[ idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) tri_gram = [] llen = len(l_matrix) mlen = len(in_matrix) rlen = len(r_matrix) dist = llen + 1 if llen > 0: if llen > 1: ta = np.hstack((beg_emb, l_matrix[0][0], l_matrix[1][0], l_matrix[0][1], l_matrix[0][2])) # 连成一个水平向量 tri_gram.append(ta) for i in range(1, len(l_matrix) - 1): ta = np.hstack((l_matrix[i - 1][0], l_matrix[i][0], l_matrix[i + 1][0], l_matrix[i][1], l_matrix[i][2])) tri_gram.append(ta) ta = np.hstack((l_matrix[llen - 2][0], l_matrix[llen - 1][0], l1, l_matrix[llen - 1][1], l_matrix[llen - 2][2])) tri_gram.append(ta) else: tri_gram.append( np.hstack((beg_emb, l_matrix[0][0], l1, l_matrix[0][1], l_matrix[0][2]))) if mlen > 0: tri_gram.append( np.hstack((l_matrix[llen - 1][0], l1, in_matrix[0][0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append( np.hstack((l_matrix[llen - 1][0], l1, l2, pos_vec[0], pos_vec[pivot - dist]))) else: if mlen > 0: tri_gram.append(np.hstack((beg_emb, l1, in_matrix[0][ 0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append( np.hstack((beg_emb, l1, l2, pos_vec[0], pos_vec[pivot - dist]))) if mlen > 0: if mlen > 1: tri_gram.append(np.hstack((l1, in_matrix[0][0], in_matrix[ 1][0], in_matrix[0][1], in_matrix[0][2]))) for i in range(1, len(in_matrix) - 1): tri_gram.append(np.hstack((in_matrix[i - 1][0], in_matrix[i][0], in_matrix[i + 1][0], in_matrix[i][1], in_matrix[i][2]))) tri_gram.append(np.hstack((in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 2][2]))) else: tri_gram.append( np.hstack((l1, in_matrix[0][0], l2, in_matrix[0][1], in_matrix[0][2]))) if rlen > 0: tri_gram.append(np.hstack( (in_matrix[mlen - 1][0], l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append( np.hstack((in_matrix[mlen - 1][0], l2, end_emb, pos_vec[dist], pos_vec[0]))) else: if rlen > 0: tri_gram.append( np.hstack((l1, l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append( np.hstack((l1, l2, end_emb, pos_vec[dist], pos_vec[0]))) if rlen > 0: if rlen > 1: tri_gram.append(np.hstack((l2, r_matrix[0][0], r_matrix[ 1][0], r_matrix[0][1], r_matrix[0][2]))) for i in range(1, len(r_matrix) - 1): tri_gram.append(np.hstack( (r_matrix[i - 1][0], r_matrix[i][0], r_matrix[i + 1][0], r_matrix[i][1], r_matrix[i][2]))) tri_gram.append(np.hstack((r_matrix[rlen - 2][0], r_matrix[rlen - 1][0], end_emb, r_matrix[rlen - 1][1], r_matrix[rlen - 2][2]))) else: tri_gram.append( np.hstack((l2, r_matrix[0][0], end_emb, r_matrix[0][1], r_matrix[0][2]))) # tri_gram.append(np.hstack((l1, in_matrix[0][0], in_matrix[1][0], # in_matrix[0][1], in_matrix[0][2]))) # # for idx in range(1, mlen - 1): # tri_gram.append( # np.hstack((in_matrix[idx - 1][0], in_matrix[idx][0], in_matrix[idx + 1][0], in_matrix[idx][1], in_matrix[idx][2]))) # tri_gram.append( # np.hstack((in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 1][2]))) # tri_gram.append(np.hstack((in_matrix[mlen - 1][0], l2, end_emb, # pos_vec_entities[2], pos_vec_entities[3]))) print("======================================") # lf = np.vstack((l1, l2, l3, l4)) relation = row['relType'] print(np.asarray(tri_gram).shape) if relation == "valid": y = [0.0, 1.0] else: y = [1.0, 0.0] yield np.asarray((np.asarray(tri_gram), np.asarray(y))) except Exception as e: traceback.print_exc()
def generate_vector(message, start1, end1, start2, end2): sent = get_sentences(message) beg = -1 for l, r in sent: if (start1 >= l and start1 <= r) or (end1 >= l and end1 <= r) or (start2 >= l and start2 <= r) or ( end2 >= l and end2 <= r): if beg == -1: beg = l fin = r print(message[beg:fin]) entity1, entity2 = message[start1:end1], message[start2:end2] l1 = [get_legit_word([word], 1) for word in tokenizer.tokenize(entity1)] l2 = [get_legit_word([word], 1) for word in tokenizer.tokenize(entity2)] # TODO add PCA for phrases temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity1) print(l1) for word in l1: if word != "UNK" and data_helpers.is_word(word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: return None l1 = temp / float(valid_words) temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity2) print(l2) for word in l2: if word != "UNK" and data_helpers.is_word(word) and word in model.vocab: valid_words += 1 temp = np.add(temp, word2vec(word)) if valid_words == 0: return None lword1 = lword2 = rword1 = rword2 = np.zeros(50) l2 = temp / float(valid_words) if get_legit_word(get_left_word(message, start1), 0) in model.vocab: lword1 = word2vec(get_legit_word(get_left_word(message, start1), 0)) if get_legit_word(get_left_word(message, start2), 0) in model.vocab: lword2 = word2vec(get_legit_word(get_left_word(message, start2), 0)) if get_legit_word(get_right_word(message, end1), 1) in model.vocab: rword1 = word2vec(get_legit_word(get_right_word(message, end1), 1)) if get_legit_word(get_right_word(message, end2), 1) in model.vocab: rword2 = word2vec(get_legit_word(get_right_word(message, end2), 1)) # l3 = np.divide(np.add(lword1, rword1), 2.0) # l4 = np.divide(np.add(lword2, rword2), 2.0) print(get_legit_word(get_left_word(message, start1), 0), get_legit_word(get_left_word(message, start2), 0)) print(get_legit_word(get_right_word(message, end1), 1), get_legit_word(get_right_word(message, end2), 1)) # tokens in between l_tokens = [] r_tokens = [] if beg != -1: l_tokens = get_tokens(tokenizer.tokenize(message[beg:start1])) if fin != -1: r_tokens = get_tokens(tokenizer.tokenize(message[end2:fin])) in_tokens = get_tokens(tokenizer.tokenize(message[end1:start2])) print(l_tokens, in_tokens, r_tokens) tot_tokens = len(l_tokens) + len(in_tokens) + len(r_tokens) + 2 while tot_tokens < FLAGS.sequence_length: r_tokens.append("UNK") tot_tokens += 1 # left tokens l_matrix = [] l_len = len(l_tokens) r_len = len(r_tokens) m_len = len(in_tokens) for idx, token in enumerate(l_tokens): word_vec, pv1, pv2 = word2vec(token), pos_vec[pivot + (idx - l_len)], pos_vec[ pivot + (idx - l_len - 1 - m_len)] l_matrix.append([word_vec, pv1, pv2]) # middle tokens in_matrix = [] for idx, token in enumerate(in_tokens): word_vec, pv1, pv2 = word2vec(token), pos_vec[idx + 1], pos_vec[idx - m_len] in_matrix.append([word_vec, pv1, pv2]) # right tokens r_matrix = [] for idx, token in enumerate(r_tokens): if token == "UNK": word_vec, pv1, pv2 = extra_emb, pos_vec[idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) else: word_vec, pv1, pv2 = word2vec(token), pos_vec[idx + m_len + 2], pos_vec[idx + 1] r_matrix.append([word_vec, pv1, pv2]) tri_gram = [] llen = len(l_matrix) mlen = len(in_matrix) rlen = len(r_matrix) dist = llen + 1 if llen > 0: if llen > 1: tri_gram.append( np.hstack((beg_emb, l_matrix[0][0], l_matrix[1][0], l_matrix[0][1], l_matrix[0][2]))) for i in range(1, len(l_matrix) - 1): tri_gram.append( np.hstack((l_matrix[i - 1][0], l_matrix[i][0], l_matrix[i + 1][0], l_matrix[i][1], l_matrix[i][2]))) tri_gram.append(np.hstack((l_matrix[llen - 2][0], l_matrix[llen - 1][0], l1, l_matrix[llen - 1][1], l_matrix[llen - 2][2]))) else: tri_gram.append( np.hstack((beg_emb, l_matrix[0][0], l1, l_matrix[0][1], l_matrix[0][2]))) if mlen > 0: tri_gram.append( np.hstack((l_matrix[llen - 1][0], l1, in_matrix[0][0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append(np.hstack((l_matrix[llen - 1][0], l1, l2, pos_vec[0], pos_vec[pivot - dist]))) else: if mlen > 0: tri_gram.append( np.hstack((beg_emb, l1, in_matrix[0][0], pos_vec[0], pos_vec[pivot - dist]))) else: tri_gram.append(np.hstack((beg_emb, l1, l2, pos_vec[0], pos_vec[pivot - dist]))) if mlen > 0: if mlen > 1: tri_gram.append(np.hstack((l1, in_matrix[0][0], in_matrix[1][0], in_matrix[0][1], in_matrix[0][2]))) for i in range(1, len(in_matrix) - 1): tri_gram.append(np.hstack((in_matrix[i - 1][0], in_matrix[i][0], in_matrix[i + 1][0], in_matrix[i][1], in_matrix[i][2]))) tri_gram.append(np.hstack((in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 2][2]))) else: tri_gram.append(np.hstack((l1, in_matrix[0][0], l2, in_matrix[0][1], in_matrix[0][2]))) if rlen > 0: tri_gram.append(np.hstack((in_matrix[mlen - 1][0], l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append(np.hstack((in_matrix[mlen - 1][0], l2, end_emb, pos_vec[dist], pos_vec[0]))) else: if rlen > 0: tri_gram.append(np.hstack((l1, l2, r_matrix[0][0], pos_vec[dist], pos_vec[0]))) else: tri_gram.append(np.hstack((l1, l2, end_emb, pos_vec[dist], pos_vec[0]))) if rlen > 0: if rlen > 1: tri_gram.append(np.hstack((l2, r_matrix[0][0], r_matrix[1][0], r_matrix[0][1], r_matrix[0][2]))) for i in range(1, len(r_matrix) - 1): tri_gram.append(np.hstack( (r_matrix[i - 1][0], r_matrix[i][0], r_matrix[i + 1][0], r_matrix[i][1], r_matrix[i][2]))) tri_gram.append(np.hstack((r_matrix[rlen - 2][0], r_matrix[rlen - 1][0], end_emb, r_matrix[rlen - 1][1], r_matrix[rlen - 2][2]))) else: tri_gram.append(np.hstack((l2, r_matrix[0][0], end_emb, r_matrix[0][1], r_matrix[0][2]))) # tri_gram.append(np.hstack((l1, in_matrix[0][0], in_matrix[1][0], in_matrix[0][1], in_matrix[0][2]))) # # for idx in range(1, mlen - 1): # tri_gram.append( # np.hstack((in_matrix[idx - 1][0], in_matrix[idx][0], in_matrix[idx + 1][0], in_matrix[idx][1], in_matrix[idx][2]))) # tri_gram.append( # np.hstack((in_matrix[mlen - 2][0], in_matrix[mlen - 1][0], l2, in_matrix[mlen - 1][1], in_matrix[mlen - 1][2]))) # tri_gram.append(np.hstack((in_matrix[mlen - 1][0], l2, end_emb, pos_vec_entities[2], pos_vec_entities[3]))) print("======================================") # lf = np.vstack((l1, l2, l3, l4)) print(np.asarray(tri_gram).shape) return np.asarray(tri_gram)
def lexical_level_features(df): for index, row in df.iterrows(): try: if index >= count: break print("======================================") print(index) message = row['Message'] if not message: continue if row['drug-offset-start'] < row['sideEffect-offset-start']: start = (row['drug-offset-start'], row['drug-offset-end']) else: start = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) if row['drug-offset-end'] > row['sideEffect-offset-end']: end = (row['drug-offset-start'], row['drug-offset-end']) else: end = (row['sideEffect-offset-start'], row['sideEffect-offset-end']) print(message) start1, start2 = start[0], end[0] end1, end2 = start[1], end[1] entity1, entity2 = message[start1:end1], message[start2:end2] l1 = [get_legit_word([word], 1) for word in tokenizer.tokenize(entity1)] l2 = [get_legit_word([word], 1) for word in tokenizer.tokenize(entity2)] # TODO add PCA for phrases temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity1) print(l1) for word in l1: if word != "UNK" and data_helpers.is_word(word): valid_words += 1 temp = np.add(temp, w2v(word)) l1 = temp / float(valid_words) temp = np.zeros(FLAGS.embedding_size) valid_words = 0 print(entity2) print(l2) for word in l2: if word != "UNK" and data_helpers.is_word(word): valid_words += 1 temp = np.add(temp, w2v(word)) l2 = temp / float(valid_words) lword1 = w2v(get_legit_word(get_left_word(message, start1), 0)) lword2 = w2v(get_legit_word(get_left_word(message, start2), 0)) rword1 = w2v(get_legit_word(get_right_word(message, end1), 1)) rword2 = w2v(get_legit_word(get_right_word(message, end2), 1)) l3 = np.divide(np.add(lword1, rword1), 2.0) l4 = np.divide(np.add(lword2, rword2), 2.0) print(get_legit_word(get_left_word(message, start1), 0), get_legit_word(get_left_word(message, start2), 0)) print(get_legit_word(get_right_word(message, end1), 1), get_legit_word(get_right_word(message, end2), 1)) print("======================================") lf = np.vstack((l1, l2, l3, l4)) relation = row['relType'] if relation == "valid": y = [0, 1] else: y = [1, 0] yield np.asarray((lf, y)) except Exception as e: print(e)