def calc_diffscore(blocks1, blocks2): scores = list() for i in range(len(blocks1)): s1 = blocks1[i].text s2 = blocks2[i].text if not s1: s1 = "" w1 = 0 else: s1 = str(s1).strip("\t\n ") w1 = len(s1) if not s2: s2 = "" w2 = 0 else: s2 = str(s2).strip("\t\n ") w2 = len(s2) if w1 == 0 and w2 == 0: scores.append(0.0) else: score = (w1 + w2 - 2 * pylcs.lcs(s1, s2)) / (w1 + w2) scores.append(score) return scores
def train_evaluate(self, batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs, seg_idx_map_bert_idxs, bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts): """评测函数(跟官方评测结果不一定相同,但很接近) """ X, Y, Z = 1e-10, 1e-10, 1e-10 for pred_tag, text, arguments, text_map_seg_idx, seg_idx_map_bert_idx, bert_idx_map_seg_idx, seg_idx_map_text, raw_text in zip( batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs, seg_idx_map_bert_idxs, bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts): inv_arguments_label = {v: k for k, v in arguments.items()} pred_arguments = bert_extract_arguments( text, pred_tag, self.schema, class_id=self.class_id, text_map_seg_idx=text_map_seg_idx, seg_idx_map_bert_idx=seg_idx_map_bert_idx, bert_idx_map_seg_idx=bert_idx_map_seg_idx, seg_idx_map_text=seg_idx_map_text, raw_text=raw_text) pred_inv_arguments = {v: k for k, v in pred_arguments.items()} Y += len(pred_inv_arguments) Z += len(inv_arguments_label) for k, v in pred_inv_arguments.items(): if k in inv_arguments_label: # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, inv_arguments_label[k]) X += 2. * l / (len(v) + len(inv_arguments_label[k])) # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return X, Y, Z
def evaluate(data): """评测函数(跟官方评测结果不一定相同,但很接近)""" X, Y, Z = 1e-10, 1e-10, 1e-10 #text='雀巢裁员4000人:时代抛弃你时,连招呼都不会打!' #arguments={'4000人': ('组织关系-裁员', '裁员人数'), '雀巢': ('组织关系-裁员', '裁员方')} for text, arguments in tqdm(data): #一次验证一个样本 #{ ('组织关系-裁员', '裁员人数'): '4000人', # ('组织关系-裁员', '裁员方'): '雀巢' } inv_arguments = {v: k for k, v in arguments.items()} #真实标签 #pred_arguments: # { '雀巢裁': ('组织关系-裁员', '裁员方'), # '4000人': ('组织关系-裁员', '裁员人数'), # '时代': ('灾害/意外-坍/垮塌', '时间'),} pred_arguments = extract_arguments(text) #从文本预测,得到预测标签 #pred_inv_arguments #{('灾害/意外-坍/垮塌', '时间'): '时代', #('组织关系-裁员', '裁员人数'): '4000人', #('组织关系-裁员', '裁员方'): '雀巢裁'} pred_inv_arguments = {v: k for k, v in pred_arguments.items()} Y += len(pred_inv_arguments) #预测Y=3 Z += len(inv_arguments) #标签Z=2 for k, v in pred_inv_arguments.items(): #k ('组织关系-裁员', '裁员方') v'雀巢裁' if k in inv_arguments: # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, inv_arguments[k]) #('雀巢裁','雀巢') 2 X += 2. * l / (len(v) + len(inv_arguments[k]) ) #2*2/(2+3)=0.8如果全对就得1 #2*0.8/(2+3) 0.8/3 0.8/2 f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall
def get_stop_id(stop_name): best = 0 res = -1 for stop in stops: cur = pylcs.lcs(stop["Name"], stop_name) if best < cur: best = cur res = stop["StopId"] return res
def compute_acc_list(true_list, pred_list): assert len(true_list) == len(pred_list) p_list, r_list, f1_list = [], [], [] for x, y in zip(true_list, pred_list): val = pylcs.lcs(x, y) p = float(val) / len(y) if len(y) > 0 else 0.0 r = float(val) / len(x) if len(x) > 0 else 0.0 f1 = 2 * p * r / (p + r) if val > 0 else 0.0 p_list.append(p) r_list.append(r) f1_list.append(f1) return p_list, r_list, f1_list
def longest_common_subsequence(self, config, sentence1, sentence2): """ Computes the length of the longest common subsequence or substring of the two sentences, depending on the configuration. L. C. Subsequence example: ("We ate a delicious pizza", "We ate a not so delicious pizza") -> "We ate a delicious pizza" L. C. Substring example: ("We ate a delicious pizza", "We ate a not so delicious pizza") -> " delicious pizza" """ mode = config['mode'] if 'mode' in config else 'subsequence' if mode == 'subsequence': return pylcs.lcs(sentence1[1], sentence2[1]) else: # mode == 'substring' return pylcs.lcs2(sentence1[1], sentence2[1])
def checkPatternSimilarity(parentPatternsDict, toCheckDict): if not toCheckDict: print("The new program doest not contain OpenMP Code") return -1 _res = defaultdict(dict) for toCheckKey, toCheckValue in toCheckDict.items(): for parKey, parValue in parentPatternsDict.items(): lcs_len = pylcs.lcs(toCheckValue[0], parValue[0]) lcs_perc = lcs_len / len(toCheckValue[0]) _res[toCheckKey][parKey] = lcs_perc return _res
def producer2(queue, datum): segdrugJader, drugBank = datum for drugJader in segdrugJader: scores = [] for drugBankName in drugBank: # edScore = ed.eval(drugJader, drugBankName) edScore = (-pylcs.lcs(drugJader, drugBankName) + ed.eval( drugJader, drugBankName)) / np.log(len(drugBankName)) scores.append(edScore) scores = np.asarray(scores) args = np.argsort(scores) assert scores[args[0]] <= scores[args[1]] candidateIds = args[:10] candidates = [drugBank[i] for i in candidateIds] queue.put([drugJader, candidates])
def evaluate(data, model, CRF): """评测函数(跟官方评测结果不一定相同,但很接近)""" X, Y, Z = 1e-10, 1e-10, 1e-10 for text, arguments in tqdm(data): inv_arguments = {v: k for k, v in arguments.items()} pred_arguments = extract_arguments(text, model, CRF) pred_inv_arguments = {v: k for k, v in pred_arguments.items()} Y += len(pred_inv_arguments) Z += len(inv_arguments) for k, v in pred_inv_arguments.items(): if k in inv_arguments: # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, inv_arguments[k]) X += 2. * l / (len(v) + len(inv_arguments[k])) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall
def evaluate(data): """评测函数(跟官方评测结果不一定相同,但很接近) """ X, Y, Z = 1e-10, 1e-10, 1e-10 for text, arguments in tqdm(data): for subject,pk_v in arguments.items(): inv_arguments = {k: v for k, v in pk_v} #(obj,label) pred_arguments = extract_arguments(text,subject) pred_inv_arguments = {v: k for k, v in pred_arguments.items()} #(label,obj) Y += len(pred_inv_arguments) Z += len(inv_arguments) for k, v in pred_inv_arguments.items(): if k in inv_arguments: # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, inv_arguments[k]) X += 2. * l / (len(v) + len(inv_arguments[k])) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall
def evaluate(self, batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs, seg_idx_map_bert_idxs, bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts): """评测函数(跟官方评测结果不一定相同,但很接近) """ X, Y, Z = 1e-10, 1e-10, 1e-10 for pred_tag, text, arguments, text_map_seg_idx, seg_idx_map_bert_idx, bert_idx_map_seg_idx, seg_idx_map_text, raw_text in zip( batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs, seg_idx_map_bert_idxs, bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts): inv_arguments_label = {v: k for k, v in arguments.items()} pred_arguments = bert_extract_arguments( text, pred_tag, self.schema, class_id=self.class_id, text_map_seg_idx=text_map_seg_idx, seg_idx_map_bert_idx=seg_idx_map_bert_idx, bert_idx_map_seg_idx=bert_idx_map_seg_idx, seg_idx_map_text=seg_idx_map_text, raw_text=raw_text) pred_inv_arguments = {v: k for k, v in pred_arguments.items()} Y += len(pred_inv_arguments) Z += len(inv_arguments_label) for k, v in pred_inv_arguments.items(): if k in inv_arguments_label: argument_str = inv_arguments_label[k].split('_')[0] # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, argument_str) # X += 2. * l / (len(v) + len(inv_arguments_label[k])) y = len(v) p = l / y + 0.000001 z = len(argument_str) r = l / z + 0.000001 f1 = 2 * p * r / (p + r) X += f1 # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return X, Y, Z
def evaluate(batch_pred_tag, batch_text, batch_arguments): """评测函数(跟官方评测结果不一定相同,但很接近) """ X, Y, Z = 1e-10, 1e-10, 1e-10 for pred_tag, text, arguments in zip(batch_pred_tag, batch_text, batch_arguments): inv_arguments_label = {v: k for k, v in arguments.items()} pred_arguments = bert_extract_arguments(text, pred_tag, schema) pred_inv_arguments = {v: k for k, v in pred_arguments.items()} Y += len(pred_inv_arguments) Z += len(inv_arguments_label) for k, v in pred_inv_arguments.items(): if k in inv_arguments_label: # 用最长公共子串作为匹配程度度量 l = pylcs.lcs(v, inv_arguments_label[k]) X += 2. * l / (len(v) + len(inv_arguments_label[k])) # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return X, Y, Z
def search(self, key, value): """ Given a key/property and an intended value, returns True if this rep has the intended value :param key: A string property :param value: The value searched for :return: True or False if this rep is part of the search """ if key == 'source': return value in self.sources.values() elif key == 'name': v = value.lower() v = ''.join([let for let in v if 'a' <= let <= 'z']) name = self.basics['name'].lower() name = ''.join([let for let in name if 'a' <= let <= 'z']) lcs = pylcs.lcs(v, name) return lcs == len(v) elif key == 'chamber': if value == 'House': return self.basics['title'] == 'Representative' elif value == 'Senate': return self.basics['title'] == 'Senator' elif key == 'alive': return not self.basics['death'] == value elif key == 'party': return value == self.get_current_party() elif key == 'state': state = us.states.lookup(value).name return state == self.get_state() elif key == 'district': state, dist = value state = us.states.lookup(state).name return state == self.get_state() and dist == self.get_district() elif key == 'active': return value == self.get_active() else: print('Unknown property for representative. Returning False') return False
def pseudo_summary(texts): """构建伪标签摘要数据集 """ source_idxs, target_idxs = list(range(len(texts))), [] while True: sims = [] for i in source_idxs: new_source_idxs = [j for j in source_idxs if j != i] new_target_idxs = sorted(target_idxs + [i]) new_source = gather_join(texts, new_source_idxs) new_target = gather_join(texts, new_target_idxs) sim = pylcs.lcs(new_source, new_target) sims.append(sim) new_idx = source_idxs[np.argmax(sims)] source_idxs.remove(new_idx) target_idxs = sorted(target_idxs + [new_idx]) source = gather_join(texts, source_idxs) target = gather_join(texts, target_idxs) if (len(source_idxs) == 1 or 1.0 * len(target) / len(source) > summary_rate): break if len(source) < len(target): source, target = target, source return source, target
def compute_normalized(s1, s2): max_len = max(len(s1), len(s2)) lcs = pylcs.lcs(s1, s2).__float__() return 1 - float(max_len - lcs) / float(max_len)
def lcs(s1, s2): return pylcs.lcs(s1, s2)
def pylcs_len(self, A, B): return pylcs.lcs(A, B)
def lexsim(val1, val2): lcs = pylcs.lcs(val1, val2) # the longest common subsequence LCS lcsr = lcs/max(len(val1),len(val2)) # the longest common subsequence ratio LCSR lexsim = lcsr/edit_distance(val1,val2) return lexsim*100
def test_lcs(): assert pylcs.lcs("aaa", "bbb") == 0 assert pylcs.lcs("aaa", "aabbbaa") == 3 assert pylcs.lcs("你好", "中国") == 0 assert pylcs.lcs("aaa你好", "你好呀") == 2
n2 = len(x2) S = np.zeros((n1, n2)) for i in range(n1): for j in range(n2): if x1[i] == x2[j]: S[i, j] = s else: S[i, j] = -s H = np.zeros((n1 + 1, n2 + 1)) max2 = np.zeros(n1 + 1) for j in range(1, n2 + 1): for t in range(n1 + 1): max2[t] = max(max2[t], H[t, j - 1]) - w max1 = 0 for i in range(1, n1 + 1): max1 = max(H[i - 1, j], max1) - w H[i, j] = max(max(H[i - 1, j - 1] + S[i - 1, j - 1], max1), max2[i], 0) return np.max(H) x = 'aaabaaab' y = 'aaabaaabaaababba' a = pylcs.lcs(x, y) / max(len(x), len(y)) print(a) a = pylcs.smith_w('aaabaaabaaabaaab', 'aaabaaabaaababba', 0.03, 0.03) print(a) c = pylcs.get_lcs([45, 65], []) print(c)
def compare_password(self, password, new_password): lcs = pylcs.lcs(password, new_password) if (lcs / len(new_password)) < 0.8: return True return False
def cal_lcs(text1, text2): return pylcs.lcs(text1, text2) / max(len(text1), len(text2))