def fun_1_5_4(): from nltk import metrics # 二进制距离是一个字符串相似性指标。如果两个标签相同,它的返回值为 0.0;否则, # 它的返回值为 1.0 def binary_distance(label1, label2): return 0.0 if label1 == label2 else 1.0 print metrics.binary_distance((10, 20, 30, 40), (30, 50, 70)) # 当存在多个标签时,Masi 距离基于部分协议。 # 包含在 nltk.metrics 包中的 masi 距离算法的 Python 代码如下 def masi_distance(label1, label2): len_intersection = len(label1.intersection(label2)) len_union = len(label1.union(label2)) len_label1 = len(label1) len_label2 = len(label2) if len_label1 == len_label2 and len_label1 == len_intersection: m = 1 elif len_intersection == min(len_label1, len_label2): m = 0.67 elif len_intersection > 0: m = 0.33 else: m = 0 return 1 - (len_intersection / float(len_union)) * m print metrics.masi_distance((10, 20, 30, 40), (30, 50, 70))
def multi_metrics(multi_classifier, test_feats): mds = [] refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feat, labels) in enumerate(test_feats): for label in labels: refsets[label].add(i) guessed = multi_classifier.classify(feat) for label in guessed: testsets[label].add(i) mds.append(metrics.masi_distance(set(labels), guessed)) avg_md = sum(mds) / float(len(mds)) precisions = {} recalls = {} for label in multi_classifier.labels(): precisions[label] = metrics.precision(refsets[label], testsets[label]) recalls[label] = metrics.recall(refsets[label], testsets[label]) return precisions, recalls, avg_md
def masi(tk1,tk2): return masi_distance(set(tk1),set(tk2)) # string jaro_similarity # jaro_winkler_similarity
def avg_masi_distance(multi_classifier, multi_label_feats): mds = [] for feat, labels in multi_label_feats: mds.append(masi_distance(labels, multi_classifier.classify(feat))) if mds: return float(sum(mds)) / len(mds) else: return 0.0
def score(self, lbl_types, ref_types, stemmed_word): """Gives the Masi distance between the two sets.""" # Hack: ref 23643 is empty after applying rules & so it case "A*D" from csv file if not len(ref_types): return 1 if stemmed_word: ref_types = self.replace_stem(stemmed_word, ref_types, lbl_types) return masi_distance(lbl_types, ref_types)
def masi_distance_chunk(D): ''' Calculates masi distance between tokenized list pairs of questions and saves the result to a column specified by the environment variable MASI_DISTANCE ''' if len(D) > 0: D[MASI_DISTANCE] = D.loc[:, Q_WORD_TOKENIZED].apply( lambda x: masi_distance(set(literal_eval(x[0])), set(literal_eval(x[1]))), axis=1) return D
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split() testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split() print(accuracy(training, testing)) trainset = set(training) testset = set(testing) print(precision(trainset, testset)) # 准确率 print(recall(trainset, testset)) # 召回率 print(f_measure(trainset, testset)) # 计算 编辑距离 复制 cost为0, 替换、删除、插入cost为1 from nltk.metrics import edit_distance print(edit_distance('relate', 'relation')) print(edit_distance('suggestion', 'calculation')) # 使用Jaccard系数执行相似性度量 , |X 交 Y|/|X 并 Y| from nltk.metrics import jaccard_distance X = set([10, 20, 30, 40]) Y = set([20, 30, 60]) print(jaccard_distance(X, Y)) # 二进制距离算法度量 便签相同返回0.0,否则1.0 from nltk.metrics import binary_distance print(binary_distance(X, Y)) # 存在多个标签,Masi距离算法 from nltk.metrics import masi_distance print(masi_distance(X, Y))
def _masi_distance(self, s1, s2): return masi_distance(set(s1), set(s2))