Example #1
0
def fun_1_5_4():
    from nltk import metrics

    # 二进制距离是一个字符串相似性指标。如果两个标签相同,它的返回值为 0.0;否则,
    # 它的返回值为 1.0
    def binary_distance(label1, label2):
        return 0.0 if label1 == label2 else 1.0

    print metrics.binary_distance((10, 20, 30, 40), (30, 50, 70))

    # 当存在多个标签时,Masi 距离基于部分协议。
    # 包含在 nltk.metrics 包中的 masi 距离算法的 Python 代码如下
    def masi_distance(label1, label2):
        len_intersection = len(label1.intersection(label2))
        len_union = len(label1.union(label2))
        len_label1 = len(label1)
        len_label2 = len(label2)
        if len_label1 == len_label2 and len_label1 == len_intersection:
            m = 1
        elif len_intersection == min(len_label1, len_label2):
            m = 0.67
        elif len_intersection > 0:
            m = 0.33
        else:
            m = 0
        return 1 - (len_intersection / float(len_union)) * m

    print metrics.masi_distance((10, 20, 30, 40), (30, 50, 70))
Example #2
0
def multi_metrics(multi_classifier, test_feats):
	mds = []
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	
	for i, (feat, labels) in enumerate(test_feats):
		for label in labels:
			refsets[label].add(i)
		
		guessed = multi_classifier.classify(feat)
		
		for label in guessed:
			testsets[label].add(i)
		
		mds.append(metrics.masi_distance(set(labels), guessed))
	
	avg_md = sum(mds) / float(len(mds))
	precisions = {}
	recalls = {}
	
	for label in multi_classifier.labels():
		precisions[label] = metrics.precision(refsets[label], testsets[label])
		recalls[label] = metrics.recall(refsets[label], testsets[label])
	
	return precisions, recalls, avg_md
def multi_metrics(multi_classifier, test_feats):
	mds = []
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	
	for i, (feat, labels) in enumerate(test_feats):
		for label in labels:
			refsets[label].add(i)
		
		guessed = multi_classifier.classify(feat)
		
		for label in guessed:
			testsets[label].add(i)
		
		mds.append(metrics.masi_distance(set(labels), guessed))
	
	avg_md = sum(mds) / float(len(mds))
	precisions = {}
	recalls = {}
	
	for label in multi_classifier.labels():
		precisions[label] = metrics.precision(refsets[label], testsets[label])
		recalls[label] = metrics.recall(refsets[label], testsets[label])
	
	return precisions, recalls, avg_md
Example #4
0
def masi(tk1,tk2):

    return masi_distance(set(tk1),set(tk2))


# string jaro_similarity
# jaro_winkler_similarity
Example #5
0
def avg_masi_distance(multi_classifier, multi_label_feats):
    mds = []

    for feat, labels in multi_label_feats:
        mds.append(masi_distance(labels, multi_classifier.classify(feat)))

    if mds:
        return float(sum(mds)) / len(mds)
    else:
        return 0.0
Example #6
0
def avg_masi_distance(multi_classifier, multi_label_feats):
	mds = []
	
	for feat, labels in multi_label_feats:
		mds.append(masi_distance(labels, multi_classifier.classify(feat)))
	
	if mds:
		return float(sum(mds)) / len(mds)
	else:
		return 0.0
Example #7
0
    def score(self, lbl_types, ref_types, stemmed_word):
        """Gives the Masi distance between the two sets."""

        # Hack: ref 23643 is empty after applying rules & so it case "A*D" from csv file
        if not len(ref_types): return 1

        if stemmed_word:
            ref_types = self.replace_stem(stemmed_word, ref_types, lbl_types)

        return masi_distance(lbl_types, ref_types)
def masi_distance_chunk(D):
    '''
    Calculates masi distance between tokenized list pairs
    of questions and saves the result to a column specified by the
    environment variable MASI_DISTANCE
    '''
    if len(D) > 0:
        D[MASI_DISTANCE] = D.loc[:, Q_WORD_TOKENIZED].apply(
            lambda x: masi_distance(set(literal_eval(x[0])),
                                    set(literal_eval(x[1]))),
            axis=1)
    return D
Example #9
0
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split()

print(accuracy(training, testing))

trainset = set(training)
testset = set(testing)
print(precision(trainset, testset))  # 准确率
print(recall(trainset, testset))  # 召回率
print(f_measure(trainset, testset))

# 计算 编辑距离 复制 cost为0, 替换、删除、插入cost为1
from nltk.metrics import edit_distance
print(edit_distance('relate', 'relation'))
print(edit_distance('suggestion', 'calculation'))

# 使用Jaccard系数执行相似性度量 , |X 交 Y|/|X 并 Y|
from nltk.metrics import jaccard_distance
X = set([10, 20, 30, 40])
Y = set([20, 30, 60])
print(jaccard_distance(X, Y))

# 二进制距离算法度量 便签相同返回0.0,否则1.0
from nltk.metrics import binary_distance
print(binary_distance(X, Y))

# 存在多个标签,Masi距离算法
from nltk.metrics import masi_distance
print(masi_distance(X, Y))
Example #10
0
 def _masi_distance(self, s1, s2):
     return masi_distance(set(s1), set(s2))