def cosine_test(train_text, test_text, num1, num2): if not train_text or not test_text: return 0.0 tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform([train_text.lower(), test_text.lower()]) result = (tfidf_matrix * tfidf_matrix.T).A[0, 1] return balance_result(num1, num2, False, result)
def abbr_test(train_examples, test_examples, num1, num2): # if testExamples is a string, perform metadata abbr test (label thing). Else do the normal one train_example_set = set(train_examples) count_matches = 0 if isinstance(test_examples, str): patterns = get_abbr_patterns(test_examples) for pattern in patterns: if pattern.match(train_examples): count_matches += 1 break return count_matches else: test_example_set = set(test_examples) if len(test_example_set) > 50 or len(train_example_set) > 50: return 0.0 for test_example in test_example_set: if not test_example.isupper(): continue patterns = get_abbr_patterns(test_example) found = False for pattern in patterns: for train_example in train_example_set: if pattern.match(train_example): count_matches += 1 found = True break if found: break result = count_matches * 2.0 / (len(train_example_set) + len(test_example_set)) return balance_result(num1, num2, False, result)
def cosine_test(train_text, test_text, num1, num2): if not train_text or not test_text: return 0.0 tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform( [train_text.lower(), test_text.lower()]) result = (tfidf_matrix * tfidf_matrix.T).A[0, 1] return balance_result(num1, num2, False, result)
def mann_whitney_u_test(train_examples, test_examples, num1, num2): try: if len(train_examples) > 1 and len(test_examples) > 1: result = mannwhitneyu(train_examples, test_examples)[1] return balance_result(num1, num2, True, result) return 0 except ValueError as e: logging.warn("IGNORE EXCEPTION: %s", str(e)) return 0
def coverage_test(train_examples, test_examples, num1, num2): if len(train_examples) > 1 and len(test_examples) > 1: max1 = percentile(train_examples, 75) min1 = percentile(train_examples, 25) max2 = percentile(test_examples, 75) min2 = percentile(test_examples, 25) max3 = max(max1, max2) min3 = min(min1, min2) if min2 > max1 or min1 > max2: return 0 elif max3 == min3: return 0 else: min4 = min(max1, max2) max4 = max(min1, min2) result = (min4 - max4) * 1.0 / (max3 - min3) return balance_result(num1, num2, True, result) return 0
def coverage_test(train_examples, test_examples, num1, num2): if len(train_examples) > 1 and len(test_examples) > 1: max1 = percentile(train_examples, 100) min1 = percentile(train_examples, 0) max2 = percentile(test_examples, 100) min2 = percentile(test_examples, 0) max3 = max(max1, max2) min3 = min(min1, min2) # print "max1", max1 # print "min1", min1 # print "max2", max2 # print "min2", min2 if min2 > max1 or min1 > max2: return 0 elif max3 == min3: return 0 else: min4 = min(max1, max2) max4 = max(min1, min2) result = (min4 - max4) * 1.0 / (max3 - min3) return balance_result(num1, num2, True, result) return 0
def welch_test(train_examples, test_examples, num1, num2): if len(train_examples) > 1 and len(test_examples) > 1: print(train_examples, test_examples) result = ttest_ind(train_examples, test_examples, False)[1] return balance_result(num1, num2, True, result) return 0
def kolmogorov_smirnov_test(train_examples, test_examples, num1, num2): if len(train_examples) > 1 and len(test_examples) > 1: result = ks_2samp(train_examples, test_examples)[1] return balance_result(num1, num2, True, result) return 0
def jaccard_test(train_examples, test_examples, num1, num2): result = jaccard_similarity(train_examples, test_examples) return balance_result(num1, num2, False, result)
def mann_whitney_u_test(train_examples, test_examples, num1, num2): if len(train_examples) > 1 and len(test_examples) > 1: result = mannwhitneyu(train_examples, test_examples)[1] return balance_result(num1, num2, True, result) return 0