def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes, low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform(ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0#average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform( ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0 #average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")