def splitDirections(self, directions_fn, scores_fn, names_fn,
                        low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,
                                          high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes,
                                          low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Esempio n. 2
0
    def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,   high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Esempio n. 3
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)
        indexes_to_get.sort()
        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        agini = []
        agini1 = []
        angini1 = []
        angini = []
        amap = []
        andcg = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    total = 0
                    amt = 0
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    new_direction = []
                    new_ppmi = []
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    for d in range(len(ppmi[counter])):
                        if ppmi[counter][d] != 0:
                            total += ppmi[counter][d]
                            amt += 1
                            new_direction.append(direction_rank[d])
                            new_ppmi.append(ppmi_rank[d])
                    average = total / amt

                    min_max_scaler = preprocessing.MinMaxScaler()
                    normalized_ppmi = min_max_scaler.fit_transform(ppmi[counter])
                    normalized_dir = min_max_scaler.fit_transform(direction)

                    ginis = gini(normalized_ppmi, normalized_dir)

                    ranked_ppmi = dt.sortByArray(new_ppmi, new_direction)
                    nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi)
                    ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi))

                    #binarizer = preprocessing.Binarizer()
                    #binary_ppmi = binarizer.transform(normalized_ppmi)
                    #normalized_dir = np.ndarray.tolist(normalized_dir)
                    map = 0#average_precision_score(normalized_ppmi, normalized_dir)

                    rho, pvalue = spearmanr(new_ppmi, new_direction)
                    rhok, pvaluek = kendalltau(new_ppmi, new_direction)

                    scores.append(rho)
                    pvalues.append(pvalue)
                    scores_kendall.append(rhok)
                    pvalues_kendall.append(pvaluek)
                    andcg.append(ndcgs)
                    agini.append(ginis)
                    amap.append(map)
                    averages.append(average)
                    print phrases[counter] + ":", map, ginis

                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
        dt.write1dArray(agini, "RuleType/gn" + fn + ".score")
        dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score")
        dt.write1dArray(amap, "RuleType/map" + fn + ".score")
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn,
                 fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)
        indexes_to_get.sort()
        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        agini = []
        agini1 = []
        angini1 = []
        angini = []
        amap = []
        andcg = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    total = 0
                    amt = 0
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    new_direction = []
                    new_ppmi = []
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    for d in range(len(ppmi[counter])):
                        if ppmi[counter][d] != 0:
                            total += ppmi[counter][d]
                            amt += 1
                            new_direction.append(direction_rank[d])
                            new_ppmi.append(ppmi_rank[d])
                    average = total / amt

                    min_max_scaler = preprocessing.MinMaxScaler()
                    normalized_ppmi = min_max_scaler.fit_transform(
                        ppmi[counter])
                    normalized_dir = min_max_scaler.fit_transform(direction)

                    ginis = gini(normalized_ppmi, normalized_dir)

                    ranked_ppmi = dt.sortByArray(new_ppmi, new_direction)
                    nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi)
                    ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi))

                    #binarizer = preprocessing.Binarizer()
                    #binary_ppmi = binarizer.transform(normalized_ppmi)
                    #normalized_dir = np.ndarray.tolist(normalized_dir)
                    map = 0  #average_precision_score(normalized_ppmi, normalized_dir)

                    rho, pvalue = spearmanr(new_ppmi, new_direction)
                    rhok, pvaluek = kendalltau(new_ppmi, new_direction)

                    scores.append(rho)
                    pvalues.append(pvalue)
                    scores_kendall.append(rhok)
                    pvalues_kendall.append(pvaluek)
                    andcg.append(ndcgs)
                    agini.append(ginis)
                    amap.append(map)
                    averages.append(average)
                    print phrases[counter] + ":", map, ginis

                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
        dt.write1dArray(agini, "RuleType/gn" + fn + ".score")
        dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score")
        dt.write1dArray(amap, "RuleType/map" + fn + ".score")