Exemple #1
0
def match_entities(entity_fn, t_entity_fn, entities_fn, classification):
    names = dt.import1dArray(entity_fn)
    t_names = dt.import1dArray(t_entity_fn)
    entities = dt.import2dArray(entities_fn)
    indexes_to_delete = []
    amount_found = 0
    for n in range(len(names)):
        names[n] = dt.removeEverythingFromString(names[n])
    for n in range(len(t_names)):
        t_names[n] = dt.removeEverythingFromString(t_names[n])
    matched_ids = []
    for n in range(len(t_names)):
        for ni in range(len(names)):
            matched_name = t_names[n]
            all_name = names[ni]
            if matched_name == all_name:
                print(matched_name)
                matched_ids.append(ni)
                break
    matched_entities = []
    for e in matched_ids:
        matched_entities.append(entities[e])

    print("Amount found", amount_found)
    dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
Exemple #2
0
def plotClusters(filename):
    names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("Plots/Top174" + filename + ".space")
    cluster_names = dt.import1dArray(
        "Clusters/films100N0.6H25L3CutLeastSimilarHIGH0.75,0.67.names")

    #svd = TruncatedSVD(n_components=2, random_state=42)

    cx = 8
    cy = 9
    x = []
    y = []
    for s in space[cx]:
        x.append(s)
    for s in space[cy]:
        y.append(s)

    #svd_space = svd.fit_transform(space)

    fig, ax = plt.subplots()
    ax.scatter(x, y, picker=True)
    #for i, name in enumerate(found_names):
    #    ax.annotate(name, (x[i], y[i]))
    ax.set_xlabel(cluster_names[cx])
    ax.set_ylabel(cluster_names[cy])

    def onpick3(event):
        ind = event.ind
        print('onpick3 scatter:', names[ind[0]])

    fig.canvas.mpl_connect('pick_event', onpick3)

    plt.show()
Exemple #3
0
def makeTopVectors(filename):

    vectors = dt.import2dArray("Rankings/" + filename + ".space")
    top250names = dt.import1dArray("filmdata/top250.txt")
    film_names = dt.import1dArray("filmdata/filmNames.txt")

    indexes = []
    ordered_names = []
    for f in range(len(film_names)):
        for t in top250names:
            if film_names[f] == t:
                indexes.append(f)
                ordered_names.append(t)

    top_vectors = [[]]
    for v in range(len(vectors)):
        if v > 0:
            top_vectors.append([])
        for i in range(len(vectors[v])):
            for id in indexes:
                if i == id:
                    top_vectors[v].append(vectors[v][i])

    dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space")
    dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
Exemple #4
0
def binaryClusterTerm(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = dt.import1dArray(
            "../data/movies/bow/binary/phrases/class-" + cn, "i")
        all_cluster_output.append(binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "ClusterTerm.txt")
Exemple #5
0
def PPMI(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = dt.import1dArray("../data/movies/bow/ppmi/class-class-" + cn,
                                  "f")
        all_cluster_output.append(binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "PPMI.txt")
Exemple #6
0
def normalizedTermFrequency(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = dt.import1dArray(
            "../data/movies/bow/frequency/phrases/class-" + cn, "i")
        all_cluster_output.append(binary)
    new_output = dt.scaleSpaceUnitVector(
        all_cluster_output,
        "../data/movies/finetune/" + fn + "NormalizedTermFrequency.txt")
Exemple #7
0
def convertEntityNamesToIDS(ID_fn, all_names_fn, individual_names_fn, output_fn):
    ID_fn = dt.import1dArray(ID_fn)
    all_names_fn = dt.import1dArray(all_names_fn)
    individual_names_fn = dt.import1dArray(individual_names_fn)
    indexes = []

    for n in range(len(all_names_fn)):
        for name in individual_names_fn:
            if all_names_fn[n] == name:
                indexes.append(n)
    dt.write1dArray(np.asarray(ID_fn)[indexes], output_fn)
def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
Exemple #9
0
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name):
    # Get the entities we have phrases for
    entity_names = dt.import1dArray(entity_names_fn)

    # Import multi classes
    multi_class = dt.import1dArray(multi_class_fn)
    class_names = []
    class_val = []
    highest_class = 0

    for line in multi_class:
        cn, cv = re.split(r'\t+', line)
        cv = int(cv)
        class_names.append(cn)
        class_val.append(cv)
        if cv  > highest_class:
            highest_class = cv



    matched_entity_names = list(set(entity_names).intersection(class_names))
    matched_entity_names.sort()
    dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt")


    indexes_to_delete = []

    for n in range(len(class_names)):
        found = False
        for en in range(len(matched_entity_names)):
            if class_names[n] == matched_entity_names[en]:
                found=True
                break
        if found is False:
            indexes_to_delete.append(n)

    class_val = np.delete(class_val, indexes_to_delete)

    classes = []
    print("Found " + str(highest_class) + " classes")
    for e in range(len(matched_entity_names)):
        class_a = [0] * highest_class
        class_a[class_val[e]-1] = 1
        classes.append(class_a)
    dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all")
    print("Wrote class all")
    classes = np.asarray(classes).transpose()


    for cn in range(len(classes)):
        dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn))
        print("Wrote", "class-"+str(cn))
Exemple #10
0
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name):
    available_indexes = dt.import1dArray(available_indexes_fn)
    rankings = np.asarray(dt.import2dArray(rankings_fn))
    names = dt.import1dArray(names)
    trimmed_rankings = []
    for r in range(len(rankings)):
        trimmed = rankings[r].take(available_indexes)
        trimmed_rankings.append(trimmed)
    for a in range(len(trimmed_rankings)):
        print("Writing", names[a])
        dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a])
    print("Writing", rankings_fn[-6:])
    dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
Exemple #11
0
def randomAll(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = np.asarray(
            dt.import1dArray(
                "../data/movies/bow/frequency/phrases/class-" + cn, "f"))
        random_binary = []
        for b in binary:
            random_binary.append(randint(0, np.amax(binary)))
        all_cluster_output.append(random_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "RandomAll.txt")
Exemple #12
0
def makeTopVectorsDirections(filename):
    vectors = dt.import2dArray("Directions/" + filename + "Cut.directions")
    top250names = dt.import1dArray("filmdata/top250.txt")
    filmnames = dt.import1dArray("filmdata/filmNames.txt")

    top250vectors = []

    for f in range(len(filmnames)):
        for t in range(len(top250names)):
            if filmnames[f] == top250names[t]:
                top250vectors.append(vectors[t])

    dt.write2dArray(top250vectors,
                    "../data/movies/plot/t250" + filename + ".directions")
Exemple #13
0
def binaryInCluster(cluster_dict_fn, fn):
    cluster = dt.readArrayDict(cluster_dict_fn)
    all_cluster_output = []
    for key, items in cluster.items():
        init_binary = dt.import1dArray(
            "../data/movies/bow/binary/phrases/" + key, "i")
        for i in items:
            binary = dt.import1dArray("../data/movies/bow/binary/phrases/" + i,
                                      "i")
            for j in range(len(init_binary)):
                if binary[j] == 1:
                    init_binary[j] = 1
        all_cluster_output.append(init_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "InCluster.txt")
Exemple #14
0
def logisticRegression(cluster_names_fn,
                       ranking_fn,
                       file_name,
                       do_p=False,
                       data_type="movies",
                       rewrite_files=False,
                       limit_entities=False,
                       classification="genres",
                       lowest_amt=0,
                       highest_amt=2147000000,
                       sparse_freqs_fn=None,
                       bow_names_fn=None):
    lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt"
    all_fns = [lr_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", bagOfClusters.__name__)
        return
    else:
        print("Running task", bagOfClusters.__name__)

    if limit_entities is False:
        classification = "all"

    cluster_names = dt.import2dArray(cluster_names_fn, "s")
    bow_names = dt.import1dArray(bow_names_fn, "s")
    sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True)

    frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt,
                     classification, file_name, bow_names, sparse_freqs)

    dt.write2dArray(frq, lr_fn)
    return frq
Exemple #15
0
def maxNonZero(cluster_names_fn, fn):
    all_cluster_output = []
    cluster_names = dt.import1dArray(cluster_names_fn)
    for cn in cluster_names:
        binary = np.asarray(
            dt.import1dArray(
                "../data/movies/bow/frequency/phrases/class-" + cn, "f"))
        random_binary = []
        for b in binary:
            if b > 0:
                random_binary.append(np.amax(binary))
            else:
                random_binary.append(0)
        all_cluster_output.append(random_binary)
    dt.write2dArray(all_cluster_output,
                    "../data/movies/finetune/" + fn + "MaxNonZero.txt")
Exemple #16
0
def plotSVD(filename):
    names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("Plots/Top174" + filename + ".space")

    space = np.matrix.transpose(np.asarray(space))
    space = space.tolist()
    svd = TruncatedSVD(n_components=2, random_state=42)
    svd_space = svd.fit_transform(space)

    x = []
    y = []

    for s in svd_space:
        print(s)
        x.append(s[0])
        y.append(s[1])

    fig, ax = plt.subplots()
    ax.scatter(x, y, picker=True)

    # for i, name in enumerate(found_names):
    #    ax.annotate(name, (x[i], y[i]))

    def onpick3(event):
        ind = event.ind
        print('onpick3 scatter:', names[ind[0]])

    fig.canvas.mpl_connect('pick_event', onpick3)

    plt.show()
Exemple #17
0
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot):
    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        frq.append(readFreq(name))

    pav_classes = []

    for f in range(len(frq)):
        print(names[f])
        x = np.asarray(frq[f])
        y = ranking[f]

        ir = IsotonicRegression()
        y_ = ir.fit_transform(x, y)
        pav_classes.append(y_)
        if plot:
            plot(x, y, y_)
        print(f)

    dt.write2dArray(
        pav_classes,
        "../data/movies/finetune/" + file_name + "PavTermFrequency.txt")
    return pav_classes
Exemple #18
0
def PPMIFT(cluster_names_fn,
           ranking_fn,
           file_name,
           do_p=False,
           data_type="movies",
           rewrite_files=False,
           limit_entities=False,
           classification="genres",
           lowest_amt=0,
           highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    dt.write2dArray(frq, pavPPMI_fn)
    return frq
    def runLR(self, property_name, y=None):
        if y is None:
            y = dt.import1dArray(
                "../data/" + self.data_type + "/bow/binary/phrases/class-" +
                property_name + "-" + str(self.lowest_amt) + "-" +
                str(self.higher_amt) + "-" + self.classification,
                file_type="i")
        else:
            y = y[0]
        for i in range(len(y)):
            if y[i] >= 1:
                y[i] = 1
        #x_train, y_train = dt.balanceClasses(x_train, y_train)
        clf = linear_model.LogisticRegression(class_weight="balanced",
                                              dual=False)
        clf.fit(self.x_train, y)
        direction = clf.coef_.tolist()[0]
        y_pred = clf.predict(self.x_test)
        y_pred = y_pred.tolist()
        f1 = f1_score(y[:len(y_pred)], y_pred)
        kappa_score = cohen_kappa_score(y[:len(y_pred)], y_pred)
        acc = accuracy_score(y[:len(y_pred)], y_pred)
        TP, FP, TN, FN = self.perf_measure(y, y_pred)

        #ppmi_score, ppmi_ratio = get_ppmi_score(y_pred, property_name)

        return kappa_score, f1, direction, acc, 0, TP, FP, TN, FN
    def runSVM(self, property_name, y=None):
        if y is None:
            y = dt.import1dArray(
                "../data/" + self.data_type + "/bow/binary/phrases/class-" +
                property_name + "-" + str(self.lowest_amt) + "-" +
                str(self.higher_amt) + "-" + self.classification,
                file_type="i")
        else:
            y = y[0]
        for i in range(len(y)):
            if y[i] >= 1:
                y[i] = 1
        #x_train, y_train = dt.balanceClasses(x_train, y_train)
        clf = svm.LinearSVC(class_weight="balanced", dual=False)
        #if len(self.x_train) !=
        clf.fit(self.x_train, y)
        direction = clf.coef_.tolist()[0]
        y_pred = clf.predict(self.x_test)
        y_pred = y_pred.tolist()
        f1 = f1_score(y[:len(y_pred)], y_pred)
        kappa_score = cohen_kappa_score(y[:len(y_pred)], y_pred)
        acc = accuracy_score(y[:len(y_pred)], y_pred)

        TP, FP, TN, FN = self.perf_measure(y, y_pred)
        print("TP", TP, "FP", FP, "TN", TN, "FN", FN)

        return kappa_score, f1, direction, acc, 0, TP, FP, TN, FN
def getAllRankings(directions_fn,
                   vectors_fn,
                   cluster_names_fn,
                   vector_names_fn,
                   percent,
                   percentage_increment,
                   by_vector,
                   fn,
                   discrete=True,
                   data_type="movies",
                   rewrite_files=False):

    #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt"
    rankings_fn = "../data/" + data_type + "/rank/numeric/" + fn + ".txt"
    #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt"

    all_fns = [rankings_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        for f in all_fns:
            print(f, "Already exists")
        print("Skipping task", "getAllRankings")
        return
    else:
        print("Running task", "getAllRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    cluster_names = dt.import1dArray(cluster_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    rankings = getRankings(directions, vectors, cluster_names, vector_names)
    rankings = np.asarray(rankings)
    if discrete:
        labels = createLabels(rankings, percent)
        labels = np.asarray(labels)
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    if by_vector:
        labels = labels.transpose()
        if discrete:
            discrete_labels = discrete_labels.transpose()
        rankings = rankings.transpose()
    if discrete:
        dt.write2dArray(labels, labels_fn)

    dt.write2dArray(rankings, rankings_fn)
    if discrete:
        dt.write2dArray(discrete_labels, discrete_labels_fn)
def fixCutoffFormatting(cutoff_fn, file_name):
    cutoff = dt.import1dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    for c in range(len(cutoff)):
        cutoff[c] = cutoff[c].split()
        for i in range(len(cutoff[c])):
            cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i]))
    dt.write2dArray(cutoff,
                    "../data/movies/rules/cutoff/" + file_name + ".txt")
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size,
                               train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax):
    file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str(
        sampling_threshold) + \
                " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(
        worker_count) + "spacy"
    " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \
    " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax)

    corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt"

    if os.path.exists(corpus_fn) is False:
        x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy")
        x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy")
        corpus = np.concatenate((x_train, x_test), axis=0)
        text_corpus = np.empty(len(corpus), dtype=np.object)
        for i in range(len(corpus)):
            text_corpus[i] = " ".join(corpus[i])
            print(text_corpus[i])
        dt.write1dArray(text_corpus, corpus_fn)

    embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt"

    model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin"
    vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy"
    score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score"

    if os.path.exists(model_fn):
        print("Imported model")
        model = g.utils.SaveLoad.load(model_fn)
    elif file_name[:7] == "Doc2Vec":
        model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold,
                        negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax)
        model.save(model_fn)

    if os.path.exists(vector_fn) is False:
        vectors = []
        for d in range(len(model.docvecs)):
            vectors.append(model.docvecs[d])
        np.save(vector_fn, vectors)
    else:
        print("Imported vectors")
        vectors = np.load(vector_fn)

    if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec":
        print("Getting score")
        if data_type == "sentiment":
            classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes)
            scores = linearSVMScore(x_train, y_train, x_test, y_test)
        else:
            classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i")
            x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes)
            scores = multiClassLinearSVM(x_train, y_train, x_test, y_test)
        print(scores)
        dt.write1dArray(scores, score_fn)
Exemple #24
0
 def obtainNDCG(self):
     # For each discrete rank, obtain the Kappa score compared to the word occ
     ndcgs = np.empty(len(self.names))
     for n in range(len(self.names)):
         ppmi = np.asarray(dt.import1dArray("../data/" + self.data_type + "/bow/ppmi/class-" + self.names[n] + "-"
                                            + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str(
             self.classification), "f"))
         sorted_indices = np.argsort(self.ranks)[::-1]
         score = ndcg.ndcg_from_ranking(ppmi, sorted_indices)
         ndcgs[n] = score
     return ndcgs
Exemple #25
0
def pavPPMI(cluster_names_fn,
            ranking_fn,
            file_name,
            do_p=False,
            data_type="movies",
            rewrite_files=False,
            limit_entities=False,
            classification="genres",
            lowest_amt=0,
            highest_amt=2147000000):
    pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt"
    all_fns = [pavPPMI_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", pavPPMI.__name__)
        return
    else:
        print("Running task", pavPPMI.__name__)
    print("certainly still running that old pavPPMI task, yes sir")
    if limit_entities is False:
        classification = "all"

    ranking = dt.import2dArray(ranking_fn)
    names = dt.import1dArray(cluster_names_fn)
    frq = []
    counter = 0

    for name in names:
        name = name.split()[0]
        if ":" in name:
            name = name[:-1]
        frq.append(
            readPPMI(name, data_type, lowest_amt, highest_amt, classification))

    pav_classes = []

    for f in range(len(frq)):
        try:
            print(names[f])
            x = np.asarray(frq[f])
            y = ranking[f]

            ir = IsotonicRegression()
            y_ = ir.fit_transform(x, y)
            pav_classes.append(y_)
            if do_p:
                plot(x, y, y_)
        except ValueError:
            print(names[f], "len ppmi",
                  len(frq[f], "len ranking", len(ranking[f])))
            exit()
        print(f)

    dt.write2dArray(pav_classes, pavPPMI_fn)
    return pav_classes
Exemple #26
0
def getAvailableEntities(entity_names_fns, data_type, classification):
    entity_names = []
    for e in entity_names_fns:
        entity_names.append(dt.import1dArray(e))
    dict = {}
    for entity_name in entity_names:
        for name in entity_name:
            dict[name] = 0
    available_entities = []
    for key in dict:
        available_entities.append(key)
    dt.write1dArray(available_entities, "../data/"+data_type+"/classify/"+classification+"available_entities.txt")
def getAllPhraseRankings(directions_fn=None,
                         vectors_fn=None,
                         property_names_fn=None,
                         vector_names_fn=None,
                         fn="no filename",
                         percentage_increment=1,
                         scores_fn=None,
                         top_amt=0,
                         discrete=False,
                         data_type="movies",
                         rewrite_files=False):
    rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt"

    all_fns = [rankings_fn_all]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", "getAllPhraseRankings")
        return
    else:
        print("Running task", "getAllPhraseRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    property_names = dt.import1dArray(property_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    if top_amt != 0:
        scores = dt.import1dArray(scores_fn, "f")
        directions = dt.sortByReverseArray(directions, scores)[:top_amt]
        property_names = dt.sortByReverseArray(property_names,
                                               scores)[:top_amt]

    rankings = getRankings(directions, vectors, property_names, vector_names)
    if discrete:
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    for a in range(len(rankings)):
        rankings[a] = np.around(rankings[a], decimals=4)
    #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt")

    dt.write2dArray(rankings, rankings_fn_all)
Exemple #28
0
def plotTopVectors(filename):

    names = dt.import1dArray(
        "../data/movies/plot/Top174OrderedByOriginalList.txt")
    space = dt.import2dArray("../data/movies/plot/Top174" + filename +
                             ".space")

    svd = TruncatedSVD(n_components=2, random_state=42)

    svd_space = svd.fit_transform(space)
    pl.plot(space[0], 'rx')
    pl.show()
    """
Exemple #29
0
def writeBagOfClusters(cluster_dict, data_type, lowest_amt, highest_amt,
                       classification, fn):
    bag_of_clusters = []
    # Note, prior we used the PPMI values directly here somehow...
    loc = "../data/" + data_type + "/bow/frequency/phrases/"
    final_fn = ""
    for c in range(len(cluster_dict)):
        # Remove the colons
        for f in range(len(cluster_dict[c])):
            if ":" in cluster_dict[c][f]:
                cluster_dict[c][f] = cluster_dict[c][f][:-1]
        # Add all of the frequences together to make a bag-of-clusters
        p1 = loc + "class-" + cluster_dict[c][0]
        p2 = "-" + str(lowest_amt) + "-" + str(
            highest_amt) + "-" + classification
        accum_freqs = [0.0] * len(dt.import1dArray(p1 + p2))
        counter = 0
        # For all the cluster terms
        for f in cluster_dict[c]:
            if ":" in f:
                f = f[:-1]
            # Import the class
            class_to_add = dt.import1dArray(
                loc + "class-" + f + "-" + str(lowest_amt) + "-" +
                str(highest_amt) + "-" + classification, "f")
            # Add the current class to the older one
            accum_freqs = np.add(accum_freqs, class_to_add)
            counter += 1
        # Append this clusters frequences to the group of them
        bag_of_clusters.append(accum_freqs)
    # Obtain the PPMI values for these frequences
    ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + "class-" + final_fn + str(
        lowest_amt) + "-" + str(highest_amt) + "-" + classification
    bag_csr = sp.csr_matrix(np.asarray(bag_of_clusters))
    ppmi_csr = mt.convertPPMI(bag_csr)
    dt.write2dArray(ppmi_csr,
                    "../data/" + data_type + "/bow/ppmi/" + fn + ".txt")
    return ppmi_csr
Exemple #30
0
 def obtainKappaOrNDCG(self):
     # For each discrete rank, obtain the Kappa score compared to the word occ
     scores = np.empty(len(self.names))
     for n in range(len(self.names)):
         if self.types[n] == 0:
             clf = svm.LinearSVC()
             ppmi = np.asarray(
                 dt.import1dArray("../data/" + self.data_type + "/bow/binary/phrases/class-" + self.names[n] + "-"
                                  + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str(
                     self.classification), "f"))
             clf.fit(self.ranks, ppmi)
             y_pred = clf.predict(self.ranks)
             score = cohen_kappa_score(ppmi, y_pred)
             scores[n] = score
         else:
             ppmi = np.asarray(
                 dt.import1dArray("../data/" + self.data_type + "/bow/ppmi/class-" + self.names[n] + "-"
                                  + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str(
                     self.classification), "f"))
             sorted_indices = np.argsort(self.ranks)[::-1]
             score = ndcg.ndcg_from_ranking(ppmi, sorted_indices)
             scores[n] = score
     return scores