コード例 #1
0
def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None):
    if os.path.exists(vector_save_fn) is False:
        glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' +
                              str(wv_amt) + 'd.txt')
        tmp_file = get_tmpfile(
            "/home/tom/Downloads/glove.6B/test_word2vec.txt")
        glove2word2vec(glove_file, tmp_file)
        svm_dir = import2dArray(svm_dir_fn)
        all_vectors = KeyedVectors.load_word2vec_format(tmp_file)
        vectors = []

        words = import1dArray(words_fn)
        for w in range(len(words)):
            try:
                if svm_dir_fn is None:
                    vectors.append(all_vectors.get_vector(words[w]))
                else:
                    vectors.append(
                        np.concatenate(
                            [all_vectors.get_vector(words[w]), svm_dir[w]]))
            except KeyError:
                if svm_dir_fn is None:
                    vectors.append(np.zeros(wv_amt))
                else:
                    vectors.append(np.zeros(wv_amt + len(svm_dir[0])))

        write2dArray(vectors, vector_save_fn)

        write1dArray(words, wvn)
    else:
        print("Already got word vectors", vector_save_fn)
コード例 #2
0
def compileSVMResults(file_name, chunk_amt, data_type):
    if fileExists("../data/" + data_type + "/svm/directions/" + file_name +
                  ".txt") is False:
        print("Compiling SVM results")
        randomcount = 0
        directions = []
        for c in range(chunk_amt):
            directions.append("../data/" + data_type + "/svm/directions/" +
                              file_name + " CID" + str(c) + " CAMT" +
                              str(chunk_amt) + ".txt")
        kappa = []
        for c in range(chunk_amt):
            kappa.append("../data/" + data_type + "/svm/kappa/" + file_name +
                         " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt")
        for f in directions:
            while not fileExists(f):
                time.sleep(10)
        time.sleep(10)
        di = []
        for d in directions:
            di.extend(import2dArray(d))
        ka = []
        for k in kappa:
            ka.extend(import1dArray(k))
        write2dArray(
            di,
            "../data/" + data_type + "/svm/directions/" + file_name + ".txt")
        write1dArray(
            ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt")
    else:
        print("Skipping compile")
コード例 #3
0
def convertToPPMI(freq_arrays_fn, term_names_fn):
    freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i"))
    term_names = import1dArray(term_names_fn)
    ppmi_arrays = []
    overall = 0.0
    for f in freq_arrays:
        overall += sum(f)
    entity_array = [0] * 15000
    # For each term
    for t in range(len(freq_arrays)):
        ppmi_array = []
        term = sum(freq_arrays[t, :])
        term_p = term / overall
        for e in range(len(freq_arrays[t])):
            ppmi = 0.0
            freq = freq_arrays[t][e]
            if freq != 0:
                freq_p = freq / overall
                if entity_array[e] == 0:
                    entity = sum(freq_arrays[:, e])
                    entity_p = entity / overall
                    entity_array[e] = entity_p
                proba = freq_p / (entity_array[e] * term_p)
                ppmi = np.amax([0.0, np.log(proba)])
            ppmi_array.append(ppmi)
        print(ppmi_array)
        ppmi_arrays.append(ppmi_array)
        write1dArray(ppmi_array,
                     "../data/movies/bow/ppmi/class-" + term_names[t])
    write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
コード例 #4
0
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name):
    full_phrases = import1dArray(full_phrases_fn)
    #ppmi = np.asarray(import2dArray(class_fn)).transpose()
    ppmi = import2dArray(class_fn)
    new_ppmi = []
    phrases_used = import1dArray(phrases_used_fn)
    for p in range(len(full_phrases)):
        for pi in range(len(phrases_used)):
            if full_phrases[p] == phrases_used[pi]:
                new_ppmi.append(ppmi[p])
                break
    write2dArray(new_ppmi, file_name)
コード例 #5
0
def scaleSpaceUnitVector(space, file_name):
    space = np.asarray(space).transpose()
    print(len(space), len(space[0]))
    scaled_vector = []
    for v in space:
        if np.sum(v) != 0:
            norm = normalize(v)
            scaled_vector.append(norm)
        else:
            scaled_vector.append(v)
    space = space.transpose()
    write2dArray(scaled_vector, file_name)
コード例 #6
0
def concatenateArrays(arrays, file_name):
    new_array = arrays[0]
    for a in range(1, len(arrays)):
        new_array = np.concatenate((new_array, arrays[a]), axis=0)
    write2dArray(new_array, file_name)
コード例 #7
0
def removeIndexes(file_name, indexes, type="f"):
    removed_indexes = []
    orig_array = import2dArray(file_name, type)
    removed_indexes = np.delete(orig_array, indexes, axis=0)
    write2dArray(removed_indexes, file_name[:-4] + "removedind.txt")
コード例 #8
0
def scaleSpace(space, lower_bound, upper_bound, file_name):
    minmax_scale = MinMaxScaler(feature_range=(lower_bound, upper_bound),
                                copy=True)
    space = minmax_scale.fit_transform(space)
    write2dArray(space, file_name)
    return space
コード例 #9
0
"""

DATA EDITING TASKS

"""


def splitData(training_data, movie_vectors, movie_labels):
    x_train = np.asarray(movie_vectors[:training_data])
    y_train = np.asarray(movie_labels[:training_data])
    x_test = np.asarray(movie_vectors[training_data:])
    y_test = np.asarray(movie_labels[training_data:])
    return x_train, y_train, x_test, y_test


"""
a = import2dArray("D:\Eclipse\MDS/class-all-30-18836-alldm", "f")

a = np.nan_to_num(a)

write2dArray(a, "class-all-30-18836-alldmnTn")
"""
"""
mds = import2dArray("../data/newsgroups/nnet/spaces/mds.txt")

mds = mds.transpose()

write2dArray(mds, "../data/newsgroups/nnet/spaces/mds.txt")
"""