Ejemplo n.º 1
0
def learn_triplets_cooccur_mat(file_in, co_mat_file):
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix

    np1_all = vocabulary.Vocabulary()
    with open(file_in, 'r') as f:
        for line in f:
            if (line[0] != '<'):
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1 = cleansing.clean(triplets[0].split())

                # Delete words not in the similarity vocabulary.
                np1_new = [w for w in np1 if np_voc.contain(w)]
                for w in np1_new:
                    np1_all.add(w)

    num_np1 = np1_all.size()
    similarity_mat_np1 = zeros([num_np1, num_np1])
    for i in range(num_np1):
        for j in range(num_np1):
            similarity_mat_np1[i, j] = np1_matrix[
                np_voc.get_word_index(np1_all.get_word(i)),
                np_voc.get_word_index(np1_all.get_word(j))]

    return CooccurMatrix(similarity_mat_np1, np1_all)
Ejemplo n.º 2
0
def learn_triplets_cooccur_mat(file_in, co_mat_file):
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix

    np1_all = vocabulary.Vocabulary()
    with open(file_in, 'r') as f:
        for line in f:
            if(line[0] != '<'):
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1 = cleansing.clean(triplets[0].split())

                # Delete words not in the similarity vocabulary.
                np1_new = [w for w in np1 if np_voc.contain(w)]
                for w in np1_new:
                    np1_all.add(w)

    num_np1 = np1_all.size()
    similarity_mat_np1 = zeros([num_np1, num_np1])
    for i in range(num_np1):
        for j in range(num_np1):
            similarity_mat_np1[i, j] = np1_matrix[np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))]

    return CooccurMatrix(similarity_mat_np1, np1_all)
Ejemplo n.º 3
0
def learn_story_distances(triplets_file_path, co_mat_file, use_similarity=True, min_similarity=None, output_file=False):
    # Load learned similarity matrix.
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix
    if min_similarity is not None:
        np1_matrix = (np1_matrix >= 0.8) * np1_matrix

    files = glob.glob(triplets_file_path)
    files.sort()
    print len(files)
    file_num = len(files)

    # Calculate word histogram for each story.
    np_num = np_voc.size()
    hist = np.zeros([file_num, np_num])
    count = 0
    for file_in in files:
        (h, wordlist) = learn_story_histogram(file_in, np_voc)
        hist[count, :] = h
        count += 1

    # Calculate pair-wise distance between stories.
    dist = np.zeros([file_num, file_num])
    for i in range(file_num):
        for j in range(file_num):
            dif = hist[i, :] - hist[j, :]
            if use_similarity:
                sq = np.dot(np.dot(dif, np1_matrix), dif.T)
                if (sq < 0):
                    sq = 0
                dist[i, j] = sqrt(sq)
            else:
                dist[i, j] = sqrt(np.dot(dif, dif.T))

    labels = []
    for filename in files:
        labels.append(filename.split('/')[-1][:-4])

    if output_file:
        np.savetxt('../mat/histogram.txt', hist)
        np.savetxt('../mat/distance.txt', dist)
        with codecs.open('../mat/filename.txt', "w", encoding='ISO-8859-1') as f:
            for l in labels:
                f.writelines(l + '\n')
    return (dist, labels)
Ejemplo n.º 4
0
def learn_story_distances(triplets_file_path, co_mat_file, use_similarity=True, min_similarity=None, output_file=False):
    # Load learned similarity matrix.
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix
    if min_similarity is not None:
        np1_matrix = (np1_matrix >= 0.8) * np1_matrix

    files = glob.glob(triplets_file_path)
    files.sort()
    print len(files)
    file_num = len(files)

    # Calculate word histogram for each story.
    np_num = np_voc.size()
    hist = np.zeros([file_num, np_num])
    count = 0
    for file_in in files:
        (h, wordlist) = learn_story_histogram(file_in, np_voc)
        hist[count, :] = h
        count += 1

    # Calculate pair-wise distance between stories.
    dist = np.zeros([file_num, file_num])
    for i in range(file_num):
        for j in range(file_num):
            dif = hist[i, :] - hist[j, :]
            if use_similarity:
                sq = np.dot(np.dot(dif, np1_matrix), dif.T)
                if sq < 0:
                    sq = 0
                dist[i, j] = sqrt(sq)
            else:
                dist[i, j] = sqrt(np.dot(dif, dif.T))

    labels = []
    for filename in files:
        labels.append(filename.split("/")[-1][:-4])

    if output_file:
        np.savetxt("../mat/histogram.txt", hist)
        np.savetxt("../mat/distance.txt", dist)
        with codecs.open("../mat/filename.txt", "w", encoding="ISO-8859-1") as f:
            for l in labels:
                f.writelines(l + "\n")
    return (dist, labels)