Beispiel #1
0
def learn_triplets_cooccur_mat(triplets_file_path):
    files = glob.glob(triplets_file_path)
    np_voc = vocabulary.Vocabulary()
    vp_voc = vocabulary.Vocabulary()
    np_voc.load('../mat/np1.voc')
    vp_voc.load('../mat/np2.voc')
    num_np = np_voc.size()
    num_vp = vp_voc.size()
    cooccur_mat = zeros([num_np, num_vp])
    for file_in in files:
        with open(file_in, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    np1 = cleansing.clean(triplets[0].split())
                    vp = cleansing.clean(triplets[1].split())
                    np2 = cleansing.clean(triplets[2].split())
                    for w in np2:
                        vp.append(w)
                    np1_new = [w for w in np1 if np_voc.contain(w)]
                    vp_new = [w for w in vp if vp_voc.contain(w)]

                    pairs = [(np_voc.get_word_index(u),
                              vp_voc.get_word_index(v)) for u in np1_new
                             for v in vp_new]
                    for pair in pairs:
                        cooccur_mat[pair[0], pair[1]] += 1
    return cooccur_mat
Beispiel #2
0
def main():
    matrix = np.loadtxt('../cooccurrence/cooccur_mat_final.txt')
    #matrix = matrix/10
    biclust_result = do_biclustering(matrix)
    #print(biclust_result)
    bb.write_biclusters(biclust_result, "biclust_result.txt")
    # print the word
    np_voc = vocabulary.Vocabulary()
    vp_voc = vocabulary.Vocabulary()
    np_voc.load('../mat/np.voc')
    vp_voc.load('../mat/vp.voc')
    count = 0
    with open("biclust_result_word.txt", 'w') as outfile:
        for bicluster in biclust_result:
            sub_matrix = matrix[np.ix_(bicluster.rows, bicluster.cols)]
            sum_np = sub_matrix.sum(axis=1)
            sum_vp = sub_matrix.sum(axis=0)
            dist_np = sum_np / (sum_np.sum())
            dist_vp = sum_vp / (sum_vp.sum())
            top_nps = np.argsort(dist_np)
            top_vps = np.argsort(dist_vp)

            # reverse so that the index will be ordered in descendant order.
            top_nps = top_nps[::-1]
            top_vps = top_vps[::-1]

            #print('\n---BICLUSTER---')
            outfile.write('\n---BICLUSTER {0}---\n'.format(count))
            count += 1
            #print('Top 20 NPs:')
            outfile.write('Top 20 NPs:\n')
            for i in range(1, len(bicluster.rows)):
                r_idx = top_nps[i]
                wid = bicluster.rows[r_idx]
                #print('{0} \t {1}'.format(np_voc.get_word(wid), dist_np[r_idx]))
                outfile.write('{0} \t {1}'.format(np_voc.get_word(wid),
                                                  dist_np[r_idx]))
                outfile.write('\n')

            #print('Top 20 VPs:')
            outfile.write('Top 20 VPs:\n')
            for i in range(1, len(bicluster.cols)):
                c_idx = top_vps[i]
                wid = bicluster.cols[c_idx]
                #print('{0} \t {1}'.format(vp_voc.get_word(wid), dist_vp[c_idx]))
                outfile.write('{0} \t {1}'.format(vp_voc.get_word(wid),
                                                  dist_vp[c_idx]))
                outfile.write('\n')

            #nps = [np_voc.get_word(r) for r in bicluster.rows]
            #outfile.write(" ".join(nps))
            #outfile.write('\n')

            #vps = [vp_voc.get_word(c) for c in bicluster.cols]
            #outfile.write(" ".join(vps))
            #outfile.write('\n')
            #outfile.write('\n')
    return
Beispiel #3
0
def learn_triplets_cooccur_mat(file_in, co_mat_file):
    learned_co_mat = CooccurMatrix()
    learned_co_mat.load(co_mat_file)
    np_voc = learned_co_mat.vocabulary
    np1_matrix = learned_co_mat.matrix

    np1_all = vocabulary.Vocabulary()
    with open(file_in, 'r') as f:
        for line in f:
            if (line[0] != '<'):
                line = (line[:-2]).lower()
                triplets = line.split('|')
                np1 = cleansing.clean(triplets[0].split())

                # Delete words not in the similarity vocabulary.
                np1_new = [w for w in np1 if np_voc.contain(w)]
                for w in np1_new:
                    np1_all.add(w)

    num_np1 = np1_all.size()
    similarity_mat_np1 = zeros([num_np1, num_np1])
    for i in range(num_np1):
        for j in range(num_np1):
            similarity_mat_np1[i, j] = np1_matrix[
                np_voc.get_word_index(np1_all.get_word(i)),
                np_voc.get_word_index(np1_all.get_word(j))]

    return CooccurMatrix(similarity_mat_np1, np1_all)
Beispiel #4
0
def build_sub_contextual_matrix_and_save(
        whole_vocab, story_files, vocab_filename, matrix_filename):
    """Build a contextual distribution matrix on a subset of corpus."""

    start_time = time.time()

    stories = []
    words = []
    for story_file in story_files:
        story_words = read_story(story_file)
        words.extend(story_words)
        stories.append(story_words)

    sub_matrix_vocab = vocabulary.Vocabulary()
    sub_matrix_vocab.build_by_list(list(set(words)))

    logging.debug('Unique words in this batch: {0}.'.format(sub_matrix_vocab.size()))
    logging.debug('Calculating frequencies...')

    sub_contextual_mat = np.zeros((sub_matrix_vocab.size(), whole_vocab.size()))

    for i in range(0, len(stories)):
        story_words = stories[i]

        # count word frequencies in the story
        story_word_freq = {}
        for w in story_words:
            if w in story_word_freq:
                story_word_freq[w] = story_word_freq[w] + 1
            else:
                story_word_freq[w] = 1

        frequencies = [(u, v, story_word_freq[u]*story_word_freq[v])
            for u in story_word_freq.keys() for v in story_word_freq.keys()]

        # fill in the contextual distribution matrix
        for (u, v, frequency) in frequencies:
            try:
                u_id = sub_matrix_vocab.get_word_index(u)
                v_id = whole_vocab.get_word_index(v)
                if u == v:
                    sub_contextual_mat[u_id, v_id] += math.sqrt(frequency)
                else:
                    sub_contextual_mat[u_id, v_id] += frequency
            except ValueError:
                continue

    # Here we do not normalize the sub contextual matrix.
    # This will be normalized when build the co-occurrence matrix of interest.
    # See: calculate_cooccur_matrix_by_submatrices().

    # save sub matrices and vocabularies.
    sub_matrix_vocab.save(vocab_filename)
    save_matrix(matrix_filename, sub_contextual_mat)

    end_time = time.time()
    logging.debug("Time: %g seconds" % (end_time - start_time))

    return sub_contextual_mat
Beispiel #5
0
def delete_words_in_vocab(old_vocab, to_delete):
    voc_old = vocabulary.Vocabulary()
    voc_old.load(old_vocab)
    print('Before delete: {0}'.format(voc_old.size()))

    voc_to_delete = vocabulary.Vocabulary()
    voc_to_delete.load(to_delete)
    print('To delete: {0}'.format(voc_to_delete.size()))

    new_word_list = [
        w for w in voc_old.word_list if not w in voc_to_delete.word_list
    ]
    voc_new = vocabulary.Vocabulary()
    voc_new.build_by_list(new_word_list)
    print('After deletion: {0}'.format(len(new_word_list)))

    return voc_new
Beispiel #6
0
def calculate_cooccur_mat_by_submatrices_and_save(vocab_file_of_interest, full_vocab_size, files):
    # Load Vocabulary of interest
    vocab_of_interest = vocabulary.Vocabulary()
    vocab_of_interest.load(vocab_file_of_interest)

    mat = calculate_ooccur_mat_by_submatrices(vocab_of_interest, full_vocab_size, files)

    # Save matrix, vocabulary, and deleted vocabulary.
    mat.save(vocab_file_of_interest.replace('.voc', '_co_mat'))
    return
Beispiel #7
0
def main():
    if (len(sys.argv) != 3):
        print('Please give matrix file and vocabulary file.')
        print('Usage: python matrix_viewer.py matrix_file vocabulary_file')

    voc = vocabulary.Vocabulary()
    try:
        matrix = np.load(sys.argv[1])
        voc.load(sys.argv[2])
    except Exception, e:
        print('Failed to load matrix or vocabulary')
        print('Exception: {0}'.format(e))
        return
Beispiel #8
0
def build_vocabulary(story_files):
    """Build a vocabulary from a set of given tpt files."""
    voc = vocabulary.Vocabulary()

    for story_file in story_files:
        words = read_story(story_file)
        for w in words:
            voc.add(w)

    logging.debug('Vocabulary obtained: {0} words.'.format(voc.size()))
    logging.debug('Print 15 words:')
    for i in range(0, 15):
        logging.debug('{0}: {1}'.format(i, voc.get_word(i)))
    return voc
Beispiel #9
0
def build_vocabulary(input_triplet_files, word_type='ALL'):
    vocab = vocabulary.Vocabulary()
    for triplet_file in input_triplet_files:
        with open(triplet_file, 'r') as f:
            for line in f:
                if (line[0] != '<'):
                    line = (line[:-2]).lower()
                    triplets = line.split('|')
                    words = []
                    if (word_type == 'NP1' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[0].split()))
                    if (word_type == 'VP' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[1].split()))
                    if (word_type == 'NP2' or word_type == 'ALL'):
                        words.extend(cleansing.clean(triplets[2].split()))

                    for w in words:
                        vocab.add(w)
    logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size()))
    return vocab
Beispiel #10
0
def generate_similarity_matrix(distribution_matrix_dir, vocabularies_of_interest):
    logging.basicConfig(level=logging.DEBUG)

    mat_filenames = glob.glob('{0}/*.npy'.format(distribution_matrix_dir))
    mat_filenames.sort()

    # list of pairs (vocabulary_filename, matrix_filename)
    files = []
    for mat_filename in mat_filenames:
        files.append((mat_filename.replace('.npy', '.voc'), mat_filename))
    logging.debug('Total # of sub matrices: {0}.'.format(len(files)))

    # load the full vocabulary
    full_vocab = vocabulary.Vocabulary()
    full_vocab.load('{0}/corpus_vocabulary.voc'.format(distribution_matrix_dir))
    logging.debug('Size of whole vocabulary: {0}.'.format(full_vocab.size()))

    # read the NP list and VP list
    for vocab_file in vocabularies_of_interest:
        logging.debug('Calculating similarity matrix for \'{0}\'...'.format(
            vocab_file))
        cooccur_mat.calculate_cooccur_mat_by_submatrices_and_save(
            vocab_file, full_vocab.size(), files)
    return
Beispiel #11
0
 def load(self, filename):
     self.matrix = np.load(filename + ".npy")
     self.vocabulary = vocabulary.Vocabulary()
     self.vocabulary.load(filename + ".voc")
Beispiel #12
0
def calculate_ooccur_mat_by_submatrices(vocab_of_interest, full_vocab_size, files):
    logging.debug('Total # of words: {0}.'.format(vocab_of_interest.size()))

    # Build contextual distribution matrix for NP.
    contextual_mat = np.zeros((vocab_of_interest.size(), full_vocab_size))

    num_it = 0
    for (voc_filename, mat_filename) in files:
        num_it += 1

        # First read the sub vocabulary.
        sub_matrix_vocab = vocabulary.Vocabulary()
        sub_matrix_vocab.load(voc_filename)

        # Only load sub matrix if the matrix has the distribution we want.
        has_target_word = False
        target_words_rows = []
        for i in range(0, vocab_of_interest.size()):
            if sub_matrix_vocab.contain(vocab_of_interest.get_word(i)):
                # get the row id of the interested word in sub matrix.
                row_id = sub_matrix_vocab.get_word_index(vocab_of_interest.get_word(i))
                target_words_rows.append(row_id)
                has_target_word = True
            else:
                # if cannot find the word of interest in the vocabulary,
                # mark as -1.
                target_words_rows.append(-1)

        if has_target_word:
            sub_matrix = np.load(mat_filename)
            for i in range(0, vocab_of_interest.size()):
                row_id = target_words_rows[i]
                if row_id != -1:
                    contextual_mat[i] += sub_matrix[row_id]

    # remove the words with no distribution information.
    row_sums = contextual_mat.sum(axis=1)
    rows_to_delete = []
    for i in range(0, vocab_of_interest.size()):
        if row_sums[i] == 0:
            rows_to_delete.append(i)
            logging.warning('No contextual information for: {0}'.format(vocab_of_interest.get_word(i)))
    logging.warning('# of words deleted: {0}'.format(len(rows_to_delete)))

    # delete corresponding rows from matrix.
    contextual_mat = np.delete(contextual_mat, rows_to_delete, 0)
    row_sums = np.delete(row_sums, rows_to_delete, 0)

    # build new vocabulary after deletion.
    vocab_after_delete = vocabulary.Vocabulary()
    for w in vocab_of_interest.word_list:
        if vocab_of_interest.get_word_index(w) not in rows_to_delete:
            vocab_after_delete.add(w)

    # normalize.
    contextual_mat = contextual_mat / row_sums.reshape(-1, 1)

    # Build the co-occurrence matrix by contextual matrix.
    co_mat = calculate_cooccur_matrix(contextual_mat)
    logging.debug(co_mat)

    return CooccurMatrix(co_mat, vocab_after_delete)