def learn_triplets_cooccur_mat(triplets_file_path): files = glob.glob(triplets_file_path) np_voc = vocabulary.Vocabulary() vp_voc = vocabulary.Vocabulary() np_voc.load('../mat/np1.voc') vp_voc.load('../mat/np2.voc') num_np = np_voc.size() num_vp = vp_voc.size() cooccur_mat = zeros([num_np, num_vp]) for file_in in files: with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) vp = cleansing.clean(triplets[1].split()) np2 = cleansing.clean(triplets[2].split()) for w in np2: vp.append(w) np1_new = [w for w in np1 if np_voc.contain(w)] vp_new = [w for w in vp if vp_voc.contain(w)] pairs = [(np_voc.get_word_index(u), vp_voc.get_word_index(v)) for u in np1_new for v in vp_new] for pair in pairs: cooccur_mat[pair[0], pair[1]] += 1 return cooccur_mat
def main(): matrix = np.loadtxt('../cooccurrence/cooccur_mat_final.txt') #matrix = matrix/10 biclust_result = do_biclustering(matrix) #print(biclust_result) bb.write_biclusters(biclust_result, "biclust_result.txt") # print the word np_voc = vocabulary.Vocabulary() vp_voc = vocabulary.Vocabulary() np_voc.load('../mat/np.voc') vp_voc.load('../mat/vp.voc') count = 0 with open("biclust_result_word.txt", 'w') as outfile: for bicluster in biclust_result: sub_matrix = matrix[np.ix_(bicluster.rows, bicluster.cols)] sum_np = sub_matrix.sum(axis=1) sum_vp = sub_matrix.sum(axis=0) dist_np = sum_np / (sum_np.sum()) dist_vp = sum_vp / (sum_vp.sum()) top_nps = np.argsort(dist_np) top_vps = np.argsort(dist_vp) # reverse so that the index will be ordered in descendant order. top_nps = top_nps[::-1] top_vps = top_vps[::-1] #print('\n---BICLUSTER---') outfile.write('\n---BICLUSTER {0}---\n'.format(count)) count += 1 #print('Top 20 NPs:') outfile.write('Top 20 NPs:\n') for i in range(1, len(bicluster.rows)): r_idx = top_nps[i] wid = bicluster.rows[r_idx] #print('{0} \t {1}'.format(np_voc.get_word(wid), dist_np[r_idx])) outfile.write('{0} \t {1}'.format(np_voc.get_word(wid), dist_np[r_idx])) outfile.write('\n') #print('Top 20 VPs:') outfile.write('Top 20 VPs:\n') for i in range(1, len(bicluster.cols)): c_idx = top_vps[i] wid = bicluster.cols[c_idx] #print('{0} \t {1}'.format(vp_voc.get_word(wid), dist_vp[c_idx])) outfile.write('{0} \t {1}'.format(vp_voc.get_word(wid), dist_vp[c_idx])) outfile.write('\n') #nps = [np_voc.get_word(r) for r in bicluster.rows] #outfile.write(" ".join(nps)) #outfile.write('\n') #vps = [vp_voc.get_word(c) for c in bicluster.cols] #outfile.write(" ".join(vps)) #outfile.write('\n') #outfile.write('\n') return
def learn_triplets_cooccur_mat(file_in, co_mat_file): learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix np1_all = vocabulary.Vocabulary() with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) # Delete words not in the similarity vocabulary. np1_new = [w for w in np1 if np_voc.contain(w)] for w in np1_new: np1_all.add(w) num_np1 = np1_all.size() similarity_mat_np1 = zeros([num_np1, num_np1]) for i in range(num_np1): for j in range(num_np1): similarity_mat_np1[i, j] = np1_matrix[ np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))] return CooccurMatrix(similarity_mat_np1, np1_all)
def build_sub_contextual_matrix_and_save( whole_vocab, story_files, vocab_filename, matrix_filename): """Build a contextual distribution matrix on a subset of corpus.""" start_time = time.time() stories = [] words = [] for story_file in story_files: story_words = read_story(story_file) words.extend(story_words) stories.append(story_words) sub_matrix_vocab = vocabulary.Vocabulary() sub_matrix_vocab.build_by_list(list(set(words))) logging.debug('Unique words in this batch: {0}.'.format(sub_matrix_vocab.size())) logging.debug('Calculating frequencies...') sub_contextual_mat = np.zeros((sub_matrix_vocab.size(), whole_vocab.size())) for i in range(0, len(stories)): story_words = stories[i] # count word frequencies in the story story_word_freq = {} for w in story_words: if w in story_word_freq: story_word_freq[w] = story_word_freq[w] + 1 else: story_word_freq[w] = 1 frequencies = [(u, v, story_word_freq[u]*story_word_freq[v]) for u in story_word_freq.keys() for v in story_word_freq.keys()] # fill in the contextual distribution matrix for (u, v, frequency) in frequencies: try: u_id = sub_matrix_vocab.get_word_index(u) v_id = whole_vocab.get_word_index(v) if u == v: sub_contextual_mat[u_id, v_id] += math.sqrt(frequency) else: sub_contextual_mat[u_id, v_id] += frequency except ValueError: continue # Here we do not normalize the sub contextual matrix. # This will be normalized when build the co-occurrence matrix of interest. # See: calculate_cooccur_matrix_by_submatrices(). # save sub matrices and vocabularies. sub_matrix_vocab.save(vocab_filename) save_matrix(matrix_filename, sub_contextual_mat) end_time = time.time() logging.debug("Time: %g seconds" % (end_time - start_time)) return sub_contextual_mat
def delete_words_in_vocab(old_vocab, to_delete): voc_old = vocabulary.Vocabulary() voc_old.load(old_vocab) print('Before delete: {0}'.format(voc_old.size())) voc_to_delete = vocabulary.Vocabulary() voc_to_delete.load(to_delete) print('To delete: {0}'.format(voc_to_delete.size())) new_word_list = [ w for w in voc_old.word_list if not w in voc_to_delete.word_list ] voc_new = vocabulary.Vocabulary() voc_new.build_by_list(new_word_list) print('After deletion: {0}'.format(len(new_word_list))) return voc_new
def calculate_cooccur_mat_by_submatrices_and_save(vocab_file_of_interest, full_vocab_size, files): # Load Vocabulary of interest vocab_of_interest = vocabulary.Vocabulary() vocab_of_interest.load(vocab_file_of_interest) mat = calculate_ooccur_mat_by_submatrices(vocab_of_interest, full_vocab_size, files) # Save matrix, vocabulary, and deleted vocabulary. mat.save(vocab_file_of_interest.replace('.voc', '_co_mat')) return
def main(): if (len(sys.argv) != 3): print('Please give matrix file and vocabulary file.') print('Usage: python matrix_viewer.py matrix_file vocabulary_file') voc = vocabulary.Vocabulary() try: matrix = np.load(sys.argv[1]) voc.load(sys.argv[2]) except Exception, e: print('Failed to load matrix or vocabulary') print('Exception: {0}'.format(e)) return
def build_vocabulary(story_files): """Build a vocabulary from a set of given tpt files.""" voc = vocabulary.Vocabulary() for story_file in story_files: words = read_story(story_file) for w in words: voc.add(w) logging.debug('Vocabulary obtained: {0} words.'.format(voc.size())) logging.debug('Print 15 words:') for i in range(0, 15): logging.debug('{0}: {1}'.format(i, voc.get_word(i))) return voc
def build_vocabulary(input_triplet_files, word_type='ALL'): vocab = vocabulary.Vocabulary() for triplet_file in input_triplet_files: with open(triplet_file, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') words = [] if (word_type == 'NP1' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[0].split())) if (word_type == 'VP' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[1].split())) if (word_type == 'NP2' or word_type == 'ALL'): words.extend(cleansing.clean(triplets[2].split())) for w in words: vocab.add(w) logging.info('Vocabulary: {0}, {1}.'.format(word_type, vocab.size())) return vocab
def generate_similarity_matrix(distribution_matrix_dir, vocabularies_of_interest): logging.basicConfig(level=logging.DEBUG) mat_filenames = glob.glob('{0}/*.npy'.format(distribution_matrix_dir)) mat_filenames.sort() # list of pairs (vocabulary_filename, matrix_filename) files = [] for mat_filename in mat_filenames: files.append((mat_filename.replace('.npy', '.voc'), mat_filename)) logging.debug('Total # of sub matrices: {0}.'.format(len(files))) # load the full vocabulary full_vocab = vocabulary.Vocabulary() full_vocab.load('{0}/corpus_vocabulary.voc'.format(distribution_matrix_dir)) logging.debug('Size of whole vocabulary: {0}.'.format(full_vocab.size())) # read the NP list and VP list for vocab_file in vocabularies_of_interest: logging.debug('Calculating similarity matrix for \'{0}\'...'.format( vocab_file)) cooccur_mat.calculate_cooccur_mat_by_submatrices_and_save( vocab_file, full_vocab.size(), files) return
def load(self, filename): self.matrix = np.load(filename + ".npy") self.vocabulary = vocabulary.Vocabulary() self.vocabulary.load(filename + ".voc")
def calculate_ooccur_mat_by_submatrices(vocab_of_interest, full_vocab_size, files): logging.debug('Total # of words: {0}.'.format(vocab_of_interest.size())) # Build contextual distribution matrix for NP. contextual_mat = np.zeros((vocab_of_interest.size(), full_vocab_size)) num_it = 0 for (voc_filename, mat_filename) in files: num_it += 1 # First read the sub vocabulary. sub_matrix_vocab = vocabulary.Vocabulary() sub_matrix_vocab.load(voc_filename) # Only load sub matrix if the matrix has the distribution we want. has_target_word = False target_words_rows = [] for i in range(0, vocab_of_interest.size()): if sub_matrix_vocab.contain(vocab_of_interest.get_word(i)): # get the row id of the interested word in sub matrix. row_id = sub_matrix_vocab.get_word_index(vocab_of_interest.get_word(i)) target_words_rows.append(row_id) has_target_word = True else: # if cannot find the word of interest in the vocabulary, # mark as -1. target_words_rows.append(-1) if has_target_word: sub_matrix = np.load(mat_filename) for i in range(0, vocab_of_interest.size()): row_id = target_words_rows[i] if row_id != -1: contextual_mat[i] += sub_matrix[row_id] # remove the words with no distribution information. row_sums = contextual_mat.sum(axis=1) rows_to_delete = [] for i in range(0, vocab_of_interest.size()): if row_sums[i] == 0: rows_to_delete.append(i) logging.warning('No contextual information for: {0}'.format(vocab_of_interest.get_word(i))) logging.warning('# of words deleted: {0}'.format(len(rows_to_delete))) # delete corresponding rows from matrix. contextual_mat = np.delete(contextual_mat, rows_to_delete, 0) row_sums = np.delete(row_sums, rows_to_delete, 0) # build new vocabulary after deletion. vocab_after_delete = vocabulary.Vocabulary() for w in vocab_of_interest.word_list: if vocab_of_interest.get_word_index(w) not in rows_to_delete: vocab_after_delete.add(w) # normalize. contextual_mat = contextual_mat / row_sums.reshape(-1, 1) # Build the co-occurrence matrix by contextual matrix. co_mat = calculate_cooccur_matrix(contextual_mat) logging.debug(co_mat) return CooccurMatrix(co_mat, vocab_after_delete)