def main(): """ Align two sparse matrices by intersecting their columns. """ # Get the arguments args = docopt('''Align two sparse matrices by intersecting their columns. Usage: count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2> <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <spacePrefix1> = path to pickled space1 without suffix <spacePrefix2> = path to pickled space2 without suffix Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] spacePrefix1 = args['<spacePrefix1>'] spacePrefix2 = args['<spacePrefix2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() space1 = load_pkl_files(spacePrefix1) space2 = load_pkl_files(spacePrefix2) id2row1 = space1.get_id2row() id2row2 = space2.get_id2row() id2column1 = space1.get_id2column() id2column2 = space2.get_id2column() column2id1 = space1.get_column2id() column2id2 = space2.get_column2id() intersected_columns = list(set(id2column1).intersection(id2column2)) intersected_columns_id1 = [ column2id1[item] for item in intersected_columns ] intersected_columns_id2 = [ column2id2[item] for item in intersected_columns ] reduced_matrix1 = space1.get_cooccurrence_matrix( )[:, intersected_columns_id1].get_mat() reduced_matrix2 = space2.get_cooccurrence_matrix( )[:, intersected_columns_id2].get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2) l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 l2norm2[l2norm2 == 0.0] = 1.0 # Convert 0 values to 1 reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1) reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1) reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1, intersected_columns) reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2, intersected_columns) if reduced_space1.get_id2column() != reduced_space2.get_id2column(): sys.exit('Two spaces not properly aligned!') # Save the Space object in pickle format save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True) save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Get second-order pairs from pair file. """ # Get the arguments args = docopt("""Get second-order pairs from pair file. Usage: second_order_pairs.py <pairFile> <samplesize> <freqThr> <outPath> Arguments: <pairFile> = path to training pairs with each line in the format 'word1 word2' <samplesize> = number of new pairs per target, expressed as percentage of old pairs (1.0 extracts equally many new pairs as old pairs) <freqThr> = co-occurrence frequency threshold over which no pairs are extracted <outPath> = output path for extracted pairs Note: Pairs are not yet switched or shuffled. """) pairFile = args['<pairFile>'] samplesize = float(args['<samplesize>']) freqThr = int(args['<freqThr>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Build vocabulary logging.info("Building vocabulary") freqs = defaultdict(int) targets = set() contexts = set() with codecs.open(pairFile, 'r', 'utf-8') as f_in: for line in f_in: line = line.strip().split(' ') word = line[0] freqs[word] += 1 contextWord = line[1] targets.add(word) contexts.add(contextWord) vocabulary = targets.union(contexts) w2i = {w: i for i, w in enumerate(vocabulary)} # Initialize co-occurrence matrix as dictionary cooc_mat = defaultdict(lambda: 0) # Get counts logging.info("Counting context words") with codecs.open(pairFile, 'r', 'utf-8') as f_in: for line in f_in: line = line.strip().split(' ') word = line[0] windex = w2i[word] contextWord = line[1] cindex = w2i[contextWord] cooc_mat[(windex, cindex)] += 1 # Convert dictionary to sparse matrix logging.info("Converting dictionary to matrix") cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)), dtype=float) try: cooc_mat_sparse.update(cooc_mat) except NotImplementedError: cooc_mat_sparse._update(cooc_mat) # Make space vocabulary = [v.encode('utf-8') for v in vocabulary] countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary) id2row = countSpace.get_id2row() row2id = countSpace.get_row2id() id2column = countSpace.get_id2column() column2id = countSpace.get_column2id() cid2rid = { cid: row2id[c] for cid, c in enumerate(id2column) if c in row2id } space = countSpace matrix = space.get_cooccurrence_matrix().get_mat() t2c = {} # Sample new pairs logging.info("Sampling new pairs") for i, target in enumerate(id2row): freq = freqs[target.decode('utf8')] if freq > freqThr: continue # Get counts as matrix m = space.get_row(target).get_mat() # Get nonzero indexes nonzeros = list(m.nonzero()[1]) if nonzeros == []: continue data = m.data # build second-order vector contextrowids = [cid2rid[n] for n in nonzeros] contextrows = matrix[contextrowids] vector2nd = csr_matrix( contextrows.multiply(data.reshape(-1, 1)).sum(axis=0)) # sample from second-order vector nonzeros = list(vector2nd.nonzero()[1]) if nonzeros == []: continue data = vector2nd.data sampling_probs = data / np.sum(data) samplesize_absolute = int(samplesize * freq) t_contexts = list( np.random.choice(nonzeros, size=samplesize_absolute, replace=True, p=sampling_probs)) t2c[target] = [ id2column[c] for c in t_contexts if id2column[c] != target ] # Export new pairs logging.info("Exporting new pairs") with codecs.open(outPath, 'w') as f_out: for t in t2c: for c in t2c[t]: f_out.write(' '.join((t, c)) + '\n') logging.info("--- %s seconds ---" % (time.time() - start_time))