def main(): """ Transform EPMI matrix in npz format to SPPMI space and save as pickle file. """ # Get the arguments args = docopt( '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file. Usage: transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k> <spacePrefix> = path to npz without suffix <outPath> = output path for space <k> = shifting parameter ''') spacePrefix = args['<spacePrefix>'] outPath = args['<outPath>'] k = int(args['<k>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get npz matrix with np.load(spacePrefix + '.npz') as loader: matrix = csr_matrix( (loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) with open(spacePrefix + '.words.vocab') as f: id2row = vocab = [line.strip() for line in f if len(line) > 0] with open(spacePrefix + '.contexts.vocab') as f: id2column = [line.strip() for line in f if len(line) > 0] # Apply log weighting matrix.data = np.log(matrix.data) # Shift values matrix.data -= np.log(k) # Eliminate negative counts matrix.data[matrix.data <= 0] = 0.0 # Eliminate zero counts matrix.eliminate_zeros() # Create new space sparseSpace = Space(SparseMatrix(matrix), id2row, id2column) #print sparseSpace.get_cooccurrence_matrix() # Save the Space object in pickle format save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Convert temporal referencing matrix to regular (binned) matrix. """ # Get the arguments args = docopt( """Convert temporal referencing matrix to regular (binned) matrix. Usage: tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath> <spacePrefix> = path to pickled space without suffix <ref> = reference string <outPath> = output path for result file Options: -w, --w2v save in w2v format -s, --sps save in sparse matrix format """) is_w2v = args['--w2v'] is_sps = args['--sps'] spacePrefix = args['<spacePrefix>'] ref = args['<ref>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space = load_pkl_files(spacePrefix) matrix = space.get_cooccurrence_matrix().get_mat() id2row = space.get_id2row() id2column = space.get_id2column() ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')] if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)] targets, indices = zip(*ti) new_matrix = matrix[list(indices), :] # Save the Space objects if is_w2v: new_space = Space(DenseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=True) if is_sps: new_space = Space(SparseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=False) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Convert txt matrix to w2v matrix and save. """ # Get the arguments args = docopt('''Convert txt matrix to w2v matrix and save. Usage: convert_matrix_txt2w2v.py <spacePrefix> <outPath> <spacePrefix> = path to npz without suffix <outPath> = output path for space ''') spacePrefix = args['<spacePrefix>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() space_array = np.loadtxt(spacePrefix + '.txt', dtype=object, delimiter=' ', skiprows=0, comments='', encoding='utf-8') targets = space_array[:, 0].flatten() values = space_array[:, 1:].astype(np.float) # Create new space sparseSpace = Space(DenseMatrix(coo_matrix(values)), list(targets), []) #print sparseSpace.get_row('wood').get_mat().toarray()[0].tolist()[id2column.index('inexhaustible')] # Save the Space object in pickle format save_pkl_files(sparseSpace, outPath, save_in_one_file=True, save_as_w2v=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Align two sparse matrices by intersecting their columns. """ # Get the arguments args = docopt('''Align two sparse matrices by intersecting their columns. Usage: count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2> <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <spacePrefix1> = path to pickled space1 without suffix <spacePrefix2> = path to pickled space2 without suffix Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] spacePrefix1 = args['<spacePrefix1>'] spacePrefix2 = args['<spacePrefix2>'] outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() space1 = load_pkl_files(spacePrefix1) space2 = load_pkl_files(spacePrefix2) id2row1 = space1.get_id2row() id2row2 = space2.get_id2row() id2column1 = space1.get_id2column() id2column2 = space2.get_id2column() column2id1 = space1.get_column2id() column2id2 = space2.get_column2id() intersected_columns = list(set(id2column1).intersection(id2column2)) intersected_columns_id1 = [ column2id1[item] for item in intersected_columns ] intersected_columns_id2 = [ column2id2[item] for item in intersected_columns ] reduced_matrix1 = space1.get_cooccurrence_matrix( )[:, intersected_columns_id1].get_mat() reduced_matrix2 = space2.get_cooccurrence_matrix( )[:, intersected_columns_id2].get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2) l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 l2norm2[l2norm2 == 0.0] = 1.0 # Convert 0 values to 1 reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1) reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1) reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1, intersected_columns) reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2, intersected_columns) if reduced_space1.get_id2column() != reduced_space2.get_id2column(): sys.exit('Two spaces not properly aligned!') # Save the Space object in pickle format save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True) save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt( '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format. Usage: ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath> <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) <k> = shifting parameter <alpha> = smoothing parameter <outPath> = output path for space Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] dsm_prefix = args['<dsm_prefix>'] k = int(args['<k>']) alpha = float(args['<alpha>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get space with sparse matrix dsm = load_pkl_files(dsm_prefix) id2row = dsm.get_id2row() id2column = dsm.get_id2column() # Get probabilities matrix_ = dsm.cooccurrence_matrix matrix_.assert_positive() row_sum = matrix_.sum(axis=1) col_sum = matrix_.sum(axis=0) # Compute smoothed P_alpha(c) smooth_col_sum = np.power(col_sum, alpha) col_sum = smooth_col_sum / smooth_col_sum.sum() # Compute P(w) row_sum = nonzero_invert(row_sum) col_sum = nonzero_invert(col_sum) # Apply epmi weighting (without log) matrix_ = matrix_.scale_rows(row_sum) matrix_ = matrix_.scale_columns(col_sum) # Apply log weighting matrix_.mat.data = np.log(matrix_.mat.data) # Shift values matrix_.mat.data -= np.log(k) # Eliminate negative counts matrix_.mat.data[matrix_.mat.data <= 0] = 0.0 # Eliminate zero counts matrix_.mat.eliminate_zeros() matrix_ = matrix_.get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(matrix_, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 matrix_ /= l2norm1.reshape(len(l2norm1), 1) dsm = Space(SparseMatrix(matrix_), id2row, id2column) # Save the Space object in pickle format save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Make count-based vector space from corpus. """ # Get the arguments args = docopt("""Make count-based vector space from corpus. Usage: count.py [-l] <windowSize> <corpDir> <outPath> <lowerBound> <upperBound> Arguments: <corpDir> = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' <outPath> = output path for vectors <windowSize> = the linear distance of context words to consider in each direction <lowerBound> = lower bound for time period <upperBound> = upper bound for time period Options: -l, --len normalize final vectors to unit length """) is_len = args['--len'] corpDir = args['<corpDir>'] outPath = args['<outPath>'] windowSize = int(args['<windowSize>']) lowerBound = int(args['<lowerBound>']) upperBound = int(args['<upperBound>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Build vocabulary logging.info("Building vocabulary") sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) vocabulary = list( set([ word for sentence in sentences for word in sentence if len(sentence) > 1 ])) # Skip one-word sentences to avoid zero-vectors w2i = {w: i for i, w in enumerate(vocabulary)} # Initialize co-occurrence matrix as dictionary cooc_mat = defaultdict(lambda: 0) # Get counts from corpus sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) logging.info("Counting context words") for sentence in sentences: for i, word in enumerate(sentence): lowerWindowSize = max(i - windowSize, 0) upperWindowSize = min(i + windowSize, len(sentence)) window = sentence[lowerWindowSize:i] + sentence[i + 1:upperWindowSize + 1] if len(window) == 0: # Skip one-word sentences continue windex = w2i[word] for contextWord in window: cooc_mat[(windex, w2i[contextWord])] += 1 # Convert dictionary to sparse matrix logging.info("Converting dictionary to matrix") cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)), dtype=float) try: cooc_mat_sparse.update(cooc_mat) except NotImplementedError: cooc_mat_sparse._update(cooc_mat) if is_len: # L2-normalize vectors l2norm1 = linalg.norm(cooc_mat_sparse, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 cooc_mat_sparse /= l2norm1.reshape(len(l2norm1), 1) # Make space vocabulary = [v.encode('utf-8') for v in vocabulary] countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary) # Save the Space object in pickle format save_pkl_files(countSpace, outPath, save_in_one_file=False) logging.info("Corpus has size %d" % sentences.corpusSize) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt( '''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format. Usage: svd.py [-l] <dsm_prefix> <dim> <gamma> <outPath> <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd) <dim> = dimensionality of low-dimensional output vectors <gamma> = eigenvalue weighting parameter <outPath> = output path for space Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] dsm_prefix = args['<dsm_prefix>'] dim = int(args['<dim>']) gamma = float(args['<gamma>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get space with sparse matrix dsm = load_pkl_files(dsm_prefix) id2row = dsm.get_id2row() # Get matrix from space matrix_ = dsm.get_cooccurrence_matrix() # Apply SVD u, s, v = randomized_svd(matrix_.get_mat(), n_components=dim, n_iter=5, transpose=False) # Weight matrix if gamma == 0.0: matrix_ = u elif gamma == 1.0: #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix) matrix_ = s * u else: #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula matrix_ = np.power(s, gamma) * u if is_len: # L2-normalize vectors l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 matrix_ /= l2norm1.reshape(len(l2norm1), 1) dsm = Space(DenseMatrix(matrix_), id2row, []) # Save the Space object in pickle format save_pkl_files(dsm, outPath + ".svd.dm", save_in_one_file=True, save_as_w2v=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in: Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing. """ # Get the arguments args = docopt( '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices. Usage: srv_align.py [-l] (-s <seeds> | -a) <dim> <t> <outPath1> <outPath2> <outPathElement> <spacePrefix1> <spacePrefix2> <samplesize> = number negative samples, expressed as percentage of positive samples <negAlpha> = smoothing parameter for negative sampling <seeds> = number of non-zero values in each random vector <dim> = number of dimensions for random vectors <t> = threshold for downsampling (if t=None, no subsampling is applied) <outPath1> = output path for aligned space 1 <outPath2> = output path for aligned space 2 <spacePrefix1> = path to pickled space without suffix <spacePrefix2> = path to pickled space without suffix <outPathElement> = output path for elemental space (context vectors) Options: -l, --len normalize final vectors to unit length -s, --see specify number of seeds manually -a, --aut calculate number of seeds automatically as proposed in [1,2] References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') is_len = args['--len'] is_seeds = args['--see'] if is_seeds: seeds = int(args['<seeds>']) is_aut = args['--aut'] dim = int(args['<dim>']) if args['<t>'] == 'None': t = None else: t = float(args['<t>']) outPath1 = args['<outPath1>'] outPath2 = args['<outPath2>'] outPathElement = args['<outPathElement>'] spacePrefix1 = args['<spacePrefix1>'] spacePrefix2 = args['<spacePrefix2>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load input spaces space1 = load_pkl_files(spacePrefix1) space2 = load_pkl_files(spacePrefix2) matrix1 = csc_matrix(space1.get_cooccurrence_matrix().get_mat()) matrix2 = csc_matrix(space2.get_cooccurrence_matrix().get_mat()) # Get mappings between rows/columns and words id2row1 = space1.get_id2row() id2row2 = space2.get_id2row() row2id_1 = space1.get_row2id() row2id_2 = space2.get_row2id() id2column1 = space1.get_id2column() id2column2 = space2.get_id2column() # Get union of rows and columns in both spaces unified_rows = sorted(list(set(id2row1).union(id2row2))) unified_columns = sorted(list(set(id2column1).union(id2column2))) columns_diff1 = list(set(unified_columns) - set(id2column1)) columns_diff2 = list(set(unified_columns) - set(id2column2)) # Get mappings of indices of columns in original spaces to indices of columns in unified space c2i = {w: i for i, w in enumerate(unified_columns)} cj2i1 = {j: c2i[w] for j, w in enumerate(id2column1 + columns_diff1)} cj2i2 = {j: c2i[w] for j, w in enumerate(id2column2 + columns_diff2)} if t != None: rows_diff1 = list(set(unified_rows) - set(id2row1)) rows_diff2 = list(set(unified_rows) - set(id2row2)) r2i = {w: i for i, w in enumerate(unified_rows)} rj2i1 = {j: r2i[w] for j, w in enumerate(id2row1 + rows_diff1)} rj2i2 = {j: r2i[w] for j, w in enumerate(id2row2 + rows_diff2)} # Build spaces with unified COLUMNS new_columns1 = csc_matrix( (len(id2row1), len(columns_diff1) )) # Get empty columns for additional context words unified_matrix1 = hstack( (matrix1, new_columns1) )[:, sorted( cj2i1, key=cj2i1.get )] # First concatenate matrix and empty columns and then order columns according to unified_columns new_columns2 = csc_matrix((len(id2row2), len(columns_diff2))) unified_matrix2 = hstack( (matrix2, new_columns2))[:, sorted(cj2i2, key=cj2i2.get)] # Build spaces with unified ROWS new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns))) final_unified_matrix1 = csc_matrix(vstack( (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)] new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns))) final_unified_matrix2 = csc_matrix(vstack( (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)] # Add up final unified matrices common_unified_matrix = np.add(final_unified_matrix1, final_unified_matrix2) # Get number of total occurrences of any word totalOcc = np.sum(common_unified_matrix) # Define function for downsampling downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0 downsample = np.vectorize(downsample) # Get total normalized co-occurrence frequency of all contexts in both spaces context_freqs = np.array(common_unified_matrix.sum(axis=0) / totalOcc)[0] ## Generate ternary random vectors if is_seeds: elementalMatrix = lil_matrix((len(unified_columns), dim)) # Generate base vector for random vectors baseVector = np.zeros( dim ) # Note: Make sure that number of seeds is not greater than dimensions for i in range(0, seeds / 2): baseVector[i] = 1.0 for i in range(seeds / 2, seeds): baseVector[i] = -1.0 for i in range( len(unified_columns) ): # To-do: make this more efficient by generating random indices for a whole array np.random.shuffle(baseVector) elementalMatrix[i] = baseVector if is_aut: elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T # Initialize target vectors alignedMatrix1 = np.zeros((len(id2row1), dim)) alignedMatrix2 = np.zeros((len(id2row2), dim)) # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words for (space, id2row, cj2i, alignedMatrix) in [(space1, id2row1, cj2i1, alignedMatrix1), (space2, id2row2, cj2i2, alignedMatrix2)]: # Iterate over targets for i, target in enumerate(id2row): # Get co-occurrence values as matrix m = space.get_row(target).get_mat() # Get nonzero indexes nonzeros = m.nonzero() nonzeros = [cj2i[j] for j in nonzeros[1]] data = m.data pos_context_vectors = elementalMatrix[nonzeros] if t != None: # Apply subsampling rfs = context_freqs[nonzeros] rfs = downsample(rfs) data *= rfs # Weight context vectors by occurrence frequency pos_context_vectors = pos_context_vectors.multiply( data.reshape(-1, 1)) # Add up context vectors and store as row for target alignedMatrix[i] = np.sum(pos_context_vectors, axis=0) if is_len: # L2-normalize vectors l2norm1 = np.linalg.norm(alignedMatrix1, axis=1, ord=2) l2norm2 = np.linalg.norm(alignedMatrix2, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 l2norm2[l2norm2 == 0.0] = 1.0 # Convert 0 values to 1 alignedMatrix1 /= l2norm1.reshape(len(l2norm1), 1) alignedMatrix2 /= l2norm2.reshape(len(l2norm2), 1) # Make spaces alignedSpace1 = Space(DenseMatrix(alignedMatrix1), id2row1, []) alignedSpace2 = Space(DenseMatrix(alignedMatrix2), id2row2, []) elementalSpace = Space(SparseMatrix(elementalMatrix), unified_columns, []) # Save the Space objects in pickle format save_pkl_files(alignedSpace1, outPath1 + '.dm', save_in_one_file=False) save_pkl_files(alignedSpace2, outPath2 + '.dm', save_in_one_file=False) save_pkl_files(elementalSpace, outPathElement + '.dm', save_in_one_file=False) logging.info("--- %s seconds ---" % (time.time() - start_time))