Esempio n. 1
0
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 2
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

    Options:
        -l, --len   normalize final vectors to unit length
    
    ''')

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
                           intersected_columns)
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,
                           intersected_columns)

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 3
0
def main():
    """
    Compute k nearest neighbors for targets.
    """

    # Get the arguments
    args = docopt("""Compute  k nearest neighbors for targets.

    Usage:
        knn.py <spacePrefix1> <k> <outPath> [<testset> <co>]

        <spacePrefix1> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <co> = column index for targets
        <k> = parameter k (k nearest neighbors)
        <outPath> = output path for result file

    Note:
        ...
        
    """)
    
    spacePrefix1 = args['<spacePrefix1>']
    testset = args['<testset>']
    co = int(args['<co>'])
    outPath = args['<outPath>']
    k = int(args['<k>'])
    
    logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,})
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()    

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)

    if testset!=None:
        with codecs.open(testset, 'r', 'utf8') as f_in:
            targets = [line.strip().split('\t')[co] for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        targets = [target.decode('utf8') for target in space1.get_row2id()]
    
    target2neighbors = {}
    for i,t1 in enumerate(targets):
        
        try:
            neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity())
            del neighbors1[0]
        except KeyError:
            neighbors1 = [('nan',float('nan'))]
            
        target2neighbors[t1] = neighbors1
               

    with codecs.open(outPath +'.csv', 'w', 'utf-8') as f_out:
        for t1 in targets:
            # Convert cosine similarity to cosine distance, export nearest neighbors
            print >> f_out, t1+'\t'+' '.join([str((n,1-v)) for (n,v) in target2neighbors[t1]])

    logging.info("--- %s seconds ---" % (time.time() - start_time))                   
Esempio n. 4
0
def main():
    """
    Compute local neighborhood distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute local neighborhood distance for target pairs from two vector spaces.

    Usage:
        lnd.py [(-f | -s)] <spacePrefix1> <spacePrefix2> <k> <outPath> [<testset>]

        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <k> = parameter k (k nearest neighbors)
        <outPath> = output path for result file

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    testset = args['<testset>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': True,
    })
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf8') as f_in:
            targets = [(line.strip().split('\t')[0],
                        line.strip().split('\t')[1]) for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        target_intersection = set([
            target.decode('utf8') for target in space1.get_row2id()
        ]).intersection(
            [target.decode('utf8') for target in space2.get_row2id()])
        targets = zip(target_intersection, target_intersection)

    scores = {}
    neighborUnionSizes = {}
    for i, (t1, t2) in enumerate(targets):

        # Get nearest neighbors
        try:
            neighbors1 = space1.get_neighbours(t1.encode('utf8'), k,
                                               CosSimilarity())
            neighbors2 = space2.get_neighbours(t2.encode('utf8'), k,
                                               CosSimilarity())
        except KeyError:
            scores[(t1, t2)] = 'nan'
            neighborUnionSizes[(t1, t2)] = 'nan'
            continue

        neighborUnion = list(
            set([
                a for (a, b) in neighbors1 + neighbors2
                if (a in space1.row2id and a in space2.row2id and not a in
                    [t1.encode('utf8'), t2.encode('utf8')])
            ]))

        simVec1 = [
            space1.get_sim(t1.encode('utf8'), n, CosSimilarity())
            for n in neighborUnion
        ]
        simVec2 = [
            space2.get_sim(t2.encode('utf8'), n, CosSimilarity())
            for n in neighborUnion
        ]

        # Compute cosine distance of vectors
        distance = spatial.distance.cosine(simVec1, simVec2)
        scores[(t1, t2)] = distance
        neighborUnionSizes[(t1, t2)] = len(neighborUnion)

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)])),
                                           str(neighborUnionSizes[(t1, t2)])))
            elif is_scd:  # output only second target string
                print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)])),
                                           str(neighborUnionSizes[(t1, t2)])))
            else:  # standard outputs both target strings
                print >> f_out, '\t'.join(
                    ('%s,%s' % (t1, t2), str(float(scores[(t1, t2)])),
                     str(neighborUnionSizes[(t1, t2)])))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
def main():
    """
    Compute cosine distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute cosine distance for target pairs from two vector spaces.

    Usage:
        displacement.py [-b] [-o] <spacePrefix1> <spacePrefix2> <outPath> [<testset>]

        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <outPath> = output path for result file

    Options:
        -b, --bot   output both targets in one column
        -o, --out   add nan to output if target is not in vocabulary

     Note:
        Important: spaces must be already aligned (columns in same order)! Default outputs only second target in case of target mismatch; you may want to change this for different purposes.
        
    """)

    is_both = args['--bot']
    is_out = args['--out']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    testset = args['<testset>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf-8') as f_in:
            targets = [(line.strip().split('\t')[0],
                        line.strip().split('\t')[1]) for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        target_intersection = set([
            target.decode('utf-8') for target in space1.get_row2id()
        ]).intersection(
            [target.decode('utf-8') for target in space2.get_row2id()])
        targets = zip(target_intersection, target_intersection)

    scores = {}
    # Iterate over rows
    for i, (t1, t2) in enumerate(targets):

        # Get row vector1
        if is_out:
            try:
                row1 = space1.get_row(t1.encode('utf8'))
            except KeyError:
                scores[(t1, t2)] = 'nan'
                continue
        else:
            row1 = space1.get_row(t1.encode('utf8'))
        # Assume it is scipy sparse matrix, if not, assume numpy matrix
        try:
            row_vector1 = row1.get_mat().toarray()[0].tolist()
        except AttributeError:
            row_vector1 = row1.get_mat().tolist()[0]

        # Get row vector2
        if is_out:
            try:
                row2 = space2.get_row(t2.encode('utf8'))
            except KeyError:
                scores[(t1, t2)] = 'nan'
                continue
        else:
            row2 = space2.get_row(t2.encode('utf8'))
        try:
            row_vector2 = row2.get_mat().toarray()[0].tolist()
        except AttributeError:
            row_vector2 = row2.get_mat().tolist()[0]

        # Compute cosine distance of vectors
        distance = spatial.distance.cosine(row_vector1, row_vector2)
        scores[(t1, t2)] = distance

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for (t1, t2) in targets:
            if is_both:
                print >> f_out, '\t'.join(
                    ('%s,%s' % (t1, t2), str(float(scores[(t1, t2)]))))
            else:
                if t1 == t2:
                    print >> f_out, '\t'.join(
                        (t1, str(float(scores[(t1, t2)]))))
                else:
                    print >> f_out, '\t'.join(
                        (t2, str(float(scores[(t1, t2)]))))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 6
0
def main():
    """
    Compute entropy for rows of targets from vector space.
    """

    # Get the arguments
    args = docopt("""Compute entropy for rows of targets from vector space.

    Usage:
        entropy.py [-n] <spacePrefix> <outPath> [<testset>]

        <spacePrefix> = path to pickled space without suffix
        <outPath> = output path for result file
        <testset> = path to file with targets in first column
        
    Options:
        -n, --nrm  normalize values by log of number of types

    """)

    is_norm = args['--nrm']
    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    testset = args['<testset>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space = load_pkl_files(spacePrefix)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf-8') as f_in:
            targets = [line.strip().split('\t')[0] for line in f_in]
    else:
        # If no test set is provided, compute values for all targets
        targets = [target.decode('utf-8') for target in space.get_row2id()]

    scores = {}
    norms = {}
    for i, v in enumerate(targets):

        try:
            row = space.get_row(v.encode('utf8'))
        except KeyError:
            scores[v] = 'nan'
            norms[v] = 'nan'
            continue

        # Get all counts in row (non-zero elements)
        counts = row.get_mat().data

        # Compute entropy of row
        H = entropy(counts, base=2)
        scores[v] = H

        if is_norm:
            # Get number of non-zero elements in row
            types = row.get_mat().getnnz()
            norms[v] = np.log2(types)

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for word in targets:
            if is_norm:
                print >> f_out, '\t'.join(
                    (word, str(float(scores[word]) / float(norms[word]))))
            else:
                print >> f_out, '\t'.join((word, str(float(scores[word]))))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 7
0
def main():
    """
    Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt(
        '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format.

    Usage:
        ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
        <k> = shifting parameter
        <alpha> = smoothing parameter
        <outPath> = output path for space

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)
    id2row = dsm.get_id2row()
    id2column = dsm.get_id2column()

    # Get probabilities
    matrix_ = dsm.cooccurrence_matrix

    matrix_.assert_positive()
    row_sum = matrix_.sum(axis=1)
    col_sum = matrix_.sum(axis=0)

    # Compute smoothed P_alpha(c)
    smooth_col_sum = np.power(col_sum, alpha)
    col_sum = smooth_col_sum / smooth_col_sum.sum()

    # Compute P(w)
    row_sum = nonzero_invert(row_sum)
    col_sum = nonzero_invert(col_sum)

    # Apply epmi weighting (without log)
    matrix_ = matrix_.scale_rows(row_sum)
    matrix_ = matrix_.scale_columns(col_sum)

    # Apply log weighting
    matrix_.mat.data = np.log(matrix_.mat.data)

    # Shift values
    matrix_.mat.data -= np.log(k)

    # Eliminate negative counts
    matrix_.mat.data[matrix_.mat.data <= 0] = 0.0

    # Eliminate zero counts
    matrix_.mat.eliminate_zeros()

    matrix_ = matrix_.get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(SparseMatrix(matrix_), id2row, id2column)

    # Save the Space object in pickle format
    save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 8
0
def main():
    """
    Compute cosine distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute cosine distance for target pairs from two vector spaces.

    Usage:
        cd.py [(-f | -s)] <spacePrefix1> <spacePrefix2> <outPath> [<testset>]

        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <testset> = path to file with tab-separated word pairs
        <outPath> = output path for result file

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file

     Note:
         Important: spaces must be already aligned (columns in same order)!
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    testset = args['<testset>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf-8') as f_in:
            targets = [(line.strip().split('\t')[0],
                        line.strip().split('\t')[1]) for line in f_in]
    else:
        # If no test set is provided, compute values for all targets occurring in both spaces
        target_intersection = set([
            target.decode('utf-8') for target in space1.get_row2id()
        ]).intersection(
            [target.decode('utf-8') for target in space2.get_row2id()])
        targets = zip(target_intersection, target_intersection)

    scores = {}
    for i, (t1, t2) in enumerate(targets):

        # Get row vectors
        try:
            row1 = space1.get_row(t1.encode('utf8'))
            row2 = space2.get_row(t2.encode('utf8'))
        except KeyError:
            scores[(t1, t2)] = 'nan'
            continue

        # Convert to list
        row_vector1 = csr_matrix(row1.get_mat()).toarray()[0].tolist()
        row_vector2 = csr_matrix(row2.get_mat()).toarray()[0].tolist()

        # Compute cosine distance of vectors
        distance = spatial.distance.cosine(row_vector1, row_vector2)
        scores[(t1, t2)] = distance

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)]))))
            elif is_scd:  # output only second target string
                print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)]))))
            else:  # standard outputs both target strings
                print >> f_out, '\t'.join(
                    ('%s,%s' % (t1, t2), str(float(scores[(t1, t2)]))))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 9
0
def main():
    """
    Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt(
        '''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format.

    Usage:
        svd.py [-l] <dsm_prefix> <dim> <gamma> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd)
        <dim> = dimensionality of low-dimensional output vectors
        <gamma> = eigenvalue weighting parameter
        <outPath> = output path for space

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    dim = int(args['<dim>'])
    gamma = float(args['<gamma>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)

    id2row = dsm.get_id2row()

    # Get matrix from space
    matrix_ = dsm.get_cooccurrence_matrix()

    # Apply SVD
    u, s, v = randomized_svd(matrix_.get_mat(),
                             n_components=dim,
                             n_iter=5,
                             transpose=False)

    # Weight matrix
    if gamma == 0.0:
        matrix_ = u
    elif gamma == 1.0:
        #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix)
        matrix_ = s * u
    else:
        #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula
        matrix_ = np.power(s, gamma) * u

    if is_len:
        # L2-normalize vectors
        l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(DenseMatrix(matrix_), id2row, [])

    # Save the Space object in pickle format
    save_pkl_files(dsm,
                   outPath + ".svd.dm",
                   save_in_one_file=True,
                   save_as_w2v=True)
    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 10
0
def main():
    """
    Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in:
       Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing.
    """

    # Get the arguments
    args = docopt(
        '''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices.

    Usage:
        srv_align.py [-l] (-s <seeds> | -a) <dim> <t> <outPath1> <outPath2> <outPathElement> <spacePrefix1> <spacePrefix2>

        <samplesize> = number negative samples, expressed as percentage of positive samples
        <negAlpha> = smoothing parameter for negative sampling
        <seeds> = number of non-zero values in each random vector
        <dim> = number of dimensions for random vectors
        <t> = threshold for downsampling (if t=None, no subsampling is applied)
        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space without suffix
        <spacePrefix2> = path to pickled space without suffix
        <outPathElement> = output path for elemental space (context vectors)

    Options:
        -l, --len   normalize final vectors to unit length
        -s, --see   specify number of seeds manually
        -a, --aut   calculate number of seeds automatically as proposed in [1,2]
  
    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')

    is_len = args['--len']
    is_seeds = args['--see']
    if is_seeds:
        seeds = int(args['<seeds>'])
    is_aut = args['--aut']
    dim = int(args['<dim>'])
    if args['<t>'] == 'None':
        t = None
    else:
        t = float(args['<t>'])
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']
    outPathElement = args['<outPathElement>']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load input spaces
    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    matrix1 = csc_matrix(space1.get_cooccurrence_matrix().get_mat())
    matrix2 = csc_matrix(space2.get_cooccurrence_matrix().get_mat())

    # Get mappings between rows/columns and words
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    row2id_1 = space1.get_row2id()
    row2id_2 = space2.get_row2id()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()

    # Get union of rows and columns in both spaces
    unified_rows = sorted(list(set(id2row1).union(id2row2)))
    unified_columns = sorted(list(set(id2column1).union(id2column2)))
    columns_diff1 = list(set(unified_columns) - set(id2column1))
    columns_diff2 = list(set(unified_columns) - set(id2column2))

    # Get mappings of indices of columns in original spaces to indices of columns in unified space
    c2i = {w: i for i, w in enumerate(unified_columns)}
    cj2i1 = {j: c2i[w] for j, w in enumerate(id2column1 + columns_diff1)}
    cj2i2 = {j: c2i[w] for j, w in enumerate(id2column2 + columns_diff2)}

    if t != None:
        rows_diff1 = list(set(unified_rows) - set(id2row1))
        rows_diff2 = list(set(unified_rows) - set(id2row2))

        r2i = {w: i for i, w in enumerate(unified_rows)}
        rj2i1 = {j: r2i[w] for j, w in enumerate(id2row1 + rows_diff1)}
        rj2i2 = {j: r2i[w] for j, w in enumerate(id2row2 + rows_diff2)}

        # Build spaces with unified COLUMNS
        new_columns1 = csc_matrix(
            (len(id2row1), len(columns_diff1)
             ))  # Get empty columns for additional context words
        unified_matrix1 = hstack(
            (matrix1, new_columns1)
        )[:, sorted(
            cj2i1, key=cj2i1.get
        )]  # First concatenate matrix and empty columns and then order columns according to unified_columns

        new_columns2 = csc_matrix((len(id2row2), len(columns_diff2)))
        unified_matrix2 = hstack(
            (matrix2, new_columns2))[:, sorted(cj2i2, key=cj2i2.get)]

        # Build spaces with unified ROWS
        new_rows1 = csc_matrix((len(rows_diff1), len(unified_columns)))
        final_unified_matrix1 = csc_matrix(vstack(
            (unified_matrix1, new_rows1)))[sorted(rj2i1, key=rj2i1.get)]

        new_rows2 = csc_matrix((len(rows_diff2), len(unified_columns)))
        final_unified_matrix2 = csc_matrix(vstack(
            (unified_matrix2, new_rows2)))[sorted(rj2i2, key=rj2i2.get)]

        # Add up final unified matrices
        common_unified_matrix = np.add(final_unified_matrix1,
                                       final_unified_matrix2)

        # Get number of total occurrences of any word
        totalOcc = np.sum(common_unified_matrix)

        # Define function for downsampling
        downsample = lambda f: np.sqrt(float(t) / f) if f > t else 1.0
        downsample = np.vectorize(downsample)

        # Get total normalized co-occurrence frequency of all contexts in both spaces
        context_freqs = np.array(common_unified_matrix.sum(axis=0) /
                                 totalOcc)[0]

    ## Generate ternary random vectors
    if is_seeds:
        elementalMatrix = lil_matrix((len(unified_columns), dim))
        # Generate base vector for random vectors
        baseVector = np.zeros(
            dim
        )  # Note: Make sure that number of seeds is not greater than dimensions
        for i in range(0, seeds / 2):
            baseVector[i] = 1.0
        for i in range(seeds / 2, seeds):
            baseVector[i] = -1.0
        for i in range(
                len(unified_columns)
        ):  # To-do: make this more efficient by generating random indices for a whole array
            np.random.shuffle(baseVector)
            elementalMatrix[i] = baseVector
    if is_aut:
        elementalMatrix = sparse_random_matrix(dim, len(unified_columns)).T

    # Initialize target vectors
    alignedMatrix1 = np.zeros((len(id2row1), dim))
    alignedMatrix2 = np.zeros((len(id2row2), dim))

    # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words
    for (space, id2row, cj2i,
         alignedMatrix) in [(space1, id2row1, cj2i1, alignedMatrix1),
                            (space2, id2row2, cj2i2, alignedMatrix2)]:
        # Iterate over targets
        for i, target in enumerate(id2row):
            # Get co-occurrence values as matrix
            m = space.get_row(target).get_mat()
            # Get nonzero indexes
            nonzeros = m.nonzero()
            nonzeros = [cj2i[j] for j in nonzeros[1]]
            data = m.data
            pos_context_vectors = elementalMatrix[nonzeros]
            if t != None:
                # Apply subsampling
                rfs = context_freqs[nonzeros]
                rfs = downsample(rfs)
                data *= rfs
            # Weight context vectors by occurrence frequency
            pos_context_vectors = pos_context_vectors.multiply(
                data.reshape(-1, 1))
            # Add up context vectors and store as row for target
            alignedMatrix[i] = np.sum(pos_context_vectors, axis=0)

    if is_len:
        # L2-normalize vectors
        l2norm1 = np.linalg.norm(alignedMatrix1, axis=1, ord=2)
        l2norm2 = np.linalg.norm(alignedMatrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        alignedMatrix1 /= l2norm1.reshape(len(l2norm1), 1)
        alignedMatrix2 /= l2norm2.reshape(len(l2norm2), 1)

    # Make spaces
    alignedSpace1 = Space(DenseMatrix(alignedMatrix1), id2row1, [])
    alignedSpace2 = Space(DenseMatrix(alignedMatrix2), id2row2, [])
    elementalSpace = Space(SparseMatrix(elementalMatrix), unified_columns, [])

    # Save the Space objects in pickle format
    save_pkl_files(alignedSpace1, outPath1 + '.dm', save_in_one_file=False)
    save_pkl_files(alignedSpace2, outPath2 + '.dm', save_in_one_file=False)
    save_pkl_files(elementalSpace,
                   outPathElement + '.dm',
                   save_in_one_file=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 11
0
def main():
    """
    Compute number of context types for all rows of a vector space and save their scores.
    """

    # Get the arguments
    args = docopt(
        """Compute number of context types for all rows of a vector space and save their scores.

    Usage:
        types.py [(-n <normConst>)] <spacePrefix> <outPath> [<testset>]

        <spacePrefix> = path to pickled space without suffix
        <outPath> = output path for result file
        <testset> = path to file with targets in first column
        <normConst> = normalization constant

    Options:
        -n, --nrm  normalize values by normalization constant
        
    """)

    is_norm = args['--nrm']
    if is_norm:
        normConst = float(args['<normConst>'])
    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    testset = args['<testset>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space = load_pkl_files(spacePrefix)

    if testset != None:
        # target vectors in first/second column are computed from space1/space2
        with codecs.open(testset, 'r', 'utf-8') as f_in:
            targets = [line.strip().split('\t')[0] for line in f_in]
    else:
        # If no test set is provided, compute values for all targets
        targets = [target.decode('utf-8') for target in space.get_row2id()]

    scores = {}
    # Iterate over targets
    for i, v in enumerate(targets):

        try:
            row = space.get_row(v.encode('utf8'))
        except KeyError:
            scores[v] = 'nan'
            continue

        # Get number of non-zero elements in row
        types = row.get_mat().getnnz()

        scores[v] = types

    with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out:
        for word in targets:
            if is_norm:
                scores[word] = float(scores[word]) / normConst
            print >> f_out, '\t'.join((word, str(float(scores[word]))))

    logging.info("--- %s seconds ---" % (time.time() - start_time))