Example #1
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

    Options:
        -l, --len   normalize final vectors to unit length
    
    ''')

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
                           intersected_columns)
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,
                           intersected_columns)

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #2
0
def main():
    """
    Get second-order pairs from pair file.
    """

    # Get the arguments
    args = docopt("""Get second-order pairs from pair file.

    Usage:
        second_order_pairs.py <pairFile> <samplesize> <freqThr> <outPath>
        
    Arguments:
       
        <pairFile> = path to training pairs with each line in the format 'word1 word2'
        <samplesize> = number of new pairs per target, expressed as percentage of old pairs (1.0 extracts equally many new pairs as old pairs)
        <freqThr> = co-occurrence frequency threshold over which no pairs are extracted
        <outPath> = output path for extracted pairs

    Note:
        Pairs are not yet switched or shuffled.

    """)

    pairFile = args['<pairFile>']
    samplesize = float(args['<samplesize>'])
    freqThr = int(args['<freqThr>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Build vocabulary
    logging.info("Building vocabulary")
    freqs = defaultdict(int)
    targets = set()
    contexts = set()
    with codecs.open(pairFile, 'r', 'utf-8') as f_in:
        for line in f_in:
            line = line.strip().split(' ')
            word = line[0]
            freqs[word] += 1
            contextWord = line[1]
            targets.add(word)
            contexts.add(contextWord)

    vocabulary = targets.union(contexts)
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts
    logging.info("Counting context words")
    with codecs.open(pairFile, 'r', 'utf-8') as f_in:
        for line in f_in:
            line = line.strip().split(' ')
            word = line[0]
            windex = w2i[word]
            contextWord = line[1]
            cindex = w2i[contextWord]
            cooc_mat[(windex, cindex)] += 1

    # Convert dictionary to sparse matrix
    logging.info("Converting dictionary to matrix")
    cooc_mat_sparse = dok_matrix((len(vocabulary), len(vocabulary)),
                                 dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    # Make space
    vocabulary = [v.encode('utf-8') for v in vocabulary]
    countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary)
    id2row = countSpace.get_id2row()
    row2id = countSpace.get_row2id()
    id2column = countSpace.get_id2column()
    column2id = countSpace.get_column2id()
    cid2rid = {
        cid: row2id[c]
        for cid, c in enumerate(id2column) if c in row2id
    }

    space = countSpace
    matrix = space.get_cooccurrence_matrix().get_mat()
    t2c = {}
    # Sample new pairs
    logging.info("Sampling new pairs")
    for i, target in enumerate(id2row):
        freq = freqs[target.decode('utf8')]
        if freq > freqThr:
            continue
        # Get counts as matrix
        m = space.get_row(target).get_mat()
        # Get nonzero indexes
        nonzeros = list(m.nonzero()[1])
        if nonzeros == []:
            continue
        data = m.data
        # build second-order vector
        contextrowids = [cid2rid[n] for n in nonzeros]
        contextrows = matrix[contextrowids]
        vector2nd = csr_matrix(
            contextrows.multiply(data.reshape(-1, 1)).sum(axis=0))
        # sample from second-order vector
        nonzeros = list(vector2nd.nonzero()[1])
        if nonzeros == []:
            continue
        data = vector2nd.data
        sampling_probs = data / np.sum(data)
        samplesize_absolute = int(samplesize * freq)
        t_contexts = list(
            np.random.choice(nonzeros,
                             size=samplesize_absolute,
                             replace=True,
                             p=sampling_probs))
        t2c[target] = [
            id2column[c] for c in t_contexts if id2column[c] != target
        ]

    # Export new pairs
    logging.info("Exporting new pairs")
    with codecs.open(outPath, 'w') as f_out:
        for t in t2c:
            for c in t2c[t]:
                f_out.write(' '.join((t, c)) + '\n')

    logging.info("--- %s seconds ---" % (time.time() - start_time))