Exemple #1
0
def train_embeddings(index, matrix, tmpdir, shard_size=IDEAL_SHARD_SIZE):
    try:
        from . import swivel
    except (SystemError, ImportError):
        import swivel
    import tensorflow as tf

    assert matrix.shape[0] == matrix.shape[1]
    assert len(index) <= matrix.shape[0]
    outlier_threshold = numpy.percentile(matrix.data, 99)
    matrix.data[matrix.data > outlier_threshold] = outlier_threshold
    nshards = len(index) // shard_size
    if nshards * shard_size < len(index):
        nshards += 1
        shard_size = len(index) // nshards
        nshards = len(index) // shard_size
    remainder = len(index) - nshards * shard_size
    if remainder > 0:
        lengths = matrix.indptr[1:] - matrix.indptr[:-1]
        filtered = sorted(numpy.argsort(lengths)[remainder:])
    else:
        filtered = list(range(len(index)))
    if len(filtered) < matrix.shape[0]:
        print("Truncating the sparse matrix...")
        matrix = matrix[filtered, :][:, filtered]
    meta_index = []
    for i, j in enumerate(filtered):
        meta_index.append((index[j], matrix[i, i]))
    index = [mi[0] for mi in meta_index]
    with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
        print("Writing Swivel metadata...")
        vocabulary = "\n".join(index)
        with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
            out.write(vocabulary)
        with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
            out.write(vocabulary)
        del vocabulary
        bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
        bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
        with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        del bool_sums_str
        reorder = numpy.argsort(-bool_sums)

        print("Writing Swivel shards...")
        for row in range(nshards):
            for col in range(nshards):
                def _int64s(xs):
                    return tf.train.Feature(
                        int64_list=tf.train.Int64List(value=list(xs)))

                def _floats(xs):
                    return tf.train.Feature(
                        float_list=tf.train.FloatList(value=list(xs)))

                indices_row = reorder[row::nshards]
                indices_col = reorder[col::nshards]
                shard = matrix[indices_row][:, indices_col].tocoo()

                example = tf.train.Example(features=tf.train.Features(feature={
                    "global_row": _int64s(indices_row),
                    "global_col": _int64s(indices_col),
                    "sparse_local_row": _int64s(shard.row),
                    "sparse_local_col": _int64s(shard.col),
                    "sparse_value": _floats(shard.data)}))

                with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
                    out.write(example.SerializeToString())
        print("Training Swivel model...")
        swivel.FLAGS.submatrix_rows = shard_size
        swivel.FLAGS.submatrix_cols = shard_size
        if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
            embedding_size = 50
            num_epochs = 20000
        elif len(meta_index) <= IDEAL_SHARD_SIZE:
            embedding_size = 50
            num_epochs = 10000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
            embedding_size = 60
            num_epochs = 5000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
            embedding_size = 70
            num_epochs = 4000
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
            embedding_size = 80
            num_epochs = 2500
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
            embedding_size = 100
            num_epochs = 500
        elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
            embedding_size = 200
            num_epochs = 300
        else:
            embedding_size = 300
            num_epochs = 200
        swivel.FLAGS.embedding_size = embedding_size
        swivel.FLAGS.input_base_path = tmproot
        swivel.FLAGS.output_base_path = tmproot
        swivel.FLAGS.loss_multiplier = 1.0 / shard_size
        swivel.FLAGS.num_epochs = num_epochs
        # Tensorflow 1.5 parses sys.argv unconditionally *applause*
        argv_backup = sys.argv[1:]
        del sys.argv[1:]
        swivel.main(None)
        sys.argv.extend(argv_backup)
        print("Reading Swivel embeddings...")
        embeddings = []
        with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
            with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
                for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
                    prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
                    assert prow[0] == pcol[0]
                    erow, ecol = \
                        (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
                         for p in (prow, pcol))
                    embeddings.append((erow + ecol) / 2)
    return meta_index, embeddings
Exemple #2
0
def train_embeddings(coocc_tree, tmpdir, shard_size=4096):
    from scipy.sparse import csr_matrix
    try:
        from . import swivel
    except (SystemError, ImportError):
        import swivel
    import tensorflow as tf

    index = coocc_tree["index"]
    nshards = len(index) // shard_size
    if nshards * shard_size < len(index):
        nshards += 1
        shard_size = len(index) // nshards
        nshards = len(index) // shard_size
    remainder = len(index) - nshards * shard_size
    if remainder > 0:
        lengths = numpy.array([len(cd) for cd in coocc_tree["matrix"]])
        filtered = sorted(numpy.argsort(lengths)[remainder:])
    else:
        filtered = list(range(len(index)))
    print("Reading the sparse matrix...")
    data = []
    indices = []
    indptr = [0]
    for row, cd in enumerate(coocc_tree["matrix"]):
        if row >= len(index):
            break
        for col, val in sorted(cd.items()):
            data.append(val)
            indices.append(col)
        indptr.append(indptr[-1] + len(cd))
    matrix = csr_matrix((data, indices, indptr),
                        shape=(len(index), len(index)))
    if len(filtered) < len(index):
        matrix = matrix[filtered, :][:, filtered]
    meta_index = []
    for i, j in enumerate(filtered):
        meta_index.append((index[j], matrix[i, i]))
    index = [mi[0] for mi in meta_index]
    with tempfile.TemporaryDirectory(prefix="hercules_labours_",
                                     dir=tmpdir or None) as tmproot:
        print("Writing Swivel metadata...")
        vocabulary = "\n".join(index)
        with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
            out.write(vocabulary)
        with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
            out.write(vocabulary)
        del vocabulary
        bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
        bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
        with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
            out.write(bool_sums_str)
        del bool_sums_str
        reorder = numpy.argsort(-bool_sums)

        print("Writing Swivel shards...")
        for row in range(nshards):
            for col in range(nshards):

                def _int64s(xs):
                    return tf.train.Feature(int64_list=tf.train.Int64List(
                        value=list(xs)))

                def _floats(xs):
                    return tf.train.Feature(float_list=tf.train.FloatList(
                        value=list(xs)))

                indices_row = reorder[row::nshards]
                indices_col = reorder[col::nshards]
                shard = matrix[indices_row][:, indices_col].tocoo()

                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "global_row": _int64s(indices_row),
                        "global_col": _int64s(indices_col),
                        "sparse_local_row": _int64s(shard.row),
                        "sparse_local_col": _int64s(shard.col),
                        "sparse_value": _floats(shard.data)
                    }))

                with open(
                        os.path.join(tmproot,
                                     "shard-%03d-%03d.pb" % (row, col)),
                        "wb") as out:
                    out.write(example.SerializeToString())
        print("Training Swivel model...")
        swivel.FLAGS.submatrix_rows = shard_size
        swivel.FLAGS.submatrix_cols = shard_size
        if len(meta_index) < 10000:
            embedding_size = 50
            num_epochs = 200
        elif len(meta_index) < 100000:
            embedding_size = 100
            num_epochs = 250
        elif len(meta_index) < 500000:
            embedding_size = 200
            num_epochs = 300
        else:
            embedding_size = 300
            num_epochs = 200
        swivel.FLAGS.embedding_size = embedding_size
        swivel.FLAGS.input_base_path = tmproot
        swivel.FLAGS.output_base_path = tmproot
        swivel.FLAGS.loss_multiplier = 1.0 / shard_size
        swivel.FLAGS.num_epochs = num_epochs
        swivel.main(None)
        print("Reading Swivel embeddings...")
        embeddings = []
        with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
            with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
                for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
                    prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
                    assert prow[0] == pcol[0]
                    erow, ecol = \
                        (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
                         for p in (prow, pcol))
                    embeddings.append((erow + ecol) / 2)
    return meta_index, embeddings