예제 #1
0
파일: tsnetsv.py 프로젝트: 52nlp/textSNE
def main(args):
    argp = _argparser().parse_args(args[1:])

    ### Build the numpy array to be used by the library
    if not argp.whitespace_cells:
        cell_sep = '\t'
    else:
        cell_sep = None
    # Keep the tokens along to align them later
    toks = []
    toks_vals = []
    for line_num, line in enumerate((l.rstrip('\n') for l in argp.input),
            start=1):
        try:
            if not argp.no_tokens:
                tok, vals_str = line.split(cell_sep, 1)
            else:
                vals_str = line
            tok_vals = [float(v) for v in vals_str.split(cell_sep)]
        except ValueError:
            print >> stderr, ('ERROR: Failed to read input line %s "%s"'
                    ) % (line_num, line, )
            print >> stderr, ('ERROR: Is the input perhaps separated by '
                    'spaces instead of tabs? If so, try the -w flag')
            return -1

        if not argp.no_tokens:
            toks.append(tok)
        else:
            toks.append(line_num)
        toks_vals.append(tok_vals)
    toks_vals_array = array(toks_vals)
    del toks_vals

    ### Perform t-SNE (this is heavy stuff)
    # TODO: A majority of the below arguments could be program arguments
    tsne_output = tsne(toks_vals_array, no_dims=2, perplexity=30, initial_dims=30)
    del toks_vals_array

    ### Re-format and produce TSV output
    # Normalise the dimensions onto the range [0, 1]
    max_x = max(x for x, _ in tsne_output)
    min_x = min(x for x, _ in tsne_output)
    max_y = max(y for _, y in tsne_output)
    min_y = min(y for _, y in tsne_output)

    x_interval = sqrt((max_x - min_x) ** 2)
    y_interval = sqrt((max_y - min_y) ** 2)

    normalised_tsne_output = (((x - min_x) / x_interval,
        (y - min_y) / y_interval) for x, y in tsne_output)

    ### And write it all out
    for token, point in izip(toks, normalised_tsne_output):
        argp.output.write('{0}\t{1}\t{2}\n'.format(token, point[0], point[1]))

    return 0
예제 #2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    titles, x = getData(args.file)

    file_size1 = len(titles)

    if args.file2 is not None:
        titles2, x2 = getData(args.file2)
        titles.extend(titles2)
        x = np.vstack((x, x2))

#     x = scaleData(x)

    log.info('performing t-SNE')
    out = tsne(x, no_dims=2, perplexity=30, initial_dims=100, use_pca=False)

    points = [('green', [
        (title, point[0], point[1])
        for title, point in zip(titles[:file_size1], out[:file_size1, :])
    ])]

    if args.file2 is not None:
        points.append(('gray', [
            (title, point[0], point[1])
            for title, point in zip(titles[file_size1:], out[file_size1:, :])
        ]))


#     pca = PCA(n_components=2)
#     out = pca.fit_transform(x)

#     mds = MDS()
#     out = mds.fit_transform(x)

    log.info('rendering result')
    render_points(points, 20, 20)

    if args.out:
        plt.savefig(args.out, dpi=600)
    else:
        plt.show()

    log.info('finished')
예제 #3
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    titles, x = getData(args.file)

    file_size1 = len(titles)

    if args.file2 is not None:
        titles2, x2 = getData(args.file2)
        titles.extend(titles2)
        x = np.vstack((x, x2))

#     x = scaleData(x)

    log.info('performing t-SNE')
    out = tsne(x, no_dims=2, perplexity=30, initial_dims=100, use_pca=False)

    points = [('green', [(title, point[0], point[1])
            for title, point in zip(titles[:file_size1], out[:file_size1, :])])]

    if args.file2 is not None:
        points.append(('gray', [(title, point[0], point[1])
            for title, point in zip(titles[file_size1:], out[file_size1:, :])]))

#     pca = PCA(n_components=2)
#     out = pca.fit_transform(x)

#     mds = MDS()
#     out = mds.fit_transform(x)

    log.info('rendering result')
    render_points(points, 20, 20)

    if args.out:
        plt.savefig(args.out, dpi=600)
    else:
        plt.show()

    log.info('finished')
예제 #4
0
파일: test.py 프로젝트: 52nlp/textSNE
#!/usr/bin/env python

import string, numpy, gzip
o = gzip.open("testdata/english-embeddings.turian.txt.gz", "rb")
titles, x = [], []
for l in o:
    toks = string.split(l)
    titles.append(toks[0])
    x.append([float(f) for f in toks[1:]])
x = numpy.array(x)

#from tsne import tsne
from calc_tsne import tsne
#out = tsne(x, no_dims=2, perplexity=30, initial_dims=30, USE_PCA=False)
#out = tsne(x, no_dims=2, perplexity=30, initial_dims=30, use_pca=False)
out = tsne(x, no_dims=2, perplexity=30, initial_dims=30)

import render
render.render([(title, point[0], point[1]) for title, point in zip(titles, out)], "test-output.rendered.png", width=3000, height=1800)
예제 #5
0
파일: test.py 프로젝트: xiaominlan/textSNE
#!/usr/bin/env python

import string, numpy, gzip

o = gzip.open("testdata/english-embeddings.turian.txt.gz", "rb")
titles, x = [], []
for l in o:
    toks = string.split(l)
    titles.append(toks[0])
    x.append([float(f) for f in toks[1:]])
x = numpy.array(x)

#from tsne import tsne
from calc_tsne import tsne
#out = tsne(x, no_dims=2, perplexity=30, initial_dims=30, USE_PCA=False)
#out = tsne(x, no_dims=2, perplexity=30, initial_dims=30, use_pca=False)
out = tsne(x, no_dims=2, perplexity=30, initial_dims=30)

import render

render.render([(title, point[0], point[1])
               for title, point in zip(titles, out)],
              "test-output.rendered.png",
              width=3000,
              height=1800)