Exemple #1
0
    def test_present_embeddings_run_server(self):
        def sweded_which(prog):
            return None

        which = shutil.which
        shutil.which = sweded_which
        browser = os.getenv("BROWSER", "")
        os.environ["BROWSER"] = ""

        try:
            with tempfile.TemporaryDirectory(
                    prefix="sourced.ml-test-") as tmpdir:
                with captured_output() as (stdout, _, _):
                    present_embeddings(tmpdir, True, ["one"],
                                       [str(i) for i in range(5)],
                                       [(i, i) for i in range(5)])
                    with open(os.path.join(tmpdir, "id2vec.json")) as fin:
                        json.load(fin)
                    with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin:
                        self.assertEqual(fin.read(), "0\n1\n2\n3\n4\n")
                    with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin:
                        self.assertEqual(fin.read(),
                                         "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n")
                self.assertIn(
                    "\thttp://projector.tensorflow.org/?config=http://0.0.0.0:8000/id2vec.json\n",
                    stdout.getvalue())
        finally:
            shutil.which = which
            os.environ["BROWSER"] = browser
            web_server.stop()
Exemple #2
0
 def test_present_embeddings(self):
     with tempfile.TemporaryDirectory(prefix="sourced.ml-test-") as tmpdir:
         tmpdir = os.path.join(tmpdir, "1", "2")
         present_embeddings(tmpdir, False, ["one", "two"],
                            [(str(i), "x") for i in range(5)],
                            [(i, i) for i in range(5)])
         with open(os.path.join(tmpdir, "id2vec.json")) as fin:
             json.load(fin)
         with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin:
             self.assertEqual(fin.read(), "one\ttwo\n0\tx\n1\tx\n2\tx\n3\tx\n4\tx\n")
         with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin:
             self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n")
Exemple #3
0
def projector_entry(args):
    MAX_TOKENS = 10000  # hardcoded in Tensorflow Projector

    log = logging.getLogger("id2vec_projector")
    id2vec = Id2Vec(log_level=args.log_level).load(source=args.input)
    if args.docfreq:
        from sourced.ml.models import DocumentFrequencies
        df = DocumentFrequencies(log_level=args.log_level).load(source=args.docfreq)
    else:
        df = None
    if len(id2vec) < MAX_TOKENS:
        tokens = numpy.arange(len(id2vec), dtype=int)
        if df is not None:
            freqs = [df.get(id2vec.tokens[i], 0) for i in tokens]
        else:
            freqs = None
    else:
        if df is not None:
            log.info("Filtering tokens through docfreq")
            items = []
            for token, idx in id2vec.items():
                try:
                    items.append((df[token], idx))
                except KeyError:
                    continue
            log.info("Sorting")
            items.sort(reverse=True)
            tokens = [i[1] for i in items[:MAX_TOKENS]]
            freqs = [i[0] for i in items[:MAX_TOKENS]]
        else:
            log.warning("You have not specified --df => picking random %d tokens", MAX_TOKENS)
            numpy.random.seed(777)
            tokens = numpy.random.choice(
                numpy.arange(len(id2vec), dtype=int), MAX_TOKENS, replace=False)
            freqs = None
    log.info("Gathering the embeddings")
    embeddings = numpy.vstack([id2vec.embeddings[i] for i in tokens])
    tokens = [id2vec.tokens[i] for i in tokens]
    labels = ["subtoken"]
    if freqs is not None:
        labels.append("docfreq")
        tokens = list(zip(tokens, (str(i) for i in freqs)))
    import sourced.ml.utils.projector as projector
    projector.present_embeddings(args.output, not args.no_browser, labels, tokens, embeddings)
    if not args.no_browser:
        projector.wait()