Example #1
0
def id2vec_postprocess(args):
    """
    Merges row and column embeddings produced by Swivel and writes the Id2Vec
    model.

    :param args: :class:`argparse.Namespace` with "swivel_data" \
                 and "result". The text files are read from \
                 `swivel_data` and the model is written to \
                 `result`.
    :return: None
    """
    log = logging.getLogger("postproc")
    log.info("Parsing the embeddings at %s...", args.swivel_data)
    tokens = []
    embeddings = []
    swd = args.swivel_data
    with open(os.path.join(swd, "row_embedding.tsv")) as frow:
        with open(os.path.join(swd, "col_embedding.tsv")) as fcol:
            for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
                if i % 10000 == (10000 - 1):
                    sys.stdout.write("%d\r" % (i + 1))
                    sys.stdout.flush()
                prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
                assert prow[0] == pcol[0]
                tokens.append(prow[0][:TokenParser.MAX_TOKEN_LENGTH])
                erow, ecol = \
                    (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
                     for p in (prow, pcol))
                embeddings.append((erow + ecol) / 2)
    log.info("Generating numpy arrays...")
    embeddings = numpy.array(embeddings, dtype=numpy.float32)
    log.info("Writing %s...", args.output)
    Id2Vec().construct(embeddings=embeddings, tokens=tokens).save(args.output)
Example #2
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              prune_df_threshold=1,
              wmd_cache_centroids=True,
              wmd_kwargs: Dict[str, Any] = None,
              languages: Tuple[List, bool] = (None, False),
              engine_kwargs: Dict[str, Any] = None):
     backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec().load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies().load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._bow = BOW().load(backend=backend)
     else:
         assert isinstance(nbow, BOW)
         self._bow = nbow
     self._log.info("Loaded BOW model: %s", self._bow)
     assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._bow.matrix.shape[1]:
         raise ValueError(
             "Models do not match: id2vec has %s tokens while nbow has %s" %
             (len(self._id2vec), self._bow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs
                                                            or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
     self._languages = languages
     self._engine_kwargs = engine_kwargs
Example #3
0
def projector_entry(args):
    MAX_TOKENS = 10000  # hardcoded in Tensorflow Projector

    log = logging.getLogger("id2vec_projector")
    id2vec = Id2Vec(log_level=args.log_level).load(source=args.input)
    if args.docfreq:
        from sourced.ml.models import DocumentFrequencies
        df = DocumentFrequencies(log_level=args.log_level).load(source=args.docfreq)
    else:
        df = None
    if len(id2vec) < MAX_TOKENS:
        tokens = numpy.arange(len(id2vec), dtype=int)
        if df is not None:
            freqs = [df.get(id2vec.tokens[i], 0) for i in tokens]
        else:
            freqs = None
    else:
        if df is not None:
            log.info("Filtering tokens through docfreq")
            items = []
            for token, idx in id2vec.items():
                try:
                    items.append((df[token], idx))
                except KeyError:
                    continue
            log.info("Sorting")
            items.sort(reverse=True)
            tokens = [i[1] for i in items[:MAX_TOKENS]]
            freqs = [i[0] for i in items[:MAX_TOKENS]]
        else:
            log.warning("You have not specified --df => picking random %d tokens", MAX_TOKENS)
            numpy.random.seed(777)
            tokens = numpy.random.choice(
                numpy.arange(len(id2vec), dtype=int), MAX_TOKENS, replace=False)
            freqs = None
    log.info("Gathering the embeddings")
    embeddings = numpy.vstack([id2vec.embeddings[i] for i in tokens])
    tokens = [id2vec.tokens[i] for i in tokens]
    labels = ["subtoken"]
    if freqs is not None:
        labels.append("docfreq")
        tokens = list(zip(tokens, (str(i) for i in freqs)))
    import sourced.ml.utils.projector as projector
    projector.present_embeddings(args.output, not args.no_browser, labels, tokens, embeddings)
    if not args.no_browser:
        projector.wait()
Example #4
0
def id2role_eval(args):
    """
    Build a simple log-regression model to predict a Role of UAST node by its identifier embedding.
    It creates a report about embedding quality.
    To collect the dataset please use repos2roleids entry point.
    """
    log = logging.getLogger("id2role_eval")

    models = {}
    common_tokens = None
    for path in handle_input_arg(args.models, log):
        name = os.path.split(path)[1]
        id2vec = Id2Vec().load(path)
        id2vec.construct(id2vec.embeddings,
                         [t.split(".")[-1] for t in id2vec.tokens])
        models[name] = id2vec
        if common_tokens is None:
            common_tokens = set(models[name].tokens)
        else:
            common_tokens &= set(models[name].tokens)
    log.info("Common tokens in all models: %d" % len(common_tokens))

    tuned_parameters = [{'C': [10**x for x in range(-7, -1)]}]
    # Load data and preprocess
    log.info("Data loading...")
    df = load_dataset(args.dataset)
    df_ids = set(df["identifier"])
    valid_tokens = list(set(df_ids) & common_tokens)
    df = df[df["identifier"].isin(valid_tokens)]
    # Count identifiers in dataset
    log.info("Have embeddings only for %d tokens from %d in your dataset" %
             (len(valid_tokens), len(df_ids)))
    df_unique = df.groupby("identifier").agg(
        lambda x: x.value_counts().index[0])
    df_unique["identifier"] = df_unique.index
    # Exclude rare roles
    vc = df["role"].value_counts()
    del df
    rare = set(vc[vc < 10].index)
    log.info("%d rare roles excluded. " % len(rare))
    df_unique = df_unique[~df_unique["role"].isin(rare)]
    log.debug("Convert words to its embeddings")
    Xs, y = identifiers_to_datasets(df_unique, models, log)

    final_report = pandas.DataFrame(
        columns=["embedding name", "score", "best C value"])
    for name in tqdm(Xs):
        log.info("{}...".format(name))
        best_values = get_quality(Xs[name],
                                  y,
                                  LogisticRegression(class_weight="balanced",
                                                     random_state=args.seed),
                                  tuned_parameters=tuned_parameters,
                                  seed=args.seed,
                                  log=log)
        final_report = final_report.append(
            {
                "embedding name": name,
                "score": best_values[0],
                "best C value": best_values[1]["C"]
            },
            ignore_index=True)

    print("Pairs number: %d.\n" % len(valid_tokens))
    print(final_report)
Example #5
0
 def setUp(self):
     self.model = Id2Vec().load(source=paths.ID2VEC)
Example #6
0
def check_postproc_results(obj, id2vec_loc):
    id2vec = Id2Vec().load(source=id2vec_loc)
    obj.assertEqual(len(id2vec.tokens), VOCAB)
    obj.assertEqual(id2vec.embeddings.shape, (VOCAB, 10))
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Repository URL or path or name.")
    parser.add_argument("--log-level", default="INFO",
                        choices=logging._nameToLevel,
                        help="Logging verbosity.")
    parser.add_argument("--id2vec", default=None,
                        help="id2vec model URL or path.")
    parser.add_argument("--df", default=None,
                        help="Document frequencies URL or path.")
    parser.add_argument("--bow", default=None,
                        help="BOW model URL or path.")
    parser.add_argument("--prune-df", default=20, type=int,
                        help="Minimum number of times an identifier must occur in the dataset "
                             "to be taken into account.")
    parser.add_argument("--vocabulary-min", default=50, type=int,
                        help="Minimum number of words in a bag.")
    parser.add_argument("--vocabulary-max", default=500, type=int,
                        help="Maximum number of words in a bag.")
    parser.add_argument("-n", "--nnn", default=10, type=int,
                        help="Number of nearest neighbours.")
    parser.add_argument("--early-stop", default=0.1, type=float,
                        help="Maximum fraction of the nBOW dataset to scan.")
    parser.add_argument("--max-time", default=300, type=int,
                        help="Maximum time to spend scanning in seconds.")
    parser.add_argument("--skipped-stop", default=0.95, type=float,
                        help="Minimum fraction of skipped samples to stop.")
    languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"]
    parser.add_argument(
        "-l", "--languages", nargs="+", choices=languages,
        default=None,  # Default value for --languages arg should be None.
        # Otherwise if you process parquet files without 'lang' column, you will
        # fail to process it with any --languages argument.
        help="The programming languages to analyse.")
    parser.add_argument("--blacklist-languages", action="store_true",
                        help="Exclude the languages in --languages from the analysis "
                             "instead of filtering by default.")
    parser.add_argument(
        "-s", "--spark", default=SparkDefault.MASTER_ADDRESS,
        help="Spark's master address.")
    parser.add_argument("--bblfsh", default=EngineDefault.BBLFSH,
                        help="Babelfish server's address.")
    parser.add_argument("--engine", default=EngineDefault.VERSION,
                        help="source{d} jgit-spark-connector version.")
    args = parser.parse_args()
    setup_logging(args.log_level)
    backend = create_backend()
    if args.id2vec is not None:
        args.id2vec = Id2Vec().load(source=args.id2vec, backend=backend)
    if args.df is not None:
        args.df = DocumentFrequencies().load(source=args.df, backend=backend)
    if args.bow is not None:
        args.bow = BOW().load(source=args.bow, backend=backend)
    sr = SimilarRepositories(
        id2vec=args.id2vec, df=args.df, nbow=args.bow,
        prune_df_threshold=args.prune_df,
        wmd_cache_centroids=False,  # useless for a single query
        wmd_kwargs={"vocabulary_min": args.vocabulary_min,
                    "vocabulary_max": args.vocabulary_max},
        languages=(args.languages, args.blacklist_languages),
        engine_kwargs={"spark": args.spark,
                       "bblfsh": args.bblfsh,
                       "engine": args.engine},
    )
    neighbours = sr.query(
        args.input, k=args.nnn, early_stop=args.early_stop,
        max_time=args.max_time, skipped_stop=args.skipped_stop)
    for index, rate in neighbours:
        print("%48s\t%.2f" % (index, rate))