def id2vec_postprocess(args): """ Merges row and column embeddings produced by Swivel and writes the Id2Vec model. :param args: :class:`argparse.Namespace` with "swivel_data" \ and "result". The text files are read from \ `swivel_data` and the model is written to \ `result`. :return: None """ log = logging.getLogger("postproc") log.info("Parsing the embeddings at %s...", args.swivel_data) tokens = [] embeddings = [] swd = args.swivel_data with open(os.path.join(swd, "row_embedding.tsv")) as frow: with open(os.path.join(swd, "col_embedding.tsv")) as fcol: for i, (lrow, lcol) in enumerate(zip(frow, fcol)): if i % 10000 == (10000 - 1): sys.stdout.write("%d\r" % (i + 1)) sys.stdout.flush() prow, pcol = (l.split("\t", 1) for l in (lrow, lcol)) assert prow[0] == pcol[0] tokens.append(prow[0][:TokenParser.MAX_TOKEN_LENGTH]) erow, ecol = \ (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t") for p in (prow, pcol)) embeddings.append((erow + ecol) / 2) log.info("Generating numpy arrays...") embeddings = numpy.array(embeddings, dtype=numpy.float32) log.info("Writing %s...", args.output) Id2Vec().construct(embeddings=embeddings, tokens=tokens).save(args.output)
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs
def projector_entry(args): MAX_TOKENS = 10000 # hardcoded in Tensorflow Projector log = logging.getLogger("id2vec_projector") id2vec = Id2Vec(log_level=args.log_level).load(source=args.input) if args.docfreq: from sourced.ml.models import DocumentFrequencies df = DocumentFrequencies(log_level=args.log_level).load(source=args.docfreq) else: df = None if len(id2vec) < MAX_TOKENS: tokens = numpy.arange(len(id2vec), dtype=int) if df is not None: freqs = [df.get(id2vec.tokens[i], 0) for i in tokens] else: freqs = None else: if df is not None: log.info("Filtering tokens through docfreq") items = [] for token, idx in id2vec.items(): try: items.append((df[token], idx)) except KeyError: continue log.info("Sorting") items.sort(reverse=True) tokens = [i[1] for i in items[:MAX_TOKENS]] freqs = [i[0] for i in items[:MAX_TOKENS]] else: log.warning("You have not specified --df => picking random %d tokens", MAX_TOKENS) numpy.random.seed(777) tokens = numpy.random.choice( numpy.arange(len(id2vec), dtype=int), MAX_TOKENS, replace=False) freqs = None log.info("Gathering the embeddings") embeddings = numpy.vstack([id2vec.embeddings[i] for i in tokens]) tokens = [id2vec.tokens[i] for i in tokens] labels = ["subtoken"] if freqs is not None: labels.append("docfreq") tokens = list(zip(tokens, (str(i) for i in freqs))) import sourced.ml.utils.projector as projector projector.present_embeddings(args.output, not args.no_browser, labels, tokens, embeddings) if not args.no_browser: projector.wait()
def id2role_eval(args): """ Build a simple log-regression model to predict a Role of UAST node by its identifier embedding. It creates a report about embedding quality. To collect the dataset please use repos2roleids entry point. """ log = logging.getLogger("id2role_eval") models = {} common_tokens = None for path in handle_input_arg(args.models, log): name = os.path.split(path)[1] id2vec = Id2Vec().load(path) id2vec.construct(id2vec.embeddings, [t.split(".")[-1] for t in id2vec.tokens]) models[name] = id2vec if common_tokens is None: common_tokens = set(models[name].tokens) else: common_tokens &= set(models[name].tokens) log.info("Common tokens in all models: %d" % len(common_tokens)) tuned_parameters = [{'C': [10**x for x in range(-7, -1)]}] # Load data and preprocess log.info("Data loading...") df = load_dataset(args.dataset) df_ids = set(df["identifier"]) valid_tokens = list(set(df_ids) & common_tokens) df = df[df["identifier"].isin(valid_tokens)] # Count identifiers in dataset log.info("Have embeddings only for %d tokens from %d in your dataset" % (len(valid_tokens), len(df_ids))) df_unique = df.groupby("identifier").agg( lambda x: x.value_counts().index[0]) df_unique["identifier"] = df_unique.index # Exclude rare roles vc = df["role"].value_counts() del df rare = set(vc[vc < 10].index) log.info("%d rare roles excluded. " % len(rare)) df_unique = df_unique[~df_unique["role"].isin(rare)] log.debug("Convert words to its embeddings") Xs, y = identifiers_to_datasets(df_unique, models, log) final_report = pandas.DataFrame( columns=["embedding name", "score", "best C value"]) for name in tqdm(Xs): log.info("{}...".format(name)) best_values = get_quality(Xs[name], y, LogisticRegression(class_weight="balanced", random_state=args.seed), tuned_parameters=tuned_parameters, seed=args.seed, log=log) final_report = final_report.append( { "embedding name": name, "score": best_values[0], "best C value": best_values[1]["C"] }, ignore_index=True) print("Pairs number: %d.\n" % len(valid_tokens)) print(final_report)
def setUp(self): self.model = Id2Vec().load(source=paths.ID2VEC)
def check_postproc_results(obj, id2vec_loc): id2vec = Id2Vec().load(source=id2vec_loc) obj.assertEqual(len(id2vec.tokens), VOCAB) obj.assertEqual(id2vec.embeddings.shape, (VOCAB, 10))
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Repository URL or path or name.") parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") parser.add_argument("--id2vec", default=None, help="id2vec model URL or path.") parser.add_argument("--df", default=None, help="Document frequencies URL or path.") parser.add_argument("--bow", default=None, help="BOW model URL or path.") parser.add_argument("--prune-df", default=20, type=int, help="Minimum number of times an identifier must occur in the dataset " "to be taken into account.") parser.add_argument("--vocabulary-min", default=50, type=int, help="Minimum number of words in a bag.") parser.add_argument("--vocabulary-max", default=500, type=int, help="Maximum number of words in a bag.") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of nearest neighbours.") parser.add_argument("--early-stop", default=0.1, type=float, help="Maximum fraction of the nBOW dataset to scan.") parser.add_argument("--max-time", default=300, type=int, help="Maximum time to spend scanning in seconds.") parser.add_argument("--skipped-stop", default=0.95, type=float, help="Minimum fraction of skipped samples to stop.") languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"] parser.add_argument( "-l", "--languages", nargs="+", choices=languages, default=None, # Default value for --languages arg should be None. # Otherwise if you process parquet files without 'lang' column, you will # fail to process it with any --languages argument. help="The programming languages to analyse.") parser.add_argument("--blacklist-languages", action="store_true", help="Exclude the languages in --languages from the analysis " "instead of filtering by default.") parser.add_argument( "-s", "--spark", default=SparkDefault.MASTER_ADDRESS, help="Spark's master address.") parser.add_argument("--bblfsh", default=EngineDefault.BBLFSH, help="Babelfish server's address.") parser.add_argument("--engine", default=EngineDefault.VERSION, help="source{d} jgit-spark-connector version.") args = parser.parse_args() setup_logging(args.log_level) backend = create_backend() if args.id2vec is not None: args.id2vec = Id2Vec().load(source=args.id2vec, backend=backend) if args.df is not None: args.df = DocumentFrequencies().load(source=args.df, backend=backend) if args.bow is not None: args.bow = BOW().load(source=args.bow, backend=backend) sr = SimilarRepositories( id2vec=args.id2vec, df=args.df, nbow=args.bow, prune_df_threshold=args.prune_df, wmd_cache_centroids=False, # useless for a single query wmd_kwargs={"vocabulary_min": args.vocabulary_min, "vocabulary_max": args.vocabulary_max}, languages=(args.languages, args.blacklist_languages), engine_kwargs={"spark": args.spark, "bblfsh": args.bblfsh, "engine": args.engine}, ) neighbours = sr.query( args.input, k=args.nnn, early_stop=args.early_stop, max_time=args.max_time, skipped_stop=args.skipped_stop) for index, rate in neighbours: print("%48s\t%.2f" % (index, rate))