def __init__(self, topics=None, docfreq=None, bow=None, verbosity=logging.DEBUG, prune_df_threshold=1, gcs_bucket=None, initialize_environment=True, repo2bow_kwargs=None): if initialize_environment: initialize() self._log = logging.getLogger("topic_detector") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if topics is None: self._topics = Topics(log_level=verbosity).load(backend=backend) else: assert isinstance(topics, Topics) self._topics = topics self._log.info("Loaded topics model: %s", self._topics) if docfreq is None: if docfreq is not False: self._docfreq = DocumentFrequencies(log_level=verbosity).load( source=self._topics.dep("docfreq")["uuid"], backend=backend) else: self._docfreq = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(docfreq, DocumentFrequencies) self._docfreq = docfreq if self._docfreq is not None: self._docfreq = self._docfreq.prune(prune_df_threshold) self._log.info("Loaded docfreq model: %s", self._docfreq) if bow is not None: assert isinstance(bow, BOWBase) self._bow = bow if self._topics.matrix.shape[1] != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: topics has %s tokens while bow has %s" % (self._topics.matrix.shape[1], self._bow.matrix.shape[1])) self._log.info("Attached BOW model: %s", self._bow) else: self._bow = None self._log.warning("No BOW cache was loaded.") if self._docfreq is not None: self._repo2bow = Repo2BOW( {t: i for i, t in enumerate(self._topics.tokens)}, self._docfreq, **(repo2bow_kwargs or {})) else: self._repo2bow = None
def __init__(self, id2vec=None, df=None, nbow=None, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity, backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity, backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity, backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW(self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids()
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity).load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity).load(backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW( self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) assert self._nbow.dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._nbow.matrix.shape[1]: raise ValueError("Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._nbow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids()
def test_create_backend_invalid_args(self): backup = back.config.BACKEND_ARGS back.config.BACKEND_ARGS = "lalala" with self.assertRaises(ValueError): back.create_backend("Bar") back.config.BACKEND_ARGS = backup backup = back.config.BACKEND_ARGS back.config.BACKEND_ARGS = "" class Bar(back.StorageBackend): NAME = "Bar" back.register_backend(Bar) git_index = ind.GitIndex(index_repo=self.default_url, cache=self.cached_path) try: self.assertIsInstance(back.create_backend("Bar", git_index), Bar) finally: back.config.BACKEND_ARGS = backup
def __init__(self, id2vec=None, docfreq=None, gcs_bucket=None, **kwargs): if gcs_bucket: backend = create_backend("gcs", "bucket=" + gcs_bucket) else: backend = None self._id2vec = kwargs["id2vec"] = Id2Vec().load(id2vec or None, backend=backend) self._df = kwargs["docfreq"] = DocumentFrequencies().load( docfreq or None, backend=backend) super(Repo2nBOWTransformer, self).__init__(**kwargs)
def __init__(self, id2vec=None, docfreq=None, gcs_bucket=None, **kwargs): if gcs_bucket: backend = create_backend("gcs", "bucket=" + gcs_bucket) else: backend = None self._id2vec = kwargs["id2vec"] = Id2Vec().load(id2vec or None, backend=backend) self._df = kwargs["docfreq"] = DocumentFrequencies().load(docfreq or None, backend=backend) prune_df = kwargs.pop("prune_df", 1) if prune_df > 1: self._df = self._df.prune(prune_df) super().__init__(**kwargs)
def test_create_backend_invalid_args(self): backup = backends.config.BACKEND_ARGS backends.config.BACKEND_ARGS = "lalala" try: with self.assertRaises(ValueError): backends.create_backend("Bar") finally: backends.config.BACKEND_ARGS = backup backup = backends.config.BACKEND_ARGS backends.config.BACKEND_ARGS = "" class Bar(backends.StorageBackend): NAME = "Bar" backends.register_backend(Bar) try: self.assertIsInstance(backends.create_backend("Bar"), Bar) finally: backends.config.BACKEND_ARGS = backup
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs
def test_auto(self): class FakeModel(GenericModel): NAME = "docfreq" def route(url): if GCSBackend.INDEX_FILE in url: return '{"models": {"docfreq": {' \ '"f64bacd4-67fb-4c64-8382-399a8e7db52a": ' \ '{"url": "https://xxx"}, ' \ '"default": "f64bacd4-67fb-4c64-8382-399a8e7db52a"' \ '}}}'.encode() self.assertEqual("https://xxx", url) with open(get_path(self.DOCFREQ_PATH), "rb") as fin: return fin.read() modelforge.gcs_backend.requests = FakeRequests(route) model = FakeModel(backend=create_backend()) self._validate_meta(model)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Repository URL or path or name.") parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") parser.add_argument("--topics", default=None, help="Topic model URL or path.") parser.add_argument("--df", default=None, help="Document frequencies URL or path.") parser.add_argument("--bow", default=None, help="BOW model URL or path.") parser.add_argument("--bblfsh", default=None, help="babelfish server address.") parser.add_argument( "--timeout", type=int, default=None, help="Babelfish timeout - longer requests are dropped. Default is %s." % DEFAULT_BBLFSH_TIMEOUT) parser.add_argument("--gcs", default=None, help="GCS bucket to use.") parser.add_argument("--linguist", default=None, help="Path to src-d/enry or github/linguist.") parser.add_argument( "--prune-df", default=20, type=int, help="Minimum number of times an identifer must occur in different " "documents to be taken into account.") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of topics to print.") parser.add_argument("-f", "--format", default="human", choices=["json", "human"], help="Output format.") args = parser.parse_args() if args.linguist is None: args.linguist = "./enry" initialize(args.log_level, enry=args.linguist) if args.gcs: backend = create_backend(args="bucket=" + args.gcs) else: backend = create_backend() if args.topics is not None: args.topics = Topics(log_level=args.log_level).load(source=args.topics, backend=backend) if args.df is not None: args.df = DocumentFrequencies(log_level=args.log_level).load( source=args.df, backend=backend) if args.bow is not None: args.bow = BOWBase(log_level=args.log_level).load(source=args.bow, backend=backend) sr = TopicDetector(topics=args.topics, docfreq=args.df, bow=args.bow, verbosity=args.log_level, prune_df_threshold=args.prune_df, gcs_bucket=args.gcs, repo2bow_kwargs={ "linguist": args.linguist, "bblfsh_endpoint": args.bblfsh, "timeout": args.timeout }) topics = sr.query(args.input, size=args.nnn) if args.format == "json": json.dump({"repository": args.input, "topics": topics}, sys.stdout) elif args.format == "human": for t, r in topics: print("%64s" % t, "%.2f" % r, sep="\t")
def setUp(self): self.backend = create_backend()
def setUp(self): ind.git = fake_git ind.Repo = fake_git.FakeRepo fake_git.FakeRepo.reset(self.default_index) self.backend = create_backend(git_index=ind.GitIndex( remote=self.default_url, cache=self.cached_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Repository URL or path or name.") parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") parser.add_argument("--id2vec", default=None, help="id2vec model URL or path.") parser.add_argument("--df", default=None, help="Document frequencies URL or path.") parser.add_argument("--bow", default=None, help="BOW model URL or path.") parser.add_argument("--prune-df", default=20, type=int, help="Minimum number of times an identifier must occur in the dataset " "to be taken into account.") parser.add_argument("--vocabulary-min", default=50, type=int, help="Minimum number of words in a bag.") parser.add_argument("--vocabulary-max", default=500, type=int, help="Maximum number of words in a bag.") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of nearest neighbours.") parser.add_argument("--early-stop", default=0.1, type=float, help="Maximum fraction of the nBOW dataset to scan.") parser.add_argument("--max-time", default=300, type=int, help="Maximum time to spend scanning in seconds.") parser.add_argument("--skipped-stop", default=0.95, type=float, help="Minimum fraction of skipped samples to stop.") languages = ["Java", "Python", "Go", "JavaScript", "TypeScript", "Ruby", "Bash", "Php"] parser.add_argument( "-l", "--languages", nargs="+", choices=languages, default=None, # Default value for --languages arg should be None. # Otherwise if you process parquet files without 'lang' column, you will # fail to process it with any --languages argument. help="The programming languages to analyse.") parser.add_argument("--blacklist-languages", action="store_true", help="Exclude the languages in --languages from the analysis " "instead of filtering by default.") parser.add_argument( "-s", "--spark", default=SparkDefault.MASTER_ADDRESS, help="Spark's master address.") parser.add_argument("--bblfsh", default=EngineDefault.BBLFSH, help="Babelfish server's address.") parser.add_argument("--engine", default=EngineDefault.VERSION, help="source{d} jgit-spark-connector version.") args = parser.parse_args() setup_logging(args.log_level) backend = create_backend() if args.id2vec is not None: args.id2vec = Id2Vec().load(source=args.id2vec, backend=backend) if args.df is not None: args.df = DocumentFrequencies().load(source=args.df, backend=backend) if args.bow is not None: args.bow = BOW().load(source=args.bow, backend=backend) sr = SimilarRepositories( id2vec=args.id2vec, df=args.df, nbow=args.bow, prune_df_threshold=args.prune_df, wmd_cache_centroids=False, # useless for a single query wmd_kwargs={"vocabulary_min": args.vocabulary_min, "vocabulary_max": args.vocabulary_max}, languages=(args.languages, args.blacklist_languages), engine_kwargs={"spark": args.spark, "bblfsh": args.bblfsh, "engine": args.engine}, ) neighbours = sr.query( args.input, k=args.nnn, early_stop=args.early_stop, max_time=args.max_time, skipped_stop=args.skipped_stop) for index, rate in neighbours: print("%48s\t%.2f" % (index, rate))
def load(self, source: Union[str, BinaryIO, "Model"] = None, cache_dir: str = None, backend: StorageBackend = None, lazy=False) -> "Model": """ Build a new Model instance. :param source: UUID, file system path, file object or an URL; None means auto. :param cache_dir: The directory where to store the downloaded model. :param backend: Remote storage backend to use if ``source`` is a UUID or a URL. :param lazy: Do not really load numpy arrays into memory. Instead, mmap() them. \ User is expected to call Model.close() when the tree is no longer needed. """ if isinstance(source, Model): if not isinstance(source, type(self)): raise TypeError("Incompatible model instance: %s <> %s" % (type(source), type(self))) self.__dict__ = source.__dict__ return self if backend is not None and not isinstance(backend, StorageBackend): raise TypeError("backend must be an instance of " "modelforge.storage_backend.StorageBackend") self._source = str(source) generic = self.NAME == self.GENERIC_NAME try: if source is None or (isinstance(source, str) and not os.path.isfile(source)): if cache_dir is None: if not generic: cache_dir = os.path.join(vendor_cache_dir(), self.NAME) else: cache_dir = tempfile.mkdtemp(prefix="modelforge-") try: uuid.UUID(source) is_uuid = True except (TypeError, ValueError): is_uuid = False model_id = self.DEFAULT_NAME if not is_uuid else source file_name = model_id + self.DEFAULT_FILE_EXT file_name = os.path.join(cache_dir, file_name) if os.path.exists(file_name) and (not source or not os.path.exists(source)): source = file_name elif source is None or is_uuid: if backend is None: try: backend = create_backend() except ValueError as e: raise ValueError( "A backend must be set to load a UUID or the default model. The " "attempt to create a backend with default parameters failed." ) from e index = backend.index.contents config = index["models"] if not generic: if not is_uuid: model_id = index["meta"][self.NAME][model_id] source = config[self.NAME][model_id] else: if not is_uuid: raise ValueError( "File path, URL or UUID is needed.") for models in config.values(): if source in models: source = models[source] break else: raise FileNotFoundError("Model %s not found." % source) source = source["url"] if re.match(r"\w+://", source): download_http(source, file_name, self._log) self._source = source source = file_name if isinstance(source, str): size = os.stat(source).st_size else: self._source = "<file object>" pos = source.tell() size = source.seek(0, os.SEEK_END) - pos source.seek(pos, os.SEEK_SET) self._log.info("Reading %s (%s)...", source, humanize.naturalsize(size)) model = asdf.open(source, copy_arrays=not lazy, lazy_load=lazy) try: tree = model.tree self._meta = tree["meta"] self._initial_version = list(self.version) if not generic: meta_name = self._meta["model"] matched = self.NAME == meta_name if not matched: needed = {self.NAME} for child in type(self).__subclasses__(): needed.add(child.NAME) matched |= child.NAME == meta_name if not matched: raise ValueError( "The supplied model is of the wrong type: needed " "%s, got %s." % (needed, meta_name)) self._load_tree(tree) finally: if not lazy: model.close() else: self._asdf = model finally: if generic and cache_dir is not None: shutil.rmtree(cache_dir) self._size = size return self
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="Repository URL or path or name.") parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") parser.add_argument("--id2vec", default=None, help="id2vec model URL or path.") parser.add_argument("--df", default=None, help="Document frequencies URL or path.") parser.add_argument("--nbow", default=None, help="nBOW model URL or path.") parser.add_argument("--no-cache-centroids", action="store_true", help="Do not cache WMD centroids.") parser.add_argument("--bblfsh", default=None, help="babelfish server address.") parser.add_argument( "--timeout", type=int, default=Repo2Base.DEFAULT_BBLFSH_TIMEOUT, help="Babelfish timeout - longer requests are dropped.") parser.add_argument("--gcs", default=None, help="GCS bucket to use.") parser.add_argument("--linguist", default=None, help="Path to github/linguist or src-d/enry.") parser.add_argument("--vocabulary-min", default=50, type=int, help="Minimum number of words in a bag.") parser.add_argument("--vocabulary-max", default=500, type=int, help="Maximum number of words in a bag.") parser.add_argument("-n", "--nnn", default=10, type=int, help="Number of nearest neighbours.") parser.add_argument("--early-stop", default=0.1, type=float, help="Maximum fraction of the nBOW dataset to scan.") parser.add_argument("--max-time", default=300, type=int, help="Maximum time to spend scanning in seconds.") parser.add_argument("--skipped-stop", default=0.95, type=float, help="Minimum fraction of skipped samples to stop.") args = parser.parse_args() if args.linguist is None: args.linguist = "./enry" initialize(args.log_level, enry=args.linguist) if args.gcs: backend = create_backend(args="bucket=" + args.gcs) else: backend = create_backend() if args.id2vec is not None: args.id2vec = Id2Vec(source=args.id2vec, backend=backend) if args.df is not None: args.df = DocumentFrequencies(source=args.df, backend=backend) if args.nbow is not None: args.nbow = NBOW(source=args.nbow, backend=backend) sr = SimilarRepositories(id2vec=args.id2vec, df=args.df, nbow=args.nbow, verbosity=args.log_level, wmd_cache_centroids=not args.no_cache_centroids, gcs_bucket=args.gcs, repo2nbow_kwargs={ "linguist": args.linguist, "bblfsh_endpoint": args.bblfsh, "timeout": args.timeout }, wmd_kwargs={ "vocabulary_min": args.vocabulary_min, "vocabulary_max": args.vocabulary_max }) neighbours = sr.query(args.input, k=args.nnn, early_stop=args.early_stop, max_time=args.max_time, skipped_stop=args.skipped_stop) for index, rate in neighbours: print("%48s\t%.2f" % (index, rate))