Ejemplo n.º 1
0
def parseCorpus(workspace_path):

    corpuses = []
    if (workspace_path / "corpus" / "encoded").exists():
        corpus_path = workspace_path / "corpus" / "encoded"
        for corpus_sha in corpus_path.iterdir():
            encoded_db = encoded.EncodedContentFiles("sqlite:///{}".format(
                corpus_sha / "encoded.db"),
                                                     must_exist=True)
            corpus = {
                'path':
                corpus_path / corpus_sha,
                'sha':
                str(corpus_sha.stem),
                'datapoint_count':
                encoded_db.size,
                'summary':
                "{} datapoint corpus, {}".format(encoded_db.size,
                                                 str(corpus_sha.stem)),
                'models':
                parseModels(workspace_path, str(corpus_sha.stem))
            }
            global cached_corpuses
            cached_corpuses[crypto.sha256_str(
                str(workspace_path.name) + str(corpus_sha.name))] = corpus
            corpuses.append(corpus)
    return corpuses
Ejemplo n.º 2
0
def encoded_contentfile(corpus_id: int, encoded_id: int):
    (encoded_url, ) = (db.session.query(
        dashboard_db.Corpus.encoded_url).filter(
            dashboard_db.Corpus.id == corpus_id).one())

    encoded_db = encoded.EncodedContentFiles(encoded_url, must_exist=True)

    with encoded_db.Session() as session:
        cf = (session.query(encoded.EncodedContentFile).filter(
            encoded.EncodedContentFile.id == encoded_id).limit(1).one())
        indices = cf.indices_array
        vocab = {
            v: k
            for k, v in encoded.EncodedContentFiles.GetVocabFromMetaTable(
                session).items()
        }
        tokens = [vocab[i] for i in indices]
        text = "".join(tokens)
        encoded_cf = {
            "id": cf.id,
            "tokencount": humanize.Commas(cf.tokencount),
            "indices": indices,
            "text": text,
            "tokens": tokens,
        }
        vocab = {
            "table": [(k, v) for k, v in vocab.items()],
            "size": len(vocab),
        }

    data = GetBaseTemplateArgs()
    data["encoded"] = encoded_cf
    data["vocab"] = vocab
    data["urls"]["view_encoded_file"] = f"/corpus/{corpus_id}/encoded/random/"
    return flask.render_template("encoded_contentfile.html", **data)
Ejemplo n.º 3
0
def main(*args):

    db = encoded.EncodedContentFiles(
        url="sqlite:///{}".format(ENCODED_DB_PATH), must_exist=True)
    tokenizer = tokenizers.TokenizerBase.FromFile(
        pathlib.Path(TOKENIZER_PATH).resolve())
    feat_vecs = get_data_features(db, tokenizer)

    Train(feat_vecs)
    return
Ejemplo n.º 4
0
def random_encoded_contentfile(corpus_id: int):
    (encoded_url, ) = (db.session.query(
        dashboard_db.Corpus.encoded_url).filter(
            dashboard_db.Corpus.id == corpus_id).one())

    encoded_db = encoded.EncodedContentFiles(encoded_url, must_exist=True)

    with encoded_db.Session() as session:
        (random_id, ) = (session.query(encoded.EncodedContentFile.id).order_by(
            encoded_db.Random()).limit(1).one())

    return flask.redirect(f"/corpus/{corpus_id}/encoded/{random_id}/",
                          code=302)
Ejemplo n.º 5
0
  def _CreateTokenizer(self) -> tokenizers.TokenizerBase:
    """Creates and caches an tokenizer."""
    corpus_txt = self.GetTextCorpus(shuffle=False)

    if self.config.HasField("pre_encoded_corpus_url"):
      encoded_db = encoded.EncodedContentFiles(
        self.config.pre_encoded_corpus_url, self.pre_train
      )
      tokenizer = WordTokenizerFromEncodedDb(self.config.tokenizer, encoded_db)
    else:
      tokenizer = tokenizers.FromText(self.config.tokenizer, self.config.contentfile_separator, corpus_txt)

    tokenizer.ToFile(self.tokenizer_path)
    return tokenizer
Ejemplo n.º 6
0
    def _CreateAtomizer(self) -> atomizers.AtomizerBase:
        """Creates and caches an atomizer."""
        app.Log(1, "Deriving atomizer from preprocessed corpus")
        corpus_txt = self.GetTextCorpus(shuffle=False)

        if self.config.HasField("ascii_character_atomizer"):
            atomizer = atomizers.AsciiCharacterAtomizer.FromText(corpus_txt)
        elif self.config.HasField("greedy_multichar_atomizer"):
            atoms = set(self.config.greedy_multichar_atomizer.tokens)
            atomizer = atomizers.GreedyAtomizer.FromText(corpus_txt, atoms)
        elif self.config.HasField("pre_encoded_corpus_url"):
            encoded_db = encoded.EncodedContentFiles(
                self.config.pre_encoded_corpus_url)
            atomizer = GreedyAtomizerFromEncodedDb(encoded_db)
        else:
            raise NotImplementedError

        atomizer.ToFile(self.atomizer_path)
        return atomizer
Ejemplo n.º 7
0
def main(*args):

  db = encoded.EncodedContentFiles(url = "sqlite:///{}".format(ENCODED_DB_PATH), must_exist = True)
  tokenizer = tokenizers.TokenizerBase.FromFile(pathlib.Path(TOKENIZER_PATH).resolve())

  distr = {
    "GreweFeatures": None,
    "AutophaseFeatures": None,
    "InstCountFeatures": None,
  }

  distr_768 = {
    "GreweFeatures": None,
    "AutophaseFeatures": None,
    "InstCountFeatures": None,
  }

  for fspace in {"GreweFeatures", "AutophaseFeatures", "InstCountFeatures"}:
    feat_vecs = [v for s, i, v in get_data_features(fspace, db, tokenizer)]
    flat_vals = []
    for vec in feat_vecs:
      for v in vec.values():
        try:
          flat_vals.append(4 * int(v // 4))
        except Exception:
          pass
    distr[fspace] = distributions.GenericDistribution(flat_vals, "feature_vals", fspace)
    distr[fspace].plot()

  for fspace in {"GreweFeatures", "AutophaseFeatures", "InstCountFeatures"}:
    feat_vecs = [v for s, i, v in get_data_features(fspace, db, tokenizer, 768)]
    flat_vals = []
    for vec in feat_vecs:
      for v in vec.values():
        try:
          flat_vals.append(4 * int(v // 4))
        except Exception:
          pass
    distr_768[fspace] = distributions.GenericDistribution(flat_vals, "feature_vals", "{}_768".format(fspace))
    distr_768[fspace].plot()
  return
Ejemplo n.º 8
0
def abc_pre_encoded() -> str:
    """Test fixture that returns a database of a single encoded content file."""
    with tempfile.TemporaryDirectory(
            prefix="phd_deeplearning_clgen_corpuses_") as d:
        url = f"sqlite:///{d}/encoded.db"
        db = encoded.EncodedContentFiles(url)
        with db.Session(commit=True) as s:
            s.add(
                encoded.EncodedContentFile(
                    data="0.1.2.0.1",
                    tokencount=5,
                    encoding_time_ms=10,
                    wall_time_ms=10,
                    date_added=datetime.datetime.utcnow(),
                ))
            s.add(
                encoded.EncodedContentFile(
                    data="2.2.2",
                    tokencount=3,
                    encoding_time_ms=10,
                    wall_time_ms=10,
                    date_added=datetime.datetime.utcnow(),
                ))
        yield db.url
Ejemplo n.º 9
0
def temp_db() -> encoded.EncodedContentFiles:
    """A test fixture which returns an empty EncodedContentFiles db."""
    with tempfile.TemporaryDirectory() as d:
        yield encoded.EncodedContentFiles(f"sqlite:///{d}/test.db")
Ejemplo n.º 10
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False
        self.dashboard_db = dashboard_db.GetDatabase()
        self._dashboard_db_id: typing.Optional[int] = None  # Set in Create()

        # An in-memory cache of the encoded contentfiles indices arrays.
        # Set and used in GetTrainingData().
        self._indices_arrays: typing.Optional[typing.List[np.array]] = None

        cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1")
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath("corpus", "preprocessed",
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                               preprocessed_id,
                                               "preprocessed.db")
        if (self.config.HasField("content_id")
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            f"sqlite:///{preprocessed_db_path}")
        # Create symlink to contentfiles.
        symlink = (
            pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent /
            "contentfiles")
        if not symlink.is_symlink():
            if config.HasField("local_directory"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink,
                )
            elif config.HasField("local_tar_archive"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix,
                        )),
                    symlink,
                )
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        db_path = cache.cachepath("corpus", "encoded", encoded_id,
                                  "encoded.db")
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic by making Corpus an abstract class and creating concrete subclasses
        # for the different types of corpus.
        if self.config.HasField("pre_encoded_corpus_url"):
            self.encoded = encoded.EncodedContentFiles(
                config.pre_encoded_corpus_url)
        else:
            self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}")
        self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id,
                                             "atomizer.pkl")
        # Create symlink to preprocessed files.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic after splitting Corpus class.
        if not self.config.HasField("pre_encoded_corpus_url"):
            symlink = (
                pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent /
                "preprocessed")
            if not symlink.is_symlink():
                os.symlink(
                    os.path.relpath(
                        pathlib.Path(
                            self.preprocessed.url[len("sqlite:///"):]).parent,
                        pathlib.Path(
                            self.encoded.url[len("sqlite:///"):]).parent,
                    ),
                    symlink,
                )
        self.hash = encoded_id
        self.cache = cache.mkcache("corpus", "encoded", encoded_id)
Ejemplo n.º 11
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = self.preprocessed.database_path.parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = self.encoded.database_path.parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.preprocessed.database_path.parent,
                                self.encoded.database_path.parent), symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
Ejemplo n.º 12
0
def encoded_db(tempdir: pathlib.Path) -> encoded.EncodedContentFiles:
    db = encoded.EncodedContentFiles(f"sqlite:///{tempdir}/encoded")
    return db
Ejemplo n.º 13
0
  def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]):
    """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
    if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus):
      raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'")

    # Make a local copy of the configuration.
    if isinstance(config, corpus_pb2.Corpus):
      self.config    = corpus_pb2.Corpus()
      self.pre_train = False
    else:
      self.config    = corpus_pb2.PreTrainCorpus()
      self.pre_train = True

    self.config.CopyFrom(AssertConfigIsValid(config))
    self._tokenizer = None
    self._created = False

    # An in-memory cache of the encoded contentfiles indices arrays.
    # Set and used in GetTrainingData().
    self._indices_arrays: typing.Optional[typing.List[np.array]] = None

    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
    distrib.barrier()
    self.content_id = ResolveContentId(self.config)
    # Database of pre-processed files.
    preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                           preprocessed_id, "preprocessed.db")

    if self.config.HasField("content_id") and not preprocessed_db_path.is_file():
      raise ValueError(f"Content ID not found: '{self.content_id}'")
    self.preprocessed = preprocessed.PreprocessedContentFiles(
      f"sqlite:///{preprocessed_db_path}"
    )
    # Create symlink to contentfiles.
    if environment.WORLD_RANK == 0:
      symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles")
      if not symlink.is_symlink():
        if config.HasField("local_directory"):
          os.symlink(
            str(ExpandConfigPath(config.local_directory,   path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("local_tar_archive"):
          os.symlink(
            str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("bq_database"):
          os.symlink(
            str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )  
        # elif config.HasField("fetch_github"):
        #   os.symlink(
        #     str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)),
        #     symlink,
        #   )
    distrib.barrier()
    # Data of encoded pre-preprocessed files.
    encoded_id = ResolveEncodedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db")
    if self.config.HasField("pre_encoded_corpus_url"):
      self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train)
    else:
      self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train)
    self.tokenizer_path = cache.cachepath(
      "corpus", "encoded", encoded_id, "tokenizer.pkl"
    )
    if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"):
      symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed")
      if not symlink.is_symlink():
        os.symlink(
          os.path.relpath(
            pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent,
            pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent,
            ),
          symlink,
        )
    self.hash = encoded_id
    self.cache = cache.mkcache("corpus", "encoded", encoded_id)
    if environment.WORLD_RANK == 0:
      commit.saveCommit(self.cache.path)
      commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id)
    distrib.barrier()
    l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path))
    return