Beispiel #1
0
 def get_samples_features(
         self) -> typing.List[typing.Tuple[str, typing.Dict[str, float]]]:
     """Return compiling samples with feature vectors"""
     with self.Session() as s:
         return [(x.text, extractor.RawToDictFeats(x.feature_vector))
                 for x in s.query(Sample).filter(
                     Sample.compile_status == True).yield_per(1000)]
Beispiel #2
0
def ContentFeat(db_feat: typing.Tuple[str, str]) -> typing.Dict[str, float]:
    """
  Multiprocessing Worker calculates contentfile hash
  of file and returns it.
  """
    if len(db_feat) == 2:
        _, feats = db_feat
    else:
        _, _, feats = db_feat
    try:
        return extractor.RawToDictFeats(feats)
    except Exception as e:
        l.logger().warn(e)
        return None
Beispiel #3
0
def ContentHash(
    db_feat: typing.Tuple[str,
                          str]) -> typing.Tuple[str, typing.Dict[str, float]]:
    """
  Multiprocessing Worker calculates contentfile hash
  of file and returns it.
  """
    if len(db_feat) == 2:
        src, feats = db_feat
        include = None
    else:
        src, include, feat = db_feat
    try:
        return opencl.ContentHash(src), extractor.RawToDictFeats(feats)
    except Exception as e:
        l.logger().warn(e)
        return None
Beispiel #4
0
  def endSample(self) -> None:
    """Write final summed data about sampling session."""
    ## Flush final queue, if exists.
    with self.db.Session(commit = True) as s:
      for sample in self.flush_queue:
        s.add(sample)
        if self.plot_sample_status:
          self.saturation_monitor.register(sample.id)
          self.saturation_monitor.plot()
      s.commit()
      self.sample_id += len(self.flush_queue)
      self.flush_queue = []

    # Create feature vector plots
    db_path = pathlib.Path(self.db.url[len("sqlite:///"):]).parent
    # feature_monitor = monitors.CategoricalDistribMonitor(db_path, "samples_feature_vector_distribution")

    feature_monitors = {
      ftype: monitors.CategoricalDistribMonitor(
                        db_path,
                        "{}_distribution".format(ftype)
                      )
      for ftype in extractor.extractors.keys()
    }

    # for sample in self.db.correct_samples:
    #   if sample.feature_vector:
    #     feature_monitor.register({l.split(':')[0:-1]: float(l.split(':')[-1])  for l in sample.feature_vector.split('\n')}) # This used to work only for Grewe. Needs expanding, see lm_data_generator.
    # feature_monitor.plot()

    for sample in self.db.correct_samples:
      if sample.feature_vector:
        features = extractor.RawToDictFeats(sample.feature_vector)
        for ftype, fvector in features.items():
          feature_monitors[ftype].register(fvector)

    for mon in feature_monitors.values():
      mon.plot()

    with self.db.Session() as session:
      compiled_count = session.query(samples_database.Sample.compile_status).filter_by(compile_status = 1).count()
    try:
      r = [
        'compilation rate: {}'.format(compiled_count / self.sample_id),
        'total compilable samples: {}'.format(compiled_count),
        'average feature vector: \n{}'.format('\n'.join(["{}:\n{}".format(ft, fm.getStrData()) for ft, fm in feature_monitors.items()]))
      ]
    except ZeroDivisionError:
      r = [
        'compilation rate: +/-inf',
        'total compilable samples: {}'.format(compiled_count),
        'average feature vector: \n{}'.format('\n'.join(["{}:\n{}".format(ft, fm.getStrData()) for ft, fm in feature_monitors.items()]))
      ]
    with self.db.Session(commit = True) as session:
      exists  = session.query(samples_database.SampleResults.key).filter_by(key = "meta").scalar() is not None
      if exists:
        entry = session.query(samples_database.SampleResults    ).filter_by(key = "meta").first()
        entry.results = "\n".join(r)
      else:
        session.add(samples_database.SampleResults(key = "meta", results = "\n".join(r)))
    return
Beispiel #5
0
  def Import(
    self,
    session: sqlutil.Session,
    preprocessed_db: preprocessed.PreprocessedContentFiles,
    tokenizer: tokenizers.TokenizerBase,
    contentfile_separator: str,
  ) -> None:
    # if environment.WORLD_RANK == 0:
    if environment.WORLD_SIZE > 1:
      preprocessed_db = preprocessed_db.replicated
    with preprocessed_db.Session() as p_session:
      query = p_session.query(preprocessed.PreprocessedContentFile).filter(
        preprocessed.PreprocessedContentFile.preprocessing_succeeded == True,
      )
      done = set([int(x.id) for x in session.query(EncodedContentFile).all()])
      total_jobs = query.count() # - len(done)
      l.logger().info("Encoding {} of {} preprocessed files"
                          .format(
                              humanize.intcomma(total_jobs),
                              humanize.intcomma(
                                p_session.query(preprocessed.PreprocessedContentFile)
                                .filter(preprocessed.PreprocessedContentFile.preprocessing_succeeded == True)
                                .count()
                              )
                          )
                        )
      chunk, idx = 2000000, 0
      if environment.WORLD_SIZE > 1:
        bar = distrib.ProgressBar(total = total_jobs, offset = idx, desc = "Encoding DB")
      else:
        bar = tqdm.tqdm(total = total_jobs, desc = "Encoding DB", leave = True)
      wall_time_start = time.time()
      while idx < total_jobs:
        try:
          if done:
            batch = []
            for f in query.limit(chunk).offset(idx).all():
              if f.id not in done:
                batch.append(f)
              else:
                idx += 1
                # done.remove(f.id)
          else:
            batch = query.limit(chunk).offset(idx).all()
          pool = multiprocessing.Pool()
          last_commit = time.time()
          for encoded_cf in pool.imap_unordered(
                              functools.partial(EncoderWorker,
                                                tokenizer = tokenizer,
                                                contentfile_separator = contentfile_separator,
                                                is_pre_train = self.is_pre_train,
                                                ),
                              batch
                            ):
            wall_time_end = time.time()
            if encoded_cf:
              encoded_cf.wall_time_ms = int(
                (wall_time_end - wall_time_start) * 1000
              )
              session.add(encoded_cf)
              self.length_monitor.register(encoded_cf.tokencount)
              if not self.is_pre_train:
                self.token_monitor.register([tokenizer.decoder[int(x)] for x in encoded_cf.data.split('.')])

                dict_features = extractor.RawToDictFeats(encoded_cf.feature_vector)
                if dict_features:
                  for key, value in dict_features.items():
                    self.feature_monitors[key].register(value)
            wall_time_start = wall_time_end
            if wall_time_end - last_commit > 10:
              session.commit()
              last_commit = wall_time_end
            idx += 1
            bar.update(idx - bar.n)
          pool.close()
        except KeyboardInterrupt as e:
          pool.terminate()
          self.length_monitor.plot()
          if not self.is_pre_train:
            self.token_monitor.plot()
            for m in self.feature_monitors.values():
              m.plot()
          raise e
        except Exception as e:
          l.logger().error(e, ddp_nodes = True)
          pool.terminate()
          self.length_monitor.plot()
          if not self.is_pre_train:
            self.token_monitor.plot()
            for m in self.feature_monitors.values():
              m.plot()
          raise e
      self.length_monitor.plot()
      if not self.is_pre_train:
        self.token_monitor.plot()
        for m in self.feature_monitors.values():
          m.plot()
      session.commit()
      if environment.WORLD_SIZE > 1:
        bar.finalize(idx)
    return
Beispiel #6
0
 def features(self) -> typing.Dict[str, float]:
   return extractor.RawToDictFeats(self.feature_vector)