def cb(frame_len, orth):
        if frame_len >= options.max_seq_frame_len:
            return
        orth_syms = parse_orthography(orth)
        if len(orth_syms) >= options.max_seq_orth_len:
            return

        Stats.count += 1
        Stats.total_frame_len += frame_len

        if options.dump_orth_syms:
            print("Orth:", "".join(orth_syms), file=log.v3)
        if options.filter_orth_sym:
            if options.filter_orth_sym in orth_syms:
                print("Found orth:", "".join(orth_syms), file=log.v3)
        if options.filter_orth_syms_seq:
            filter_seq = parse_orthography_into_symbols(
                options.filter_orth_syms_seq)
            if found_sub_seq(filter_seq, orth_syms):
                print("Found orth:", "".join(orth_syms), file=log.v3)
        Stats.orth_syms_set.update(orth_syms)
        Stats.total_orth_len += len(orth_syms)

        # Show some progress if it takes long.
        if time.time() - Stats.process_last_time > 2:
            Stats.process_last_time = time.time()
            if options.collect_time:
                print("Collect process, total frame len so far:",
                      hms(Stats.total_frame_len *
                          (options.frame_time / 1000.0)),
                      file=log.v3)
            else:
                print("Collect process, total orth len so far:",
                      human_size(Stats.total_orth_len),
                      file=log.v3)
Esempio n. 2
0
  def _callback(self, orth):
    """
    :param str orth:
    """
    orth_words = parse_orthography(orth, prefix=[], postfix=[], word_based=True)

    self.seq_count += 1

    if self.options.dump_orth:
      print("Orth:", orth_words, file=log.v3)
    self.words.update(orth_words)
    self.total_word_len += len(orth_words)

    # Show some progress if it takes long.
    if time.time() - self.process_last_time > 2:
      self.process_last_time = time.time()
      print("Collect process, total word len so far:", human_size(self.total_word_len), file=log.v3)
Esempio n. 3
0
  def __init__(self, options, iter_corpus):
    """
    :param options: argparse.Namespace
    :param iter_corpus:
    """

    self.options = options
    self.seq_count = 0
    self.words = set()
    self.total_word_len = 0
    self.process_last_time = time.time()

    iter_corpus(self._callback)

    print("Total word len:", self.total_word_len, "(%s)" % human_size(self.total_word_len), file=log.v3)
    print("Average orth len:", float(self.total_word_len) / self.seq_count, file=log.v3)
    print("Num word symbols:", len(self.words), file=log.v3)
def collect_stats(options, iter_corpus):
    """
  :param options: argparse.Namespace
  """
    orth_symbols_filename = options.output
    if orth_symbols_filename:
        assert not os.path.exists(orth_symbols_filename)

    class Stats:
        count = 0
        process_last_time = time.time()
        total_frame_len = 0
        total_orth_len = 0
        orth_syms_set = set()

    if options.add_numbers:
        Stats.orth_syms_set.update(
            map(chr, list(range(ord("0"),
                                ord("9") + 1))))
    if options.add_lower_alphabet:
        Stats.orth_syms_set.update(
            map(chr, list(range(ord("a"),
                                ord("z") + 1))))
    if options.add_upper_alphabet:
        Stats.orth_syms_set.update(
            map(chr, list(range(ord("A"),
                                ord("Z") + 1))))

    def cb(frame_len, orth):
        if frame_len >= options.max_seq_frame_len:
            return
        orth_syms = parse_orthography(orth)
        if len(orth_syms) >= options.max_seq_orth_len:
            return

        Stats.count += 1
        Stats.total_frame_len += frame_len

        if options.dump_orth_syms:
            print("Orth:", "".join(orth_syms), file=log.v3)
        if options.filter_orth_sym:
            if options.filter_orth_sym in orth_syms:
                print("Found orth:", "".join(orth_syms), file=log.v3)
        if options.filter_orth_syms_seq:
            filter_seq = parse_orthography_into_symbols(
                options.filter_orth_syms_seq)
            if found_sub_seq(filter_seq, orth_syms):
                print("Found orth:", "".join(orth_syms), file=log.v3)
        Stats.orth_syms_set.update(orth_syms)
        Stats.total_orth_len += len(orth_syms)

        # Show some progress if it takes long.
        if time.time() - Stats.process_last_time > 2:
            Stats.process_last_time = time.time()
            if options.collect_time:
                print("Collect process, total frame len so far:",
                      hms(Stats.total_frame_len *
                          (options.frame_time / 1000.0)),
                      file=log.v3)
            else:
                print("Collect process, total orth len so far:",
                      human_size(Stats.total_orth_len),
                      file=log.v3)

    iter_corpus(cb)

    if options.remove_symbols:
        filter_syms = parse_orthography_into_symbols(options.remove_symbols)
        Stats.orth_syms_set -= set(filter_syms)

    if options.collect_time:
        print("Total frame len:",
              Stats.total_frame_len,
              "time:",
              hms(Stats.total_frame_len * (options.frame_time / 1000.0)),
              file=log.v3)
    else:
        print("No time stats (--collect_time False).", file=log.v3)
    print("Total orth len:",
          Stats.total_orth_len,
          "(%s)" % human_size(Stats.total_orth_len),
          end=' ',
          file=log.v3)
    if options.collect_time:
        print("fraction:",
              float(Stats.total_orth_len) / Stats.total_frame_len,
              file=log.v3)
    else:
        print("", file=log.v3)
    print("Average orth len:",
          float(Stats.total_orth_len) / Stats.count,
          file=log.v3)
    print("Num symbols:", len(Stats.orth_syms_set), file=log.v3)

    if orth_symbols_filename:
        orth_syms_file = open(orth_symbols_filename, "wb")
        for orth_sym in sorted(Stats.orth_syms_set):
            orth_syms_file.write(b"%s\n" % unicode(orth_sym).encode("utf8"))
        orth_syms_file.close()
        print("Wrote orthography symbols to",
              orth_symbols_filename,
              file=log.v3)
    else:
        print("Provide --output to save the symbols.", file=log.v3)