Beispiel #1
0
def plot_speech_features(feature_matrix,
                         file_name=None,
                         vertical=True,
                         ax=None,
                         length=8.0,
                         dpi=100):
    """Plot speech feature matrix.

  Args:
    feature_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention; if not given, the plt context will be left un-closed
    vertical: if True, the time dimension will be projected onto the y axis, otherwise the x axis
    ax: if given, draw on this matplotlib axis; otherwise create a new figure
    length: figure length (if ax is not given)
    dpi: plot resolution
  """
    if not ax:
        plt.subplots(figsize=(1.0, length))
    if vertical: feature_matrix = feature_matrix[:, ::-1].T
    if ax:
        ax.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
        ax.axis('off')
    else:
        plt.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
        plt.axis('off')
    if file_name is not None:
        utils.make_parent_dir(file_name)
        plt.savefig(file_name, dpi=dpi)
        plt.close()
Beispiel #2
0
def set_out_file(out_file, exp_name):
    """
  Set the file to log to. Before calling this, logs are only passed to stdout/stderr.
  Args:
    out_file: file name
    exp_name: name of experiment
  """
    unset_out_file()
    utils.make_parent_dir(out_file)
    with open(out_file, mode="w") as f_out:
        for line in _preamble_content:
            f_out.write(f"{line}\n")
    fh = logging.FileHandler(out_file, encoding="utf-8")
    fh.setLevel(settings.LOG_LEVEL_FILE)
    fh.setFormatter(MainFormatter())
    logger.addHandler(fh)
    logger_file.addHandler(fh)
    yaml_fh = logging.FileHandler(f"{out_file}.yaml",
                                  mode='w',
                                  encoding="utf-8")
    yaml_fh.setLevel(logging.DEBUG)
    yaml_fh.setFormatter(YamlFormatter())
    yaml_fh.setLevel(logging.DEBUG)
    yaml_logger.addHandler(yaml_fh)
    tensorboard_writer.set_out_file(f"{out_file}.tb", exp_name=exp_name)
Beispiel #3
0
 def conclude_report(self) -> None:
     if self.hyp_sents:
         ref_filename = os.path.join(self.report_path, "tmp",
                                     "compare-mt.ref")
         out_filename = os.path.join(self.report_path, "tmp",
                                     "compare-mt.out")
         utils.make_parent_dir(out_filename)
         with open(ref_filename, "w") as fout:
             for l in self.ref_sents:
                 fout.write(f"{l.strip()}\n")
         with open(out_filename, "w") as fout:
             for l in self.hyp_sents:
                 fout.write(f"{l.strip()}\n")
         import xnmt.thirdparty.comparemt.compare_mt as compare_mt
         args = utils.ArgClass(ref_file=ref_filename,
                               out_file=out_filename,
                               out2_file=self.out2_file,
                               train_file=self.train_file,
                               train_counts=self.train_counts,
                               alpha=self.alpha,
                               ngram=self.ngram,
                               ngram_size=self.ngram_size,
                               sent_size=self.sent_size)
         out_lines = compare_mt.main(args)
         report_filename = os.path.join(self.report_path, "compare-mt.txt")
         utils.make_parent_dir(report_filename)
         with open(report_filename, "w") as fout:
             for l in out_lines:
                 fout.write(f"{l}\n")
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
Beispiel #4
0
  def perform_inference(self, generator: 'models.GeneratorModel', src_file: str = None, trg_file: str = None) \
          -> None:
    """
    Perform inference.

    Args:
      generator: the model to be used
      src_file: path of input src file to be translated
      trg_file: path of file where trg translatons will be written
    """
    src_file = src_file or self.src_file
    trg_file = trg_file or self.trg_file
    utils.make_parent_dir(trg_file)

    logger.info(f'Performing inference on {src_file}')

    ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file)

    event_trigger.set_train(False)

    ref_scores = None
    if self.mode == 'score':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)
      self._write_rescored_output(ref_scores, self.ref_file, trg_file)

    if self.mode == 'forceddebug':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)

    if self.mode != 'score':
      self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores,
                            src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher,
                            max_src_len=self.max_src_len)
Beispiel #5
0
 def __init__(self,
              report_path: str,
              src_vocab=Ref(Path("model.src_reader.vocab"))):
     self.src_vocab = src_vocab
     self.logger = logging.getLogger("segmenting_reporter")
     utils.make_parent_dir(report_path)
     self.logger.addHandler(logging.StreamHandler(open(report_path, "w")))
     self.logger.setLevel("INFO")
Beispiel #6
0
 def write_html(self) -> None:
     html_str = "\n".join(self.html_contents)
     soup = bs(html_str, "lxml")
     pretty_html = soup.prettify()
     html_file_name = os.path.join(self.report_path,
                                   f"{self.report_name}.html")
     utils.make_parent_dir(html_file_name)
     with open(html_file_name, 'w', encoding='utf-8') as f:
         f.write(pretty_html)
Beispiel #7
0
def plot_attention(src_words,
                   trg_words,
                   attention_matrix,
                   file_name,
                   size_x=8.0,
                   size_y=8.0):
    """This takes in source and target words and an attention matrix (in numpy format)
  and prints a visualization of this to a file.

  Args:
    src_words: a list of words in the source
    trg_words: a list of target words
    attention_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention
    size_x: width of the main plot
    size_y: height of the plot
  """
    trg_words = [unidecode(w) for w in trg_words]
    src_is_speech = isinstance(src_words, np.ndarray)
    max_len = len(''.join(trg_words))
    if not src_is_speech:
        max_len = max(max_len, len(''.join(src_words)))
        src_words = [unidecode(w) for w in src_words]
    if max_len > 150: matplotlib.rc('font', size=5)
    elif max_len > 50: matplotlib.rc('font', size=7)
    dpi = 100 if max_len <= 150 else 150
    fig, axs = plt.subplots(
        nrows=1,
        ncols=2 if src_is_speech else 1,
        figsize=(size_x + (1.0 if src_is_speech else 0.0), size_y),
        gridspec_kw={'width_ratios': [1, size_x]} if src_is_speech else None)
    ax = axs[1] if src_is_speech else axs
    # put the major ticks at the middle of each cell
    ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()
    if src_is_speech: plt.yticks([], [])

    # label axes by words
    ax.set_xticklabels(trg_words, minor=False)
    if not src_is_speech: ax.set_yticklabels(src_words, minor=False)
    ax.xaxis.tick_top()

    # draw the heatmap
    plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1)
    plt.colorbar()

    if src_is_speech:
        ax = axs[0]
        plot_speech_features(feature_matrix=src_words, ax=ax, dpi=dpi)
        fig.tight_layout()

    utils.make_parent_dir(file_name)
    plt.savefig(file_name, dpi=dpi)
    plt.close()
Beispiel #8
0
 def setUp(self):
     events.clear()
     xnmt.resolved_serialize_params = {}
     yaml.add_representer(DummyArgClass, xnmt.init_representer)
     yaml.add_representer(DummyArgClass2, xnmt.init_representer)
     self.out_dir = os.path.join("test", "tmp")
     utils.make_parent_dir(os.path.join(self.out_dir, "asdf"))
     self.model_file = os.path.join(self.out_dir, "saved.mod")
     param_collections.ParamManager.init_param_col()
     param_collections.ParamManager.param_col.model_file = self.model_file
Beispiel #9
0
 def conclude_report(self) -> None:
     if self.hyp_sents:
         html_filename = os.path.join(self.report_path, "charcut.html")
         utils.make_parent_dir(html_filename)
         args = utils.ArgClass(html_output_file=html_filename,
                               match_size=self.match_size,
                               alt_norm=self.alt_norm)
         aligned_segs = charcut.load_input_segs(cand_segs=self.hyp_sents,
                                                ref_segs=self.ref_sents,
                                                src_segs=self.src_sents)
         charcut.run_on(aligned_segs, args)
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
Beispiel #10
0
    def create_sent_report(self, segment_actions, src, **kwargs):
        if self.report_fp is None:
            report_path = os.path.join(self.report_path, "segment.txt")
            utils.make_parent_dir(report_path)
            self.report_fp = open(report_path, "w")

        actions = segment_actions[0]
        src = src.str_tokens()
        words = []
        start = 0
        for end in actions:
            words.append("".join(str(src[start:end + 1])))
            start = end + 1
        print(" ".join(words), file=self.report_fp)
Beispiel #11
0
    def create_sent_report(self, segment_actions, src: sent.Sentence,
                           **kwargs):
        if self.report_fp is None:
            utils.make_parent_dir(self.report_path)
            self.report_fp = open(self.report_path, "w")

        actions = segment_actions[0]
        src = src.str_tokens()
        words = []
        start = 0
        for end in actions:
            if start < end + 1:
                words.append("".join(map(str, src[start:end + 1])))
            start = end + 1
        print(" ".join(words), file=self.report_fp)
Beispiel #12
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     tokenizers = {
         my_opts["filenum"]: [tok for tok in my_opts["tokenizers"]]
         for my_opts in self.specs
     }
     for file_num, (in_file, out_file) in enumerate(
             zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             my_tokenizers = tokenizers.get(file_num, tokenizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for tokenizer in my_tokenizers:
                     in_stream = tokenizer.tokenize_stream(in_stream)
                 for line in in_stream:
                     out_stream.write(f"{line}\n")
Beispiel #13
0
    def __init__(self,
                 report_path: str = None,
                 src_vocab=Ref(Path("model.src_reader.vocab")),
                 trg_vocab=Ref(Path("model.trg_reader.vocab"))):
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.logger = logging.getLogger("simult")

        if report_path is not None:
            utils.make_parent_dir(report_path)
            stream = open(report_path, "w")
        else:
            stream = sys.stderr

        self.logger.addHandler(logging.StreamHandler(stream))
        self.logger.setLevel("INFO")
Beispiel #14
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     normalizers = {
         my_opts["filenum"]: [norm for norm in my_opts["normalizers"]]
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             my_normalizers = normalizers.get(i, normalizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for line in in_stream:
                     line = line.strip()
                     for normalizer in my_normalizers:
                         line = normalizer.normalize(line)
                     out_stream.write(line + "\n")
Beispiel #15
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     filters = {
         my_opts["filenum"]: [norm for norm in my_opts["filters"]]
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 vocab = {}
                 for line in in_stream:
                     for word in line.strip().split():
                         vocab[word] = vocab.get(word, 0) + 1
                 for my_filter in filters.get(i, filters["all"]):
                     vocab = my_filter.filter(vocab)
                 for word in vocab.keys():
                     out_stream.write((word + u"\n"))
Beispiel #16
0
    def __init__(self,
                 train_files: Sequence[str],
                 vocab_size: numbers.Integral,
                 overwrite: bool = False,
                 model_prefix: str = 'sentpiece',
                 output_format: str = 'piece',
                 model_type: str = 'bpe',
                 hard_vocab_limit: bool = True,
                 encode_extra_options: Optional[str] = None,
                 decode_extra_options: Optional[str] = None) -> None:
        """
    This will initialize and train the sentencepiece tokenizer.

    If overwrite is set to False, learned model will not be overwritten, even if parameters
    are changed.

    "File" output for Sentencepiece written to StringIO temporarily before being written to disk.

    """
        self.model_prefix = model_prefix
        self.output_format = output_format
        self.input_format = output_format
        self.overwrite = overwrite
        self.encode_extra_options = [
            '--extra_options=' + encode_extra_options
        ] if encode_extra_options else []
        self.decode_extra_options = [
            '--extra_options=' + decode_extra_options
        ] if decode_extra_options else []

        utils.make_parent_dir(model_prefix)
        self.sentpiece_train_args = [
            '--input=' + ','.join(train_files),
            '--model_prefix=' + str(model_prefix),
            '--vocab_size=' + str(vocab_size),
            '--hard_vocab_limit=' + str(hard_vocab_limit).lower(),
            '--model_type=' + str(model_type)
        ]

        self.sentpiece_processor = None
Beispiel #17
0
 def setUp(self):
     yaml.add_representer(DummyClass, xnmt.init_representer)
     self.out_dir = "test/tmp"
     utils.make_parent_dir(f"{self.out_dir}/asdf")
Beispiel #18
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     extractor = self.specs
     for in_file, out_file in zip(self.in_files, self.out_files):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             extractor.extract_to(in_file, out_file)