コード例 #1
0
def plot_speech_features(feature_matrix,
                         file_name=None,
                         vertical=True,
                         ax=None,
                         length=8.0,
                         dpi=100):
    """Plot speech feature matrix.

  Args:
    feature_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention; if not given, the plt context will be left un-closed
    vertical: if True, the time dimension will be projected onto the y axis, otherwise the x axis
    ax: if given, draw on this matplotlib axis; otherwise create a new figure
    length: figure length (if ax is not given)
    dpi: plot resolution
  """
    if not ax:
        plt.subplots(figsize=(1.0, length))
    if vertical: feature_matrix = feature_matrix[:, ::-1].T
    if ax:
        ax.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
        ax.axis('off')
    else:
        plt.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1)
        plt.axis('off')
    if file_name is not None:
        utils.make_parent_dir(file_name)
        plt.savefig(file_name, dpi=dpi)
        plt.close()
コード例 #2
0
def set_out_file(out_file, exp_name):
    """
  Set the file to log to. Before calling this, logs are only passed to stdout/stderr.
  Args:
    out_file: file name
    exp_name: name of experiment
  """
    unset_out_file()
    utils.make_parent_dir(out_file)
    with open(out_file, mode="w") as f_out:
        for line in _preamble_content:
            f_out.write(f"{line}\n")
    fh = logging.FileHandler(out_file, encoding="utf-8")
    fh.setLevel(settings.LOG_LEVEL_FILE)
    fh.setFormatter(MainFormatter())
    logger.addHandler(fh)
    logger_file.addHandler(fh)
    yaml_fh = logging.FileHandler(f"{out_file}.yaml",
                                  mode='w',
                                  encoding="utf-8")
    yaml_fh.setLevel(logging.DEBUG)
    yaml_fh.setFormatter(YamlFormatter())
    yaml_fh.setLevel(logging.DEBUG)
    yaml_logger.addHandler(yaml_fh)
    tensorboard_writer.set_out_file(f"{out_file}.tb", exp_name=exp_name)
コード例 #3
0
ファイル: reports.py プロジェクト: rezahaffari/xnmt
 def conclude_report(self) -> None:
     if self.hyp_sents:
         ref_filename = os.path.join(self.report_path, "tmp",
                                     "compare-mt.ref")
         out_filename = os.path.join(self.report_path, "tmp",
                                     "compare-mt.out")
         utils.make_parent_dir(out_filename)
         with open(ref_filename, "w") as fout:
             for l in self.ref_sents:
                 fout.write(f"{l.strip()}\n")
         with open(out_filename, "w") as fout:
             for l in self.hyp_sents:
                 fout.write(f"{l.strip()}\n")
         import xnmt.thirdparty.comparemt.compare_mt as compare_mt
         args = utils.ArgClass(ref_file=ref_filename,
                               out_file=out_filename,
                               out2_file=self.out2_file,
                               train_file=self.train_file,
                               train_counts=self.train_counts,
                               alpha=self.alpha,
                               ngram=self.ngram,
                               ngram_size=self.ngram_size,
                               sent_size=self.sent_size)
         out_lines = compare_mt.main(args)
         report_filename = os.path.join(self.report_path, "compare-mt.txt")
         utils.make_parent_dir(report_filename)
         with open(report_filename, "w") as fout:
             for l in out_lines:
                 fout.write(f"{l}\n")
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
コード例 #4
0
  def perform_inference(self, generator: 'models.GeneratorModel', src_file: str = None, trg_file: str = None) \
          -> None:
    """
    Perform inference.

    Args:
      generator: the model to be used
      src_file: path of input src file to be translated
      trg_file: path of file where trg translatons will be written
    """
    src_file = src_file or self.src_file
    trg_file = trg_file or self.trg_file
    utils.make_parent_dir(trg_file)

    logger.info(f'Performing inference on {src_file}')

    ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file)

    event_trigger.set_train(False)

    ref_scores = None
    if self.mode == 'score':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)
      self._write_rescored_output(ref_scores, self.ref_file, trg_file)

    if self.mode == 'forceddebug':
      ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents)

    if self.mode != 'score':
      self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores,
                            src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher,
                            max_src_len=self.max_src_len)
コード例 #5
0
ファイル: reporter.py プロジェクト: seeledu/xnmt-devel
 def __init__(self,
              report_path: str,
              src_vocab=Ref(Path("model.src_reader.vocab"))):
     self.src_vocab = src_vocab
     self.logger = logging.getLogger("segmenting_reporter")
     utils.make_parent_dir(report_path)
     self.logger.addHandler(logging.StreamHandler(open(report_path, "w")))
     self.logger.setLevel("INFO")
コード例 #6
0
ファイル: reports.py プロジェクト: rezahaffari/xnmt
 def write_html(self) -> None:
     html_str = "\n".join(self.html_contents)
     soup = bs(html_str, "lxml")
     pretty_html = soup.prettify()
     html_file_name = os.path.join(self.report_path,
                                   f"{self.report_name}.html")
     utils.make_parent_dir(html_file_name)
     with open(html_file_name, 'w', encoding='utf-8') as f:
         f.write(pretty_html)
コード例 #7
0
def plot_attention(src_words,
                   trg_words,
                   attention_matrix,
                   file_name,
                   size_x=8.0,
                   size_y=8.0):
    """This takes in source and target words and an attention matrix (in numpy format)
  and prints a visualization of this to a file.

  Args:
    src_words: a list of words in the source
    trg_words: a list of target words
    attention_matrix: a two-dimensional numpy array of values between zero and one,
      where rows correspond to source words, and columns correspond to target words
    file_name: the name of the file to which we write the attention
    size_x: width of the main plot
    size_y: height of the plot
  """
    trg_words = [unidecode(w) for w in trg_words]
    src_is_speech = isinstance(src_words, np.ndarray)
    max_len = len(''.join(trg_words))
    if not src_is_speech:
        max_len = max(max_len, len(''.join(src_words)))
        src_words = [unidecode(w) for w in src_words]
    if max_len > 150: matplotlib.rc('font', size=5)
    elif max_len > 50: matplotlib.rc('font', size=7)
    dpi = 100 if max_len <= 150 else 150
    fig, axs = plt.subplots(
        nrows=1,
        ncols=2 if src_is_speech else 1,
        figsize=(size_x + (1.0 if src_is_speech else 0.0), size_y),
        gridspec_kw={'width_ratios': [1, size_x]} if src_is_speech else None)
    ax = axs[1] if src_is_speech else axs
    # put the major ticks at the middle of each cell
    ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()
    if src_is_speech: plt.yticks([], [])

    # label axes by words
    ax.set_xticklabels(trg_words, minor=False)
    if not src_is_speech: ax.set_yticklabels(src_words, minor=False)
    ax.xaxis.tick_top()

    # draw the heatmap
    plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1)
    plt.colorbar()

    if src_is_speech:
        ax = axs[0]
        plot_speech_features(feature_matrix=src_words, ax=ax, dpi=dpi)
        fig.tight_layout()

    utils.make_parent_dir(file_name)
    plt.savefig(file_name, dpi=dpi)
    plt.close()
コード例 #8
0
 def setUp(self):
     events.clear()
     xnmt.resolved_serialize_params = {}
     yaml.add_representer(DummyArgClass, xnmt.init_representer)
     yaml.add_representer(DummyArgClass2, xnmt.init_representer)
     self.out_dir = os.path.join("test", "tmp")
     utils.make_parent_dir(os.path.join(self.out_dir, "asdf"))
     self.model_file = os.path.join(self.out_dir, "saved.mod")
     param_collections.ParamManager.init_param_col()
     param_collections.ParamManager.param_col.model_file = self.model_file
コード例 #9
0
ファイル: reports.py プロジェクト: rezahaffari/xnmt
 def conclude_report(self) -> None:
     if self.hyp_sents:
         html_filename = os.path.join(self.report_path, "charcut.html")
         utils.make_parent_dir(html_filename)
         args = utils.ArgClass(html_output_file=html_filename,
                               match_size=self.match_size,
                               alt_norm=self.alt_norm)
         aligned_segs = charcut.load_input_segs(cand_segs=self.hyp_sents,
                                                ref_segs=self.ref_sents,
                                                src_segs=self.src_sents)
         charcut.run_on(aligned_segs, args)
         self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
コード例 #10
0
    def create_sent_report(self, segment_actions, src, **kwargs):
        if self.report_fp is None:
            report_path = os.path.join(self.report_path, "segment.txt")
            utils.make_parent_dir(report_path)
            self.report_fp = open(report_path, "w")

        actions = segment_actions[0]
        src = src.str_tokens()
        words = []
        start = 0
        for end in actions:
            words.append("".join(str(src[start:end + 1])))
            start = end + 1
        print(" ".join(words), file=self.report_fp)
コード例 #11
0
ファイル: reports.py プロジェクト: rezahaffari/xnmt
    def create_sent_report(self, segment_actions, src: sent.Sentence,
                           **kwargs):
        if self.report_fp is None:
            utils.make_parent_dir(self.report_path)
            self.report_fp = open(self.report_path, "w")

        actions = segment_actions[0]
        src = src.str_tokens()
        words = []
        start = 0
        for end in actions:
            if start < end + 1:
                words.append("".join(map(str, src[start:end + 1])))
            start = end + 1
        print(" ".join(words), file=self.report_fp)
コード例 #12
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     tokenizers = {
         my_opts["filenum"]: [tok for tok in my_opts["tokenizers"]]
         for my_opts in self.specs
     }
     for file_num, (in_file, out_file) in enumerate(
             zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             my_tokenizers = tokenizers.get(file_num, tokenizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for tokenizer in my_tokenizers:
                     in_stream = tokenizer.tokenize_stream(in_stream)
                 for line in in_stream:
                     out_stream.write(f"{line}\n")
コード例 #13
0
    def __init__(self,
                 report_path: str = None,
                 src_vocab=Ref(Path("model.src_reader.vocab")),
                 trg_vocab=Ref(Path("model.trg_reader.vocab"))):
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.logger = logging.getLogger("simult")

        if report_path is not None:
            utils.make_parent_dir(report_path)
            stream = open(report_path, "w")
        else:
            stream = sys.stderr

        self.logger.addHandler(logging.StreamHandler(stream))
        self.logger.setLevel("INFO")
コード例 #14
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     normalizers = {
         my_opts["filenum"]: [norm for norm in my_opts["normalizers"]]
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             my_normalizers = normalizers.get(i, normalizers["all"])
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 for line in in_stream:
                     line = line.strip()
                     for normalizer in my_normalizers:
                         line = normalizer.normalize(line)
                     out_stream.write(line + "\n")
コード例 #15
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     filters = {
         my_opts["filenum"]: [norm for norm in my_opts["filters"]]
         for my_opts in self.specs
     }
     for i, (in_file,
             out_file) in enumerate(zip(self.in_files, self.out_files)):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             with open(out_file, "w", encoding='utf-8') as out_stream, \
                  open(in_file, "r", encoding='utf-8') as in_stream:
                 vocab = {}
                 for line in in_stream:
                     for word in line.strip().split():
                         vocab[word] = vocab.get(word, 0) + 1
                 for my_filter in filters.get(i, filters["all"]):
                     vocab = my_filter.filter(vocab)
                 for word in vocab.keys():
                     out_stream.write((word + u"\n"))
コード例 #16
0
    def __init__(self,
                 train_files: Sequence[str],
                 vocab_size: numbers.Integral,
                 overwrite: bool = False,
                 model_prefix: str = 'sentpiece',
                 output_format: str = 'piece',
                 model_type: str = 'bpe',
                 hard_vocab_limit: bool = True,
                 encode_extra_options: Optional[str] = None,
                 decode_extra_options: Optional[str] = None) -> None:
        """
    This will initialize and train the sentencepiece tokenizer.

    If overwrite is set to False, learned model will not be overwritten, even if parameters
    are changed.

    "File" output for Sentencepiece written to StringIO temporarily before being written to disk.

    """
        self.model_prefix = model_prefix
        self.output_format = output_format
        self.input_format = output_format
        self.overwrite = overwrite
        self.encode_extra_options = [
            '--extra_options=' + encode_extra_options
        ] if encode_extra_options else []
        self.decode_extra_options = [
            '--extra_options=' + decode_extra_options
        ] if decode_extra_options else []

        utils.make_parent_dir(model_prefix)
        self.sentpiece_train_args = [
            '--input=' + ','.join(train_files),
            '--model_prefix=' + str(model_prefix),
            '--vocab_size=' + str(vocab_size),
            '--hard_vocab_limit=' + str(hard_vocab_limit).lower(),
            '--model_type=' + str(model_type)
        ]

        self.sentpiece_processor = None
コード例 #17
0
 def setUp(self):
     yaml.add_representer(DummyClass, xnmt.init_representer)
     self.out_dir = "test/tmp"
     utils.make_parent_dir(f"{self.out_dir}/asdf")
コード例 #18
0
 def run_preproc_task(self, overwrite: bool = False) -> None:
     extractor = self.specs
     for in_file, out_file in zip(self.in_files, self.out_files):
         if overwrite or not os.path.isfile(out_file):
             utils.make_parent_dir(out_file)
             extractor.extract_to(in_file, out_file)