def __init__(self,
                 name,
                 pipeline_parser,
                 pipeline_config_key,
                 options_fn=None):
        self.name = name
        self._pipeline_parser = pipeline_parser
        self._pipeline_config_key = pipeline_config_key.replace('-', '_')

        self._parser = configargparse.get_argument_parser(
            self.name,
            prog='kiwi {}'.format(self.name),
            add_help=False,
            config_file_parser_class=configargparse.YAMLConfigFileParser,
            ignore_unknown_config_file_keys=False,
        )

        self._parser.add(
            '--config',
            required=False,
            is_config_file=True,
            type=PathType(exists=True),
            help='Load config file from path',
        )

        if options_fn is not None:
            options_fn(self._parser)
def _add_predicting_data_file_opts(parser):
    # Data options
    group = parser.add_argument_group('data')

    group.add_argument(
        '--test-source',
        type=PathType(exists=True),
        required=True,
        help='Path to validation source file',
    )
    group.add_argument(
        '--test-target',
        type=PathType(exists=True),
        required=True,
        help='Path to validation target file',
    )
    return group
def _add_data_options(data_parser):
    group = data_parser.add_argument_group(
        'PredEst data',
        description='Predictor Estimator specific data ' 'options. (POSTECH)',
    )

    group.add(
        '--extend-source-vocab',
        type=PathType(exists=True),
        help='Optionally load more data which is used only for vocabulary '
        'creation. Path to additional Data'
        '(Predictor)',
    )
    group.add(
        '--extend-target-vocab',
        type=PathType(exists=True),
        help='Optionally load more data which is used only for vocabulary '
        'creation. Path to additional Data'
        '(Predictor)',
    )
Example #4
0
def add_predicting_data_file_opts(parser):
    # Data options
    group = parser.add_argument_group('data')

    group.add_argument(
        '--test-source',
        type=PathType(exists=True),
        required=True,
        help='Path to validation source file',
    )
    group.add_argument(
        '--test-target',
        type=PathType(exists=True),
        required=True,
        help='Path to validation target file',
    )
    group.add_argument(
        '--test-alignments',
        type=PathType(exists=True),
        help='Path to test alignments between source and target.',
    )
    group.add_argument(
        '--test-source-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for source',
    )
    group.add_argument(
        '--test-target-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for target',
    )
    group.add_argument(
        '--test-target-parse',
        type=PathType(exists=True),
        help='Path to test dependency parsing file for target (tabular format)',
    )
    group.add_argument(
        '--test-target-ngram',
        type=PathType(exists=True),
        help='Path to test highest order ngram file for target (tabular '
        'format)',
    )  # noqa
    group.add_argument(
        '--test-target-stacked',
        type=PathType(exists=True),
        help='Path to test stacked predictions file for target (tabular '
        'format)',
    )  # noqa

    return group
Example #5
0
def jackknife_opts(parser):
    # Training loop options
    group = parser.add_argument_group('jackknifing')

    group.add(
        '--splits',
        required=False,
        type=int,
        default=5,
        help='Jackknife with X folds.',
    )
    group.add(
        '--train-config',
        required=False,
        type=PathType(exists=True),
        help='Path to config file with model parameters.',
    )
def add_training_options(training_parser):
    add_pretraining_options(training_parser)

    group = training_parser.add_argument_group(
        'predictor-estimator training',
        description='Predictor Estimator (POSTECH). These settings are used '
        ' to train the Predictor. They will be ignored if training a '
        ' Predictor-Estimator and the `load-model` flag is set.',
    )
    group.add_argument(
        '--start-stop',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Append start and stop symbols to estimator feature sequence.',
    )
    group.add_argument(
        '--predict-gaps',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Predict Gap Tags. Requires `train-gap-tags`, `valid-'
        'gap-tags` to be set.',
    )
    group.add_argument(
        '--predict-target',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=True,
        help='Predict Target Tags. Requires `train-target-tags`, `valid-'
        'target-tags` to be set.',
    )
    group.add_argument(
        '--predict-source',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Predict Source Tags. Requires `train-source-tags`, `valid-'
        'source-tags` to be set.',
    )
    group.add_argument(
        '--load-pred-source',
        type=PathType(exists=True),
        help='If set, model architecture and vocabulary parameters are '
        'ignored. Load pretrained predictor tgt->src.',
    )
    group.add_argument(
        '--load-pred-target',
        type=PathType(exists=True),
        help='If set, model architecture and vocabulary parameters are '
        'ignored. Load pretrained predictor src->tgt.',
    )

    group.add_argument(
        '--rnn-layers-est', type=int, default=2, help='Layers in Estimator RNN'
    )
    group.add_argument(
        '--dropout-est', type=float, default=0.0, help='Dropout in estimator'
    )
    group.add_argument(
        '--hidden-est',
        type=int,
        default=100,
        help='Size of hidden layers in LSTM',
    )
    group.add_argument(
        '--mlp-est',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""Pass the Estimator input through a linear layer
        reducing dimensionality before RNN.""",
    )
    group.add_argument(
        '--sentence-level',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""Predict Sentence Level Scores.
        Requires setting `train-sentence-scores, valid-sentence-scores`""",
    )
    group.add_argument(
        '--sentence-ll',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""Use probabilistic Loss for sentence scores instead of
        squared error. If set, the model will output mean and variance of
        a truncated Gaussian distribution over the interval [0, 1], and use
        the NLL of ground truth `hter` as the loss.
        This seems to improve performance, and gives you uncertainty estimates
        for sentence level predictions as a byproduct.
        If `sentence-level == False`, this is without effect.
        """,
    )
    group.add_argument(
        '--sentence-ll-predict-mean',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""If `sentence-ll == True`, by default the prediction for `hter`
        will be the mean of the Guassian /before truncation/. After truncation,
        this will be the mode of the distribution, but not the mean as
        truncated Gaussian is skewed to one side. set this to `True` to use
        the True mean after truncation for prediction.
        """,
    )
    group.add_argument(
        '--use-probs',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Predict scores as product/sum of word level probs',
    )

    group.add_argument(
        '--binary-level',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""Predict binary sentence labels indicating `hter == 0.0`
        Requires setting `train-sentence-scores`, `valid-sentence-scores`""",
    )
    group.add_argument(
        '--token-level',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help="""Continue training the predictor on the postedited text.
        If set, will do an additional forward pass through the predictor
        Using the SRC, PE pair and add the `Predictor` loss for the tokens
        in the postedited text PE. Recommended if you have access to PE.
        Requires setting `train-pe`, `valid-pe`""",
    )
    group.add_argument(
        '--target-bad-weight',
        type=float,
        default=3.0,
        help='Relative weight for target bad labels.',
    )
    group.add_argument(
        '--gaps-bad-weight',
        type=float,
        default=3.0,
        help='Relative weight for gaps bad labels.',
    )
    group.add_argument(
        '--source-bad-weight',
        type=float,
        default=3.0,
        help='Relative weight for source bad labels.',
    )
def _add_training_data_file_opts(parser):
    # Data options
    group = parser.add_argument_group('data')

    group.add_argument(
        '--train-source',
        type=PathType(exists=True),
        required=True,
        help='Path to training source file',
    )
    group.add_argument(
        '--train-target',
        type=PathType(exists=True),
        # required=True,
        help='Path to training target file',
    )
    group.add_argument(
        '--train-source-tags',
        type=PathType(exists=True),
        help='Path to validation label file for source (WMT18 format)',
    )
    group.add_argument(
        '--train-target-tags',
        type=PathType(exists=True),
        help='Path to validation label file for target',
    )

    group.add_argument(
        '--train-pe',
        type=PathType(exists=True),
        help='Path to file containing post-edited target.',
    )
    group.add_argument(
        '--train-sentence-scores',
        type=PathType(exists=True),
        help='Path to file containing sentence level scores.',
    )

    valid_group = parser.add_argument_group('validation data')

    valid_group.add_argument(
        '--split',
        type=float,
        help='Split Train dataset in case that no validation set is given.',
    )

    valid_group.add_argument(
        '--valid-source',
        type=PathType(exists=True),
        # required=True,
        help='Path to validation source file',
    )
    valid_group.add_argument(
        '--valid-target',
        type=PathType(exists=True),
        # required=True,
        help='Path to validation target file',
    )
    valid_group.add_argument(
        '--valid-alignments',
        type=str,
        # required=True,
        help='Path to valid alignments between source and target.',
    )
    valid_group.add_argument(
        '--valid-source-tags',
        type=PathType(exists=True),
        help='Path to validation label file for source (WMT18 format)',
    )
    valid_group.add_argument(
        '--valid-target-tags',
        type=PathType(exists=True),
        help='Path to validation label file for target',
    )

    valid_group.add_argument(
        '--valid-pe',
        type=PathType(exists=True),
        help='Path to file containing postedited target.',
    )
    valid_group.add_argument(
        '--valid-sentence-scores',
        type=PathType(exists=True),
        help='Path to file containing sentence level scores.',
    )
def _add_vocabulary_opts(parser):
    group = parser.add_argument_group(
        'vocabulary options',
        description='Options for loading vocabulary from a previous run. '
        'This is used for e.g. training a source predictor via predict-'
        'inverse: True ; If set, other vocab options are ignored',
    )
    group.add_argument(
        '--source-vocab-size',
        type=int,
        default=None,
        help='Size of the source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-size',
        type=int,
        default=None,
        help='Size of the target vocabulary.',
    )
    group.add_argument(
        '--source-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for target vocabulary.',
    )


    """start:给predictor-estimator添加source-embeddings和target-embeddings"""
    group.add_argument(
        '--embeddings-format',
        type=str,
        default='polyglot',
        choices=['polyglot', 'word2vec', 'fasttext', 'glove', 'text'],
        help='Word embeddings format. '
             'See README for specific formatting instructions.',
    )
    group.add_argument(
        '--embeddings-binary',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Load embeddings stored in binary.',
    )
    group.add_argument(
        '--source-embeddings',
        type=PathType(exists=True),
        help='Path to word embeddings file for source.',
    )
    group.add_argument(
        '--target-embeddings',
        type=PathType(exists=True),
        help='Path to word embeddings file for target.',
    )
    """end"""
Example #9
0
def add_training_data_file_opts(parser):
    # Data options
    group = parser.add_argument_group('data')

    group.add_argument(
        '--train-source',
        type=PathType(exists=True),
        required=True,
        help='Path to training source file',
    )
    group.add_argument(
        '--train-target',
        type=PathType(exists=True),
        required=True,
        help='Path to training target file',
    )
    group.add_argument(
        '--train-alignments',
        type=str,
        required=True,
        help='Path to train alignments between source and target.',
    )
    group.add_argument(
        '--train-source-tags',
        type=PathType(exists=True),
        help='Path to training label file for source (WMT18 format)',
    )
    group.add_argument(
        '--train-target-tags',
        type=PathType(exists=True),
        help='Path to training label file for target',
    )

    group.add_argument(
        '--valid-source',
        type=PathType(exists=True),
        required=True,
        help='Path to validation source file',
    )
    group.add_argument(
        '--valid-target',
        type=PathType(exists=True),
        required=True,
        help='Path to validation target file',
    )
    group.add_argument(
        '--valid-alignments',
        type=str,
        required=True,
        help='Path to valid alignments between source and target.',
    )
    group.add_argument(
        '--valid-source-tags',
        type=PathType(exists=True),
        help='Path to validation label file for source (WMT18 format)',
    )
    group.add_argument(
        '--valid-target-tags',
        type=PathType(exists=True),
        help='Path to validation label file for target',
    )

    group.add_argument(
        '--valid-source-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for source',
    )
    group.add_argument(
        '--valid-target-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for target',
    )

    return group
Example #10
0
def add_vocabulary_opts(parser):
    group = parser.add_argument_group('vocabulary options')
    group.add_argument(
        '--source-vocab-size',
        type=int,
        default=None,
        help='Size of the source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-size',
        type=int,
        default=None,
        help='Size of the target vocabulary.',
    )
    group.add_argument(
        '--source-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for target vocabulary.',
    )

    group.add_argument(
        '--keep-rare-words-with-embeddings',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Keep words that occur less then min-frequency but '
        'are in embeddings vocabulary.',
    )
    group.add_argument(
        '--add-embeddings-vocab',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Add words from embeddings vocabulary to source/target '
        'vocabulary.',
    )

    group.add_argument(
        '--embeddings-format',
        type=str,
        default='polyglot',
        choices=['polyglot', 'word2vec', 'fasttext', 'glove', 'text'],
        help='Word embeddings format. '
        'See README for specific formatting instructions.',
    )
    group.add_argument(
        '--embeddings-binary',
        type=lambda x: bool(strtobool(x)),
        nargs='?',
        const=True,
        default=False,
        help='Load embeddings stored in binary.',
    )
    group.add_argument(
        '--source-embeddings',
        type=PathType(exists=True),
        help='Path to word embeddings file for source.',
    )
    group.add_argument(
        '--target-embeddings',
        type=PathType(exists=True),
        help='Path to word embeddings file for target.',
    )

    return group
Example #11
0
parser = configargparse.get_argument_parser('search')

parser.add_argument(
    '-e',
    '--experiment-name',
    required=False,
    help='MLflow will log this run under this experiment name, '
    'which appears as a separate section in the UI. It '
    'will also be used in some messages and files.',
)
parser.add(
    '-c',
    '--config',
    required=True,
    is_config_file=False,
    type=PathType(exists=True),
    help='Load config file from path',
)

group = parser.add_argument_group('models')
group.add_argument('model_name', choices=Model.subclasses.keys())


def get_action(option):
    for action in train.parser._actions:
        if option in train.parser.get_possible_config_keys(action):
            return action
    return None


def split_options(options):
Example #12
0
def add_training_data_file_opts(parser):
    # Data options
    group = parser.add_argument_group('data')

    group.add_argument(
        '--train-source',
        type=PathType(exists=True),
        help='Path to training source file',
    )
    group.add_argument(
        '--train-target',
        type=PathType(exists=True),
        help='Path to training target file',
    )
    group.add_argument(
        '--train-alignments',
        type=str,
        help='Path to train alignments between source and target.',
    )
    group.add_argument(
        '--train-source-tags',
        type=PathType(exists=True),
        help='Path to validation label file for source (WMT18 format)',
    )
    group.add_argument(
        '--train-target-tags',
        type=PathType(exists=True),
        help='Path to validation label file for target',
    )

    group.add_argument(
        '--train-source-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for source',
    )
    group.add_argument(
        '--train-target-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for target',
    )
    group.add_argument(
        '--train-target-parse',
        type=PathType(exists=True),
        help='Path to training dependency parsing file for target (tabular '
        'format)',
    )
    group.add_argument(
        '--train-target-ngram',
        type=PathType(exists=True),
        help='Path to training highest order ngram file for target (tabular '
        'format)',
    )
    group.add_argument(
        '--train-target-stacked',
        type=PathType(exists=True),
        help='Path to training stacked predictions file for target (tabular '
        'format)',
    )

    group = parser.add_argument_group('validation data')

    group.add_argument(
        '--valid-source',
        type=PathType(exists=True),
        # required=True,
        help='Path to validation source file',
    )
    group.add_argument(
        '--valid-target',
        type=PathType(exists=True),
        # required=True,
        help='Path to validation target file',
    )
    group.add_argument(
        '--valid-alignments',
        type=str,
        # required=True,
        help='Path to valid alignments between source and target.',
    )
    group.add_argument(
        '--valid-source-tags',
        type=PathType(exists=True),
        help='Path to validation label file for source (WMT18 format)',
    )
    group.add_argument(
        '--valid-target-tags',
        type=PathType(exists=True),
        help='Path to validation label file for target',
    )

    group.add_argument(
        '--valid-source-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for source',
    )
    group.add_argument(
        '--valid-target-pos',
        type=PathType(exists=True),
        help='Path to training PoS tags file for target',
    )
    group.add_argument(
        '--valid-target-parse',
        type=PathType(exists=True),
        help='Path to validation dependency parsing file for target (tabular '
        'format)',
    )
    group.add_argument(
        '--valid-target-ngram',
        type=PathType(exists=True),
        help='Path to validation highest order ngram file for target (tabular '
        'format)',
    )
    group.add_argument(
        '--valid-target-stacked',
        type=PathType(exists=True),
        help='Path to validation stacked predictions file for target (tabular '
        'format)',
    )
Example #13
0
def evaluate_opts(parser):
    # Evaluation options

    group = parser.add_argument_group("Evaluation of WMT Quality Estimation")

    group.add_argument(
        "--type",
        help="Input type for prediction file",
        choices=["probs", "tags"],
        type=str,
        default="probs",
    )
    group.add_argument(
        "--format",
        help="Input format for gold files",
        choices=["wmt17", "wmt18"],
        type=str,
        default="wmt17",
    )
    group.add_argument(
        "--pred-format",
        help="Input format for predicted files. Defaults to the same as "
        "--format.",
        choices=["wmt17", "wmt18"],
        type=str,
        default="wmt18",
    )
    group.add_argument(
        "--sents-avg",
        help="Obtain scores for sentences by averaging over tags or "
        "probabilities.",
        choices=["probs", "tags"],
        type=str,
        # default=None
    )

    # Gold files.
    group.add_argument(
        "--gold-sents",
        help="Sentences gold standard. ",
        type=PathType(exists=True),
        required=False,
    )
    group.add_argument(
        "--gold-target",
        help="Target tags gold standard, or target and gaps "
        'if format == "wmt18".',
        type=PathType(exists=True),
        required=False,
    )
    group.add_argument(
        "--gold-source",
        help="Source tags gold standard.",
        type=PathType(exists=True),
        required=False,
    )
    group.add_argument(
        "--gold-cal",
        help="Target Tags to calibrate.",
        type=PathType(exists=True),
        required=False,
    )

    # Prediction Files
    group.add_argument(
        "--input-dir",
        help="Directory with prediction files generated by predict pipeline. "
        "Setting this argument will evaluate all predictions for "
        "which a gold file is set.",
        nargs="+",
        type=PathType(exists=True),
        # required=True
    )
    group.add_argument(
        "--pred-sents",
        help="Sentences HTER predictions.",
        type=PathType(exists=True),
        nargs="+",
        required=False,
    )
    group.add_argument(
        "--pred-target",
        help="Target predictions; can be tags or probabilities (of BAD). "
        "See --type.",
        type=PathType(exists=True),
        nargs="+",
        required=False,
    )
    group.add_argument(
        "--pred-gaps",
        help="Gap predictions; can be tags or probabilities (of BAD). "
        "(see --type). Use this option for files that only contain gap "
        "tags.",
        type=PathType(exists=True),
        nargs="+",
        required=False,
    )
    group.add_argument(
        "--pred-source",
        help="Source predictions. can be tags or probabilities (of BAD). "
        " See --type.",
        type=PathType(exists=True),
        nargs="+",
        required=False,
    )
    group.add_argument(
        "--pred-cal",
        help="Target Predictions to calibrate.",
        type=PathType(exists=True),
        required=False,
    )