def __init__(self, name, pipeline_parser, pipeline_config_key, options_fn=None): self.name = name self._pipeline_parser = pipeline_parser self._pipeline_config_key = pipeline_config_key.replace('-', '_') self._parser = configargparse.get_argument_parser( self.name, prog='kiwi {}'.format(self.name), add_help=False, config_file_parser_class=configargparse.YAMLConfigFileParser, ignore_unknown_config_file_keys=False, ) self._parser.add( '--config', required=False, is_config_file=True, type=PathType(exists=True), help='Load config file from path', ) if options_fn is not None: options_fn(self._parser)
def _add_predicting_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--test-source', type=PathType(exists=True), required=True, help='Path to validation source file', ) group.add_argument( '--test-target', type=PathType(exists=True), required=True, help='Path to validation target file', ) return group
def _add_data_options(data_parser): group = data_parser.add_argument_group( 'PredEst data', description='Predictor Estimator specific data ' 'options. (POSTECH)', ) group.add( '--extend-source-vocab', type=PathType(exists=True), help='Optionally load more data which is used only for vocabulary ' 'creation. Path to additional Data' '(Predictor)', ) group.add( '--extend-target-vocab', type=PathType(exists=True), help='Optionally load more data which is used only for vocabulary ' 'creation. Path to additional Data' '(Predictor)', )
def add_predicting_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--test-source', type=PathType(exists=True), required=True, help='Path to validation source file', ) group.add_argument( '--test-target', type=PathType(exists=True), required=True, help='Path to validation target file', ) group.add_argument( '--test-alignments', type=PathType(exists=True), help='Path to test alignments between source and target.', ) group.add_argument( '--test-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--test-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--test-target-parse', type=PathType(exists=True), help='Path to test dependency parsing file for target (tabular format)', ) group.add_argument( '--test-target-ngram', type=PathType(exists=True), help='Path to test highest order ngram file for target (tabular ' 'format)', ) # noqa group.add_argument( '--test-target-stacked', type=PathType(exists=True), help='Path to test stacked predictions file for target (tabular ' 'format)', ) # noqa return group
def jackknife_opts(parser): # Training loop options group = parser.add_argument_group('jackknifing') group.add( '--splits', required=False, type=int, default=5, help='Jackknife with X folds.', ) group.add( '--train-config', required=False, type=PathType(exists=True), help='Path to config file with model parameters.', )
def add_training_options(training_parser): add_pretraining_options(training_parser) group = training_parser.add_argument_group( 'predictor-estimator training', description='Predictor Estimator (POSTECH). These settings are used ' ' to train the Predictor. They will be ignored if training a ' ' Predictor-Estimator and the `load-model` flag is set.', ) group.add_argument( '--start-stop', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Append start and stop symbols to estimator feature sequence.', ) group.add_argument( '--predict-gaps', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Predict Gap Tags. Requires `train-gap-tags`, `valid-' 'gap-tags` to be set.', ) group.add_argument( '--predict-target', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=True, help='Predict Target Tags. Requires `train-target-tags`, `valid-' 'target-tags` to be set.', ) group.add_argument( '--predict-source', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Predict Source Tags. Requires `train-source-tags`, `valid-' 'source-tags` to be set.', ) group.add_argument( '--load-pred-source', type=PathType(exists=True), help='If set, model architecture and vocabulary parameters are ' 'ignored. Load pretrained predictor tgt->src.', ) group.add_argument( '--load-pred-target', type=PathType(exists=True), help='If set, model architecture and vocabulary parameters are ' 'ignored. Load pretrained predictor src->tgt.', ) group.add_argument( '--rnn-layers-est', type=int, default=2, help='Layers in Estimator RNN' ) group.add_argument( '--dropout-est', type=float, default=0.0, help='Dropout in estimator' ) group.add_argument( '--hidden-est', type=int, default=100, help='Size of hidden layers in LSTM', ) group.add_argument( '--mlp-est', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""Pass the Estimator input through a linear layer reducing dimensionality before RNN.""", ) group.add_argument( '--sentence-level', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""Predict Sentence Level Scores. Requires setting `train-sentence-scores, valid-sentence-scores`""", ) group.add_argument( '--sentence-ll', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""Use probabilistic Loss for sentence scores instead of squared error. If set, the model will output mean and variance of a truncated Gaussian distribution over the interval [0, 1], and use the NLL of ground truth `hter` as the loss. This seems to improve performance, and gives you uncertainty estimates for sentence level predictions as a byproduct. If `sentence-level == False`, this is without effect. """, ) group.add_argument( '--sentence-ll-predict-mean', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""If `sentence-ll == True`, by default the prediction for `hter` will be the mean of the Guassian /before truncation/. After truncation, this will be the mode of the distribution, but not the mean as truncated Gaussian is skewed to one side. set this to `True` to use the True mean after truncation for prediction. """, ) group.add_argument( '--use-probs', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Predict scores as product/sum of word level probs', ) group.add_argument( '--binary-level', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""Predict binary sentence labels indicating `hter == 0.0` Requires setting `train-sentence-scores`, `valid-sentence-scores`""", ) group.add_argument( '--token-level', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help="""Continue training the predictor on the postedited text. If set, will do an additional forward pass through the predictor Using the SRC, PE pair and add the `Predictor` loss for the tokens in the postedited text PE. Recommended if you have access to PE. Requires setting `train-pe`, `valid-pe`""", ) group.add_argument( '--target-bad-weight', type=float, default=3.0, help='Relative weight for target bad labels.', ) group.add_argument( '--gaps-bad-weight', type=float, default=3.0, help='Relative weight for gaps bad labels.', ) group.add_argument( '--source-bad-weight', type=float, default=3.0, help='Relative weight for source bad labels.', )
def _add_training_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--train-source', type=PathType(exists=True), required=True, help='Path to training source file', ) group.add_argument( '--train-target', type=PathType(exists=True), # required=True, help='Path to training target file', ) group.add_argument( '--train-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--train-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--train-pe', type=PathType(exists=True), help='Path to file containing post-edited target.', ) group.add_argument( '--train-sentence-scores', type=PathType(exists=True), help='Path to file containing sentence level scores.', ) valid_group = parser.add_argument_group('validation data') valid_group.add_argument( '--split', type=float, help='Split Train dataset in case that no validation set is given.', ) valid_group.add_argument( '--valid-source', type=PathType(exists=True), # required=True, help='Path to validation source file', ) valid_group.add_argument( '--valid-target', type=PathType(exists=True), # required=True, help='Path to validation target file', ) valid_group.add_argument( '--valid-alignments', type=str, # required=True, help='Path to valid alignments between source and target.', ) valid_group.add_argument( '--valid-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) valid_group.add_argument( '--valid-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) valid_group.add_argument( '--valid-pe', type=PathType(exists=True), help='Path to file containing postedited target.', ) valid_group.add_argument( '--valid-sentence-scores', type=PathType(exists=True), help='Path to file containing sentence level scores.', )
def _add_vocabulary_opts(parser): group = parser.add_argument_group( 'vocabulary options', description='Options for loading vocabulary from a previous run. ' 'This is used for e.g. training a source predictor via predict-' 'inverse: True ; If set, other vocab options are ignored', ) group.add_argument( '--source-vocab-size', type=int, default=None, help='Size of the source vocabulary.', ) group.add_argument( '--target-vocab-size', type=int, default=None, help='Size of the target vocabulary.', ) group.add_argument( '--source-vocab-min-frequency', type=int, default=1, help='Min word frequency for source vocabulary.', ) group.add_argument( '--target-vocab-min-frequency', type=int, default=1, help='Min word frequency for target vocabulary.', ) """start:给predictor-estimator添加source-embeddings和target-embeddings""" group.add_argument( '--embeddings-format', type=str, default='polyglot', choices=['polyglot', 'word2vec', 'fasttext', 'glove', 'text'], help='Word embeddings format. ' 'See README for specific formatting instructions.', ) group.add_argument( '--embeddings-binary', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Load embeddings stored in binary.', ) group.add_argument( '--source-embeddings', type=PathType(exists=True), help='Path to word embeddings file for source.', ) group.add_argument( '--target-embeddings', type=PathType(exists=True), help='Path to word embeddings file for target.', ) """end"""
def add_training_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--train-source', type=PathType(exists=True), required=True, help='Path to training source file', ) group.add_argument( '--train-target', type=PathType(exists=True), required=True, help='Path to training target file', ) group.add_argument( '--train-alignments', type=str, required=True, help='Path to train alignments between source and target.', ) group.add_argument( '--train-source-tags', type=PathType(exists=True), help='Path to training label file for source (WMT18 format)', ) group.add_argument( '--train-target-tags', type=PathType(exists=True), help='Path to training label file for target', ) group.add_argument( '--valid-source', type=PathType(exists=True), required=True, help='Path to validation source file', ) group.add_argument( '--valid-target', type=PathType(exists=True), required=True, help='Path to validation target file', ) group.add_argument( '--valid-alignments', type=str, required=True, help='Path to valid alignments between source and target.', ) group.add_argument( '--valid-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--valid-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--valid-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--valid-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) return group
def add_vocabulary_opts(parser): group = parser.add_argument_group('vocabulary options') group.add_argument( '--source-vocab-size', type=int, default=None, help='Size of the source vocabulary.', ) group.add_argument( '--target-vocab-size', type=int, default=None, help='Size of the target vocabulary.', ) group.add_argument( '--source-vocab-min-frequency', type=int, default=1, help='Min word frequency for source vocabulary.', ) group.add_argument( '--target-vocab-min-frequency', type=int, default=1, help='Min word frequency for target vocabulary.', ) group.add_argument( '--keep-rare-words-with-embeddings', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Keep words that occur less then min-frequency but ' 'are in embeddings vocabulary.', ) group.add_argument( '--add-embeddings-vocab', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Add words from embeddings vocabulary to source/target ' 'vocabulary.', ) group.add_argument( '--embeddings-format', type=str, default='polyglot', choices=['polyglot', 'word2vec', 'fasttext', 'glove', 'text'], help='Word embeddings format. ' 'See README for specific formatting instructions.', ) group.add_argument( '--embeddings-binary', type=lambda x: bool(strtobool(x)), nargs='?', const=True, default=False, help='Load embeddings stored in binary.', ) group.add_argument( '--source-embeddings', type=PathType(exists=True), help='Path to word embeddings file for source.', ) group.add_argument( '--target-embeddings', type=PathType(exists=True), help='Path to word embeddings file for target.', ) return group
parser = configargparse.get_argument_parser('search') parser.add_argument( '-e', '--experiment-name', required=False, help='MLflow will log this run under this experiment name, ' 'which appears as a separate section in the UI. It ' 'will also be used in some messages and files.', ) parser.add( '-c', '--config', required=True, is_config_file=False, type=PathType(exists=True), help='Load config file from path', ) group = parser.add_argument_group('models') group.add_argument('model_name', choices=Model.subclasses.keys()) def get_action(option): for action in train.parser._actions: if option in train.parser.get_possible_config_keys(action): return action return None def split_options(options):
def add_training_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--train-source', type=PathType(exists=True), help='Path to training source file', ) group.add_argument( '--train-target', type=PathType(exists=True), help='Path to training target file', ) group.add_argument( '--train-alignments', type=str, help='Path to train alignments between source and target.', ) group.add_argument( '--train-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--train-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--train-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--train-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--train-target-parse', type=PathType(exists=True), help='Path to training dependency parsing file for target (tabular ' 'format)', ) group.add_argument( '--train-target-ngram', type=PathType(exists=True), help='Path to training highest order ngram file for target (tabular ' 'format)', ) group.add_argument( '--train-target-stacked', type=PathType(exists=True), help='Path to training stacked predictions file for target (tabular ' 'format)', ) group = parser.add_argument_group('validation data') group.add_argument( '--valid-source', type=PathType(exists=True), # required=True, help='Path to validation source file', ) group.add_argument( '--valid-target', type=PathType(exists=True), # required=True, help='Path to validation target file', ) group.add_argument( '--valid-alignments', type=str, # required=True, help='Path to valid alignments between source and target.', ) group.add_argument( '--valid-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--valid-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--valid-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--valid-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--valid-target-parse', type=PathType(exists=True), help='Path to validation dependency parsing file for target (tabular ' 'format)', ) group.add_argument( '--valid-target-ngram', type=PathType(exists=True), help='Path to validation highest order ngram file for target (tabular ' 'format)', ) group.add_argument( '--valid-target-stacked', type=PathType(exists=True), help='Path to validation stacked predictions file for target (tabular ' 'format)', )
def evaluate_opts(parser): # Evaluation options group = parser.add_argument_group("Evaluation of WMT Quality Estimation") group.add_argument( "--type", help="Input type for prediction file", choices=["probs", "tags"], type=str, default="probs", ) group.add_argument( "--format", help="Input format for gold files", choices=["wmt17", "wmt18"], type=str, default="wmt17", ) group.add_argument( "--pred-format", help="Input format for predicted files. Defaults to the same as " "--format.", choices=["wmt17", "wmt18"], type=str, default="wmt18", ) group.add_argument( "--sents-avg", help="Obtain scores for sentences by averaging over tags or " "probabilities.", choices=["probs", "tags"], type=str, # default=None ) # Gold files. group.add_argument( "--gold-sents", help="Sentences gold standard. ", type=PathType(exists=True), required=False, ) group.add_argument( "--gold-target", help="Target tags gold standard, or target and gaps " 'if format == "wmt18".', type=PathType(exists=True), required=False, ) group.add_argument( "--gold-source", help="Source tags gold standard.", type=PathType(exists=True), required=False, ) group.add_argument( "--gold-cal", help="Target Tags to calibrate.", type=PathType(exists=True), required=False, ) # Prediction Files group.add_argument( "--input-dir", help="Directory with prediction files generated by predict pipeline. " "Setting this argument will evaluate all predictions for " "which a gold file is set.", nargs="+", type=PathType(exists=True), # required=True ) group.add_argument( "--pred-sents", help="Sentences HTER predictions.", type=PathType(exists=True), nargs="+", required=False, ) group.add_argument( "--pred-target", help="Target predictions; can be tags or probabilities (of BAD). " "See --type.", type=PathType(exists=True), nargs="+", required=False, ) group.add_argument( "--pred-gaps", help="Gap predictions; can be tags or probabilities (of BAD). " "(see --type). Use this option for files that only contain gap " "tags.", type=PathType(exists=True), nargs="+", required=False, ) group.add_argument( "--pred-source", help="Source predictions. can be tags or probabilities (of BAD). " " See --type.", type=PathType(exists=True), nargs="+", required=False, ) group.add_argument( "--pred-cal", help="Target Predictions to calibrate.", type=PathType(exists=True), required=False, )