Beispiel #1
0
class TLSCMDConfig:
    key: str = field(
        "Path to key file", default="server.key",
    )
    cert: str = field(
        "Path to cert file", default="server.pem",
    )
Beispiel #2
0
class DNNClassifierModelConfig:
    predict: Feature = field("Feature name holding predict value")
    classifications: List[str] = field("Options for value of classification")
    features: Features = field("Features to train on")
    clstype: Type = field("Data type of classifications values", default=str)
    batchsize: int = field(
        "Number repos to pass through in an epoch", default=20
    )
    shuffle: bool = field("Randomise order of repos in a batch", default=True)
    steps: int = field("Number of steps to train the model", default=3000)
    epochs: int = field(
        "Number of iterations to pass over all repos in a source", default=30
    )
    directory: str = field(
        "Directory where state should be saved",
        default=os.path.join(
            os.path.expanduser("~"), ".cache", "dffml", "tensorflow"
        ),
    )
    hidden: List[int] = field(
        "List length is the number of hidden layers in the network. Each entry in the list is the number of nodes in that hidden layer",
        default_factory=lambda: [12, 40, 15],
    )

    def __post_init__(self):
        self.classifications = list(map(self.clstype, self.classifications))
Beispiel #3
0
class SLRConfig:
    predict: Feature = field("Label or the value to be predicted")
    features: Features = field("Features to train on")
    directory: str = field(
        "Directory where state should be saved",
        default=os.path.join(os.path.expanduser("~"), ".cache", "dffml",
                             "scratch"),
    )
Beispiel #4
0
class MultiCommCMDConfig:
    mc_config: str = field(
        "MultiComm config directory", default=None,
    )
    mc_atomic: bool = field(
        "No routes other than dataflows registered at startup",
        action="store_true",
        default=False,
    )
Beispiel #5
0
class DataFlowSourceConfig:
    source: BaseSource = field("Source to wrap")
    dataflow: DataFlow = field("DataFlow to use for preprocessing")
    features: Features = field(
        "Features to pass as definitions to each context from each "
        "record to be preprocessed",
        default=Features(),
    )
    inputs: List[str] = field(
        "Other inputs to add under each ctx (record's key will " +
        "be used as the context)",
        action=ParseInputsAction,
        default_factory=lambda: [],
    )
    record_def: str = field(
        "Definition to be used for record.key."
        "If set, record.key will be added to the set of inputs "
        "under each context (which is also the record's key)",
        default=None,
    )
    length: str = field("Definition name to add as source length",
                        default=None)
    all_for_single: bool = field(
        "Run all records through dataflow before grabing "
        "results of desired record on a call to record()",
        default=False,
    )
    no_strict: bool = field(
        "Do not exit on operation exceptions, just log errors",
        default=False,
    )
    orchestrator: BaseOrchestrator = field(
        "Orchestrator",
        default=MemoryOrchestrator.withconfig({}),
    )
Beispiel #6
0
class DNNClassifierModelConfig(TensorflowBaseConfig):
    classifications: List[str] = field("Options for value of classification",
                                       default=None)
    clstype: Type = field("Data type of classifications values", default=str)
    batchsize: int = field("Number records to pass through in an epoch",
                           default=20)
    shuffle: bool = field("Randomise order of records in a batch",
                          default=True)

    def __post_init__(self):
        self.classifications = list(map(self.clstype, self.classifications))
Beispiel #7
0
class PyTorchPreTrainedModelConfig(PyTorchModelConfig):
    pretrained: bool = field(
        "Load Pre-trained model weights",
        default=True,
    )
    trainable: bool = field("Tweak pretrained model by training again",
                            default=False)
    add_layers: bool = field(
        "Add layers on top of pretrained model",
        default=False,
    )
    layers: dict = field("Extra layers to be added on top of pretrained model",
                         default=None)
Beispiel #8
0
class PyTorchPreTrainedModelConfig(PyTorchModelConfig):
    pretrained: bool = field(
        "Load Pre-trained model weights", default=True,
    )
    trainable: bool = field(
        "Tweak pretrained model by training again", default=False
    )
    add_layers: bool = field(
        "Replace the last layer of the pretrained model", default=False,
    )
    layers: dict = field(
        "Extra layers to replace the last layer of the pretrained model",
        default=None,
    )
Beispiel #9
0
class ServerConfig(TLSCMDConfig, MultiCommCMDConfig):
    port: int = field(
        "Port to bind to",
        default=8080,
    )
    addr: str = field(
        "Address to bind to",
        default="127.0.0.1",
    )
    upload_dir: str = field(
        "Directory to store uploaded files in",
        default=None,
    )
    static: str = field(
        "Directory to serve static content from",
        default=None,
    )
    js: bool = field(
        "Serve JavaScript API file at /api.js",
        default=False,
        action="store_true",
    )
    insecure: bool = field(
        "Start without TLS encryption",
        action="store_true",
        default=False,
    )
    cors_domains: List[str] = field(
        "Domains to allow CORS for (see keys in defaults dict for aiohttp_cors.setup)",
        default_factory=lambda: [],
    )
    models: Model = field(
        "Models configured on start",
        default_factory=lambda: AsyncContextManagerList(),
        action=list_action(AsyncContextManagerList),
        labeled=True,
    )
    sources: Sources = field(
        "Sources configured on start",
        default_factory=lambda: Sources(),
        action=list_action(Sources),
        labeled=True,
    )
    redirect: List[str] = field(
        "list of METHOD SOURCE_PATH DESTINATION_PATH pairs, number of elements must be divisible by 3",
        action=ParseRedirectsAction,
        default_factory=lambda: [],
    )
Beispiel #10
0
class FakeTestingConfig2:
    name: str = field("Name of FakeTesting2")
    num: float
    features: Features = Features(
        DefFeature("default", int, 1), DefFeature("features", int, 10)
    )
    label: str = "unlabeled"
Beispiel #11
0
def scikit_doc_to_field(type_str, param):
    default = param.default
    if default is inspect.Parameter.empty:
        default = scikit_get_default(type_str)

    type_cls = Any

    # Set of choices
    if "{'" in type_str and "'}" in type_str:
        type_cls = str
    elif "{" in type_str and "}" in type_str:
        type_cls = int
        if "." in type_str:
            type_cls = float
    else:
        type_split = list(
            map(lambda x: x.lower(),
                type_str.replace(",", "").split()))
        for scikit_type_name, python_type in SCIKIT_DOCS_TYPE_MAP.items():
            if scikit_type_name in type_split:
                type_cls = python_type

    if type_cls == Any and default != None:
        type_cls = type(default)

    return type_cls, field(type_str, default=default)
Beispiel #12
0
class FakeTestingConfig:
    num: float
    files: List[str]
    features: Features
    name: str = field("Name of FakeTesting")
    label: str = "unlabeled"
    readonly: bool = False
    source: BaseSource = JSONSource
Beispiel #13
0
class CreateTLSClientConfig:
    bits: int = field(
        "Number of bits to use for key",
        default=4096,
    )
    key: str = field(
        "Path to client key file",
        default="client.key",
    )
    cert: str = field(
        "Path to client cert file",
        default="client.pem",
    )
    csr: str = field(
        "Path to client csr file",
        default="client.csr",
    )
    server_key: str = field(
        "Path to server key file",
        default="server.key",
    )
    server_cert: str = field(
        "Path to server cert file",
        default="server.pem",
    )
Beispiel #14
0
def inspect_pytorch_params(cls: Callable):
    parameters = inspect.signature(cls).parameters
    args = {}

    for param_name, param in parameters.items():
        args[param_name] = (
            param.annotation,
            field(
                param_name,
                default=param.default
                if param.default is not inspect.Parameter.empty else None,
            ),
        )

    return args
Beispiel #15
0
class TensorflowBaseConfig:
    predict: Feature = field("Feature name holding target values")
    features: Features = field("Features to train on")
    directory: pathlib.Path = field("Directory where state should be saved")
    steps: int = field("Number of steps to train the model", default=3000)
    epochs: int = field(
        "Number of iterations to pass over all records in a source", default=30
    )
    hidden: List[int] = field(
        "List length is the number of hidden layers in the network. Each entry in the list is the number of nodes in that hidden layer",
        default_factory=lambda: [12, 40, 15],
    )
Beispiel #16
0
class DNNRegressionModelConfig:
    predict: str = field("Feature name holding target values")
    features: Features = field("Features to train on")
    steps: int = field("Number of steps to train the model", default=3000)
    epochs: int = field(
        "Number of iterations to pass over all repos in a source", default=30)
    directory: str = field(
        "Directory where state should be saved",
        default=os.path.join(os.path.expanduser("~"), ".cache", "dffml",
                             "tensorflow"),
    )
    hidden: List[int] = field(
        "List length is the number of hidden layers in the network. Each entry in the list is the number of nodes in that hidden layer",
        default_factory=lambda: [12, 40, 15],
    )
class HFClassificationModelConfig:
    features: Features = field("Feature to train on")
    predict: Feature = field("Feature holding target values")
    label_list: List[str] = field("List of target labels")
    cache_dir: str = field(
        "Directory to store the pre-trained models downloaded from s3")
    model_name_or_path: str = field(
        "Path to pretrained model or model identifier from huggingface.co/models",
    )
    directory: str = field(
        "The output directory where the model predictions and checkpoints will be written.",
    )
    logging_dir: str = field("Tensorboard log dir.")
    from_pt: bool = field(
        "Whether to load model from pytorch checkpoint or .bin file",
        default=False,
    )
    clstype: Type = field("Data type of classifications values", default=str)
    max_seq_length: int = field(
        "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.",
        default=128,
    )
    overwrite_cache: bool = field(
        "Overwrite the cached training and evaluation sets",
        default=False,
    )

    config_name: str = field(
        "Pretrained config name or path if not the same as model_name",
        default=None,
    )
    tokenizer_name: str = field(
        "Pretrained tokenizer name or path if not the same as model_name",
        default=None,
    )
    use_fast: bool = field("Set this flag to use fast tokenization.",
                           default=False)
    doc_stride: int = field(
        "When splitting up a long document into chunks, how much stride to take between chunks.",
        default=128,
    )
    optimizer_name: str = field(
        'Name of a Tensorflow optimizer among "adadelta, adagrad, adam, adamax, ftrl, nadam, rmsprop, sgd, adamw"',
        default="adam",
    )
    loss_name: str = field(
        "Name of a Tensorflow loss. For the list see: https://www.tensorflow.org/api_docs/python/tf/keras/losses",
        default="SparseCategoricalCrossentropy",
    )
    gpus: str = field(
        "List of gpu devices. If only one, switch to single gpu strategy, if None takes all availabel gpus",
        default="0",
    )
    no_cuda: bool = field("Avoid using CUDA when available", default=False)

    end_lr: float = field("End learning rate for optimizer", default=0)
    debug: bool = field(
        "Activate the trace to record computation graphs and profiling information",
        default=False,
    )
    overwrite_directory: bool = field(
        "Overwrite the content of the output directory.Use this to continue training if directory points to a checkpoint directory.",
        default=False,
    )

    evaluate_during_training: bool = field(
        "Run evaluation during training at each logging step.",
        default=False,
    )

    per_device_train_batch_size: int = field(
        "Batch size per GPU/TPU core/CPU for training.",
        default=8,
    )
    per_device_eval_batch_size: int = field(
        "Batch size per GPU/TPU core/CPU for evaluation.",
        default=8,
    )

    gradient_accumulation_steps: int = field(
        "Number of updates steps to accumulate before performing a backward/update pass.",
        default=1,
    )

    learning_rate: float = field(
        "The initial learning rate for Adam.",
        default=5e-5,
    )
    weight_decay: float = field(
        "Weight decay if we apply some.",
        default=0.0,
    )
    adam_epsilon: float = field(
        "Epsilon for Adam optimizer.",
        default=1e-8,
    )
    max_grad_norm: float = field(
        "Max gradient norm.",
        default=1.0,
    )

    num_train_epochs: float = field(
        "Total number of training epochs to perform.",
        default=1,
    )
    max_steps: int = field(
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
        default=-1,
    )
    warmup_steps: int = field(
        "Linear warmup over warmup_steps.",
        default=0,
    )

    logging_first_step: bool = field(
        "Log and eval the first global_step",
        default=False,
    )
    logging_steps: int = field(
        "Log every X updates steps.",
        default=500,
    )
    save_steps: int = field(
        "Save checkpoint every X updates steps.",
        default=500,
    )
    save_total_limit: int = field(
        "Limit the total amount of checkpoints.Deletes the older checkpoints in the directory. Default is unlimited checkpoints",
        default=None,
    )
    no_cuda: bool = field(
        "Do not use CUDA even when it is available",
        default=False,
    )
    seed: int = field(
        "random seed for initialization",
        default=42,
    )

    fp16: bool = field(
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
        default=False,
    )
    fp16_opt_level: str = field(
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
        default="O1",
    )

    local_rank: int = field(
        "For distributed training: local_rank",
        default=-1,
    )
    dataloader_drop_last: bool = field(
        "Drop the last incomplete batch if the length of the dataset is not divisible by the batch size",
        default=False,
    )
    past_index: int = field(
        "Some models can make use of the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model at the next training step under the keyword argument `mems` ",
        default=-1,
    )

    def to_json_string(self):
        config_dict = export(self)
        [config_dict.pop(key) for key in ["features", "predict", "clstype"]]
        return json.dumps(config_dict, indent=2)

    def __post_init__(self):
        self.output_dir = self.directory
        self.tf = importlib.import_module("tensorflow")
        self.label_list = list(map(self.clstype, self.label_list))
        self.task_name = "sst-2"
        self.mode = "text-classification"
        if len(self.features) > 1:
            raise ValueError("Found more than one feature to train on")
        if self.fp16:
            self.tf.config.optimizer.set_experimental_options(
                {"auto_mixed_precision": True})
        if len(self.gpus.split(",")) > 1:
            self.n_replicas = len(
                [f"/gpu:{gpu}" for gpu in self.gpus.split(",")])
            self.strategy = self.tf.distribute.MirroredStrategy(
                devices=[f"/gpu:{gpu}" for gpu in self.gpus.split(",")])
        elif self.no_cuda:
            self.n_replicas = 1
            self.strategy = self.tf.distribute.OneDeviceStrategy(
                device="/cpu:0")
        else:
            self.n_replicas = len(self.gpus.split(","))
            self.strategy = self.tf.distribute.OneDeviceStrategy(
                device="/gpu:" + self.gpus.split(",")[0])
        self.train_batch_size = self.per_device_train_batch_size * max(
            1, self.n_replicas)
        self.eval_batch_size = self.per_device_eval_batch_size * max(
            1, self.n_replicas)
Beispiel #18
0
class NERModelConfig:
    sid: Feature = field(
        "Unique Id to identify words of each sentence (Sentence ID)"
    )
    words: Feature = field("Tokens to train NER model")
    predict: Feature = field("NER Tags (B-MISC, I-PER, O etc.) for tokens")
    model_architecture_type: str = field(
        "Model architecture selected in the : "
        + ", ".join(ORIGINAL_NER_MODELS.keys())
    )
    model_name_or_path: str = field(
        "Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS)
    )
    output_dir: str = field(
        "The output directory where the model checkpoints will be written",
        default=str(
            pathlib.Path("~", ".cache", "dffml", "transformers", "checkpoints")
        ),
    )
    config_name: str = field(
        "Pretrained config name or path if not the same as model_name",
        default=None,
    )
    tokenizer_name: str = field(
        "Pretrained tokenizer name or path if not the same as model_name",
        default=None,
    )
    cache_dir: str = field(
        "Directory to store the pre-trained models downloaded from s3",
        default=str(pathlib.Path("~", ".cache", "dffml", "transformers")),
    )
    max_seq_length: int = field(
        "The maximum total input sentence length after tokenization.Sequences longer than this will be truncated, sequences shorter will be padded",
        default=128,
    )
    max_steps: int = field(
        "If greater than zero then sets total number of training steps to perform. Overrides `epochs`",
        default=0,
    )
    use_fp16: bool = field(
        "Whether to use 16-bit (mixed) precision instead of 32-bit",
        default=False,
    )
    ner_tags: List[str] = field(
        "List of all distinct NER Tags",
        default_factory=lambda: [
            "O",
            "B-MISC",
            "I-MISC",
            "B-PER",
            "I-PER",
            "B-ORG",
            "I-ORG",
            "B-LOC",
            "I-LOC",
        ],
    )
    do_lower_case: bool = field(
        "Set this flag if using uncased model.", default=False
    )
    gradient_accumulation_steps: int = field(
        "Number of updates steps to accumulate before performing a backward pass.",
        default=1,
    )
    learning_rate: float = field(
        "The initial learning rate for Adam", default=5e-5
    )
    weight_decay: float = field("Weight decay", default=0.0)
    adam_epsilon: float = field("Epsilon for Adam optimizer", default=1e-8)
    max_grad_norm: float = field("Max gradient norm.", default=1.0)
    epochs: int = field(
        "Total number of training epochs to perform.", default=1
    )
    warmup_steps: int = field("Linear warmup over warmup_steps.", default=0)
    save_steps: int = field(
        "Save checkpoint every X update steps.", default=10
    )
    seed: int = field("Random seed for initialization", default=2020)
    gpus: str = field(
        "List of gpu devices. If only one, switch to single gpu strategy, if None takes all availabel gpus",
        default="0",
    )
    tpu: str = field(
        "The Cloud TPU to use for training. This should be either the name used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url",
        default=None,
    )
    num_tpu_cores: int = field("Total number of TPU cores to use.", default=8)
    per_device_train_batch_size: int = field(
        "Batch size per GPU/CPU/TPU for training", default=8
    )
    per_device_eval_batch_size: int = field(
        "Batch size per GPU/CPU/TPU for assessing accuracy", default=8
    )
    no_cuda: bool = field("Avoid using CUDA when available", default=False)
    eval_all_checkpoints: bool = field(
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
        default=False,
    )

    def __post_init__(self):
        self.tf = importlib.import_module("tensorflow")
        if self.use_fp16:
            self.tf.config.optimizer.set_experimental_options(
                {"auto_mixed_precision": True}
            )
        if self.tpu:
            resolver = self.tf.distribute.cluster_resolver.TPUClusterResolver(
                tpu=self.parent.config.tpu
            )
            self.tf.config.experimental_connect_to_cluster(resolver)
            self.tf.tpu.experimental.initialize_tpu_system(resolver)
            self.strategy = self.tf.distribute.experimental.TPUStrategy(
                resolver
            )
            self.n_device = self.num_tpu_cores
        elif len(self.gpus.split(",")) > 1:
            self.n_device = len(
                [f"/gpu:{gpu}" for gpu in self.gpus.split(",")]
            )
            self.strategy = self.tf.distribute.MirroredStrategy(
                devices=[f"/gpu:{gpu}" for gpu in self.gpus.split(",")]
            )
        elif self.no_cuda:
            self.n_device = 1
            self.strategy = self.tf.distribute.OneDeviceStrategy(
                device="/cpu:0"
            )
        else:
            self.n_device = len(self.gpus.split(","))
            self.strategy = self.tf.distribute.OneDeviceStrategy(
                device="/gpu:" + self.gpus.split(",")[0]
            )
Beispiel #19
0
class XGBRegressorModelConfig:
    directory: pathlib.Path = field("Directory where model should be saved")
    features: Features = field("Features on which we train the model")
    predict: Feature = field("Value to be predicted")
    learning_rate: float = field("Learning rate to train with", default=0.05)
    n_estimators: int = field(
        "Number of gradient boosted trees. Equivalent to the number of boosting rounds",
        default=1000,
    )
    max_depth: int = field("Maximium tree depth for base learners", default=6)
    subsample: float = field("Subsample ratio of the training instance",
                             default=1)
    gamma: float = field(
        "Minimium loss reduction required to make a furthre partition on a leaf node",
        default=0,
    )
    n_jobs: int = field("Number of parallel threads used to run xgboost",
                        default=-1)
    colsample_bytree: float = field(
        "Subsample ratio of columns when constructing each tree", default=1)
    booster: str = field(
        "Specify which booster to use: gbtree, gblinear or dart",
        default="gbtree",
    )
    min_child_weight: float = field(
        "Minimum sum of instance weight(hessian) needed in a child", default=0)
    reg_lambda: float = field(
        "L2 regularization term on weights. Increasing this value will make model more conservative",
        default=1,
    )
    reg_alpha: float = field(
        "L1 regularization term on weights. Increasing this value will make model more conservative",
        default=0,
    )
class TextClassifierConfig:
    predict: Feature = field("Feature name holding classification value")
    classifications: List[str] = field("Options for value of classification")
    features: Features = field("Features to train on")
    trainable: str = field("Tweak pretrained model by training again",
                           default=True)
    batch_size: int = field("Batch size", default=120)
    max_seq_length: int = field(
        "Length of sentence, used in preprocessing of input for bert embedding",
        default=256,
    )
    add_layers: bool = field("Add layers on the top of pretrianed model/layer",
                             default=False)
    embedType: str = field(
        "Type of pretrained embedding model, required to be set to `bert` to use bert pretrained embedding",
        default=None,
    )
    layers: List[str] = field(
        "Extra layers to be added on top of pretrained model", default=None)
    model_path: str = field(
        "Pretrained model path/url",
        default=
        "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1",
    )
    optimizer: str = field("Optimizer used by model", default="adam")
    metrics: str = field("Metric used to evaluate model", default="accuracy")
    clstype: Type = field("Data type of classifications values", default=str)
    epochs: int = field(
        "Number of iterations to pass over all records in a source",
        default=10)
    directory: str = field(
        "Directory where state should be saved",
        default=os.path.join(os.path.expanduser("~"), ".cache", "dffml",
                             "tensorflow_hub"),
    )

    def __post_init__(self):
        self.classifications = list(map(self.clstype, self.classifications))
        if self.add_layers:
            # Temperory solution
            self.layers = parse_layers(self.layers)
Beispiel #21
0
class QAModelConfig:
    model_type: str = field("Model type in the list: " +
                            ", ".join(MODEL_TYPES))
    model_name_or_path: str = field(
        "Path to pretrained model or model identifier from huggingface.co/models",
    )
    output_dir: str = field(
        "The output directory where the model checkpoints and predictions will be written.",
    )
    cache_dir: str = field(
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    log_dir: str = field("Directory used by SummaryWriter for logging")
    tokenizer_name: str = field(
        "Pretrained tokenizer name or path if not the same as model_name",
        default=None,
    )
    from_tf: bool = field(
        "Whether to load model from tensorflow checkpoint or .h5 file",
        default=False,
    )
    config_name: str = field(
        "Pretrained config name or path if not the same as model_name",
        default=None,
    )
    null_score_diff_threshold: str = field(
        "If null_score - best_non_null is greater than the threshold predict null.",
        default=0.0,
    )
    max_seq_length: int = field(
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.",
        default=384,
    )
    doc_stride: int = field(
        "When splitting up a long document into chunks, how much stride to take between chunks.",
        default=128,
    )
    max_query_length: int = field(
        "The maximum number of tokens for the question. Questions longer than this will be truncated to this length",
        default=64,
    )
    do_lower_case: bool = field("Set this flag while using uncased model",
                                default=False)
    per_gpu_train_batch_size: int = field(
        "Batch size per GPU/CPU for training", default=8)
    per_gpu_eval_batch_size: int = field(
        "Batch size per GPU/CPU for evaluation", default=8)
    learning_rate: float = field("The initial learning rate for Adam",
                                 default=5e-5)
    gradient_accumulation_steps: int = field(
        "Number of updates steps to accumulate before performing a backward/update pass",
        default=1,
    )
    weight_decay: float = field("Weight decay if we apply some.", default=0.0)
    adam_epsilon: float = field("Epsilon for Adam optimizer", default=1e-8)
    max_grad_norm: float = field("Max gradient norm.", default=1.0)
    num_train_epochs: float = field(
        "Total number of training epoches to perform", default=1.0)
    max_steps: int = field(
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
        default=-1,
    )
    warmup_steps: int = field("Linear warmup over warmup_steps.", default=0)
    n_best_size: int = field(
        "The total number of n-best predictions to generate", default=20)
    max_answer_length: int = field(
        "The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.",
        default=30,
    )
    lang_id: int = field(
        "language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
        default=0,
    )
    logging_steps: int = field("Log every X updates steps.", default=500)
    save_steps: int = field("Save checkpoint every X update steps",
                            default=500)
    no_cuda: bool = field("Whether not to use CUDA when available",
                          default=False)
    overwrite_output_dir: bool = field(
        "Overwrite the content of the output directory", default=False)
    seed: int = field("random seed for initialization", default=2020)
    local_rank: int = field("local_rank for distributed training on gpus",
                            default=-1)
    fp16: int = field(
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) insted of 32-bit",
        default=False,
    )
    fp16_opt_level: str = field(
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html",
        default="O1",
    )
    threads: int = field("Multiple threads for converting example to features",
                         default=1)

    def __post_init__(self):
        if self.doc_stride >= self.max_seq_length - self.max_query_length:
            logger.warning(
                "WARNING - You've set a doc stride which may be superior to the document length in some "
                "examples. This could result in errors when building features from the examples. Please reduce the doc "
                "stride or increase the maximum length to ensure the features are correctly built."
            )
        if self.local_rank == -1 or self.no_cuda:
            device = torch.device("cuda" if torch.cuda.is_available()
                                  and not self.no_cuda else "cpu")
            self.n_gpu = 0 if self.no_cuda else torch.cuda.device_count()
        else:
            torch.cuda.set_device(self.local_rank)
            device = torch.device("cuda", self.local_rank)
            torch.distributed.init_process_group(backend="nccl")
            self.n_gpu = 1
        self.device = device
        set_seed(self.seed, self.n_gpu)

        if self.local_rank not in [-1, 0]:
            torch.distributed.barrier()

        self.model_type = self.model_type.lower()
Beispiel #22
0
 class CMDTestConfig(CMDConfig):
     nope: bool = field("test field", default=False)
Beispiel #23
0
class PyTorchModelConfig:
    predict: Feature = field("Feature name holding classification value")
    features: Features = field("Features to train on")
    directory: pathlib.Path = field("Directory where state should be saved")
    classifications: List[str] = field(
        "Options for value of classification", default=None
    )
    clstype: Type = field("Data type of classifications values", default=str)
    imageSize: int = field(
        "Common size for all images to resize and crop to", default=None
    )
    enableGPU: bool = field("Utilize GPUs for processing", default=False)
    epochs: int = field(
        "Number of iterations to pass over all records in a source", default=20
    )
    batch_size: int = field("Batch size", default=32)
    validation_split: float = field(
        "Split training data for Validation", default=0.0
    )
    patience: int = field(
        "Early stops the training if validation loss doesn't improve after a given patience",
        default=5,
    )
    loss: PyTorchLoss = field(
        "Loss Functions available in PyTorch",
        default=CrossEntropyLossFunction,
    )
    optimizer: str = field(
        "Optimizer Algorithms available in PyTorch", default="SGD"
    )
    normalize_mean: List[float] = field(
        "Mean values for normalizing Tensor image", default=None
    )
    normalize_std: List[float] = field(
        "Standard Deviation values for normalizing Tensor image", default=None
    )

    def __post_init__(self):
        if self.classifications is not None:
            self.classifications = list(
                map(self.clstype, self.classifications)
            )
Beispiel #24
0
    (
        "scikitac",
        "AgglomerativeClustering",
        AgglomerativeClustering,
        applicable_features,
    ),
    ("scikitoptics", "OPTICS", OPTICS, applicable_features),
]:
    estimator_type = cls._estimator_type
    config_fields = dict()
    if estimator_type in supervised_estimators:
        parentContext = ScikitContext
        parentModel = Scikit
        config_fields["predict"] = (
            Feature,
            field("Label or the value to be predicted"),
        )
    elif estimator_type in unsupervised_estimators:
        parentContext = ScikitContextUnsprvised
        parentModel = ScikitUnsprvised
        config_fields["tcluster"] = (
            Feature,
            field(
                "True cluster label for evaluating clustering models",
                default=None,
            ),
        )
    dffml_config_properties = {
        **{
            "directory": (
                pathlib.Path,
Beispiel #25
0
    ("scikitbirch", "Birch", Birch),
    ("scikitmbkmeans", "MiniBatchKMeans", MiniBatchKMeans),
    ("scikitap", "AffinityPropagation", AffinityPropagation),
    ("scikims", "MeanShift", MeanShift),
    ("scikitsc", "SpectralClustering", SpectralClustering),
    ("scikitac", "AgglomerativeClustering", AgglomerativeClustering),
    ("scikitoptics", "OPTICS", OPTICS),
]:
    estimator_type = cls._estimator_type
    config_fields = dict()
    if estimator_type in supervised_estimators:
        parentContext = ScikitContext
        parentModel = Scikit
        config_fields["predict"] = (
            Feature,
            field("Label or the value to be predicted"),
        )
    elif estimator_type in unsupervised_estimators:
        parentContext = ScikitContextUnsprvised
        parentModel = ScikitUnsprvised
        config_fields["tcluster"] = (
            Feature,
            field(
                "True cluster label for evaluating clustering models",
                default=None,
            ),
        )
    dffml_config_properties = {
        **{
            "directory": (
                pathlib.Path,
Beispiel #26
0
class PyTorchNeuralNetworkConfig(PyTorchModelConfig):
    network: Network = field("Model", default=None)
Beispiel #27
0
    ),
    ("scikitridge", "Ridge", Ridge, applicable_features),
    ("scikitlars", "Lars", Lars, applicable_features),
]:

    dffml_config = mkscikit_config_cls(
        name + "ModelConfig",
        cls,
        properties={
            "directory": (
                str,
                field(
                    "Directory where state should be saved",
                    default=os.path.join(
                        os.path.expanduser("~"),
                        ".cache",
                        "dffml",
                        f"scikit-{entry_point_name}",
                    ),
                ),
            ),
            "predict": (str, field("Label or the value to be predicted")),
            "features": (Features, field("Features to train on")),
        },
    )

    dffml_cls_ctx = type(
        name + "ModelContext",
        (ScikitContext, ),
        {"applicable_features": applicable_features_function},
    )
Beispiel #28
0
 class FakeSubCMDConfig:
     test: str = field("test field")
Beispiel #29
0
class CreateTLSServerConfig(TLSCMDConfig):
    bits: int = field(
        "Number of bits to use for key",
        default=4096,
    )
Beispiel #30
0
class MiscServicesConfig:
    integer: int = field(
        f"Port to do nothing with",
        default=0,
        required=True,
    )