def train_supervised(*kargs, **kwargs):
    """
    Train a supervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input file must must contain at least one label per line. For an
    example consult the example datasets which are part of the fastText
    repository such as the dataset pulled by classification-example.sh.
    """
    supervised_default = unsupervised_default.copy()
    supervised_default.update({
        'lr': 0.1,
        'minCount': 1,
        'minn': 0,
        'maxn': 0,
        'loss': "softmax",
        'model': "supervised"
    })

    arg_names = [
        'input', 'lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel',
        'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket', 'thread',
        'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors'
    ]
    params = read_args(kargs, kwargs, arg_names, supervised_default)
    a = _build_args(params)
    ft = _FastText(args=a)
    fasttext.train(ft.f, a)
    return ft
Ejemplo n.º 2
0
def train_unsupervised(*kargs, **kwargs):
    """
    Train an unsupervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input field must not contain any labels or use the specified label prefix
    unless it is ok for those words to be ignored. For an example consult the
    dataset pulled by the example script word-vector-example.sh, which is
    part of the fastText repository.
    """
    arg_names = [
        'input', 'model', 'lr', 'dim', 'ws', 'epoch', 'minCount',
        'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket',
        'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors'
    ]
    args, manually_set_args = read_args(kargs, kwargs, arg_names,
                                        unsupervised_default)
    a = _build_args(args, manually_set_args)
    ft = _FastText(args=a)
    fasttext.train(ft.f, a)
    return ft
Ejemplo n.º 3
0
def train_unsupervised(
    input,
    model="skipgram",
    lr=0.05,
    dim=100,
    ws=5,
    epoch=5,
    minCount=5,
    minCountLabel=0,
    minn=3,
    maxn=6,
    neg=5,
    wordNgrams=1,
    loss="ns",
    bucket=2000000,
    thread=12,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
    incr=False,
    inputModel=""
):
    """
    Train an unsupervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input field must not contain any labels or use the specified label prefix
    unless it is ok for those words to be ignored. For an example consult the
    dataset pulled by the example script word-vector-example.sh, which is
    part of the fastText repository.
    """
    a = _build_args(locals())
    ft = _FastText()
    fasttext.train(ft.f, a)
    return ft
Ejemplo n.º 4
0
def train_supervised(
    input,
    lr=0.1,
    dim=100,
    ws=5,
    epoch=5,
    minCount=1,
    minCountLabel=0,
    minn=0,
    maxn=0,
    neg=5,
    wordNgrams=1,
    loss="softmax",
    bucket=2000000,
    thread=12,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
    incr=False,
    inputModel=""
):
    """
    Train a supervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input file must must contain at least one label per line. For an
    example consult the example datasets which are part of the fastText
    repository such as the dataset pulled by classification-example.sh.
    """
    model = "supervised"
    a = _build_args(locals())
    ft = _FastText()
    fasttext.train(ft.f, a)
    return ft
Ejemplo n.º 5
0
def train_unsupervised(
    input,
    model="skipgram",
    lr=0.05,
    dim=100,
    ws=5,
    epoch=5,
    minCount=5,
    minCountLabel=0,
    minn=3,
    maxn=6,
    neg=5,
    wordNgrams=1,
    loss="ns",
    bucket=2000000,
    thread=12,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
    saveOutput=0
):
    """
    Train an unsupervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input fiel must not contain any labels or use the specified label prefix
    unless it is ok for those words to be ignored. For an example consult the
    dataset pulled by the example script word-vector-example.sh, which is
    part of the fastText repository.
    """
    a = _build_args(locals())
    ft = _FastText()
    fasttext.train(ft.f, a)
    return ft
Ejemplo n.º 6
0
def train_supervised(
    input,
    lr=0.1,
    dim=100,
    ws=5,
    epoch=5,
    minCount=1,
    minCountLabel=0,
    neg=5,
    wordNgrams=1,
    loss="softmax",
    bucket=2000000,
    thread=12,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
    saveOutput=0
):
    """
    Train a supervised model and return a model object.

    input must be a filepath. The input text does not need to be tokenized
    as per the tokenize function, but it must be preprocessed and encoded
    as UTF-8. You might want to consult standard preprocessing scripts such
    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

    The input file must must contain at least one label per line. For an
    example consult the example datasets which are part of the fastText
    repository such as the dataset pulled by classification-example.sh.
    """
    minn = 0
    maxn = 0
    model = "supervised"
    a = _build_args(locals())
    ft = _FastText()
    fasttext.train(ft.f, a)
    return ft