def train_supervised(*kargs, **kwargs): """ Train a supervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input file must must contain at least one label per line. For an example consult the example datasets which are part of the fastText repository such as the dataset pulled by classification-example.sh. """ supervised_default = unsupervised_default.copy() supervised_default.update({ 'lr': 0.1, 'minCount': 1, 'minn': 0, 'maxn': 0, 'loss': "softmax", 'model': "supervised" }) arg_names = [ 'input', 'lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket', 'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors' ] params = read_args(kargs, kwargs, arg_names, supervised_default) a = _build_args(params) ft = _FastText(args=a) fasttext.train(ft.f, a) return ft
def train_unsupervised(*kargs, **kwargs): """ Train an unsupervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input field must not contain any labels or use the specified label prefix unless it is ok for those words to be ignored. For an example consult the dataset pulled by the example script word-vector-example.sh, which is part of the fastText repository. """ arg_names = [ 'input', 'model', 'lr', 'dim', 'ws', 'epoch', 'minCount', 'minCountLabel', 'minn', 'maxn', 'neg', 'wordNgrams', 'loss', 'bucket', 'thread', 'lrUpdateRate', 't', 'label', 'verbose', 'pretrainedVectors' ] args, manually_set_args = read_args(kargs, kwargs, arg_names, unsupervised_default) a = _build_args(args, manually_set_args) ft = _FastText(args=a) fasttext.train(ft.f, a) return ft
def train_unsupervised( input, model="skipgram", lr=0.05, dim=100, ws=5, epoch=5, minCount=5, minCountLabel=0, minn=3, maxn=6, neg=5, wordNgrams=1, loss="ns", bucket=2000000, thread=12, lrUpdateRate=100, t=1e-4, label="__label__", verbose=2, pretrainedVectors="", incr=False, inputModel="" ): """ Train an unsupervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input field must not contain any labels or use the specified label prefix unless it is ok for those words to be ignored. For an example consult the dataset pulled by the example script word-vector-example.sh, which is part of the fastText repository. """ a = _build_args(locals()) ft = _FastText() fasttext.train(ft.f, a) return ft
def train_supervised( input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss="softmax", bucket=2000000, thread=12, lrUpdateRate=100, t=1e-4, label="__label__", verbose=2, pretrainedVectors="", incr=False, inputModel="" ): """ Train a supervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input file must must contain at least one label per line. For an example consult the example datasets which are part of the fastText repository such as the dataset pulled by classification-example.sh. """ model = "supervised" a = _build_args(locals()) ft = _FastText() fasttext.train(ft.f, a) return ft
def train_unsupervised( input, model="skipgram", lr=0.05, dim=100, ws=5, epoch=5, minCount=5, minCountLabel=0, minn=3, maxn=6, neg=5, wordNgrams=1, loss="ns", bucket=2000000, thread=12, lrUpdateRate=100, t=1e-4, label="__label__", verbose=2, pretrainedVectors="", saveOutput=0 ): """ Train an unsupervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input fiel must not contain any labels or use the specified label prefix unless it is ok for those words to be ignored. For an example consult the dataset pulled by the example script word-vector-example.sh, which is part of the fastText repository. """ a = _build_args(locals()) ft = _FastText() fasttext.train(ft.f, a) return ft
def train_supervised( input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, neg=5, wordNgrams=1, loss="softmax", bucket=2000000, thread=12, lrUpdateRate=100, t=1e-4, label="__label__", verbose=2, pretrainedVectors="", saveOutput=0 ): """ Train a supervised model and return a model object. input must be a filepath. The input text does not need to be tokenized as per the tokenize function, but it must be preprocessed and encoded as UTF-8. You might want to consult standard preprocessing scripts such as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html The input file must must contain at least one label per line. For an example consult the example datasets which are part of the fastText repository such as the dataset pulled by classification-example.sh. """ minn = 0 maxn = 0 model = "supervised" a = _build_args(locals()) ft = _FastText() fasttext.train(ft.f, a) return ft