Python train_supervised Examples, fastText.train_supervised Python Examples

Example #1

0

Show file

 def fit(self):
     '''
     模型训练
     :return:
     '''
     fastText.train_supervised(self.train_input,
                               label="__label__").save_model(self.output)

Example #2

0

Show file

def test_fasttext():
    import os
    import codecs
    from fastText import train_supervised

    data_dir = "/home/davidyu/stock/scripts/davidyu_stock/scripts/analysis/news_report"
    train_data = os.path.join(data_dir, "train_text.txt")
    valid_data = os.path.join(data_dir, "valid_text.txt")

    model = train_supervised(input=train_data,
                             epoch=50,
                             lr=0.5,
                             wordNgrams=2,
                             dim=100,
                             verbose=2,
                             minCount=1,
                             thread=1)
    model = train_supervised(input=train_data,
                             epoch=50,
                             lr=0.5,
                             wordNgrams=2,
                             dim=100,
                             verbose=2,
                             minCount=1,
                             thread=1,
                             loss="softmax")

    print_results(*model.test(valid_data))

Example #3

0

Show file

File: train.py Project: uk-gov-mirror/ONSdigital.dp-fasttext

def train_model(fname_prefix: str,
                out_fname: str,
                label_prefix: str = "__label__",
                **kwargs):
    # Train the model
    import fastText

    params = {
        "dim": kwargs.get("dim", 300),
        "epoch": kwargs.get("epoch", 1000),
        "wordNgrams": kwargs.get("wordNgrams", 2),
        "verbose": kwargs.get("verbose", 2),
        "minCount": kwargs.get("minCount", 15),
        "minCountLabel": kwargs.get("minCountLabel", 5),
        "lr": kwargs.get("lr", 0.1),
        "neg": kwargs.get("neg", 10),
        "thread": kwargs.get("thread", 16),
        "loss": kwargs.get("loss", "ns"),
        "t": kwargs.get("t", 1e-5)
    }

    logging.info("Training fastText model", extra={"params": params})
    model = fastText.train_supervised(input="%s.train" % fname_prefix,
                                      label=label_prefix,
                                      **params)
    logging.info("Writing model to disk", extra={"output_file": out_fname})
    model.save_model(out_fname)
    return model

Example #4

0

Show file

File: fasttext_main.py Project: xlk0101/short_essay_el

def train_sup(mode=fasttextConfig.create_data_word):
    # train supervised model
    print('start train supervised fasttext model')
    # init path
    if mode == fasttextConfig.create_data_word:
        input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_word_data
        output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model
        test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_word_data
    elif mode == fasttextConfig.create_data_char:
        input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_char_data
        output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_char_model
        test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_char_data
    # init path
    model = fastText.train_supervised(input=input_path,
                                      dim=200,
                                      epoch=100,
                                      lr=1.0,
                                      wordNgrams=3,
                                      ws=7,
                                      verbose=2,
                                      minCount=1,
                                      thread=8,
                                      loss='hs')
    print_results(*model.test(test_path))
    model.save_model(output_path)
    print("train sup fasttext finish")

Example #5

0

Show file

def fit(session, uid, path):
    labeled_text = dal.get_text_labeled_text(session, uid)

    # not sure if this is the right prereq for fasttext
    if len(labeled_text['targets']) < 2:
        return

    # preprocess training data one line at a time in order to limit pipe buffer issues
    preprocessed_data = [preprocess(datum) for datum in labeled_text['data']]

    # create a new temporary training data file
    fd, train_path = tempfile.mkstemp()

    # close the temporary training data file descriptor as we don't need it
    os.close(fd)

    # fill the temporary training data file
    with open(train_path, 'w') as f:
        for (target, datum) in zip(labeled_text['targets'], preprocessed_data):
            f.write(label_prefix + target + " " + datum + "\n")

    # train the fasttext model
    model = fastText.train_supervised(input=train_path)

    # compress the fasttext model to save space (disabled for now because it requires at least 256 rows)
    #model.quantize(input=train_path)

    # delete the temporary training data file
    os.unlink(train_path)

    # serialize the model out to the temporary model file
    model.save_model(path)

Example #6

0

Show file

File: wikimark.py Project: seamrvaulter/sensimark

def fasttext_train(input, output):
    import fastText as ft
    model = ft.train_supervised(
        input=input,
        dim=300,
        pretrainedVectors='wiki-news-300d-1M-subword.vec')
    model.save_model(output)

Example #7

0

Show file

File: fasttext.py Project: smh2019/RedditScore

    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(y, np.ndarray):
            y = np.array(y)

        path = data_to_temp(X, self.label, y)
        self._num_classes = len(np.unique(y))
        self._model = fastText.train_supervised(path,
                                                lr=self.lr,
                                                dim=self.dim,
                                                ws=self.ws,
                                                epoch=self.epoch,
                                                minCount=self.minCount,
                                                minCountLabel=self.minCountLabel,
                                                minn=self.minn,
                                                maxn=self.maxn,
                                                neg=self.neg,
                                                wordNgrams=self.wordNgrams,
                                                loss=self.loss,
                                                bucket=self.bucket,
                                                thread=self.thread,
                                                lrUpdateRate=self.lrUpdateRate,
                                                t=self.t,
                                                label=self.label,
                                                verbose=self.verbose)
        os.remove(path)
        fd, path = tempfile.mkstemp()
        self._model.save_softmax(path)
        self.class_embeddings = pd.read_csv(
            path, skiprows=[0], delimiter=' ').dropna(axis=1)
        os.remove(path)
        return self

Example #8

0

Show file

    def trainFastText(self, sents, labls, argsOut):

        if argsOut.load:
            # load given model
            print("Loading fastText model {0} ..".format(argsOut.load))
            return load_model(argsOut.load)

        outFile = "ft_runTrain{0}.txt".format(time.time())

        labls = self.roundLabelsRegress(labls)

        for i in range(0, len(sents)):
            sents[i] = "__label__{0} {1}".format(labls[i], sents[i])

        with open(outFile, mode="w", encoding="utf-8") as file:
            file.writelines(sents)

        # Get model
        model = train_supervised(input=outFile,
                                 epoch=argsOut.epochs,
                                 dim=argsOut.dim,
                                 bucket=10000000,
                                 lr=argsOut.lr,
                                 wordNgrams=argsOut.wordNgrams,
                                 verbose=2,
                                 minCount=1)
        os.remove(outFile)

        return model

Example #9

0

Show file

def train():
    if FLAGS.word_level:
        train_file = 'data/train_w.fast'
        dev_file = 'data/dev_w.fast'
        model_file = 'temp/ml/fast_model_w.bin'
    else:
        train_file = 'data/train_c.fast'
        dev_file = 'data/dev_c.fast'
        model_file = 'temp/ml/fast_model_c.bin'
    model = fastText.train_supervised(input=train_file,
                                      dim=100,
                                      epoch=15,
                                      thread=40,
                                      minCount=10,
                                      loss='softmax',
                                      wordNgrams=2)
    model.save_model(model_file)

    vocab_size, p, r = model.test(train_file)
    f1 = (p * r * 2) * 1.0 / (p + r)
    print 'Train:vocab_size:%d  p:%.5f   r:%.5f   f1:%.5f' % (vocab_size, p, r,
                                                              f1)

    vocab_size, p, r = model.test(dev_file)
    f1 = (p * r * 2) * 1.0 / (p + r)
    print 'Dev:vocab_size:%d  p:%.5f   r:%.5f   f1:%.5f' % (vocab_size, p, r,
                                                            f1)

Example #10

0

Show file

File: fastText_wrapper.py Project: m-doru/tweets-binary-emoji-prediction

    def fit(self, X, y):
        '''
        Method that trains the fasttext clasifier on the provided dataset. If the file with the serialized model 
        of the classifier trained on the exact same dataset exists, then just load the model, otherwise 
        train the classifier and save model.
        :param X: the set of instances in the dataset
        :param y: the set of labels in the dataset
        :return: None
        '''
        file_identifier = self._get_identifier_for_model_file(X, y)
        serialized_file = os.path.join(self.folder, file_identifier)

        if os.path.isfile(serialized_file):
            print("Loading the model from file " + str(serialized_file))
            # file already exists, fit means just loading the model
            self.model = load_model(serialized_file)

        else:
            print("Training the classifier " + str(self.name))
            # means that file does not exist, we have to train the model

            with open(SWAP_FILE, 'w', encoding='utf8') as f:
                for tweet, label in zip(X, y):
                    f.write(LABEL_IDENTIFIER + str(label) + SEPARATOR +
                            tweet.strip() + '\n')

            self.model = train_supervised(SWAP_FILE, **self.params)
            os.remove(SWAP_FILE)  # DELETE SWAP FILE AFTER USAGE

            # lastly, save model. Firstly, create folder for classifier, if it doesn't exist

            os.makedirs(self.folder, exist_ok=True)
            self.model.save_model(serialized_file)

Example #11

0

Show file

    def fit(self, X, y):
        # Fit model
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(y, np.ndarray):
            y = np.array(y)

        path = data_to_temp(X, self.label, y)
        self._num_classes = len(np.unique(y))
        self._model = fastText.train_supervised(path,
                                                lr=self.lr,
                                                dim=self.dim,
                                                ws=self.ws,
                                                epoch=self.epoch,
                                                minCount=self.minCount,
                                                minCountLabel=self.minCountLabel,
                                                minn=self.minn,
                                                maxn=self.maxn,
                                                neg=self.neg,
                                                wordNgrams=self.wordNgrams,
                                                loss=self.loss,
                                                bucket=self.bucket,
                                                thread=self.thread,
                                                lrUpdateRate=self.lrUpdateRate,
                                                t=self.t,
                                                label=self.label,
                                                verbose=self.verbose)
        os.remove(path)
        return self

Example #12

0

Show file

File: manager.py Project: project-renard-survey/Caliopen

    def _train_tagging_model(self, output, quantization=False):
        """Train a tagging model using an annotated file and save it to output.

        Pre-processing of the input file must have been done prior to training.
        Training file should be utf-8.
        The model can be quantized to reduce memory usage
        (but quantization is quite expensive).
        """
        log.info('Start training model.')
        new_model = fastText.train_supervised(input=self.tempfilename,
                                              epoch=self.epoch,
                                              lr=self.lr,
                                              wordNgrams=self.wordNgrams,
                                              minCount=self.minCount,
                                              dim=self.dim,
                                              loss=self.loss,
                                              thread=self.thread,
                                              neg=self.neg)
        log.info('Training model done.')
        if quantization:
            log.info('Start quantization.')
            new_model.quantize(thread=self.thread, retrain=False)
            log.info('Quantization done.')
        model_file = '{}/{}.bin'.format(resources_path, output)
        new_model.save_model(model_file)
        log.info('Model saved at {}.'.format(model_file))
        return model_file

Example #13

0

Show file

def get_predict(train_path, test_path):
    """
    :param train_path:      处理好的可以用来训练的数据
    :param test_path:       处理好的用来预测的无类标数据
    """
    import fastText
    classifier = fastText.train_supervised(
        train_path,
        lr=0.3,
        dim=200,
        epoch=8,
        minn=1,
        maxn=4,
        wordNgrams=2,
        loss='hs',
    )
    test_data = np.load(test_path)
    li = []
    j = 0
    for i in test_data:
        j += 1
        pre = classifier.predict(i[:-1], k=10)
        result = handle_ft_predict_result(
            pre)  # 每一个句子对应一个二维数组，形如[['价格,' '0' '0.65184933']...]
        li.append(result)
    loc_predict_probability = rename(test_path,
                                     'probability_fastT_subOnly.npy')
    np.save(loc_predict_probability, li)

Example #14

0

Show file

 def process(self, data_file):
     json_object = self.load_json(data_file)
     json_object = self.replace_entities(json_object)
     self.load_answers(json_object)
     self.create_labeled_questions_file(json_object, "fasttext_data.txt")
     self.fast_text_model = fastText.train_supervised("fasttext_data.txt",
                                                      epoch=50)

Example #15

0

Show file

    def train(self, data, labels, dim=100, ng=2, epoch=10):
        if len(data) != len(labels):
            raise ValueError("Length of data (" + str(len(data)) +
                             ") does not match length of labels (" +
                             str(len(labels)) + ")")

        # Convert training data into strings for fastText
        mapped_report_strs = []
        for i in range(len(data)):
            report_string = data[i].replace("\n", " ")
            label = " __label__" + str(labels[i])
            mapped_report_strs.append(report_string + label)
        shuffle(mapped_report_strs)

        # Write strings to a temp file
        train_path = "./MODEL_TRAIN_TEMP.bin"
        with open(train_path, 'w') as outfile:
            for mrs in mapped_report_strs:
                outfile.write(mrs)
                outfile.write("\n")

        # Train fastText model
        self.model = fastText.train_supervised(train_path,
                                               dim=dim,
                                               epoch=epoch,
                                               thread=4,
                                               wordNgrams=ng)

        # Delete temp file
        os.remove(train_path)

Example #16

0

Show file

File: fasttext_classify.py Project: Forethought-Technologies/ieee-dsmp-2018-paper

def fasttext_classify(data, extra_params={}):
  class_to_predict = 'type' # product importance
  data[class_to_predict] = data[class_to_predict].map(lambda s : s.replace(" ", ""))
  data_for_fasttext = data['text'] + ' __label__' + data[class_to_predict]
  data_for_fasttext = shuffle(data_for_fasttext, random_state=77)

  num_records = len(data_for_fasttext)
  data_train = data_for_fasttext[:int(0.85 * num_records)]
  data_test = data_for_fasttext[int(0.85 * num_records):]

  data_train.to_csv(TRAIN_PATH, sep='\t', header=0, index=False)
  data_test.to_csv(TEST_PATH, sep='\t', header=0, index=False)

  model = fastText.train_supervised(TRAIN_PATH, **extra_params)
  #model.save_model(MODEL_PATH)
  print('Training accuracy:')
  train_accuracy = model.test(TRAIN_PATH)
  print(train_accuracy[-1])

  print('Test accuracy:')
  test_accuracy = model.test(TEST_PATH)
  print(test_accuracy[-1])

  y_pred = []
  y_true = []
  for test_item in data_test:
    test_text, test_label = test_item.split('__label__')
    y_pred.append(model.predict(test_text)[0])
    y_true.append('__label__' + test_label)

  print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))

  return test_accuracy[-1] # accuracy is a tuple

Example #17

0

Show file

 def train_model(self, traverse_list):
     """
     This is used for LR and FT model as each step of the model will output a probability.
     :param traverse_list:
     :return: None
     """
     for i, node in enumerate(traverse_list):
         print("Node key: " + node.key)
         print("Child keys:")
         print(node.child.keys())
         print("Trained {0} models".format(i))
         if len(node.child.keys()) <= 2:
             continue
         if node.key == "*":
             continue
         if not node.child:
             continue
         train_data = self.get_train_data(node)
         if self.tree_type == "FT":
             node.classifier = train_supervised(input=train_data)
         elif self.tree_type == "LR":
             node.classifier = LogisticRegression(multi_class='multinomial', solver='newton-cg')
             node.classifier.fit(train_data[0], train_data[1])
         elif self.tree_type == "SVM":
             node.classifier = LinearSVC()
             node.classifier.fit(train_data[0], train_data[1])

Example #18

0

Show file

    def train(self):
        master = master_file.MasterFile()
        data = master.get_training_data()

        traning = traning_file.TraningFile()
        traning.reset()

        hasText = False
        for uuid in data:
            for name in data[uuid]:
                traning.append('__label__' + uuid + ' , ' +
                               app.get_wakati(data[uuid][name]['title']))
                hasText = True

        if hasText is False:
            return

        self._model = train_supervised(input=app.get_output_train_text(),
                                       epoch=200,
                                       lr=0.7,
                                       wordNgrams=2,
                                       loss="hs",
                                       dim=100)
        with self._lock:
            self.__print_results(
                *self._model.test(app.get_output_train_text()))
            self._model.save_model(app.get_output_fasttext_model())

Example #19

0

Show file

File: manager.py Project: CaliOpen/Caliopen

    def _train_tagging_model(self, output, quantization=False):
        """Train a tagging model using an annotated file and save it to output.

        Pre-processing of the input file must have been done prior to training.
        Training file should be utf-8.
        The model can be quantized to reduce memory usage
        (but quantization is quite expensive).
        """
        log.info('Start training model.')
        new_model = fastText.train_supervised(input=self.tempfilename,
                                              epoch=self.epoch,
                                              lr=self.lr,
                                              wordNgrams=self.wordNgrams,
                                              minCount=self.minCount,
                                              dim=self.dim,
                                              loss=self.loss,
                                              thread=self.thread,
                                              neg=self.neg)
        log.info('Training model done.')
        if quantization:
            log.info('Start quantization.')
            new_model.quantize(thread=self.thread, retrain=False)
            log.info('Quantization done.')
        model_file = '{}/{}.bin'.format(resources_path, output)
        new_model.save_model(model_file)
        log.info('Model saved at {}.'.format(model_file))
        return model_file

Example #20

0

Show file

File: views.py Project: MatthewAlviz/Text-Classifier-and-Trainer-Website

def trainModel(request):
    if request.method == 'POST':
        epoch = int(request.POST['epochValue'])
        lr = float(request.POST['learningRateValue'])
        user = request.session['userEmail']
        user = user.split('@')
        """print('epoch:')
        print(epoch)
        print('lr')
        print(lr)"""
        #Train the model
        model = fastText.train_supervised(
            settings.BASE_DIR + '/media/temp/' + user[0] + 'trainData.txt', lr,
            epoch)

        #Finished test
        result = model.test(settings.BASE_DIR + '/media/temp/' + user[0] +
                            'testData.txt')
        precision = float(result[1])
        recall = float(result[2])

        #Compute Accuracy
        Fmeasure = 2 * ((precision * recall) / (precision + recall))
        Fmeasure = "{:.2%}".format(Fmeasure)
        #print('accuracy%')
        #print(Fmeasure)

        #Save model
        model.save_model(settings.BASE_DIR + '/media/temp/' + user[0] +
                         'trainedModel.bin')

        data = {'status': Fmeasure}

    return JsonResponse(data)

Example #21

0

Show file

def run(ps, i):
    lr = random.uniform(0, 1)
    epoch = round(random.uniform(5, 50))
    wordNgrams = round(random.uniform(1, 5))
    minCount = round(random.uniform(1, 10))
    model = fastText.train_supervised(input=ps[0],
                                      lr=lr,
                                      epoch=epoch,
                                      wordNgrams=wordNgrams,
                                      minCount=minCount)

    #     had to do it like this because I want to get the prediction and not just a metric from the model
    preds = Path(ps[1]).read_text().split('\n')

    truth = []
    output = []
    for p in preds:
        label = p[:10]
        text = p[11:]
        truth.append(label)
        output.append(model.predict(text)[0][0])

    rpt = sklearn.metrics.classification_report(truth,
                                                output,
                                                output_dict=True)

    rpt['lr'] = lr
    rpt['epoch'] = epoch
    rpt['wordNgrams'] = wordNgrams
    rpt['minCount'] = minCount
    rpt['kappa'] = sklearn.metrics.cohen_kappa_score(truth, output)
    Path(ps[0].replace('train.csv',
                       f"{i}_results.json")).write_text(json.dumps(rpt))

Example #22

0

Show file

def get_predict(train_path, test_path):
    """
    :param train_path:      处理好的可以用来训练的数据
    :param test_path:       处理好的用来预测的无类标数据
    """
    import fastText
    classifier = fastText.train_supervised(
        train_path,
        lr=0.3,
        dim=10,
        epoch=10,
        minn=1,
        maxn=4,
        wordNgrams=1,
        loss='hs',
    )
    test_data = np.load(test_path)
    li = []
    j = 0
    for i in test_data:
        j += 1
        pre = classifier.predict(i[:-1], k=10)
        # print(pre)
        result = handle_ft_predict_result(
            pre)  # 每一个句子对应一个二维数组，形如[['价格,' '0' '0.65184933']...]
        # if each_c == 10:
        #     break
        li.append(result)
    # print(li)
    # li_np = np.array(li)
    loc_predict_probability = rename(test_path,
                                     '/probability_fastT_multiLabel.npy')
    np.save(loc_predict_probability, li)

Example #23

0

Show file

def fasttext_train_valid(train_X, train_y, model_dir, model_name, **kwargs):
    """
    :param train_X: 训练数据[type:txt]
    :param train_y: 验证数据[type:txt]
    :param model_dir: 模型保存的文件夹[type:string]
    :param model_name: 模型的命名
    :parma **kwargs: 模型训练的参数
    :return clf, precision, recall: 返回分类器，precision表现和recall表现
    """
    clf = train_supervised(input=train_X, **kwargs)
    result = clf.test(train_y)
    precision = result[1]
    recall = result[2]

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    __model = model_dir + model_name
    clf.save_model('%s.bin' % __model)

    __record = {
        'train_X': train_X,
        'train_y': train_y,
        'model_path': __model,
        'training_parameter': kwargs,
        'precision': precision,
        'recall': recall
    }
    pprint.pprint(__record, width=1)
    return clf, precision, recall

Example #24

0

Show file

def model():
    # model = fastText.train_supervised(path + 'train_set.txt', label='__myprefix__',bucket=400000
    #                                        ,wordNgrams=2,minCount=3,lr=1,lrUpdateRate=0)
    model = fastText.train_supervised(path + 't_train_set.txt', label='__myprefix__', bucket=39759
                                      , wordNgrams=3, minCount=3, lr=1, lrUpdateRate=200
                                      ,dim=128)
    result = model.test(path + 't_test_set.txt')
    print(result)
    # model.save_model(model_path + 'model')

    true_labels = []
    all_words = []
    f = open(path + 't_test_set.txt', 'r')
    for line in f:
        words, labels = model.get_line(line.strip())
        if len(labels) == 0:
            continue
        all_words.append(" ".join(words))
        true_labels += [labels]
    predictions, _ = model.predict(all_words)

    n = 0
    for i in range(len(true_labels)):
        if (predictions[i]==true_labels[i]):
            n+=1
    print(n/len(true_labels))

Example #25

0

Show file

File: fasttext.py Project: thiefstar/toy_algorithms_in_python

def train():
    # (先准备好训练语料)
    ftrain = 'reviews_fasttext_train.txt'
    ftest = 'reviews_fasttext_test.txt'

    # 训练模型
    classifier = fastText.train_supervised(ftrain, label="__label__")
    classifier.save_model("reviews_fasttext.bin")

Example #26

0

Show file

    def retrain(self):
        if len(self.data
               ) < self.config['models']['fast_text']['min_train_data']:
            return

        self.log.info('Retraining bilinear model, explore_rate = %.2f' %
                      self.explore_rate)

        train_file = tempfile.NamedTemporaryFile(mode='w')

        best_tactic = {}
        for scored_candidate, status in self.data:
            smt_file = scored_candidate.benchmarks[0].file
            tactics = get_tactics(scored_candidate.t)

            if status != ScoredCandidateStatus.SOLVED:
                continue

            for i, tactic in enumerate(tactics):
                fast_text_line = self.encode(tactics[:i])
                key = (smt_file, fast_text_line)
                if key not in best_tactic or best_tactic[key][
                        1] > scored_candidate.rlimit:
                    best_tactic[key] = (tactic.s, scored_candidate.rlimit)
                    entry = '__label__%s %s' % (tactic.s, fast_text_line)
                    train_file.write(entry + '\n')

        for scored_candidate, status in self.data:
            smt_file = scored_candidate.benchmarks[0].file
            tactics = get_tactics(scored_candidate.t)

            if status != ScoredCandidateStatus.SOLVED:
                continue

            for i, tactic in enumerate(tactics):
                params = self.strategy_enum.extract_params([tactic])[0]
                disc_idx = self.map_to_discretized(tactic.s, params)
                fast_text_line = self.encode(tactics[:i])
                key = (smt_file, fast_text_line)

                if best_tactic[key][1] == scored_candidate.rlimit:
                    entry = '__label__%s_%d %s' % (tactic.s, disc_idx,
                                                   fast_text_line)
                    train_file.write(entry + '\n')
        train_file.flush()
        os.fsync(train_file.fileno())

        self.log.info('Created dataset of %d entries' % len(best_tactic))

        self.bilinear_model = fastText.train_supervised(
            input=train_file.name,
            epoch=self.config['models']['fast_text']['epoch'],
            lr=self.config['models']['fast_text']['lr'],
            wordNgrams=self.config['models']['fast_text']['ngrams'],
            verbose=self.config['models']['fast_text']['verbose'],
            minCount=self.config['models']['fast_text']['min_count'],
            dim=self.config['models']['fast_text']['dim'],
        )

Example #27

0

Show file

File: FastText.py Project: wds-seu/Aceso

def train():
    model = fastText.train_supervised(path + "/materials/fastTexttrain.input",
                                      dim=108,
                                      epoch=100,
                                      lr=0.001,
                                      pretrainedVectors=path +
                                      "/materials/umls.embeddings")
    model.save_model(path + "/materials/fastTexttrain.model")
    print_results(*model.test(path + "/materials/fastTextval.input"))

Example #28

0

Show file

File: model.py Project: SuiyunCloud/ChineseTextClassifyTool

def fasttext(input, epoch=25, wordNgrams=2):

    model = train_supervised(input=input,
                             epoch=epoch,
                             lr=0.9,
                             wordNgrams=wordNgrams,
                             verbose=2,
                             minCount=1)
    return model

Example #29

0

Show file

 def _create_model(self):
     self.info('creating fastText model')
     trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
     modelpath = os.path.join(self.datadir, self.MODEL_FILE)
     params = {param: self.FASTTEXT_PARAMS[param](val)
               for param, val in self.params.items()
               if param in self.FASTTEXT_PARAMS}
     self._model = fastText.train_supervised(trainpath, **params)
     self._model.save_model(modelpath)

Example #30

0

Show file

def build_supervised_model(data, kwargs):
    kwargs = default_kwargs(kwargs)
    with tempfile.NamedTemporaryFile(delete=False) as tmpf:
        for line in data:
            line = "__label__" + line.strip() + "\n"
            tmpf.write(line.encode("UTF-8"))
        tmpf.flush()
        model = train_supervised(input=tmpf.name, **kwargs)
    return model

Example #31

0

Show file

File: make_model.py Project: angelica-keiskei/automata

def main(argv):
    input_file = argv[0]
    output_file = argv[1]
    model = ft.train_supervised(input=input_file,
                                dim=200,
                                epoch=30,
                                lr=0.1,
                                label='__label__',
                                thread=8)
    model.save_model(output_file)

Example #32

0

Show file

File: test_script.py Project: whr94621/FastText

    def sup_test(self):
        def check(
            output_local, test_local, n_local, p1_local, r1_local, size_local,
            lessthan
        ):
            test_args = self.default_test_args(output_local, test_local)
            test_output = self.get_test_output(test_args)
            self.assertEqual(
                str(test_output[0]),
                str(n_local),
                "N: Want: " + str(n_local) + " Is: " + str(test_output[0])
            )
            self.assertTrue(
                float(test_output[1]) >= float(p1_local),
                "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1])
            )
            self.assertTrue(
                float(test_output[2]) >= float(r1_local),
                "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2])
            )
            path_size = self.get_path_size(output_local)
            if lessthan:
                self.assertTrue(
                    path_size <= size_local, "Size: Want at most: " +
                    str(size_local) + " Is: " + str(path_size)
                )
            else:
                self.assertTrue(
                    path_size == size_local,
                    "Size: Want: " + str(size_local) + " Is: " + str(path_size)
                )

        train, test, output = self.build_paths(
            dataset + ".train", dataset + ".test", dataset
        )
        model = train_supervised(
            input=train,
            dim=10,
            lr=lr,
            wordNgrams=2,
            minCount=1,
            bucket=10000000,
            epoch=5,
            thread=self.num_thread()
        )
        model.save_model(output)
        check(output, test, n, p1, r1, size, False)
        # Exercising
        model.predict("hello world")
        model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True)
        model.save_model(output + ".ftz")
        # Exercising
        model.predict("hello world")
        check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)

Example #33

0

Show file

File: test_script.py Project: ElijahLai/fastText

    def sup_test(self):
        def get_path_size(path):
            path_size = subprocess.check_output(["stat", "-c", "%s",
                                                 path]).decode('utf-8')
            path_size = int(path_size)
            return path_size

        def check(model, model_filename, test, lessthan, msg_prefix=""):
            lines, labels = read_labels(test["data"])
            predictions = []
            for line in lines:
                pred_label, _ = model.predict(line)
                predictions.append(pred_label)
            p1_local_out, r1_local_out = util.test(predictions, labels)
            self.assertEqual(
                len(predictions), test["n"], msg_prefix + "N: Want: " +
                str(test["n"]) + " Is: " + str(len(predictions))
            )
            self.assertTrue(
                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
                str(test["p1"]) + " Is: " + str(p1_local_out)
            )
            self.assertTrue(
                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
                str(test["r1"]) + " Is: " + str(r1_local_out)
            )
            path_size = get_path_size(model_filename)
            size_msg = str(test["size"]) + " Is: " + str(path_size)
            if lessthan:
                self.assertTrue(
                    path_size <= test["size"],
                    msg_prefix + "Size: Want at most: " + size_msg
                )
            else:
                self.assertTrue(
                    path_size == test["size"],
                    msg_prefix + "Size: Want: " + size_msg
                )

        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
        model = train_supervised(**configuration["train_args"])
        model.save_model(output + ".bin")
        check(model, output + ".bin", configuration["test"], False)
        model.quantize(**configuration["quant_args"])
        model.save_model(output + ".ftz")
        check(
            model, output + ".ftz", configuration["quant_test"], True, "Quant: "
        )

Example #34

0

Show file

File: yelp.py Project: wlf061/nlp

def do_fasttext(text,stars):
    #删除通用词
    text_cleaned=[]

    list_stopWords = list(set(stopwords.words('english')))
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    d = enchant.Dict("en_US")

    for line in text:

        # 分词
        list_words = word_tokenize(line.lower())
        # 去掉标点符号
        list_words = [word for word in list_words if word not in english_punctuations]
        # 实用wordnet删除非常见英文单词
        #list_words = [word for word in list_words if wordnet.synsets(word) ]
        list_words = [word for word in list_words if d.check(word)]
        # 过滤停止词
        filtered_words = [w for w in list_words if not w in list_stopWords]
        text_cleaned.append( " ".join(filtered_words) )

    # 分割训练集和测试集 测试集占20%
    #x_train, x_test, y_train, y_test = train_test_split(text, stars, test_size=0.2)
    x_train, x_test, y_train, y_test = train_test_split(text_cleaned, stars, test_size=0.2)

    # 按照fasttest的要求生成训练数据和测试数据
    dump_file(x_train, y_train, "yelp_train.txt")
    dump_file(x_test, y_test, "yelp_test.txt")

    model = train_supervised(
        input="yelp_train.txt", epoch=20, lr=0.6, wordNgrams=2, verbose=2, minCount=1
    )

    def print_results(N, p, r):
        print("N\t" + str(N))
        print("P@{}\t{:.3f}".format(1, p))
        print("R@{}\t{:.3f}".format(1, r))

    print_results(*model.test("yelp_test.txt"))

Example #35

0

Show file

File: test_script.py Project: 1412723245/fastText

 def check(data):
     third = int(len(data) / 3)
     train_data = data[:2 * third]
     valid_data = data[third:]
     with tempfile.NamedTemporaryFile(
         delete=False
     ) as tmpf, tempfile.NamedTemporaryFile(delete=False) as tmpf2:
         for line in train_data:
             tmpf.write(
                 ("__label__" + line.strip() + "\n").encode("UTF-8")
             )
         tmpf.flush()
         for line in valid_data:
             tmpf2.write(
                 ("__label__" + line.strip() + "\n").encode("UTF-8")
             )
         tmpf2.flush()
         model = train_supervised(input=tmpf.name, **kwargs)
         true_labels = []
         all_words = []
         with open(tmpf2.name, 'r') as fid:
             for line in fid:
                 if sys.version_info < (3, 0):
                     line = line.decode("UTF-8")
                 if len(line.strip()) == 0:
                     continue
                 words, labels = model.get_line(line.strip())
                 if len(labels) == 0:
                     continue
                 all_words.append(" ".join(words))
                 true_labels += [labels]
         predictions, _ = model.predict(all_words)
         p, r = util.test(predictions, true_labels)
         N = len(predictions)
         Nt, pt, rt = model.test(tmpf2.name)
         self.assertEqual(N, Nt)
         self.assertEqual(p, pt)
         self.assertEqual(r, rt)

Example #36

0

Show file

File: core.py Project: shaypal5/skift

    def fit(self, X, y):
        """Fits the classifier

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values. An array of int.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        self._validate_x(X)
        y = self._validate_y(y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.num_classes_ = len(self.classes_)
        self.class_labels_ = [
            '__label__{}'.format(lbl) for lbl in self.classes_]
        # Dump training set to a fasttext-compatible file
        temp_trainset_fpath = temp_dataset_fpath()
        input_col = self._input_col(X)
        dump_xy_to_fasttext_format(input_col, y, temp_trainset_fpath)
        # train
        self.model = train_supervised(
            input=temp_trainset_fpath, **self.kwargs)
        # Return the classifier
        try:
            os.remove(temp_trainset_fpath)
        except FileNotFoundError:  # pragma: no cover
            pass
        return self

Example #37

0

Show file

File: test_script.py Project: 1412723245/fastText

    def sup_test(self):
        def get_path_size(path):
            path_size = subprocess.check_output(["stat", "-c", "%s",
                                                 path]).decode('utf-8')
            path_size = int(path_size)
            return path_size

        def check(model, model_filename, test, lessthan, msg_prefix=""):
            N_local_out, p1_local_out, r1_local_out = model.test(test["data"])
            self.assertEqual(
                N_local_out, test["n"], msg_prefix + "N: Want: " +
                str(test["n"]) + " Is: " + str(N_local_out)
            )
            self.assertTrue(
                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
                str(test["p1"]) + " Is: " + str(p1_local_out)
            )
            self.assertTrue(
                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
                str(test["r1"]) + " Is: " + str(r1_local_out)
            )
            path_size = get_path_size(model_filename)
            size_msg = str(test["size"]) + " Is: " + str(path_size)
            if lessthan:
                self.assertTrue(
                    path_size <= test["size"],
                    msg_prefix + "Size: Want at most: " + size_msg
                )
            else:
                self.assertTrue(
                    path_size == test["size"],
                    msg_prefix + "Size: Want: " + size_msg
                )

        configuration["args"]["input"] = os.path.join(
            data_dir, configuration["args"]["input"]
        )
        configuration["quant_args"]["input"] = configuration["args"]["input"]
        configuration["test"]["data"] = os.path.join(
            data_dir, configuration["test"]["data"]
        )
        configuration["quant_test"]["data"] = configuration["test"]["data"]
        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
        print()
        model = train_supervised(**configuration["args"])
        model.save_model(output + ".bin")
        check(
            model,
            output + ".bin",
            configuration["test"],
            False,
            msg_prefix="Supervised: "
        )
        print()
        model.quantize(**configuration["quant_args"])
        model.save_model(output + ".ftz")
        check(
            model,
            output + ".ftz",
            configuration["quant_test"],
            True,
            msg_prefix="Quantized: "
        )

Example #38

0

Show file

File: train_supervised.py Project: victorustc/fastText

import os
from fastText import train_supervised


def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

if __name__ == "__main__":
    train_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.train')
    valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid')

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = train_supervised(
        input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
    )
    print_results(*model.test(valid_data))

    model = train_supervised(
        input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1,
        loss="hs"
    )
    print_results(*model.test(valid_data))
    model.save_model("cooking.bin")

    model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
    print_results(*model.test(valid_data))
    model.save_model("cooking.ftz")

Example #39

0

Show file

File: fasttext.py Project: PrivateThink/nlp

    print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    scores = cross_val_score(clf, x, y, cv = 5,scoring='accuracy')
    #print scores
    print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

if __name__ == '__main__':
    SogouTCE_kv=load_SogouTCE()

    #labels=load_url(SogouTCE_kv)

    x,y=load_selecteddata(SogouTCE_kv)

    stopwords=load_stopwords()

    #切割token
    x=[  [word for word in line.split() if word not in stopwords]   for line in x]

    # 分割训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    #按照fasttest的要求生成训练数据和测试数据
    dump_file(x_train,y_train,"../data/sougou_train.txt")
    dump_file(x_test, y_test, "../data/sougou_test.txt")

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = train_supervised(
        input="../data/sougou_train.txt",
        epoch=25, lr=0.9, wordNgrams=2, verbose=2, minCount=1
    )
    print_results(*model.test("../data/sougou_test.txt"))