Exemple #1
0
 def fit(self):
     '''
     模型训练
     :return:
     '''
     fastText.train_supervised(self.train_input,
                               label="__label__").save_model(self.output)
Exemple #2
0
def test_fasttext():
    import os
    import codecs
    from fastText import train_supervised

    data_dir = "/home/davidyu/stock/scripts/davidyu_stock/scripts/analysis/news_report"
    train_data = os.path.join(data_dir, "train_text.txt")
    valid_data = os.path.join(data_dir, "valid_text.txt")

    model = train_supervised(input=train_data,
                             epoch=50,
                             lr=0.5,
                             wordNgrams=2,
                             dim=100,
                             verbose=2,
                             minCount=1,
                             thread=1)
    model = train_supervised(input=train_data,
                             epoch=50,
                             lr=0.5,
                             wordNgrams=2,
                             dim=100,
                             verbose=2,
                             minCount=1,
                             thread=1,
                             loss="softmax")

    print_results(*model.test(valid_data))
def train_model(fname_prefix: str,
                out_fname: str,
                label_prefix: str = "__label__",
                **kwargs):
    # Train the model
    import fastText

    params = {
        "dim": kwargs.get("dim", 300),
        "epoch": kwargs.get("epoch", 1000),
        "wordNgrams": kwargs.get("wordNgrams", 2),
        "verbose": kwargs.get("verbose", 2),
        "minCount": kwargs.get("minCount", 15),
        "minCountLabel": kwargs.get("minCountLabel", 5),
        "lr": kwargs.get("lr", 0.1),
        "neg": kwargs.get("neg", 10),
        "thread": kwargs.get("thread", 16),
        "loss": kwargs.get("loss", "ns"),
        "t": kwargs.get("t", 1e-5)
    }

    logging.info("Training fastText model", extra={"params": params})
    model = fastText.train_supervised(input="%s.train" % fname_prefix,
                                      label=label_prefix,
                                      **params)
    logging.info("Writing model to disk", extra={"output_file": out_fname})
    model.save_model(out_fname)
    return model
def train_sup(mode=fasttextConfig.create_data_word):
    # train supervised model
    print('start train supervised fasttext model')
    # init path
    if mode == fasttextConfig.create_data_word:
        input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_word_data
        output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model
        test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_word_data
    elif mode == fasttextConfig.create_data_char:
        input_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_train_char_data
        output_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_char_model
        test_path = fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_test_char_data
    # init path
    model = fastText.train_supervised(input=input_path,
                                      dim=200,
                                      epoch=100,
                                      lr=1.0,
                                      wordNgrams=3,
                                      ws=7,
                                      verbose=2,
                                      minCount=1,
                                      thread=8,
                                      loss='hs')
    print_results(*model.test(test_path))
    model.save_model(output_path)
    print("train sup fasttext finish")
Exemple #5
0
def fit(session, uid, path):
    labeled_text = dal.get_text_labeled_text(session, uid)

    # not sure if this is the right prereq for fasttext
    if len(labeled_text['targets']) < 2:
        return

    # preprocess training data one line at a time in order to limit pipe buffer issues
    preprocessed_data = [preprocess(datum) for datum in labeled_text['data']]

    # create a new temporary training data file
    fd, train_path = tempfile.mkstemp()

    # close the temporary training data file descriptor as we don't need it
    os.close(fd)

    # fill the temporary training data file
    with open(train_path, 'w') as f:
        for (target, datum) in zip(labeled_text['targets'], preprocessed_data):
            f.write(label_prefix + target + " " + datum + "\n")

    # train the fasttext model
    model = fastText.train_supervised(input=train_path)

    # compress the fasttext model to save space (disabled for now because it requires at least 256 rows)
    #model.quantize(input=train_path)

    # delete the temporary training data file
    os.unlink(train_path)

    # serialize the model out to the temporary model file
    model.save_model(path)
Exemple #6
0
def fasttext_train(input, output):
    import fastText as ft
    model = ft.train_supervised(
        input=input,
        dim=300,
        pretrainedVectors='wiki-news-300d-1M-subword.vec')
    model.save_model(output)
Exemple #7
0
    def fit(self, X, y):
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(y, np.ndarray):
            y = np.array(y)

        path = data_to_temp(X, self.label, y)
        self._num_classes = len(np.unique(y))
        self._model = fastText.train_supervised(path,
                                                lr=self.lr,
                                                dim=self.dim,
                                                ws=self.ws,
                                                epoch=self.epoch,
                                                minCount=self.minCount,
                                                minCountLabel=self.minCountLabel,
                                                minn=self.minn,
                                                maxn=self.maxn,
                                                neg=self.neg,
                                                wordNgrams=self.wordNgrams,
                                                loss=self.loss,
                                                bucket=self.bucket,
                                                thread=self.thread,
                                                lrUpdateRate=self.lrUpdateRate,
                                                t=self.t,
                                                label=self.label,
                                                verbose=self.verbose)
        os.remove(path)
        fd, path = tempfile.mkstemp()
        self._model.save_softmax(path)
        self.class_embeddings = pd.read_csv(
            path, skiprows=[0], delimiter=' ').dropna(axis=1)
        os.remove(path)
        return self
Exemple #8
0
    def trainFastText(self, sents, labls, argsOut):

        if argsOut.load:
            # load given model
            print("Loading fastText model {0} ..".format(argsOut.load))
            return load_model(argsOut.load)

        outFile = "ft_runTrain{0}.txt".format(time.time())

        labls = self.roundLabelsRegress(labls)

        for i in range(0, len(sents)):
            sents[i] = "__label__{0} {1}".format(labls[i], sents[i])

        with open(outFile, mode="w", encoding="utf-8") as file:
            file.writelines(sents)

        # Get model
        model = train_supervised(input=outFile,
                                 epoch=argsOut.epochs,
                                 dim=argsOut.dim,
                                 bucket=10000000,
                                 lr=argsOut.lr,
                                 wordNgrams=argsOut.wordNgrams,
                                 verbose=2,
                                 minCount=1)
        os.remove(outFile)

        return model
Exemple #9
0
def train():
    if FLAGS.word_level:
        train_file = 'data/train_w.fast'
        dev_file = 'data/dev_w.fast'
        model_file = 'temp/ml/fast_model_w.bin'
    else:
        train_file = 'data/train_c.fast'
        dev_file = 'data/dev_c.fast'
        model_file = 'temp/ml/fast_model_c.bin'
    model = fastText.train_supervised(input=train_file,
                                      dim=100,
                                      epoch=15,
                                      thread=40,
                                      minCount=10,
                                      loss='softmax',
                                      wordNgrams=2)
    model.save_model(model_file)

    vocab_size, p, r = model.test(train_file)
    f1 = (p * r * 2) * 1.0 / (p + r)
    print 'Train:vocab_size:%d  p:%.5f   r:%.5f   f1:%.5f' % (vocab_size, p, r,
                                                              f1)

    vocab_size, p, r = model.test(dev_file)
    f1 = (p * r * 2) * 1.0 / (p + r)
    print 'Dev:vocab_size:%d  p:%.5f   r:%.5f   f1:%.5f' % (vocab_size, p, r,
                                                            f1)
    def fit(self, X, y):
        '''
        Method that trains the fasttext clasifier on the provided dataset. If the file with the serialized model 
        of the classifier trained on the exact same dataset exists, then just load the model, otherwise 
        train the classifier and save model.
        :param X: the set of instances in the dataset
        :param y: the set of labels in the dataset
        :return: None
        '''
        file_identifier = self._get_identifier_for_model_file(X, y)
        serialized_file = os.path.join(self.folder, file_identifier)

        if os.path.isfile(serialized_file):
            print("Loading the model from file " + str(serialized_file))
            # file already exists, fit means just loading the model
            self.model = load_model(serialized_file)

        else:
            print("Training the classifier " + str(self.name))
            # means that file does not exist, we have to train the model

            with open(SWAP_FILE, 'w', encoding='utf8') as f:
                for tweet, label in zip(X, y):
                    f.write(LABEL_IDENTIFIER + str(label) + SEPARATOR +
                            tweet.strip() + '\n')

            self.model = train_supervised(SWAP_FILE, **self.params)
            os.remove(SWAP_FILE)  # DELETE SWAP FILE AFTER USAGE

            # lastly, save model. Firstly, create folder for classifier, if it doesn't exist

            os.makedirs(self.folder, exist_ok=True)
            self.model.save_model(serialized_file)
Exemple #11
0
    def fit(self, X, y):
        # Fit model
        if not isinstance(X, np.ndarray):
            X = np.array(X)
        if not isinstance(y, np.ndarray):
            y = np.array(y)

        path = data_to_temp(X, self.label, y)
        self._num_classes = len(np.unique(y))
        self._model = fastText.train_supervised(path,
                                                lr=self.lr,
                                                dim=self.dim,
                                                ws=self.ws,
                                                epoch=self.epoch,
                                                minCount=self.minCount,
                                                minCountLabel=self.minCountLabel,
                                                minn=self.minn,
                                                maxn=self.maxn,
                                                neg=self.neg,
                                                wordNgrams=self.wordNgrams,
                                                loss=self.loss,
                                                bucket=self.bucket,
                                                thread=self.thread,
                                                lrUpdateRate=self.lrUpdateRate,
                                                t=self.t,
                                                label=self.label,
                                                verbose=self.verbose)
        os.remove(path)
        return self
    def _train_tagging_model(self, output, quantization=False):
        """Train a tagging model using an annotated file and save it to output.

        Pre-processing of the input file must have been done prior to training.
        Training file should be utf-8.
        The model can be quantized to reduce memory usage
        (but quantization is quite expensive).
        """
        log.info('Start training model.')
        new_model = fastText.train_supervised(input=self.tempfilename,
                                              epoch=self.epoch,
                                              lr=self.lr,
                                              wordNgrams=self.wordNgrams,
                                              minCount=self.minCount,
                                              dim=self.dim,
                                              loss=self.loss,
                                              thread=self.thread,
                                              neg=self.neg)
        log.info('Training model done.')
        if quantization:
            log.info('Start quantization.')
            new_model.quantize(thread=self.thread, retrain=False)
            log.info('Quantization done.')
        model_file = '{}/{}.bin'.format(resources_path, output)
        new_model.save_model(model_file)
        log.info('Model saved at {}.'.format(model_file))
        return model_file
Exemple #13
0
def get_predict(train_path, test_path):
    """
    :param train_path:      处理好的可以用来训练的数据
    :param test_path:       处理好的用来预测的无类标数据
    """
    import fastText
    classifier = fastText.train_supervised(
        train_path,
        lr=0.3,
        dim=200,
        epoch=8,
        minn=1,
        maxn=4,
        wordNgrams=2,
        loss='hs',
    )
    test_data = np.load(test_path)
    li = []
    j = 0
    for i in test_data:
        j += 1
        pre = classifier.predict(i[:-1], k=10)
        result = handle_ft_predict_result(
            pre)  # 每一个句子对应一个二维数组,形如[['价格,' '0' '0.65184933']...]
        li.append(result)
    loc_predict_probability = rename(test_path,
                                     'probability_fastT_subOnly.npy')
    np.save(loc_predict_probability, li)
Exemple #14
0
 def process(self, data_file):
     json_object = self.load_json(data_file)
     json_object = self.replace_entities(json_object)
     self.load_answers(json_object)
     self.create_labeled_questions_file(json_object, "fasttext_data.txt")
     self.fast_text_model = fastText.train_supervised("fasttext_data.txt",
                                                      epoch=50)
Exemple #15
0
    def train(self, data, labels, dim=100, ng=2, epoch=10):
        if len(data) != len(labels):
            raise ValueError("Length of data (" + str(len(data)) +
                             ") does not match length of labels (" +
                             str(len(labels)) + ")")

        # Convert training data into strings for fastText
        mapped_report_strs = []
        for i in range(len(data)):
            report_string = data[i].replace("\n", " ")
            label = " __label__" + str(labels[i])
            mapped_report_strs.append(report_string + label)
        shuffle(mapped_report_strs)

        # Write strings to a temp file
        train_path = "./MODEL_TRAIN_TEMP.bin"
        with open(train_path, 'w') as outfile:
            for mrs in mapped_report_strs:
                outfile.write(mrs)
                outfile.write("\n")

        # Train fastText model
        self.model = fastText.train_supervised(train_path,
                                               dim=dim,
                                               epoch=epoch,
                                               thread=4,
                                               wordNgrams=ng)

        # Delete temp file
        os.remove(train_path)
def fasttext_classify(data, extra_params={}):
  class_to_predict = 'type' # product importance
  data[class_to_predict] = data[class_to_predict].map(lambda s : s.replace(" ", ""))
  data_for_fasttext = data['text'] + ' __label__' + data[class_to_predict]
  data_for_fasttext = shuffle(data_for_fasttext, random_state=77)

  num_records = len(data_for_fasttext)
  data_train = data_for_fasttext[:int(0.85 * num_records)]
  data_test = data_for_fasttext[int(0.85 * num_records):]

  data_train.to_csv(TRAIN_PATH, sep='\t', header=0, index=False)
  data_test.to_csv(TEST_PATH, sep='\t', header=0, index=False)

  model = fastText.train_supervised(TRAIN_PATH, **extra_params)
  #model.save_model(MODEL_PATH)
  print('Training accuracy:')
  train_accuracy = model.test(TRAIN_PATH)
  print(train_accuracy[-1])

  print('Test accuracy:')
  test_accuracy = model.test(TEST_PATH)
  print(test_accuracy[-1])

  y_pred = []
  y_true = []
  for test_item in data_test:
    test_text, test_label = test_item.split('__label__')
    y_pred.append(model.predict(test_text)[0])
    y_true.append('__label__' + test_label)

  print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))

  return test_accuracy[-1] # accuracy is a tuple
Exemple #17
0
 def train_model(self, traverse_list):
     """
     This is used for LR and FT model as each step of the model will output a probability.
     :param traverse_list:
     :return: None
     """
     for i, node in enumerate(traverse_list):
         print("Node key: " + node.key)
         print("Child keys:")
         print(node.child.keys())
         print("Trained {0} models".format(i))
         if len(node.child.keys()) <= 2:
             continue
         if node.key == "*":
             continue
         if not node.child:
             continue
         train_data = self.get_train_data(node)
         if self.tree_type == "FT":
             node.classifier = train_supervised(input=train_data)
         elif self.tree_type == "LR":
             node.classifier = LogisticRegression(multi_class='multinomial', solver='newton-cg')
             node.classifier.fit(train_data[0], train_data[1])
         elif self.tree_type == "SVM":
             node.classifier = LinearSVC()
             node.classifier.fit(train_data[0], train_data[1])
Exemple #18
0
    def train(self):
        master = master_file.MasterFile()
        data = master.get_training_data()

        traning = traning_file.TraningFile()
        traning.reset()

        hasText = False
        for uuid in data:
            for name in data[uuid]:
                traning.append('__label__' + uuid + ' , ' +
                               app.get_wakati(data[uuid][name]['title']))
                hasText = True

        if hasText is False:
            return

        self._model = train_supervised(input=app.get_output_train_text(),
                                       epoch=200,
                                       lr=0.7,
                                       wordNgrams=2,
                                       loss="hs",
                                       dim=100)
        with self._lock:
            self.__print_results(
                *self._model.test(app.get_output_train_text()))
            self._model.save_model(app.get_output_fasttext_model())
Exemple #19
0
    def _train_tagging_model(self, output, quantization=False):
        """Train a tagging model using an annotated file and save it to output.

        Pre-processing of the input file must have been done prior to training.
        Training file should be utf-8.
        The model can be quantized to reduce memory usage
        (but quantization is quite expensive).
        """
        log.info('Start training model.')
        new_model = fastText.train_supervised(input=self.tempfilename,
                                              epoch=self.epoch,
                                              lr=self.lr,
                                              wordNgrams=self.wordNgrams,
                                              minCount=self.minCount,
                                              dim=self.dim,
                                              loss=self.loss,
                                              thread=self.thread,
                                              neg=self.neg)
        log.info('Training model done.')
        if quantization:
            log.info('Start quantization.')
            new_model.quantize(thread=self.thread, retrain=False)
            log.info('Quantization done.')
        model_file = '{}/{}.bin'.format(resources_path, output)
        new_model.save_model(model_file)
        log.info('Model saved at {}.'.format(model_file))
        return model_file
def trainModel(request):
    if request.method == 'POST':
        epoch = int(request.POST['epochValue'])
        lr = float(request.POST['learningRateValue'])
        user = request.session['userEmail']
        user = user.split('@')
        """print('epoch:')
        print(epoch)
        print('lr')
        print(lr)"""
        #Train the model
        model = fastText.train_supervised(
            settings.BASE_DIR + '/media/temp/' + user[0] + 'trainData.txt', lr,
            epoch)

        #Finished test
        result = model.test(settings.BASE_DIR + '/media/temp/' + user[0] +
                            'testData.txt')
        precision = float(result[1])
        recall = float(result[2])

        #Compute Accuracy
        Fmeasure = 2 * ((precision * recall) / (precision + recall))
        Fmeasure = "{:.2%}".format(Fmeasure)
        #print('accuracy%')
        #print(Fmeasure)

        #Save model
        model.save_model(settings.BASE_DIR + '/media/temp/' + user[0] +
                         'trainedModel.bin')

        data = {'status': Fmeasure}

    return JsonResponse(data)
Exemple #21
0
def run(ps, i):
    lr = random.uniform(0, 1)
    epoch = round(random.uniform(5, 50))
    wordNgrams = round(random.uniform(1, 5))
    minCount = round(random.uniform(1, 10))
    model = fastText.train_supervised(input=ps[0],
                                      lr=lr,
                                      epoch=epoch,
                                      wordNgrams=wordNgrams,
                                      minCount=minCount)

    #     had to do it like this because I want to get the prediction and not just a metric from the model
    preds = Path(ps[1]).read_text().split('\n')

    truth = []
    output = []
    for p in preds:
        label = p[:10]
        text = p[11:]
        truth.append(label)
        output.append(model.predict(text)[0][0])

    rpt = sklearn.metrics.classification_report(truth,
                                                output,
                                                output_dict=True)

    rpt['lr'] = lr
    rpt['epoch'] = epoch
    rpt['wordNgrams'] = wordNgrams
    rpt['minCount'] = minCount
    rpt['kappa'] = sklearn.metrics.cohen_kappa_score(truth, output)
    Path(ps[0].replace('train.csv',
                       f"{i}_results.json")).write_text(json.dumps(rpt))
Exemple #22
0
def get_predict(train_path, test_path):
    """
    :param train_path:      处理好的可以用来训练的数据
    :param test_path:       处理好的用来预测的无类标数据
    """
    import fastText
    classifier = fastText.train_supervised(
        train_path,
        lr=0.3,
        dim=10,
        epoch=10,
        minn=1,
        maxn=4,
        wordNgrams=1,
        loss='hs',
    )
    test_data = np.load(test_path)
    li = []
    j = 0
    for i in test_data:
        j += 1
        pre = classifier.predict(i[:-1], k=10)
        # print(pre)
        result = handle_ft_predict_result(
            pre)  # 每一个句子对应一个二维数组,形如[['价格,' '0' '0.65184933']...]
        # if each_c == 10:
        #     break
        li.append(result)
    # print(li)
    # li_np = np.array(li)
    loc_predict_probability = rename(test_path,
                                     '/probability_fastT_multiLabel.npy')
    np.save(loc_predict_probability, li)
Exemple #23
0
def fasttext_train_valid(train_X, train_y, model_dir, model_name, **kwargs):
    """
    :param train_X: 训练数据[type:txt]
    :param train_y: 验证数据[type:txt]
    :param model_dir: 模型保存的文件夹[type:string]
    :param model_name: 模型的命名
    :parma **kwargs: 模型训练的参数
    :return clf, precision, recall: 返回分类器,precision表现和recall表现
    """
    clf = train_supervised(input=train_X, **kwargs)
    result = clf.test(train_y)
    precision = result[1]
    recall = result[2]

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    __model = model_dir + model_name
    clf.save_model('%s.bin' % __model)

    __record = {
        'train_X': train_X,
        'train_y': train_y,
        'model_path': __model,
        'training_parameter': kwargs,
        'precision': precision,
        'recall': recall
    }
    pprint.pprint(__record, width=1)
    return clf, precision, recall
Exemple #24
0
def model():
    # model = fastText.train_supervised(path + 'train_set.txt', label='__myprefix__',bucket=400000
    #                                        ,wordNgrams=2,minCount=3,lr=1,lrUpdateRate=0)
    model = fastText.train_supervised(path + 't_train_set.txt', label='__myprefix__', bucket=39759
                                      , wordNgrams=3, minCount=3, lr=1, lrUpdateRate=200
                                      ,dim=128)
    result = model.test(path + 't_test_set.txt')
    print(result)
    # model.save_model(model_path + 'model')

    true_labels = []
    all_words = []
    f = open(path + 't_test_set.txt', 'r')
    for line in f:
        words, labels = model.get_line(line.strip())
        if len(labels) == 0:
            continue
        all_words.append(" ".join(words))
        true_labels += [labels]
    predictions, _ = model.predict(all_words)

    n = 0
    for i in range(len(true_labels)):
        if (predictions[i]==true_labels[i]):
            n+=1
    print(n/len(true_labels))
def train():
    # (先准备好训练语料)
    ftrain = 'reviews_fasttext_train.txt'
    ftest = 'reviews_fasttext_test.txt'

    # 训练模型
    classifier = fastText.train_supervised(ftrain, label="__label__")
    classifier.save_model("reviews_fasttext.bin")
Exemple #26
0
    def retrain(self):
        if len(self.data
               ) < self.config['models']['fast_text']['min_train_data']:
            return

        self.log.info('Retraining bilinear model, explore_rate = %.2f' %
                      self.explore_rate)

        train_file = tempfile.NamedTemporaryFile(mode='w')

        best_tactic = {}
        for scored_candidate, status in self.data:
            smt_file = scored_candidate.benchmarks[0].file
            tactics = get_tactics(scored_candidate.t)

            if status != ScoredCandidateStatus.SOLVED:
                continue

            for i, tactic in enumerate(tactics):
                fast_text_line = self.encode(tactics[:i])
                key = (smt_file, fast_text_line)
                if key not in best_tactic or best_tactic[key][
                        1] > scored_candidate.rlimit:
                    best_tactic[key] = (tactic.s, scored_candidate.rlimit)
                    entry = '__label__%s %s' % (tactic.s, fast_text_line)
                    train_file.write(entry + '\n')

        for scored_candidate, status in self.data:
            smt_file = scored_candidate.benchmarks[0].file
            tactics = get_tactics(scored_candidate.t)

            if status != ScoredCandidateStatus.SOLVED:
                continue

            for i, tactic in enumerate(tactics):
                params = self.strategy_enum.extract_params([tactic])[0]
                disc_idx = self.map_to_discretized(tactic.s, params)
                fast_text_line = self.encode(tactics[:i])
                key = (smt_file, fast_text_line)

                if best_tactic[key][1] == scored_candidate.rlimit:
                    entry = '__label__%s_%d %s' % (tactic.s, disc_idx,
                                                   fast_text_line)
                    train_file.write(entry + '\n')
        train_file.flush()
        os.fsync(train_file.fileno())

        self.log.info('Created dataset of %d entries' % len(best_tactic))

        self.bilinear_model = fastText.train_supervised(
            input=train_file.name,
            epoch=self.config['models']['fast_text']['epoch'],
            lr=self.config['models']['fast_text']['lr'],
            wordNgrams=self.config['models']['fast_text']['ngrams'],
            verbose=self.config['models']['fast_text']['verbose'],
            minCount=self.config['models']['fast_text']['min_count'],
            dim=self.config['models']['fast_text']['dim'],
        )
Exemple #27
0
def train():
    model = fastText.train_supervised(path + "/materials/fastTexttrain.input",
                                      dim=108,
                                      epoch=100,
                                      lr=0.001,
                                      pretrainedVectors=path +
                                      "/materials/umls.embeddings")
    model.save_model(path + "/materials/fastTexttrain.model")
    print_results(*model.test(path + "/materials/fastTextval.input"))
def fasttext(input, epoch=25, wordNgrams=2):

    model = train_supervised(input=input,
                             epoch=epoch,
                             lr=0.9,
                             wordNgrams=wordNgrams,
                             verbose=2,
                             minCount=1)
    return model
Exemple #29
0
 def _create_model(self):
     self.info('creating fastText model')
     trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
     modelpath = os.path.join(self.datadir, self.MODEL_FILE)
     params = {param: self.FASTTEXT_PARAMS[param](val)
               for param, val in self.params.items()
               if param in self.FASTTEXT_PARAMS}
     self._model = fastText.train_supervised(trainpath, **params)
     self._model.save_model(modelpath)
Exemple #30
0
def build_supervised_model(data, kwargs):
    kwargs = default_kwargs(kwargs)
    with tempfile.NamedTemporaryFile(delete=False) as tmpf:
        for line in data:
            line = "__label__" + line.strip() + "\n"
            tmpf.write(line.encode("UTF-8"))
        tmpf.flush()
        model = train_supervised(input=tmpf.name, **kwargs)
    return model
def main(argv):
    input_file = argv[0]
    output_file = argv[1]
    model = ft.train_supervised(input=input_file,
                                dim=200,
                                epoch=30,
                                lr=0.1,
                                label='__label__',
                                thread=8)
    model.save_model(output_file)
Exemple #32
0
    def sup_test(self):
        def check(
            output_local, test_local, n_local, p1_local, r1_local, size_local,
            lessthan
        ):
            test_args = self.default_test_args(output_local, test_local)
            test_output = self.get_test_output(test_args)
            self.assertEqual(
                str(test_output[0]),
                str(n_local),
                "N: Want: " + str(n_local) + " Is: " + str(test_output[0])
            )
            self.assertTrue(
                float(test_output[1]) >= float(p1_local),
                "p1: Want: " + str(p1_local) + " Is: " + str(test_output[1])
            )
            self.assertTrue(
                float(test_output[2]) >= float(r1_local),
                "r1: Want: " + str(r1_local) + " Is: " + str(test_output[2])
            )
            path_size = self.get_path_size(output_local)
            if lessthan:
                self.assertTrue(
                    path_size <= size_local, "Size: Want at most: " +
                    str(size_local) + " Is: " + str(path_size)
                )
            else:
                self.assertTrue(
                    path_size == size_local,
                    "Size: Want: " + str(size_local) + " Is: " + str(path_size)
                )

        train, test, output = self.build_paths(
            dataset + ".train", dataset + ".test", dataset
        )
        model = train_supervised(
            input=train,
            dim=10,
            lr=lr,
            wordNgrams=2,
            minCount=1,
            bucket=10000000,
            epoch=5,
            thread=self.num_thread()
        )
        model.save_model(output)
        check(output, test, n, p1, r1, size, False)
        # Exercising
        model.predict("hello world")
        model.quantize(input=train, retrain=True, cutoff=100000, qnorm=True)
        model.save_model(output + ".ftz")
        # Exercising
        model.predict("hello world")
        check(output + ".ftz", test, n, p1_q, r1_q, quant_size, True)
Exemple #33
0
    def sup_test(self):
        def get_path_size(path):
            path_size = subprocess.check_output(["stat", "-c", "%s",
                                                 path]).decode('utf-8')
            path_size = int(path_size)
            return path_size

        def check(model, model_filename, test, lessthan, msg_prefix=""):
            lines, labels = read_labels(test["data"])
            predictions = []
            for line in lines:
                pred_label, _ = model.predict(line)
                predictions.append(pred_label)
            p1_local_out, r1_local_out = util.test(predictions, labels)
            self.assertEqual(
                len(predictions), test["n"], msg_prefix + "N: Want: " +
                str(test["n"]) + " Is: " + str(len(predictions))
            )
            self.assertTrue(
                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
                str(test["p1"]) + " Is: " + str(p1_local_out)
            )
            self.assertTrue(
                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
                str(test["r1"]) + " Is: " + str(r1_local_out)
            )
            path_size = get_path_size(model_filename)
            size_msg = str(test["size"]) + " Is: " + str(path_size)
            if lessthan:
                self.assertTrue(
                    path_size <= test["size"],
                    msg_prefix + "Size: Want at most: " + size_msg
                )
            else:
                self.assertTrue(
                    path_size == test["size"],
                    msg_prefix + "Size: Want: " + size_msg
                )

        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
        model = train_supervised(**configuration["train_args"])
        model.save_model(output + ".bin")
        check(model, output + ".bin", configuration["test"], False)
        model.quantize(**configuration["quant_args"])
        model.save_model(output + ".ftz")
        check(
            model, output + ".ftz", configuration["quant_test"], True, "Quant: "
        )
Exemple #34
0
def do_fasttext(text,stars):
    #删除通用词
    text_cleaned=[]

    list_stopWords = list(set(stopwords.words('english')))
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    d = enchant.Dict("en_US")

    for line in text:

        # 分词
        list_words = word_tokenize(line.lower())
        # 去掉标点符号
        list_words = [word for word in list_words if word not in english_punctuations]
        # 实用wordnet删除非常见英文单词
        #list_words = [word for word in list_words if wordnet.synsets(word) ]
        list_words = [word for word in list_words if d.check(word)]
        # 过滤停止词
        filtered_words = [w for w in list_words if not w in list_stopWords]
        text_cleaned.append( " ".join(filtered_words) )

    # 分割训练集和测试集 测试集占20%
    #x_train, x_test, y_train, y_test = train_test_split(text, stars, test_size=0.2)
    x_train, x_test, y_train, y_test = train_test_split(text_cleaned, stars, test_size=0.2)

    # 按照fasttest的要求生成训练数据和测试数据
    dump_file(x_train, y_train, "yelp_train.txt")
    dump_file(x_test, y_test, "yelp_test.txt")

    model = train_supervised(
        input="yelp_train.txt", epoch=20, lr=0.6, wordNgrams=2, verbose=2, minCount=1
    )

    def print_results(N, p, r):
        print("N\t" + str(N))
        print("P@{}\t{:.3f}".format(1, p))
        print("R@{}\t{:.3f}".format(1, r))

    print_results(*model.test("yelp_test.txt"))
Exemple #35
0
 def check(data):
     third = int(len(data) / 3)
     train_data = data[:2 * third]
     valid_data = data[third:]
     with tempfile.NamedTemporaryFile(
         delete=False
     ) as tmpf, tempfile.NamedTemporaryFile(delete=False) as tmpf2:
         for line in train_data:
             tmpf.write(
                 ("__label__" + line.strip() + "\n").encode("UTF-8")
             )
         tmpf.flush()
         for line in valid_data:
             tmpf2.write(
                 ("__label__" + line.strip() + "\n").encode("UTF-8")
             )
         tmpf2.flush()
         model = train_supervised(input=tmpf.name, **kwargs)
         true_labels = []
         all_words = []
         with open(tmpf2.name, 'r') as fid:
             for line in fid:
                 if sys.version_info < (3, 0):
                     line = line.decode("UTF-8")
                 if len(line.strip()) == 0:
                     continue
                 words, labels = model.get_line(line.strip())
                 if len(labels) == 0:
                     continue
                 all_words.append(" ".join(words))
                 true_labels += [labels]
         predictions, _ = model.predict(all_words)
         p, r = util.test(predictions, true_labels)
         N = len(predictions)
         Nt, pt, rt = model.test(tmpf2.name)
         self.assertEqual(N, Nt)
         self.assertEqual(p, pt)
         self.assertEqual(r, rt)
Exemple #36
0
    def fit(self, X, y):
        """Fits the classifier

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values. An array of int.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check that X and y have correct shape
        self._validate_x(X)
        y = self._validate_y(y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.num_classes_ = len(self.classes_)
        self.class_labels_ = [
            '__label__{}'.format(lbl) for lbl in self.classes_]
        # Dump training set to a fasttext-compatible file
        temp_trainset_fpath = temp_dataset_fpath()
        input_col = self._input_col(X)
        dump_xy_to_fasttext_format(input_col, y, temp_trainset_fpath)
        # train
        self.model = train_supervised(
            input=temp_trainset_fpath, **self.kwargs)
        # Return the classifier
        try:
            os.remove(temp_trainset_fpath)
        except FileNotFoundError:  # pragma: no cover
            pass
        return self
Exemple #37
0
    def sup_test(self):
        def get_path_size(path):
            path_size = subprocess.check_output(["stat", "-c", "%s",
                                                 path]).decode('utf-8')
            path_size = int(path_size)
            return path_size

        def check(model, model_filename, test, lessthan, msg_prefix=""):
            N_local_out, p1_local_out, r1_local_out = model.test(test["data"])
            self.assertEqual(
                N_local_out, test["n"], msg_prefix + "N: Want: " +
                str(test["n"]) + " Is: " + str(N_local_out)
            )
            self.assertTrue(
                p1_local_out >= test["p1"], msg_prefix + "p1: Want: " +
                str(test["p1"]) + " Is: " + str(p1_local_out)
            )
            self.assertTrue(
                r1_local_out >= test["r1"], msg_prefix + "r1: Want: " +
                str(test["r1"]) + " Is: " + str(r1_local_out)
            )
            path_size = get_path_size(model_filename)
            size_msg = str(test["size"]) + " Is: " + str(path_size)
            if lessthan:
                self.assertTrue(
                    path_size <= test["size"],
                    msg_prefix + "Size: Want at most: " + size_msg
                )
            else:
                self.assertTrue(
                    path_size == test["size"],
                    msg_prefix + "Size: Want: " + size_msg
                )

        configuration["args"]["input"] = os.path.join(
            data_dir, configuration["args"]["input"]
        )
        configuration["quant_args"]["input"] = configuration["args"]["input"]
        configuration["test"]["data"] = os.path.join(
            data_dir, configuration["test"]["data"]
        )
        configuration["quant_test"]["data"] = configuration["test"]["data"]
        output = os.path.join(tempfile.mkdtemp(), configuration["dataset"])
        print()
        model = train_supervised(**configuration["args"])
        model.save_model(output + ".bin")
        check(
            model,
            output + ".bin",
            configuration["test"],
            False,
            msg_prefix="Supervised: "
        )
        print()
        model.quantize(**configuration["quant_args"])
        model.save_model(output + ".ftz")
        check(
            model,
            output + ".ftz",
            configuration["quant_test"],
            True,
            msg_prefix="Quantized: "
        )
import os
from fastText import train_supervised


def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

if __name__ == "__main__":
    train_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.train')
    valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid')

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = train_supervised(
        input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
    )
    print_results(*model.test(valid_data))

    model = train_supervised(
        input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1,
        loss="hs"
    )
    print_results(*model.test(valid_data))
    model.save_model("cooking.bin")

    model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
    print_results(*model.test(valid_data))
    model.save_model("cooking.ftz")
Exemple #39
0
    print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    scores = cross_val_score(clf, x, y, cv = 5,scoring='accuracy')
    #print scores
    print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

if __name__ == '__main__':
    SogouTCE_kv=load_SogouTCE()

    #labels=load_url(SogouTCE_kv)

    x,y=load_selecteddata(SogouTCE_kv)

    stopwords=load_stopwords()

    #切割token
    x=[  [word for word in line.split() if word not in stopwords]   for line in x]

    # 分割训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    #按照fasttest的要求生成训练数据和测试数据
    dump_file(x_train,y_train,"../data/sougou_train.txt")
    dump_file(x_test, y_test, "../data/sougou_test.txt")

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = train_supervised(
        input="../data/sougou_train.txt",
        epoch=25, lr=0.9, wordNgrams=2, verbose=2, minCount=1
    )
    print_results(*model.test("../data/sougou_test.txt"))