コード例 #1
0
 def train_model(self):
     classify_model = None
     if os.path.exists(self.preTrained_vectors):
         logging.info("存在预训练的词向量,从本地加载词向量进行训练...")
         classify_model = fasttext.supervised(
             self.fasttext_train_file,
             self.model_file[0:-4],
             lr=0.1,
             epoch=100,
             dim=self.fasttext_dim,
             bucket=50000000,
             loss='softmax',
             thread=56,
             min_count=3,
             word_ngrams=4,
             pretrained_vectors=self.preTrained_vectors,
             silent=False)
     else:
         logging.info("不存在预训练的词向量,重头开始训练...")
         classify_model = fasttext.supervised(self.fasttext_train_file,
                                              self.model_file[0:-4],
                                              lr=0.1,
                                              epoch=100,
                                              dim=self.fasttext_dim,
                                              bucket=50000000,
                                              loss='softmax',
                                              thread=56,
                                              min_count=3,
                                              word_ngrams=4,
                                              silent=False)
     return classify_model
コード例 #2
0
def train(attrs, path):
    for attr in attrs:
        print('开始训练---', attr)
        train_file = path + 'train-cut/' + attr + '_train.txt'
        model = path + 'model-cut/' + attr + '_model'
        fasttext.supervised(train_file, model, label_prefix='__label__')
        print('训练完成---', attr)
コード例 #3
0
def train():
    fasttext.supervised("/tmp/xinlang.train",
                        "/tmp/xinlang.model",
                        label_prefix="__label__",
                        lr=1,
                        dim=200,
                        word_ngrams=2,
                        bucket=10000000,
                        epoch=20)
コード例 #4
0
def train_fasttext(data_path="./data/question/disease",
                   model_path="./data/fasttext.model"):
    """
    This function is used to train the fasttext classifier
    :param data_path: string, the path of training data.
    :param model_path: string, the path to save the trained model of fasttext.
    :return:
    """
    path = preprocess.generate_train_text(data_path)
    fasttext.supervised(path, model_path, label_prefix="__label__")
コード例 #5
0
 def train(self, input_file, output_file):
     # TODO: access label prefix info from create_training_data.py
     #       or create new sub-command to create training data
     fasttext.supervised(input_file,
                         output_file,
                         label_prefix='__LABEL__',
                         dim=300,
                         min_count=1,
                         thread=2,
                         silent=0)
コード例 #6
0
 def train(self):
     print('--- starting training ---')
     if os.path.exists(self.pretrained_vec_file_name):
         print('found pretrained word vector')
         fasttext.supervised(
             self.training_file,
             self.model_file,
             pretrained_vectors=self.pretrained_vec_file_name,
             dim=self.dim)
     else:
         fasttext.supervised(self.training_file, self.model_file)
     print('--- fininshed training ---')
コード例 #7
0
ファイル: rojak.py プロジェクト: IndonesiaHaze/rojak
def train(data, output):
    """Train Rojak"""
    # TODO: access label prefix info from create_training_data.py
    #       or create new sub-command to create training data
    # TODO: access training fasttext model using class wrapper
    fasttext.supervised(data,
                        output,
                        label_prefix='__LABEL__',
                        dim=300,
                        min_count=1,
                        thread=2,
                        silent=0)
コード例 #8
0
 def fit(self, features, labels):
     """Trains the fasttext classifier on the provided data and outputs the results."""
     store_data_in_fasttext_file_format(
         os.path.join(self.model_dir, "train.txt"), features, labels)
     fasttext.supervised(os.path.join(self.model_dir, "train.txt"),
                         os.path.join(self.model_dir, "cv_model"),
                         label_prefix='__label__',
                         bucket=2000000,
                         epoch=10,
                         dim=300,
                         lr=0.005)
     self.model = fasttext.load_model(
         os.path.join(self.model_dir, 'cv_model.bin'))
     return self
コード例 #9
0
 def fit(self):
     print("Henter inn tekst")
     self.trainFolder2fasttext()
     print("Starter trening")
     if self.wikiVec == True:
         print("Kjører test med forhåndstrente Embeddings")
         self.model = fasttext.supervised(input_file=self.tmp_ft_file_path, output='model', epoch=self.epochs,
                                          lr=self.learningRate, lr_update_rate=self.lrUpdate, loss=self.lossFunction,
                                          ws=self.wordWindow, pretrained_vectors = self.wikiPath)
     else:
         self.model = fasttext.supervised(input_file=self.tmp_ft_file_path, output='model', epoch=self.epochs,
                                          lr=self.learningRate, lr_update_rate=self.lrUpdate, loss=self.lossFunction,
                                          ws=self.wordWindow)
     os.remove(self.tmp_ft_file_path)
コード例 #10
0
    def test_train_classifier(self):
        # set params
        dim=10
        lr=0.005
        epoch=1
        min_count=1
        word_ngrams=3
        bucket=2000000
        thread=4
        silent=1
        label_prefix='__label__'

        # Train the classifier
        model = ft.supervised(input_file, output, dim=dim, lr=lr, epoch=epoch,
                min_count=min_count, word_ngrams=word_ngrams, bucket=bucket,
                thread=thread, silent=silent, label_prefix=label_prefix)

        # Make sure the model is generated correctly
        self.assertEqual(model.dim, dim)
        self.assertEqual(model.epoch, epoch)
        self.assertEqual(model.min_count, min_count)
        self.assertEqual(model.word_ngrams, word_ngrams)
        self.assertEqual(model.bucket, bucket)

        # Read labels from the the input_file
        labels = read_labels_from_input(input_file, label_prefix)

        # Make sure labels are loaded correctly
        self.assertTrue(sorted(model.labels) == sorted(labels))

        # Make sure .bin and .vec are generated
        self.assertTrue(path.isfile(output + '.bin'))
コード例 #11
0
def train_model(lines,
                filename='/tmp/model.train',
                output='model/model',
                dim=100,
                lr=0.1,
                epoch=6,
                min_count=1,
                word_ngrams=1,
                bucket=1000000,
                thread=4,
                silent=1,
                label_prefix='__label__',
                remove_after=False):
    save_file(lines, filename)

    mkdir_p(os.path.dirname(output))

    classifier = ft.supervised(filename,
                               output,
                               dim=dim,
                               lr=lr,
                               epoch=epoch,
                               min_count=min_count,
                               word_ngrams=word_ngrams,
                               bucket=bucket,
                               thread=thread,
                               silent=silent,
                               label_prefix=label_prefix)

    if remove_after:
        os.remove(filename)
        os.remove(output + '.bin')

    return classifier
コード例 #12
0
def fast_text(tweets, test_tweets):
    """
    DESCRIPTION: 
            Applies FastText Algorithm
    INPUT: 
            tweets: Dataframe of train tweets
            test_tweets: Dataframe of test tweets
    OUTPUT: 
            labels: list of predicted labels of 1 or -1
    """

    tweets['sentiment'] = change_label(tweets['sentiment'])

    write_tweets_with_fasttext_labels(tweets)

    classifier = fasttext.supervised(FASTTEXT_TRAIN_FILE, FASTTEXT_MODEL,\
                                                     label_prefix='__label__',\
                                                     epoch=algorithm['params']['epochs'],\
                                                     dim=algorithm['params']['we_features'],\
                                                     ws = algorithm['params']['window_size'],\
                                                     lr = algorithm['params']['learning_rate'])

    test_tweets = transform_test_tweets(test_tweets)

    labels = classifier.predict(test_tweets)
    labels = transform_labels(labels)
    return labels
コード例 #13
0
def train_fasttext(train_file):
    logging.info("start training FT model...")
    temp_ft_model = fasttext.supervised(train_file,
                                        TEMP_FT_FILE,
                                        label_prefix='__label__')
    logging.info('training ft finished!')
    return temp_ft_model
コード例 #14
0
 def train(self, txt_path, config=DEF_CONFIG):
     if self.mode == "skipgram":
         self.model = fasttext.skipgram(txt_path, self.model_path, **config)
     elif self.mode == "cbow":
         self.model = fasttext.cbow(txt_path, self.model_path, **config)
     elif self.mode == "supervised":
         self.model = fasttext.supervised(txt_path, self.model_path, **config)
コード例 #15
0
	def _trainModel(self,train_x,train_y,tag_level):
		if not self._loadConfig():
			sys.exit(1)

		#create tmp fasttext train file from sklearn train file
		train_file_name = convertSkTrainFileToFastTextFile(train_x,train_y,self._model_path,self._label_prefix)
		if train_file_name is None:
			print 'convert train file fail'
			sys.exit(1)
			#return False
		
		try:
			if '1' == tag_level:
				model_name = self._level1_tag_model_name
			elif '2' == tag_level:
				model_name = self._level2_tag_model_name
			else:
				print 'Error tag_level: ' + tag_level
				sys.exit(1)

			self._fasttext = fasttext.supervised(train_file_name,model_name,
				label_prefix = self._label_prefix)
			
		except Exception,e:
			print 'train fasttext model fail. ' + str(e)
			sys.exit(1)
コード例 #16
0
ファイル: build_model.py プロジェクト: PeanutsGroup/nlp-demo
def train_model(tdata, model):
    classifier = fasttext.supervised(tdata + '.train', model)
    result = classifier.test(tdata + '.test')
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)
    return result
コード例 #17
0
def train(data):

    x_test = data[2]
    y_test = data[3]
    clf = fasttext.supervised('data/train_ft.txt',
                              'model',
                              dim=256,
                              ws=5,
                              neg=5,
                              epoch=100,
                              min_count=10,
                              lr=0.1,
                              lr_update_rate=1000,
                              bucket=200000)

    # 我们用predict来给出判断
    labels = clf.predict(x_test)

    y_preds = np.array(labels).flatten().astype(int)

    # 我们来看看
    print(len(y_test))
    print(y_test)
    print(len(y_preds))
    print(y_preds)

    # AUC准确率
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_preds, pos_label=1)
    print(metrics.auc(fpr, tpr))
コード例 #18
0
def train():
    classifier = fasttext.supervised('train.txt',
                                     'model',
                                     label_prefix='__label__')
    result = classifier.test('test.txt')
    print('Precision:', result.precision)
    print('Recall:', result.recall)
コード例 #19
0
ファイル: ft.py プロジェクト: bobflagg/sentiment-analysis
    def fit(self, X, y):
        # Check that X and y have correct shape
        #X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        
        input_file = self.store_training_data(X, y)
        self.output ='/tmp/fast-text-model-%s' % os.getpid()
        self.model = fasttext.supervised(
            input_file, 
            self.output, 
            dim=self.dim, 
            lr=self.lr, 
            epoch=self.epoch, 
            min_count=self.min_count, 
            word_ngrams=self.word_ngrams, 
            bucket=self.bucket, 
            thread=self.thread, 
            silent=self.silent, 
            label_prefix=self.label_prefix
        )
        # Clean up the temporary training data file:
        os.remove(input_file)
        # Return the classifier
        return self
コード例 #20
0
def char_main():
    fileout_tn = 'data/fasttest_train.txt'
    fileout_val = 'data/fasttest_val.txt'
    fileout_ts = 'data/fasttest_ts.txt'
    # convert_data(fileout_tn, fileout_val, fileout_ts)
    # print('convert data done.')
    classifier = fasttext.supervised(fileout_tn,
                                     'fasttextmodel',
                                     epoch=50,
                                     min_count=10,
                                     word_ngrams=4,
                                     minn=0,
                                     maxn=0,
                                     dim=300,
                                     ws=5,
                                     bucket=2000000)
    """
    0.9817
    epoch=25,min_count= 10, word_ngrams=4, minn=0, maxn=0,
                                     dim =500, ws=5,
                                     """
    result = classifier.test(fileout_val)
    # print('acc:', result.accuracy)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)
コード例 #21
0
ファイル: fast_text.py プロジェクト: Lukasz1928/NLP
def classify(data, labels, test, train, validation):
    train_data = [k for k in data.keys() if k in train]
    train_labels = [labels[k] for k in train_data]
    train_data = [data[k] for k in train_data]

    test_data = [k for k in data.keys() if k in test]
    test_labels = [labels[k] for k in test_data]
    test_data = [data[k] for k in test_data]

    validation_data = [k for k in data.keys() if k in validation]
    validation_labels = [labels[k] for k in validation_data]
    validation_data = [data[k] for k in validation_data]

    save_training_file(train_data, train_labels)
    cls = fasttext.supervised('training.txt',
                              'model',
                              lr_update_rate=200000,
                              epoch=10,
                              lr=0.3)
    predicted = [int(x[0]) for x in cls.predict(validation_data)]
    remove_training_file()
    precision, recall, f1, _ = precision_recall_fscore_support(
        validation_labels, predicted, average='binary')
    return {
        'accuracy': float("{:.3f}".format(round(precision, 3))),
        'recall': float("{:.3f}".format(round(recall, 3))),
        'f1': float("{:.3f}".format(round(f1, 3)))
    }
コード例 #22
0
def url_classification_fasttext():
    url_cat, url_content = text_utility.get_documents(current_path="data", pattern="train*.xlsx")
    with open("fasttext_train_nocutall.txt", mode="w") as fd:
        format_data(url_cat, url_content, fd)
        fd.flush()

    classifier = fasttext.supervised(input_file="fasttext_train_nocutall.txt", output="fasttext_nocutall.model", label_prefix="__label__")
    # classifier = fasttext.load_model("fasttext.model.bin", label_prefix="__label__")
    print(len(classifier.labels))
    for class_name in classifier.labels:
        print(class_name)

    texts = list()
    with open("test.txt", mode="r") as fd:
        for line in fd:
            line = line.strip()
            segs = line.split(',')
            if len(segs) != 6:
                continue
            url, title, keywords, desc, a_content, p_content = line.split(',')
            content = " ".join([title, keywords, desc, a_content, p_content])
            word_vec = [word for word in jieba.cut(content, cut_all=False)]
            if len(word_vec) == 0:
                continue
            test_content = " ".join(word_vec)
            print(url, test_content)
            texts.append(test_content)
            # predict函数的输入需要使用list类型

    label_list = classifier.predict_proba(texts, len(classifier.labels))
    for label in label_list:
        for value in label:
            print(value[0], value[1])
コード例 #23
0
    def train(self):
        """
        训练函数
        :return: 
        """
        start_time = time.time()
        traing_file = self.config.train_file
        save_path = self.config.save_model
        print(
            sys.stderr,
            'Start training model, training file: %s, saved model path: %s.' %
            (traing_file, save_path))

        classifier = fasttext.supervised(traing_file,
                                         save_path,
                                         label_prefix='__label__',
                                         dim=100,
                                         word_ngrams=2,
                                         bucket=2000000,
                                         loss='softmax')

        end_time = time.time()
        print >> sys.stderr, 'Training over. cost %.2fs' % (end_time -
                                                            start_time)
        return classifier
コード例 #24
0
ファイル: main.py プロジェクト: himanshukgp/fasttext
def main():
    trainDataPath = "/home/singh/Desktop/emocontext/starterkitdata/train.txt"
    testDataPath = "/home/singh/Desktop/emocontext/starterkitdata/devwithoutlabels.txt"
    solutionPath = "/home/singh/Desktop/emocontext/fast_text/fast_text/test3.txt"

    print("Processing training data...")
    #data_train = preprocessDatatrain(trainDataPath, mode="train")
    print("Processing test data...")
    #data_test = preprocessDatatest(testDataPath, mode="test")
    list1 = preprocessDatalist(testDataPath, mode="test")

    classifier = fasttext.supervised(
        input_file="fasttext_dataset_training.txt",
        output='model/model3',
        dim=300,
        lr=0.01,
        epoch=30)
    labels = classifier.predict(list1)
    #print(list1[:5], labels[:5])

    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) +
                   '\n')
        with io.open(testDataPath, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(label2emotion[int(labels[lineNum][0])] + '\n')
コード例 #25
0
    def train(self, train, test, model, **kwargs):

        params = {
            'dim': [100, 150],
            'lr': [0.1, 0.5, 1],
            'loss': ['ns', 'hs'],
            'ws': [5, 10]
        }
        for k, v in kwargs.items():
            params[k] = v
        keys, values = params.keys(), params.values()
        best, best_score = '', 0
        for p in product(*values):
            ps = {keys[i]: p[i] for i in xrange(4)}
            clf = fasttext.supervised(
                train, model + '%s_%s_%s_%s' % (p[0], p[1], p[2], p[3]), **ps)
            result = clf.test(test)
            print '%s_%s_%s_%s' % (p[0], p[1], p[2], p[3])
            print 'Precision: %.2f%%' % (result.precision * 100)
            print 'Recall Rate: %.2f%%\n' % (result.recall * 100)
            f1 = float(2.0 * result.precision *
                       result.recall) / float(result.precision + result.recall)
            if best_score < f1:
                best, best_score, = '%s_%s_%s_%s' % (p[0], p[1], p[2],
                                                     p[3]), f1
        print '%s\n%.2f' % (best, best_score)
コード例 #26
0
def train():
    classifier = fasttext.supervised('train80ft_mecab.txt',
                                     'classify_title',
                                     dim=100,
                                     epoch=50,
                                     bucket=200000,
                                     word_ngrams=2)
コード例 #27
0
def train_model(train_data_path, test_data_path, model_save_path):
    t1 = time.time()
    classifier = fasttext.supervised(train_data_path,
                                     model_save_path,
                                     label_prefix="__label__")
    t2 = time.time()
    print('train model over. it took {0}ms'.format((t2 - t1)))

    result = classifier.test(test_data_path)
    print("P@1:", result.precision)  # 准确率
    print("R@2:", result.recall)  # 召回率
    print("Number of examples:", result.nexamples)  # 预测的样本数量

    # 预测测试数据
    y_true, y_pred = [], []
    with open(test_data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            parts = line.split(' , ')
            if len(parts) != 2:
                continue
            cls, txt = parts[0], parts[1]
            prediction = classifier.predict([txt])
            y_pred.append(prediction[0][0])
            y_true.append(cls.replace('__label__', '').strip())

    # 输出各类别测试测试参数
    print(y_true[:10], y_pred[:10])
    classify_report = metrics.classification_report(y_true, y_pred)
    print(classify_report)
コード例 #28
0
 def train(self):
     classifier = fasttext.supervised('data/train_fasttext.txt',
                                      'save_model/fast_text/fasttext_Model',
                                      label_prefix='__label__')
     result = classifier.test('data/train_fasttext.txt')
     print("pre:" + str(result.precision))
     print("recall:" + str(result.recall))
コード例 #29
0
def execute():
    # Verify that mandatory arguments are present
    if "-i" not in args:
        return "ERROR: No input file was given"

    if "-t" not in args:
        return "ERROR: No model type was given"

    # Extract arguments
    train_file = args[args.index("-i")+1]
    model_type = args[args.index("-t")+1]

    # Extract optional arguments
    epoch = get_optional_param('--epoch',5)
    ngrams = get_optional_param('--ngrams',1)
    label_prefix = get_optional_param('--label','__label__')

    # Create temporary file
    tmp, modelname = tempfile.mkstemp()

    # Use specified classifier with parameters and output model to the name of the temporary file
    if model_type == "supervised":
        classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "skipgram":
        classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "cbow":
        classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    # Return the temporary file name
    return modelname
コード例 #30
0
def fast_cv(df):
    X = df['Discuss'].values
    y = df['Score'].values
    fast_pred = []
    folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))
    for train_index, test_index in folds:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_file = fasttext_data(X_train, y_train)
        classifier = fasttext.supervised(train_file, '../input/model', lr=0.01, dim=128, label_prefix="__label__", encoding = 'utf-8-sig')
        result = classifier.predict_proba(df.loc[test_index, 'Discuss'].tolist(), k=5)
        print(result[0:100])
        pred = [[int(sco) * proba for sco, proba in result_i] for result_i in result]
        pred = [sum(pred_i) for pred_i in pred]
        print(pred[0:100])
        print(rmsel(y_test, pred))

        test_result = classifier.predict_proba(test_df['Discuss'].tolist(), k=5)
        fast_predi = [[int(sco) * proba for sco, proba in result_i] for result_i in test_result]
        fast_predi = [sum(pred_i) for pred_i in fast_predi]
        fast_pred.append(fast_predi)

    fast_pred = np.array(fast_pred)
    fast_pred = np.mean(fast_pred, axis=0)
    return fast_pred
コード例 #31
0
 def classify(**kvargs):
     classifier = ft.supervised("news_fasttext_train.txt",
                                "news_fasttext.model", label_prefix="__label__")
     # classifier = ft.load_model(
     #     'news_fasttext.model.bin', label_prefix='__label__')
     news = News.objects.all()[:20]
     for new in news:
         text = new.n_abs
         seg_text = jieba.cut(text.replace("\t", " ").replace("\n", " "))
         outline = " ".join(seg_text)
         texts = [outline.encode("utf8")]
         labels = classifier.predict(texts)
         print text + ":" + labels[0][0]
コード例 #32
0
ファイル: model.py プロジェクト: lyssym/nlp_base
    def train(cls, input_file, output, **kwargs):
        """
        模型训练

        * input_file             training file path (required)
        * output                 output file path (required)
        * label_prefix           label prefix ['__label__']
        * lr                     learning rate [0.1]
        * lr_update_rate         change the rate of updates for the learning rate [100]
        * dim                    size of word vectors [100]
        * ws                     size of the context window [5]
        * epoch                  number of epochs [5]
        * min_count              minimal number of word occurences [1]
        * neg                    number of negatives sampled [5]
        * word_ngrams            max length of word ngram [1]
        * loss                   loss function {ns, hs, softmax} [softmax]
        * bucket                 number of buckets [0]
        * minn                   min length of char ngram [0]
        * maxn                   max length of char ngram [0]
        * thread                 number of threads [12]
        * t                      sampling threshold [0.0001]
        * silent                 disable the log output from the C++ extension [1]
        * encoding               specify input_file encoding [utf-8]
        * pretrained_vectors     pretrained word vectors (.vec file) for supervised learning []
        """
        config = get_config()
        kwargs.setdefault('lr', config.get('model', 'lr'))
        kwargs.setdefault('lr_update_rate', config.get('model', 'lr_update_rate'))
        kwargs.setdefault('dim', config.get('model', 'dim'))
        kwargs.setdefault('ws', config.get('model', 'ws'))
        kwargs.setdefault('epoch', config.get('model', 'epoch'))
        kwargs.setdefault('word_ngrams', config.get('model', 'word_ngrams'))
        kwargs.setdefault('loss', config.get('model', 'loss'))
        kwargs.setdefault('bucket', config.get('model', 'bucket'))
        kwargs.setdefault('thread', config.get('model', 'thread'))
        kwargs.setdefault('silent', config.get('model', 'silent'))
        cls.__model = ft.supervised(input_file, output, **kwargs)
        return cls.__model
コード例 #33
0
def main():
    data_path = '/Users/ruizhang/Documents/NLP_dataset/'


    #############
    #
    ############
    # Load train set
    train_file = data_path +'dbpedia_csv/train.csv'
    df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'dbpedia_csv/test.csv'
    df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Mapping from class number to class name
    class_dict = {
        1: 'Company',
        2: 'EducationalInstitution',
        3: 'Artist',
        4: 'Athlete',
        5: 'OfficeHolder',
        6: 'MeanOfTransportation',
        7: 'Building',
        8: 'NaturalPlace',
        9: 'Village',
        10: 'Animal',
        11: 'Plant',
        12: 'Album',
        13: 'Film',
        14: 'WrittenWork'
    }
    df['class_name'] = df['class'].map(class_dict)
    df.head()

    #############
    #
    ############
    desc = df.groupby('class')
    desc.describe().transpose()

    # Transform datasets
    df_train_clean = clean_dataset(df, True, False)
    df_test_clean = clean_dataset(df_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'dbpedia.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'dbpedia.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    # Train a classifier
    output_file = data_path + 'dp_model'
    classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__')

    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.']
    labels1 = classifier.predict(sentence1)
    class1 = int(labels1[0][0])
    print("Sentence: ", sentence1[0])
    print("Label: %d; label name: %s" % (class1, class_dict[class1]))

    sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.']
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class2 as string
    print("Sentence: ", sentence2[0])
    print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!']
    number_responses = 3
    labels3 = classifier.predict_proba(sentence3, k=number_responses)
    print("Sentence: ", sentence3[0])
    for l in range(number_responses):
        class3, prob3 = labels3[0][l]
        print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3))

    # Load train set
    train_file = data_path + 'amazon_review_polarity_train.csv'
    df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'amazon_review_polarity_test.csv'
    df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Transform datasets
    df_train_clean = clean_dataset(df_sentiment_train, True, False)
    df_test_clean = clean_dataset(df_sentiment_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'amazon.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'amazon.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    dim = 10
    lr = 0.1
    epoch = 5
    min_count = 1
    word_ngrams = 2
    bucket = 10000000
    thread = 12
    label_prefix = '__label__'

    # Train a classifier
    output_file = data_path + 'amazon_model'
    classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch,
                                     min_count=min_count, word_ngrams=word_ngrams, bucket=bucket,
                                     thread=thread, label_prefix=label_prefix)

    # Evaluate classifier
    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    class_dict = {
        1: "Negative",
        2: "Positive"
    }

    sentence1 = ["The product design is nice but it's working as expected"]
    labels1 = classifier.predict_proba(sentence1)
    class1, prob1 = labels1[0][0]  # it returns class as string
    print("Sentence: ", sentence1[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1))

    sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"]
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class as string
    print("Sentence: ", sentence2[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    url = "https://twitter.com/miguelgfierro/status/805827479139192832"
    response = urlopen(url).read()
    title = str(response).split('<title>')[1].split('</title>')[0]
    print(title)

    # # Format tweet
    # tweet = unescape(title)
    # print(tweet)
    #
    # # Classify tweet
    # label_tweet = classifier.predict_proba([tweet])
    # class_tweet, prob_tweet = label_tweet[0][0]
    # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet))


    wiki_dataset_original = data_path + 'enwik9'
    wiki_dataset = data_path + 'text9'
    if not os.path.isfile(wiki_dataset):
        os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset)

    output_skipgram = data_path + 'skipgram'
    if os.path.isfile(output_skipgram + '.bin'):
        skipgram = fasttext.load_model(output_skipgram + '.bin')
    else:
        skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5,
                                     epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
                                     thread=4, t=1e-4, lr_update_rate=100)
    print(np.asarray(skipgram['king']))

    print("Number of words in the model: ", len(skipgram.words))

    # Get the vector of some word
    Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum()
    print(Droyals)
    Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople)
    Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople2)

    print(len(skipgram.words))
    targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother',
               'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy']
    classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
    X_target = []
    for w in targets:
        X_target.append(skipgram[w])
    X_target = np.asarray(X_target)
    word_list = list(skipgram.words)[:10000]
    X_subset = []
    for w in word_list:
        X_subset.append(skipgram[w])
    X_subset = np.asarray(X_subset)
    X_target = np.concatenate((X_subset, X_target))
    print(X_target.shape)
    X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact',
                  random_state=0, n_iter=200, verbose=2).fit_transform(X_target)
    print(X_tsne.shape)
    X_tsne_target = X_tsne[-20:, :]
    print(X_tsne_target.shape)
    plot_words(X_tsne_target, targets, classes=classes)
    plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
コード例 #34
0
ファイル: test.py プロジェクト: cyy0523xc/code
train_file = '/home/alex/data/hsbianma_page10_sogou_huaxue.csv'
train_file_output = '/home/alex/data/hsbianma_page10_sogou_huaxue.fasttext.txt'


def format_train_file(csv_file, train_file):
    with open(csv_file) as r, open(train_file, 'w') as w:
        csv_r = csv.DictReader(r, fieldnames=('kind', 'content'))
        for row in csv_r:
            row['content'] = row['content'].strip()
            if "\n" in row['content']:
                continue

            words = jieba.cut(row['content'])
            w.write('__label__' + row['kind'][:4] + ' ' + ' '.join(words) + "\n")


# 格式化训练文件
format_train_file(train_file, train_file_output)

# 训练
classifier = fasttext.supervised(train_file_output, 'classify_model')

# 测试
result = classifier.test(train_file_output)
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)

# 预测
classifier.predict([' '.join(jieba.cut("锦纶天丝面料"))], k=3)
コード例 #35
0
ファイル: myfasttext.py プロジェクト: lzyang/python
import fasttext


import fasttext

# 训练模型
classifier = fasttext.supervised("train.txt", "model", label_prefix="__label__", dim=100, epoch=2, word_ngrams=1, min_count=1, lr=0.1, bucket=200000)
コード例 #36
0
ファイル: train.py プロジェクト: cyy0523xc/code
# -*- coding: utf-8 -*-
#
#
# Author: alex
# Created Time: 2017年09月10日 星期日 13时18分07秒
import csv
import jieba
import fasttext

train_file = 'fasttext.train.txt'
test_file = 'fasttext.test.txt'

# 训练
classifier = fasttext.supervised(train_file, 'classify_model',
                                 lr=1.0,
                                 epoch=30,
                                 #word_ngrams=2
                                 loss='hs'
                                 )

# 测试
result = classifier.test(train_file)
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)

# 测试
result = classifier.test(test_file)
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)