def train_model(ipt=None, opt=None, model='', dim=128, epoch=5, lr=0.1, loss='softmax'):
    np.set_printoptions(suppress=True)
    if os.path.isfile(model):
        classifier = fasttext.load_model(model)
    else:
        classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch,
                                               lr=lr, wordNgrams=2, loss=loss)
        classifier.save_model(opt)
    return classifier
Exemple #2
0
def get_embeddings_with_timedeltas_per_block(data: defaultdict,
                                             model: fasttext.FastText) -> List:
    embeddings = []
    for logs in data.values():
        numpy_block = np.zeros(shape=(len(logs), model.get_dimension() + 1),
                               dtype=np.float32)

        for i, log in enumerate(logs):
            numpy_block[i, 1:] = model.get_sentence_vector(log.rstrip())
        numpy_block[:, 0] = get_timedeltas(logs)

        embeddings.append(numpy_block)
    return embeddings
 def get_model(self):
     """获取模型"""
     model_path = config.fasttext_vectorizer_path
     if os.path.exists(model_path):
         model = FastText.load_model(model_path)
     else:
         recall_corpus = config.recall_corpus
         model = FastText.train_unsupervised(input=recall_corpus,
                                             minCount=1,
                                             wordNgrams=2,
                                             epoch=20)
         model.save_model(model_path)
     return model
Exemple #4
0
def load_pretrained(name, word_to_ix, embed_size=128):
    import fasttext.FastText as fasttext
    model = fasttext.load_model(os.path.join(MODELPATH, '{}.bin'.format(name)))
    embedding_matrix = np.zeros((len(word_to_ix), embed_size))
    for w, ix in word_to_ix.items():
        embedding_matrix[ix] = model[w]
    return embedding_matrix
def train():
    try:
        if not os.path.isfile(prepared_data):
            prepera_data(training_data_path, prepared_data)
        if not os.path.isfile(prepared_test):
            prepera_data(test_data_path, prepared_test)

        hyper_params = {"lr": 0.01,
            "epoch": 20,
            "wordNgrams": 2,
            "dim": 20}     

        # Train the model.
        model = fastText.train_supervised(input=prepared_data, **hyper_params)
        print("Model trained with the hyperparameter \n {}".format(hyper_params))

        # CHECK PERFORMANCE
        result = model.test(prepared_test)
        # DISPLAY ACCURACY OF TRAINED MODEL
        text_line = str(hyper_params) + ",accuracy:" + str(result[1]) + '\n' 
        print(text_line)

        model.save_model(os.path.join(model_path,model_name + ".ftz"))
    except Exception as ex:
        print('Exception' + str(ex))
    def train(self):
        classifier = ff.train_supervised(
            input=self.train_data_path,
            label="__label__",
            epoch=self.epoch,
            pretrainedVectors=self.pretrainedVectors,
            lr=self.lr,
            dim=self.dim)
        classifier.save_model(self.save_model_path)
        train_result = classifier.test(self.train_data_path)

        print("### TRAIN RESULT ###")
        print("Train Samples: {}".format(train_result[0]))
        print("Train Precision: {}".format(train_result[1]))
        print("Train Recall: {}\n\n".format(train_result[2]))

        if self.test_data_path:
            test_result = classifier.test(self.test_data_path)

            print("### TEST RESULT ###")
            print("Test Samples: {}".format(test_result[0]))
            print("Test Precision: {}".format(test_result[1]))
            print("Test Recall: {}\n\n".format(test_result[2]))

        print("model save to {}".format(self.save_model_path))
Exemple #7
0
 def __init__(self,
              save_model_path=None,
              train_data_path=None,
              test_data_path=None,
              type="train",
              k=1,
              threshold=0.0):
     """
     fasttext 文本分类
     :param train_data_path: 训练文本路径
     :param save_model_path: 模型保存路径
     :param test_data_path: 测试文本路径, 默认为 None
     :param type: 运行模式,“train/prediict”
     """
     self.train_data_path = train_data_path
     self.test_data_path = test_data_path
     self.save_model_path = save_model_path
     self.type = type
     self.k = k
     self.threshold = threshold
     if self.type == "predict":
         if not os.path.exists(self.save_model_path):
             print("MODEL: {} is not EXIST ....")
             sys.exit()
         print("LOAD MODEL FROM: {}".format(self.save_model_path))
         self.classifier = ff.load_model(self.save_model_path)
     else:
         self.classifier = None
Exemple #8
0
def get_embeddings_per_log(data: defaultdict,
                           model: fasttext.FastText) -> np.ndarray:
    # create embeddings per log but at first remove '\n' (newline character) from the end
    embeddings = [
        model.get_sentence_vector(log.rstrip()) for logs in data.values()
        for log in logs
    ]
    return np.asarray(embeddings)
def train_model(ipt=None,
                opt=None,
                model='',
                dim=100,
                epoch=5,
                lr=0.1,
                loss='softmax'):
    np.set_printoptions(suppress=True)
    if os.path.isfile(model):
        classifier = fasttext.load_model(model)
    else:
        classifier = fasttext.train_supervised(ipt,
                                               label='__label__',
                                               dim=dim,
                                               epoch=epoch,
                                               lr=lr,
                                               wordNgrams=2,
                                               loss=loss)
        """
          训练一个监督模型, 返回一个模型对象

          @param input:           训练数据文件路径
          @param lr:              学习率
          @param dim:             向量维度
          @param ws:              cbow模型时使用
          @param epoch:           次数
          @param minCount:        词频阈值, 小于该值在初始化时会过滤掉
          @param minCountLabel:   类别阈值,类别小于该值初始化时会过滤掉
          @param minn:            构造subword时最小char个数
          @param maxn:            构造subword时最大char个数
          @param neg:             负采样
          @param wordNgrams:      n-gram个数
          @param loss:            损失函数类型, softmax, ns: 负采样, hs: 分层softmax
          @param bucket:          词扩充大小, [A, B]: A语料中包含的词向量, B不在语料中的词向量
          @param thread:          线程个数, 每个线程处理输入数据的一段, 0号线程负责loss输出
          @param lrUpdateRate:    学习率更新
          @param t:               负采样阈值
          @param label:           类别前缀
          @param verbose:         ??
          @param pretrainedVectors: 预训练的词向量文件路径, 如果word出现在文件夹中初始化不再随机
          @return model object
        """
        classifier.save_model(opt)
    return classifier
 def load_model(self, ):
     """
     load fast-text model
     :return:
     """
     if os.path.exists(self.model_path):
         self.classifier = FastText.load_model(self.model_path)
     else:
         print("no such model, train now")
         self.train_model()
 def __init__(self):
     jieba.enable_paddle()
     f = open("data/stop_words_utf-8.txt", mode='r',
              encoding='utf-8')
     self.stop_words = [line.strip() for line in f.readlines()]
     f.close()
     print("停用词加载成功")
     self.classifier = fasttext.load_model(
         'model/data_dim100_lr00.5_iter5.model')
     print("模型加载成功")
Exemple #12
0
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText,
                             with_timedelta: bool) -> List:
    # create embeddings per block but at first remove '\n' (newline character) from the end
    if with_timedelta:
        embeddings = get_embeddings_with_timedeltas_per_block(data, model)
    else:
        embeddings = [
            np.asarray(
                [model.get_sentence_vector(log.rstrip()) for log in logs])
            for logs in data.values()
        ]
    return embeddings
def fasttext_word_norm(token: str,
                       model: fasttext.FastText) -> Optional[np.array]:
    # divide each word vector by its L2 norm
    # and we will only sum over vectors with positive L2 norms

    raw_word_vec = model.get_word_vector(token)
    norm = np.linalg.norm(raw_word_vec)

    if norm > 0:
        return raw_word_vec / np.linalg.norm(raw_word_vec)
    else:
        # these Nones are dropped in get_fasttext_sentence_vector
        # so that we don't sum over ones with zero norm
        return None
 def train_model(self):
     if not Path(self.train_data_path).exists():
         self.preprocessor.preprocess()
     classifier = FastText.train_supervised(input=self.train_data_path,
                                            lr=0.25,
                                            ws=4)
     classifier.save_model(self.model_path)
     self.classifier = classifier
     print("test result in training data:")
     result = classifier.test(self.train_data_path)
     print(result)
     print("test result in testing data:")
     result = classifier.test(self.test_data_path)
     print(result)
    def __init__(self,
                 save_model_path=None,
                 train_data_path=None,
                 test_data_path=None,
                 type="train",
                 k=1,
                 threshold=0.0,
                 epoch=5,
                 pretrainedVectors="",
                 label="__label__",
                 lr=0.1,
                 dim=100):
        """
        fasttext
        :param save_model_path: 模型保存路径
        :param train_data_path: 训练样本路径
        :param test_data_path: 测试样本路径
        :param type: 模式:“train/predict”
        :param k: 返回结果个数
        :param threshold: 阈值
        :param epoch: 训练轮数
        :param pretrainedVectors: 预训练词向量路径
        :param label: 标签前缀
        :param lr: 学习率
        :param dim: 词向量维度
        """
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.save_model_path = save_model_path
        self.type = type
        self.k = k
        self.threshold = threshold
        self.epoch = epoch
        self.pretrainedVectors = pretrainedVectors
        self.label = label
        self.lr = lr
        self.dim = dim

        if self.type == "predict":
            if not os.path.exists(self.save_model_path):
                print("MODEL: {} is not EXIST ....")
                sys.exit()
            print("LOAD MODEL FROM: {}".format(self.save_model_path))
            self.classifier = ff.load_model(self.save_model_path)
        else:
            self.classifier = None
Exemple #16
0
    def load_pretrained_embedding_from_fasttext_cc_model(self, path, freeze):
        self.model = FastText.load_model(path)

        self.embedding_dim = self.model.get_dimension()
        self.words = [*self.model.words, NUM, PUNCTUATION]
        self.word2index = {word: i for i, word in enumerate(self.words)}
        self.unk_index = len(self.words)
        self.padding_index = len(self.words) + 1
        self.embeddings = nn.Embedding.from_pretrained(
            torch.cat([
                torch.tensor(self.model.get_input_matrix()),
                torch.rand(3, self.embedding_dim, dtype=torch.float).uniform_(
                    -math.sqrt(3 / self.embedding_dim),
                    math.sqrt(3 / self.embedding_dim)),
                torch.zeros(1, self.embedding_dim, dtype=torch.float)
            ]),
            padding_idx=self.padding_index,
            freeze=freeze)
Exemple #17
0
 def __init__(self, fasttextmodel: fasttext.FastText):
     super(FastTextTorch, self).__init__()
     self.fasttextmodel = fasttextmodel
     weights = torch.from_numpy(fasttextmodel.get_input_matrix())
     # Note: `vocab_size` is the size of the actual fasttext vocabulary. In pratice, the
     # embeddings here have two more tokens in their vocabulary: one for padding (embedding fixed
     # at 0, since the padding embedding never receive gradient in `nn.Embedding`) and one for
     # the special (root) tokens, with values sampled accross the vocabulary
     self.vocab_size, self.embedding_size = weights.shape
     root_embedding = weights[
         torch.randint(high=self.vocab_size, size=(self.embedding_size, )),
         torch.arange(self.embedding_size), ].unsqueeze(0)
     weights = torch.cat((weights, torch.zeros(
         (1, self.embedding_size)), root_embedding),
                         dim=0).to(torch.float)
     weights.requires_grad = True
     self.embeddings = nn.Embedding.from_pretrained(
         weights, padding_idx=self.vocab_size + 1)
Exemple #18
0
    def train(self):
        classifier = ff.train_supervised(self.train_data_path,
                                         label_prefix="__label__")
        classifier.save_model(self.save_model_path)
        train_result = classifier.test(self.train_data_path)

        print("### TRAIN RESULT ###")
        print("Train Samples: {}".format(train_result[0]))
        print("Train Precision: {}".format(train_result[1]))
        print("Train Recall: {}\n\n".format(train_result[2]))

        if self.test_data_path:
            test_result = classifier.test(self.test_data_path)

            print("### TEST RESULT ###")
            print("Test Samples: {}".format(test_result[0]))
            print("Test Precision: {}".format(test_result[1]))
            print("Test Recall: {}\n\n".format(test_result[2]))

        print("model save to {}".format(self.save_model_path))
Exemple #19
0
def question_classifier_test():
    """
    问题分类测试
    :return:
    """
    # 加载停用词表
    stop_words = load_stop_word_list("stopwords.txt")
    label_to_name = load_label_name_map()[0]
    classifier = ff.load_model("Model/model_w" + str(wordNgrams) + "_e" +
                               str(epoch))
    while True:
        input_ = input("question:")
        seg_line = jieba.cut(input_)
        add_str = ""
        for word in seg_line:
            if word not in stop_words:
                add_str += word + " "
        predict = classifier.predict(add_str.strip(), 3)
        print(predict)
        for label in predict[0]:
            print(label_to_name[label])
    def train_model(self):
        """
        train fast-text model
        :return:
        """
        classifier = FastText.train_supervised(input=self.train_data_path,
                                               lr=1,
                                               ws=4,
                                               loss="hs",
                                               epoch=25)
        # classifier = FastText.train_supervised(input=self.train_data_path, lr=1, wordNgrams=2,
        #                                        verbose=2, minCount=1, epoch=25, loss="hs")

        classifier.save_model(self.model_path)
        self.classifier = classifier
        print("test result in training data:")
        result = classifier.test(self.train_data_path)
        print(result)
        print("test result in testing data:")
        result = classifier.test(self.test_data_path)
        print(result)
Exemple #21
0
def fasttext_model_train():
    """
    fasttext模型训练
    :return:
    """
    # for i in range(20, 30):
    #     for w in range(1, 3):
    #         start_time = time.time()
    #         classifier = ff.train_supervised("fasttext.train", epoch=i, lr=0.5, wordNgrams=w)
    #         print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time))
    #         classifier.save_model("Model/model_w" + str(w) + "_e" + str(i))

    start_time = time.time()
    classifier = ff.train_supervised("fasttext.train",
                                     epoch=epoch,
                                     lr=0.5,
                                     wordNgrams=wordNgrams)
    print("ngram=%d,训练第%d轮,用时%s" %
          (wordNgrams, epoch, time.time() - start_time))
    classifier.save_model("Model/model_w" + str(wordNgrams) + "_e" +
                          str(epoch))
import tensorflow as tf

dir_path = os.getcwd()

cla_model = os.path.join('model', 'data_dim100_lr05_iter5.model')

cws_model_path = os.path.join(dir_path, 'model',
                              'cws.model')  # 分词模型路径,模型名称为`cws.model`

pos_model_path = os.path.join(dir_path, 'model',
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

ner_model_path = os.path.join(dir_path, 'model',
                              'ckpt')  # 实体识别模型路径,模型名称为`ner.model`
Ner = evaluate_line(ner_model_path)  #加载实体识别模型
classifier = fasttext.load_model(cla_model)  #加载文本分类模型

from pyltp import Segmentor, Postagger


class PyltpTool:
    def __init__(self, sentence):
        self.sentence = sentence

        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        self.seg_sentences = []
        self.pos_tags = []
        self.ner_sentence = []
from fasttext import FastText
from text_preprocess import TextProcess
import json

model_path = "model/fasttext.bin"
model = FastText.load_model(model_path)

with open("label_dict.json", 'r') as load_f:
    label_dict = json.load(load_f)


def predict(text):
    pre_label = model.predict(TextProcess().word_preprocess(text))
    label_index = pre_label[0][0].strip().strip('__label__')
    label_name = list(label_dict.keys())[list(label_dict.values()).index(
        int(label_index))]

    return label_name


if __name__ == '__main__':
    text = input("Please input sentence: ")
    print("result:", predict(text))
Exemple #24
0
    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)

max_len = max(len(x) for x in x_train)
print(max_len)

print('Pad sequences...')
x_train = sequence.pad_sequences(x_train, maxlen=max_len, value=0)
x_test = sequence.pad_sequences(x_test, maxlen=max_len, value=0)

print('Build model...')
model = FastText(max_len,
                 embedding_dim,
                 batch_size=batch_size,
                 class_num=2,
                 max_features=max_features,
                 epochs=epochs)

print('Train...')
model.fit(x_train, x_test, y_train, y_test)

print('Test...')
result = model.predict(x_test)
result = np.argmax(np.array(result), axis=1)
y_test = np.argmax(np.array(y_test), axis=1)

print('f1:', f1_score(y_test, result, average='macro'))
print('accuracy:', accuracy_score(y_test, result))
print('classification report:\n', classification_report(y_test, result))
print('confusion matrix:\n', confusion_matrix(y_test, result))
Exemple #25
0
if Path.cwd().name == 'app':
    base_path = Path('.')
else:
    base_path = Path('app')

files_path = base_path / 'files/'

gnaf_addresses = pd.read_csv(files_path / 'gnaf_addresses.csv',
                             low_memory=False)
concat_address = pd.read_csv(files_path / 'address_clean.txt', header=None)[0]

mdl = FastText.train_unsupervised(input=str(files_path / 'address_clean.txt'),
                                  minCount=0,
                                  minn=0,
                                  maxn=3,
                                  dim=300,
                                  epoch=10,
                                  bucket=200000)
mdl.save_model(str(files_path / 'address_resolver.mdl'))

address_vecs = [
    mdl.get_sentence_vector(addr) for addr in tqdm(concat_address.values)
]
np.save(files_path / 'address_vecs.npy', address_vecs)

raw_address = '8mayfrd avehopevally'
raw_address = '55 curry st adelaid'
raw_address = raw_address.upper()
raw_address_vec = mdl.get_sentence_vector(raw_address)
distances = pairwise_distances([raw_address_vec], address_vecs)
Exemple #26
0
def train_embedding(fn):
    model = FastText.train_unsupervised(fn, model='skipgram', dim=300, maxn=0)
    model.save_model(
        'data/pretrained_embedding/fasttext_pretrained_embeddings_300.bin')
            if label.strip() == pre_label.strip():
                precision[label.strip().strip('__label__')] += 1

    for sub in precision.keys():
        pre = precision[sub] / total[sub]
        rec = precision[sub] / recall[sub]
        F1 = (2 * pre * rec) / (pre + rec)

        label_name = list(label_dict.keys())[list(label_dict.values()).index(
            int(sub))]
        print("{}, precison:{}, recall:{}, F1:{}".format(
            str(label_name.strip('/')), str(pre), str(rec), str(F1)))


if __name__ == '__main__':
    model = FastText.train_supervised(input=train_data,
                                      label_prefix="__label__",
                                      epoch=15,
                                      dim=32,
                                      lr=0.5,
                                      loss='softmax',
                                      verbose=2,
                                      minCount=3,
                                      word_ngrams=2,
                                      bucket=1000000)

    model.save_model('model/fasttext.bin')
    score = model.test(test_data)
    cal_precision_and_recall(test_data)
 def train(self, file_path):
     """
     训练
     :return:
     """
     self.model = FastText.train_supervised(input=file_path, **CONFIG)
Exemple #29
0
import const
"""
with open(os.path.join(const.DATAPATH, 'corpus.txt'), 'r') as f:
    lines = []
    for line in f:
        lines.append(' '.join(line.strip().split('_')) + '\n')

with open(os.path.join(const.DATAPATH, 'corp.txt'), 'w') as f:
    f.writelines(lines)
"""
#fasttext.train_unsupervised('data.txt', model='cbow')

model = fasttext.train_unsupervised(os.path.join(const.DATAPATH, 'corp.txt'),
                                    model='skipgram',
                                    dim=128,
                                    epoch=10,
                                    ws=5,
                                    minCount=1,
                                    loss='hs',
                                    wordNgrams=2)
model.save_model(os.path.join(const.MODELPATH, 'skipgram.bin'))

model = fasttext.train_unsupervised(os.path.join(const.DATAPATH, 'corp.txt'),
                                    model='cbow',
                                    dim=128,
                                    epoch=10,
                                    ws=5,
                                    minCount=1,
                                    loss='hs',
                                    wordNgrams=2)
model.save_model(os.path.join(const.MODELPATH, 'cbow.bin'))
# with open('../data/train_data.txt','r',encoding='utf-8') as file ,open('train.txt','w',encoding='utf-8') as trainfile,\
#     open('text.txt','w',encoding='utf-8') as testfile:
#     for line in file.readlines():
#         label,content=line.strip().split('\t')[1],line.strip().split('\t')[0]
#         content=jieba.cut(content)
#         content=' '.join(content)
#         label="__label__"+label
#         if random.random()>0.2:
#             trainfile.write(content.strip()+"\t"+label+'\n')
#         else:
#             testfile.write(content.strip()+"\t"+label+'\n')



#训练监督文本,train_data.txt,模型会默认保存在当前目录下,名称为"fasttext_test.model.bin";thread表示以3个线程进行训练,不加默认1个线程
classifier = ff.train_supervised('train.txt',label='__label__',epoch=10)

# 验证数据集
result = classifier.test('test.txt')
# 输出准确率和召回率
print(result[1], result[2])
# 预测文本分类, articles是一段文本用字符串表示, k=3表示输入可能性较高的三个分类,不加参数默认只输出一个
result = classifier.predict("我 明天 去 北京 出差 支持 客户", k=3)
print(result)

# with open("test.txt","r",encoding="utf-8") as testfile:
#     for line in testfile.readlines():
#         content,label=line.strip().split("\t")
#         predit_label=classifier.predict(content)
#         print("预测值为:",predit_label,"实际值为:",label)