Exemple #1
0
def generate_fasttext(corpus,text_filepath,emb_path,cbow=False,min_count=2,minn=3, maxn=5, dim=200,epochs=5,lr=.1,neg=5,ws=5):

    try:
        os.makedirs(emb_path)
    except OSError:
        pass

    try:
        os.makedirs(text_filepath)
    except OSError:
        pass

    if type(corpus[0]) == list:
        corpus = [" ".join(i) for i in corpus]

    df = pd.DataFrame()
    df['text'] = corpus
    df.to_csv(os.path.join(text_filepath,'file.txt'),header=False,index=False)
    
    if cbow:
        model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), "cbow", minCount=min_count, minn=minn, maxn=maxn, dim=dim, \
                                                epoch=epochs, lr=lr, ws=ws, neg=neg)

    else:
        model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), minCount=min_count, minn=minn, maxn=maxn, dim=dim, \
                                                epoch=epochs, lr=lr, ws=ws, neg=neg)
Exemple #2
0
def learn_embeddings(mode, sentences, dimensions, window_size, workers, iter,
                     ind):
    """
    Jointly Learn word-level and fact-level embeddings by optimizing the Language Model.

    :param ind: the index for each fact
    :param mode: the chosen language model
    :param sentences: the sequence sampled by node2vec
    :param dimensions: the number of dimensions
    :param window_size: the size of window in language model
    :param workers: the number of parnell threads
    :param iter: the number of epochs in SGD.
    :return: the word-level (model_W) and the fact-level (model_S) model
    """
    np.savetxt(sen_file_path, np.array(sentences), fmt="%s", newline="\n")

    if mode == "skipgram":
        print("                    +++Learning Word-level Embeddings++++")
        wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions)
        wm.save_model(word_model_path + "_" + mode + ".bin")

        print("                    +++Learning Fact-level Embeddings++++")
        sent = list(list(map(str, s)) for s in sentences)
        fm = Word2Vec(sent,
                      size=dimensions,
                      window=window_size,
                      min_count=0,
                      sg=1,
                      workers=workers,
                      iter=iter)
        fm.wv.save_word2vec_format(fact_embedding_path, binary=False)
        fm.save(fact_model_path + "_" + mode + ".bin")

        # Turn fact into corresponding nodes
        semantic_to_fact(ind, dimensions)

        return wm

    if mode == "cbow":
        print("                    +++Learning Word-level Embeddings++++")
        wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions)
        wm.save_model(word_model_path + "_" + mode + ".bin")

        print("                    +++Learning Fact-level Embeddings++++")
        sent = list(list(map(str, s)) for s in sentences)
        fm = Word2Vec(sent,
                      size=dimensions,
                      window=window_size,
                      min_count=0,
                      sg=0,
                      workers=workers,
                      iter=iter)
        fm.wv.save_word2vec_format(fact_embedding_path, binary=False)
        fm.save(fact_model_path + "_" + mode + ".bin")

        # Turn fact into corresponding nodes
        semantic_to_fact(ind, dimensions)

        return wm
Exemple #3
0
def train_fasttext_model():
    model = ft.train_unsupervised('./twitter_corpora/corpora.txt',
                                  model='skipgram',
                                  dim=45)
    model.save_model('./fasttext/sk_fasttext.bin')

    model = ft.train_unsupervised('./twitter_corpora/corpora.txt',
                                  model='cbow',
                                  dim=45)
    model.save_model('./fasttext/cbow_fasttext.bin')
 def finetune_model(self, model_type, overwrite=False):
     """
     Method that trains an unsupervised fasttext model on our dataset for the given
     Video metadata type and stores is so that it can be used during the training of
     the Pseudoscience Classifier for extracting the embeddings from the input features
     :param model_type: 'video_snippet', 'video_tags', 'video_transcript', or 'video_comments'
     :param overwrite: whether to retrain and overwrite existing saved fastText model (if exists)
     :return:
     """
     # Create fastText input data filename
     fasttext_model_filename = '{0}/fasttext_model_{1}.bin'.format(
         self.FEATURE_ENGINEERING_MODELS_DIR, model_type)
     if not os.path.isfile(fasttext_model_filename) or overwrite:
         # Train unspervised fastText model
         model = fasttext.train_unsupervised(
             input='{0}/{1}_train_data.txt'.format(self.DATA_DIR,
                                                   model_type),
             pretrainedVectors='wiki-news-300d-1M.vec',
             dim=300,
             minn=2,
             maxn=5,
             thread=multiprocessing.cpu_count() -
             1,  # run in multiple cores
             verbose=2)
         # Save trained model
         model.save_model(fasttext_model_filename)
     return
Exemple #5
0
def embeddings_from_docs(
    in_path,
    out_path,
    fasttext_path=None,
    word_vec_dim=300,
    min_count=5,
    n_epoch=20,
    minn=3,
    maxn=5,
    lr=0.05,
):
    # Read in docs
    with open(in_path, "rb") as f:
        docs = pickle.load(f)

    # Write docs to temporary *.txt file for fasttext to train on
    with open("tmp.txt", "w", encoding="utf-8") as f:
        for doc in docs:
            f.write("\n".join(
                [" ".join([word for word in sen]) for sen in doc.sentences]))

    # Train word embeddings
    model = fasttext.train_unsupervised(
        "tmp.txt",
        dim=word_vec_dim,
        minCount=min_count,
        epoch=n_epoch,
        minn=minn,
        maxn=maxn,
        lr=lr,
    )

    model.save_model(out_path)
    def train(self):
        """
        update the language model
        """
        self.model = None  # remove the old model (for saving memory)

        current_time = datetime.datetime.now()
        file_name = "fasttext_{hash_code}_{year}_{month}_{day}".format(hash_code=abs(hash(current_time)),
                                                                       year=current_time.year,
                                                                       month=current_time.month,
                                                                       day=current_time.day)
        tmp_path = os.path.join(self.tmp_dir, file_name)

        # make corpus
        logger.info("Starting to build corpus for training, tmp file: {}".format(tmp_path))
        with open(tmp_path, "w", encoding="utf-8") as f:
            for doc in self.db[self.collection].find({self.abstract_entry: {"$exists": True, "$ne": None}}):
                tokens = PreTokenize.tokenize(doc.get(self.abstract_entry, ""), True)
                if tokens:
                    f.write(" ".join(tokens)+"\n")

        logger.info("Training the model -- Arguments: {}".format(self.training_args))
        model = fasttext.train_unsupervised(input=tmp_path, **self.training_args)
        model.save_model(self.model_path)
        self.model = model  # load new model

        # delete the tmp file
        os.remove(tmp_path)
        logger.info("Successfully save the new model and remove tmp file")
        self.db.metadata.update_one(
            {"data": "last_word_embedding_trained"}, {"$set": {"datetime": datetime.datetime.now()}}
        )
Exemple #7
0
def train_word_vectors(input: WordVectorTrainingInput) -> WordVectorTrainingOutput:
    """Trains word vectors via [FastText](https://fasttext.cc) based on a provided text."""

    with NamedTemporaryFile(suffix=".txt", mode="w", encoding="utf-8") as f:
        f.write(input.text)
        f.seek(0)

        model = fasttext.train_unsupervised(
            f.name,
            model=input.model.value,
            lr=input.learning_rate,
            dim=input.dimension,
            epoch=input.epoch,
            minCount=input.min_count,
            loss=input.loss_function,
            thread=1,  # only train with one thread to not block other demos
        )

        with NamedTemporaryFile(suffix=".vec", mode="w+b") as vec_file:
            words = model.get_words()
            for word in words:
                vec_file.write(
                    str.encode(
                        word
                        + "".join(" " + str(vi) for vi in model.get_word_vector(word))
                        + "\n"
                    )
                )
            vec_file.seek(0)
            return WordVectorTrainingOutput(vector_file=vec_file.read())
Exemple #8
0
def create_model(texts):
    temp_file = "temp.txt"
    with open(temp_file, "w") as f:
        f.write(texts.str.cat(sep='\n'))
    model = fasttext.train_unsupervised(temp_file, minn=2, maxn=5, dim=100)
    os.remove(temp_file)
    return model
Exemple #9
0
def create_fasttext_embedding_matrix(
        file_path: str, vocab: typing.Dict[str, int],
        embedding_dim: int) -> typing.Dict[str, np.ndarray]:
    """Train a fasttext model and return the embeddings."""

    model_path = os.path.join(SHARED_PATH, 'embedding_models',
                              f'fasttext_model_dim_{embedding_dim}.bin')

    if os.path.exists(model_path):
        logger.info('Loading fasttext embeddings...')
        model = fasttext.load_model(model_path)
    else:
        logger.info('Training fasttext embeddings...')
        model = fasttext.train_unsupervised(file_path,
                                            model='skipgram',
                                            dim=embedding_dim)
        model.save_model(model_path)

    embedding_matrix = np.zeros((len(vocab), model.get_dimension()))
    for word in vocab.keys():
        idx = vocab[word]
        if word in model.words:
            embedding_matrix[idx] = model[word]
        else:
            pass  # If word embedding is unknown, vector of zeros

    return embedding_matrix
def train_model():
    model = fasttext.train_unsupervised(input="wiki_cut_word.txt",
                                        model="skipgram",
                                        ws=6,
                                        minn=2,
                                        thread=12)
    model.save_model("fasttext.wiki.model.bin")
Exemple #11
0
def train_model(input_filename):
    model = fasttext.train_unsupervised(input_filename,
                                        model='skipgram',
                                        maxn=0,
                                        dim=100,
                                        ws=5)
    return model
Exemple #12
0
def train_facebook_fasttext_embedding(data,
                                      emb_nm,
                                      minn=3,
                                      maxn=6,
                                      dim=100,
                                      epoch=5,
                                      lr=0.05,
                                      thread=4,
                                      max_vocab_size=200000):
    # unsupervised training with custom parameters
    emb = fasttext.train_unsupervised(data,
                                      minn=minn,
                                      maxn=maxn,
                                      dim=dim,
                                      epoch=epoch,
                                      lr=lr,
                                      thread=thread)

    # we only select the vocab_size most frequent terms
    # TODO this should probably be emb.words = [:max_vocab_size]. Use Gensim to change format and reduce size
    # TODO ref: https://medium.com/@vasnetsov93/shrinking-fasttext-embeddings-so-that-it-fits-google-colab-cd59ab75959e
    # del emb.words[max_vocab_size:]

    # saving trained model
    emb.save_model(emb_nm)
Exemple #13
0
def create_fasttext_model(labels):
    """Runs Fastettext unsupervised to create a good model based on training set"""
    create_text_file(labels)
    model = fasttext.train_unsupervised('data_raw.txt',
                                        model='skipgram',
                                        dim=15)
    model.save_model("model_text_raw.bin")
Exemple #14
0
def get_model(model_path: str, train_data_path: str):
    try:
        model = fasttext.load_model(model_path)
    except ValueError:
        model = fasttext.train_unsupervised(train_data_path, model='skipgram')
        model.save_model(model_path)
    return model
def fasttext_train_unsupervised(bpe_file,
                                nwords,
                                outf,
                                dim=300,
                                minCount=1,
                                minn=10,
                                maxn=10):
    words = set()
    with open(bpe_file, 'r') as f:
        for line in f:
            for w in line.split():
                if w not in words:
                    words.add(w)
    print('training word vecs by fasttext')
    model = fasttext.train_unsupervised(bpe_file,
                                        dim=dim,
                                        minCount=minCount,
                                        minn=minn,
                                        maxn=maxn)
    print('OK! training finished')

    words_vec = np.zeros((nwords, dim))
    for i in range(nwords):
        if str(i) in words:
            words_vec[i, :] = model.get_word_vector(str(i))

    np.savetxt(outf, words_vec, delimiter=',')
    print('OK! model saved to %s ' % (outf))
 def fit(self, config):
     if self.pretrained:
         path = hydra.utils.to_absolute_path(config.word.embedding)
         self.model = fasttext.load_model(path)
     else:
         path = hydra.utils.to_absolute_path(config.data.train_path)
         self.model = fasttext.train_unsupervised(path, dim=self.dimensions)
Exemple #17
0
def train_fasttext(hf_dataset, output_dir):
    """

    Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output

    :return:
    """

    tokens_fp = os.path.join(output_dir, 'tokens.txt')
    fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin')
    fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt')

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')

    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Tokenized text
    doc_delimiter = '\n'
    token_delimiter = ' '
    tokens_count = 0

    with open(tokens_fp, 'w') as f:
        for doc in docs_ds:
            # Extract plain text
            text = doc['title'] + ': ' + doc['abstract']

            for token in gensim.utils.simple_preprocess(text,
                                                        min_len=2,
                                                        max_len=15):
                f.write(token + token_delimiter)
                tokens_count += 1
            f.write(doc_delimiter)

    logger.info(f'Total tokens: {tokens_count:,}')

    # Train actual fasttext model
    logger.info(f'Train fastext model...')

    model = fasttext.train_unsupervised(
        tokens_fp,
        model='skipgram',
        lr=0.05,  # learning rate [0.05]
        dim=300,  # size of word vectors [100]
        ws=5,  # size of the context window [5]
        epoch=5  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(fasttext_bin_fp)

    del model

    ft_model = FastText.load_fasttext_format(fasttext_bin_fp)
    ft_model.wv.save_word2vec_format(fasttext_w2v_fp)

    logger.info(f'Output saved to: {fasttext_w2v_fp}')

    logger.info('Done')
Exemple #18
0
def train_w2v(sentences, model='skipgram', dim=200, min_count=20, lr=0.015, ws=7, minn=3, maxn=6, epoch=20):
    """train word2vec ( via ``fasttext.unsupervised`` ).

    Args:
        sentences (list-like): list of raw sentences.
        model (str): model name (options are: 'skipgram' and 'cbow').
        dim (int): embedding size. default is 200.
        min_count (int): filter words with less than ``min_count`` occurrences.
        lr (float): learning rate.
        ws (int): window-size.
        minn (int): subword min length (default: 3-char).
        maxn (int): subword max length (default: 6-char).
        epoch (int): num of training epochs.

    Returns:
        ``fasttext.FastText._FastText``
    """
    with tempfile.NamedTemporaryFile(mode='w', prefix='corpus-', suffix='.txt') as f:
        for raw_sentence in sentences:
            f.write(raw_sentence)
            f.write('\n')

        return fasttext.train_unsupervised(input=f.name,
                                           model=model,
                                           dim=dim,
                                           minCount=min_count,
                                           lr=lr,
                                           epoch=epoch,
                                           ws=ws,
                                           minn=minn,
                                           maxn=maxn)
Exemple #19
0
	def format_data_BRAND(self,blog_file,data_file,data_vec_file):
		
		data = load_data(data_file)
		n=data['y_h'].shape[0]
		# print n
		# return 
		# only_txt = {i:data[i]['txt'] for i in data['data'].keys()}
		# self.dict_to_txt(only_txt, blog_file)		
		model = fasttext.train_unsupervised(blog_file, model='skipgram')
		data_vec={'y':np.zeros(n),'c':np.zeros(n)}
		x=[]
		for tid,y_h in zip(data['data'].keys(),data['y_h']):
			blog = data['data'][tid]['txt'].replace('\n',' ').decode('utf-8')
			print blog
			print '****************************************************'
			x.append(model.get_sentence_vector(blog).flatten() )
			# print y_h
			data_vec['y'][int(tid)] = np.mean(y_h)
			data_vec['c'][int(tid)] = np.mean((y_h-np.mean(y_h))**2)*0.01
		# return 
		plt.plot(data_vec['y'],label='y')
		plt.plot(data_vec['c'],label='c')
		plt.legend()
		plt.show()
		data_vec['x']=np.array(x)

		save(data_vec, data_vec_file)
Exemple #20
0
def get_parameter_value_with_results(i, param, param_values, params_wordembeddings, params_training, tune, X_test, y_test):
    print(str(i))
    model_name = "test_" + param + "_" + str(i)
    # bin_path = "word_vectors/fasttext/" + model_name + ".bin" 
    vec_path = "word_vectors/fasttext/" + model_name + ".vec" 
    if tune == "wordembeddings": ####### tuning parameter for fasttext WORD EMBEDDING
        params_wordembeddings[param] = param_values[i]
    embeddings = fasttext.train_unsupervised(input='data.txt', model='skipgram', **params_wordembeddings) 
    # embeddings.save_model(bin_path)
    # embeddings = load_model(bin_path)
    ### convert from fasttext embeddings (would be saved as .bin) to .vec,
    #   in order to use the embeddings .vec file as pretrainedVectors for fasttext text classification
    from_bin_to_vec(embeddings, vec_path)
    if tune == "training": ####### tuning parameter for fasttext TRAINING
        params_training[param] = param_values[i]
    # dimension of embeddings has to fit with dimension of look-up table (embeddings) in training model
    params_training["dim"] = embeddings.get_dimension()
    trained_model = fasttext.train_supervised(input=train_file, pretrainedVectors= vec_path, **params_training)
    ### find and apply optimal (threshold) cutoff point
    # get scores, i.e. list of probabilities for being labeled positive on set X_test
    y_scores = get_prediction_scores(trained_model,X_test)
    # find optimal probability threshold
    opt_threshold = find_optimal_cutoff(y_test, y_scores)
    # apply optimal threshold to the prediction probability and get label predictions
    y_pred = get_predictions(opt_threshold, y_scores) 
    ################## Evaluation
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    auprc = metrics.average_precision_score(y_test, y_pred)
    return [accuracy, precision, recall, auc, auprc]
Exemple #21
0
def train_unsupervised(args):

    # https://fasttext.cc/docs/en/unsupervised-tutorial.html
    model = fasttext.train_unsupervised(args.input,
                                        lr=args.lr,
                                        minCount=args.min_count,
                                        epoch=args.epoch,
                                        minn=args.minn,
                                        maxn=args.maxn,
                                        dim=args.dim,
                                        ws=args.ws)

    if not os.path.isdir(args.output_dir):
        print(f'Creating output directory: {args.output_dir}')
        os.makedirs(args.output_dir)

    model_fname = os.path.join(args.output_dir, 'model.bin')
    print(f'Saving model to: {model_fname}')
    model.save_model(model_fname)

    vec_fname = os.path.join(args.output_dir, f'word-vectors-{args.dim}d.txt')
    print(f'Saving word vectors to: {vec_fname}')
    bin_to_vec(model, vec_fname)

    count_fname = os.path.join(args.output_dir, f'word-counts.txt')
    print(f'Saving word count to: {count_fname}')
    bin_to_word_count(model, count_fname)
Exemple #22
0
def train_fasttext(data_dir='./data',
                   dim=300,
                   epoch=5,
                   ft_model='skipgram',
                   ft_lr=0.05,
                   ft_window=5):

    data_dir = Path(data_dir)

    import fasttext

    model = fasttext.train_unsupervised(
        str(data_dir / 'ocb_and_wikisource.w2v_tokens.txt'),
        model=ft_model,
        lr=ft_lr,  # learning rate [0.05]
        dim=dim,  # size of word vectors [100]
        ws=ft_window,  # size of the context window [5]
        epoch=epoch  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    from gensim.models.wrappers import FastText

    ft_model = FastText.load_fasttext_format(
        str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    ft_model.wv.save_word2vec_format(data_dir /
                                     'ocb_and_wikisource.fasttext.w2v.txt')

    logger.info('done')
    def w2v_train(self, documents_input, w2v_model_output):  # 预训练词向量并保存
        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : create word-segment without label txt')
        documents_cut = 'cache/msg_seg_without_label.txt'
        self.DP.file_cut_words(documents_input, documents_cut, mode='vec')

        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : w2v train start')
        # skipgram模型训练生成词向量,结果输出到w2v_model_output:lr学习率,dim维数,min_count最小词频
        model = fasttext.train_unsupervised(documents_cut,
                                            model='skipgram',
                                            lr=0.05,
                                            dim=self.dim,
                                            loss=self.loss,
                                            word_ngrams=self.word_ngrams,
                                            min_count=self.min_count)
        model.save_model(w2v_model_output)

        # os.remove(documents_cut)
        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : w2v train done')

        return model
Exemple #24
0
    def test_as_array_produces_token_array() -> None:

        with tempfile.TemporaryDirectory() as tempdir:
            dataset_filename = f"{tempdir}/dataset.txt"
            pretrained_filename = f"{tempdir}/fasttext.model"

            with open(dataset_filename, "w") as fp:
                fp.write("\n".join(
                    ["this is a first sentence", "this is a second sentence"]))

            model = fasttext.train_unsupervised(
                dataset_filename,
                model="skipgram",
                dim=10,
                minCount=1,
            )
            model.save_model(pretrained_filename)

            indexer = FastTextTokenIndexer(
                pretrained_filename=pretrained_filename)
            tokens = [
                Token(word) for word in "this is a test sentence".split()
            ]
            field = TextField(tokens, token_indexers={"tokens": indexer})

            vocab = Vocabulary()
            field.index(vocab)

            array_dict = indexer.tokens_to_indices(tokens, vocab)
            assert len(array_dict["tokens"]) == 5
            assert len(array_dict["tokens"][0]) == 10
Exemple #25
0
 def generate_embedding(self):
     classifier = fasttext.train_unsupervised(input=self.train_file,
                                              dim=self.vec_dim,
                                              epoch=self.epoch,
                                              minCount=10,
                                              thread=10)
     return self.get_res(classifier)
Exemple #26
0
 def train_fasttext(self, data, model_name, epoch):
     if self.is_train:
         model = fasttext.train_unsupervised(data,
                                             model='skipgram',
                                             minCount=1,
                                             epoch=epoch)
         model.save_model(model_name)
Exemple #27
0
    def build(data, size, mincount, path):
        """
        Builds fastText vectors from a file.

        Args:
            data: path to input data file
            size: number of vector dimensions
            mincount: minimum number of occurrences required to register a token
            path: path to output file
        """

        # Train on data file using largest dimension size
        model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)

        # Output file path
        print("Building %d dimension model" % size)

        # Output vectors in vec/txt format
        with open(path + ".txt", "w") as output:
            words = model.get_words()
            output.write("%d %d\n" % (len(words), model.get_dimension()))

            for word in words:
                # Skip end of line token
                if word != "</s>":
                    vector = model.get_word_vector(word)
                    data = ""
                    for v in vector:
                        data += " " + str(v)

                    output.write(word + data + "\n")

        # Build magnitude vectors database
        print("Converting vectors to magnitude format")
        converter.convert(path + ".txt", path + ".magnitude", subword=True)
def generate_fasttext_skipgram(data_file, train_iter, emb_size, output_file):
    model = fasttext.train_unsupervised(input=data_file, model='skipgram', dim=emb_size, minCount=5, verbose=2,
                                        thread=8)
    model_output = output_file.replace(".txt", ".bin")
    text_output = output_file
    model.save_model(model_output)
    fasttext_to_text.export_to_file(model_output, text_output)
Exemple #29
0
 def __generate_embeddings(self, file_path):
     self.printer.print('generating fasttext term embeddings')
     tmp_file = os.path.join(self.args.local_dir, 'tmp')
     with open(tmp_file, 'w', encoding='utf8') as f_out:
         with open(os.path.join(self.args.local_dir,
                                self.args.file_in_qs_train),
                   'rt',
                   encoding='utf8') as f_in:
             reader = csv.reader(f_in, delimiter='\t')
             for [_, q] in reader:
                 f_out.write(q)
                 f_out.write('\n')
         with open(os.path.join(self.args.local_dir,
                                self.args.file_in_docs),
                   'rt',
                   encoding='utf8') as f_in:
             reader = csv.reader(f_in, delimiter='\t')
             for row in reader:
                 f_out.write('\n'.join(row[1:]))
                 f_out.write('\n')
     self.printer.print('training fasttext term embeddings')
     embeddings = fasttext.train_unsupervised(
         tmp_file,
         model='skipgram',
         dim=self.args.num_hidden_nodes // 2,
         bucket=10000,
         minCount=100,
         minn=1,
         maxn=0,
         ws=10,
         epoch=5)
     embeddings.save_model(file_path)
     os.remove(tmp_file)
Exemple #30
0
def train_fasttext(corpus, cut_func, vocabulary, embedding_dim=300):
    corpus = [' '.join(cut_func(sentence)) for sentence in corpus]
    corpus_file_path = 'fasttext_tmp_corpus.txt'
    with open(corpus_file_path, 'w', encoding='utf8') as writer:
        for sentence in corpus:
            writer.write(sentence + '\n')

    model = train_unsupervised(input=corpus_file_path,
                               model='skipgram',
                               epoch=10,
                               minCount=1,
                               wordNgrams=3,
                               dim=300)

    model_vocab = model.get_words()

    emb = np.zeros(shape=(len(vocabulary) + 1, embedding_dim), dtype='float32')
    nb_unk = 0
    for w, i in vocabulary.items():
        if w not in model_vocab:
            nb_unk += 1
            emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
        else:
            emb[i, :] = model.get_word_vector(w)
    print(
        'Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}'
        .format(emb.shape, nb_unk))
    os.remove(corpus_file_path)
    return emb