Ejemplo n.º 1
0
    def fit(self, X, y='ignored'):
        if type(X) == pd.DataFrame:
            X = X[self.token_column].values

        if self.model is None:
            self.model = FastText(sentences=X, size=self.size, window=self.window, min_count=self.min_count,
                                  iter=self.iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams,
                                  workers=self.workers)
Ejemplo n.º 2
0
    def check_txt_embeddor(self, txt_embeddor):
        if txt_embeddor == "fasttext":
            txt_embeddor = FastText(size=self.size_embeddor, window=5,min_count = 1,sample=1e-3, sg=1, workers=1)

        if isinstance(txt_embeddor, FastText):
            return txt_embeddor
        else:
            raise Exception('Wrong txt_embeddor parameter : must be "fasttext" or instances of gensim.models.FastText')
Ejemplo n.º 3
0
def make_fasttext(target_dataset):

	corpus_path = os.path.join(CONFIG.DATASET_PATH, target_dataset, "corpus.txt")
	sentences = word2vec.LineSentence(corpus_path) 
	dimension_size = 300
	print("embedding started")
	embedding_model = FastText(sentences=sentences, size=dimension_size, window=6, min_count=5, workers=4, sg = 1) #skip-gram
	embedding_model = FastText(size=dimension_size, window=6, min_count=5, workers=4, sg = 1) #skip-gram
	embedding_model.build_vocab(sentences=sentences)
	embedding_model.train(sentences=sentences, total_examples=embedding_model.corpus_count, epochs=10)
	model_name = "FASTTEXT_"+ target_dataset + ".model"
	#pad_value = np.finfo(np.float32).eps
	pad_value = 1.
	embedding_model.wv.add("<PAD>", np.full(embedding_model.vector_size, pad_value), replace=True)
	embedding_model.wv.init_sims(replace=True)
	embedding_model.wv.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	print("embedding completed")
Ejemplo n.º 4
0
    def embedding(self, x, size=100, window=5, min_count=5):
        model = FastText(size=self.embed_size,
                         window=5,
                         min_count=5,
                         sentences=x,
                         seed=self.seed)

        return model
Ejemplo n.º 5
0
    def __init__(self,
                 use_type,
                 window=None,
                 size=None,
                 sample=None,
                 input_type=None,
                 alpha=None,
                 distance_type=None,
                 train_path=None,
                 output_path=None,
                 pretrained_model_path=None,
                 validation_path=None,
                 logging=logging.ERROR,
                 batch_size=500,
                 sim_threshold=0.9,
                 jec_threshold=0.5,
                 epoch_train=30,
                 workers=-1,
                 min_count=10,
                 split_validation=0):
        super().__init__(use_type=use_type,
                         train_path=train_path,
                         output_path=output_path,
                         validation_path=validation_path,
                         log_level=logging,
                         distance_type=distance_type,
                         batch_size=batch_size,
                         input_type=input_type,
                         alpha=alpha,
                         sample=sample,
                         size=size,
                         window=window,
                         sim_threshold=sim_threshold,
                         jec_threshold=jec_threshold,
                         epoch_train=epoch_train,
                         min_count=min_count,
                         split_validation=split_validation)

        if use_type != 'validation':
            if pretrained_model_path:
                self._model = FastText.load(pretrained_model_path)
                # World movers
                if distance_type == 'wm':
                    self._model.init_sims(replace=True)
            else:
                if workers == -1:
                    workers = self._cores - 1
                self.model = FastText(
                    min_count=self._min_count,
                    window=self._window,
                    size=self._size,
                    sample=self._sample,
                    alpha=self._alpha,
                    min_alpha=self._min_alpha,
                    negative=self._negative,
                    sg=0,  # CBOW
                    min_n=2,  # minimum ngram
                    workers=workers)
Ejemplo n.º 6
0
    def train(self,
              data: List[List[str]],
              save_path: str,
              save_name: str,
              iterations: int = 5,
              window_size: int = 5,
              min_count: int = 3,
              hs: int = 0) -> None:
        """
        Train a new FastText model.

        :param data: The training data
        :type data: list(list(str))
        :param save_path: Directory path to save the model
        :type save_path: str
        :param save_name: Name of the model (Should be <name>.joblib)
        :type save_name: str
        :param iterations: Epochs of the model to train
        :type iterations: int
        :param window_size: Window size used by the model
        :type window_size: int
        :param min_count: Minimal frequency of words to be regarded
        :type min_count: int
        :param hs: Hierarchical Softmax
        :type hs: int

        :return: None
        """
        if os.path.exists(save_path):
            raise IOError(
                "Save path already exists. Please specify another one to not override."
            )
        else:
            os.mkdir(save_path)

        self.config.update({
            "path": save_path,
            "name": save_name,
            "epochs": iterations,
            "window size": window_size,
            "min count": min_count,
            "hierarchical softmax": hs
        })

        model = FastText(data,
                         size=self.dimensions,
                         window=window_size,
                         min_count=min_count,
                         iter=iterations,
                         hs=hs,
                         workers=3,
                         sg=1)

        try:
            dump(model, os.path.join(save_path, save_name))
        except:
            tf.logging.warning("model could not be saved!")
        return None
Ejemplo n.º 7
0
def building_word_vector_model(option, sentences, embed_dim, workers, window,
                               y_train):
    """
        Builds the word vector

        Args:
            type = {bool} 0 for Word2vec. 1 for gensim Fastext. 2 for Fasttext 2018.
            sentences = {list} list of tokenized words
            embed_dim = {int} embedding dimension of the word vectors
            workers = {int} no. of worker threads to train the model (faster training with multicore machines)
            window = {int} max distance between current and predicted word
            y_train = y_train

        Returns:
            model = Word2vec/Gensim fastText/ Fastext_2018 model trained on the training corpus


    """
    if option == 0:
        print("Training a word2vec model")
        model = Word2Vec(sentences=sentences,
                         size=embed_dim,
                         workers=workers,
                         window=window)
        print("Training complete")

    elif option == 1:
        print("Training a Gensim FastText model")
        model = FastText(sentences=sentences,
                         size=embed_dim,
                         workers=workers,
                         window=window)
        print("Training complete")

    elif option == 2:
        print("Training a Fasttext model from Facebook Research")
        y_train = [
            "__label__positive" if i == 1 else "__label__negative"
            for i in y_train
        ]

        with open("imdb_train.txt", "w") as text_file:
            for i in range(len(sentences)):
                print(sentences[i], y_train[i], file=text_file)

        model = fasttext.train_unsupervised("imdb_train.txt",
                                            model='skipgram',
                                            lr=0.05,
                                            dim=100,
                                            ws=5,
                                            epoch=15)
        # fasttext.skipgram("imdb_train.txt","model_ft_2018_imdb",dim = embed_dim)
        print("Training complete")

        # fasttext.train_unsupervised("train.txt", model='skipgram', lr=0.05, dim=100, ws=5, epoch=5)
        # model.save_model("model_file.bin")

    return model
Ejemplo n.º 8
0
 def train(self):
     model = FastText(self.sentences,
                      size=200,
                      window=3,
                      min_count=1,
                      iter=70)
     currdir = os.getcwd()
     model.save(currdir + '/mymodel.bin')
     return model
Ejemplo n.º 9
0
 def specific_setup(self):
     self.emb_size = 10
     self.attrs_number = len(self.ds.attr_to_idx)
     self.attr_language_model = {}
     raw_data = self.ds.get_raw_data()
     for attr in self.ds.attr_to_idx:
         attr_corpus = list(zip(raw_data[attr].tolist()))
         model = FastText(attr_corpus, min_count=1, size=self.emb_size)
         self.attr_language_model[attr] = model
Ejemplo n.º 10
0
def bible_embeddings(processed_bible):
    #Parameters: processed bible file
    #Returns: writes bible representation to file
    model = FastText()
    model.build_vocab(sentences=processed_bible)
    model.train(sentences=processed_bible,
                total_examples=len(processed_bible),
                epochs=10)
    model.save("bible_ft.bin")
Ejemplo n.º 11
0
 def train(self, sentences):
     self.model = FastText(sentences,
                           size=self.size,
                           window=self.window,
                           min_count=self.min_count,
                           sg=self.sg,
                           workers=self.workers)
     self.model.save(self.model_file)
     self.model.save_word2vec_format(self.model_file + '.bin', binary=True)
Ejemplo n.º 12
0
 def __init__(self, tokenizer, config):
     self.tokenizer = tokenizer
     self.pretrained_embed_dir = config.pretrained_embed_dir
     self.vocab_list = config.vocab_list
     self.vocab_size = config.vocab_size
     self.embed_dim = config.embed_dim
     self.idx2word = list()
     self.word2idx = dict()
     self.fasttext = FastText()
Ejemplo n.º 13
0
def get_word_vector(sentences):
    return FastText(sentences,
                    size=100,
                    window=3,
                    min_count=1,
                    iter=10,
                    min_n=3,
                    max_n=6,
                    word_ngrams=0)
Ejemplo n.º 14
0
def fasttext_model(text_data, **kwargs):
    model = FastText(text_data,
                     size=kwargs['size'],
                     window=kwargs['window'],
                     min_count=kwargs['min_count'],
                     workers=kwargs['workers'],
                     seed=kwargs['seed'],
                     sg=1)
    return model
Ejemplo n.º 15
0
def test_fast():

    if gensim_version.major >= 4:
        fast = FastText(vector_size=4,
                        window=3,
                        min_count=1,
                        sentences=common_texts,
                        epochs=10)
    else:
        fast = FastText(size=4,
                        window=3,
                        min_count=1,
                        sentences=common_texts,
                        iter=10)
    fast_keyed_vectors = fast.wv
    wem = WordEmbeddingModel(fast_keyed_vectors, "w2v")

    assert fast.wv == wem.model
Ejemplo n.º 16
0
def train_fasttext_model(sentences, embed_dim):
    print("Training a fasttext model ...")
    model = FastText(sentences,
                     size=embed_dim,
                     workers=2,
                     window=1,
                     min_count=1)
    print("Training complete")
    return model
Ejemplo n.º 17
0
def spell(request):
    if request.method != "POST":
        report = {
            'word_count': len(content.split(' ')),
            'bigram_count': len(bigrams_list),
            'trigram_count': len(trigrams_list)
        }
    else:
        post_data = dict(request.POST.lists())
        post_data.pop('csrfmiddlewaretoken', None)
        inp = post_data['word'][0]
        print(post_data)
        global model_ted
        import re
        # remove parenthesis
        input_text_noparens = re.sub(r'\([^)]*\)', '', content)
        # store as list of sentences
        sentences_strings_ted = []
        for line in input_text_noparens.split('\n'):
            m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$',
                         line)
            sentences_strings_ted.extend(
                sent for sent in m.groupdict()['postcolon'].split('.') if sent)
        # store as list of lists of words
        sentences_ted = []
        for sent_str in sentences_strings_ted:
            tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
            sentences_ted.append(tokens)
        import gensim
        from gensim.models import FastText
        global model_ted
        model_ted = FastText(sentences_ted,
                             window=5,
                             min_count=5,
                             workers=4,
                             sg=1)

        similar = model_ted.wv.most_similar(inp)
        semantically_similar_words = {
            words:
            [item[0] for item in model_ted.wv.most_similar([words], topn=5)]
            for words in ['dhoni', 'singh']
        }
        out = []
        for i in similar:
            s = {}
            s['item'] = i[0]
            s['p'] = i[1]
            out.append(s)
        print(similar)
        print(model_ted)
    return render(request, 'custom2.html', {
        'bigram': bigrams_list,
        'similar': out,
        'inp': inp
    })
def fasttext_model(sentences, size=100,
                   min_count=5, negative=5,
                   window=5, cbow=True, iterations=5,
                   seed=0, workers=1):
    """ gensim fasttext function """
    model = FastText(sentences, size=size, window=window, min_count=min_count,
                     negative=negative, sg=cbow, seed=seed, iter=iterations)
    model.train(sentences, total_examples=model.corpus_count,
                epochs=model.epochs)
    return model
Ejemplo n.º 19
0
def _train_and_save_model_ft(sents, model_path):
    ft_model = FastText(sents,
                        size=128,
                        window=32,
                        min_count=5,
                        sample=1e-2,
                        sg=1,
                        iter=50)
    ft_model.save(model_path)
    return ft_model
Ejemplo n.º 20
0
    def fit(self, X, is_prepro=False):
        if self.model is not None:
            raise RuntimeError("Failed to fit: you have indicated pretrained model")

        if not is_prepro:
            X = self.prepro(X)
        self.model = FastText(X, **self.args)
        if self.save_file is not None:
            self.model.save(self.save_file)
            print("Model saved to {}".format(self.save_file))
Ejemplo n.º 21
0
    def get_model_embedding(self,sentences):

        self.model = None
        if self.embedding_name == "fasttext":
            self.model = FastText(sentences, size=self.emb_dim, window=5, min_count=5, workers=4,sg=1)
            
        elif self.embedding_name == "glove":
            self.model = Word2Vec(sentences=sentences, size=self.emb_dim, window=5, min_count=5, workers=4, sg=0)
    
        return self.model
def main(file_path, my_min_count, my_workers):

    model = FastText([token for token in getline(file_path)],
                     size=4,
                     window=10,
                     min_count=5,
                     iter=10,
                     size=500)

    pass
Ejemplo n.º 23
0
def train_fasttext_model(inp_data,
                         tokenizer_fn,
                         model_path,
                         char_tokens=False):
    tokens = tokenizer_fn(inp_data, char_tokens)
    model = FastText(size=128, window=5, min_count=2, workers=5, iter=50)

    model.build_vocab(tokens)
    model.train(tokens, total_examples=model.corpus_count, epochs=50)
    model.save(model_path)
Ejemplo n.º 24
0
 def __init__(self,
              size: int = 5,
              window: int = 3,
              min_count: int = 1,
              skipgram: bool = True):
     model = FastText(size=size,
                      window=window,
                      min_count=min_count,
                      sg=skipgram)
     self.model = model
def get_fastext(sentences_tok):
    print("Training FastText model ...\n")
    model = FastText(size=324, window=10, min_count=1)  # instantiate
    model.build_vocab(sentences_tok)
    model.train(sentences=sentences_tok,
                total_examples=len(sentences_tok),
                epochs=5)  # train
    se = Sentence2Vec(model)
    ft_embeddings = se.train(sentences_tok)
    return ft_embeddings
Ejemplo n.º 26
0
def test_fast():
    fast = FastText(size=4,
                    window=3,
                    min_count=1,
                    sentences=common_texts,
                    iter=10)
    fast_keyed_vectors = fast.wv
    wem = WordEmbeddingModel(fast_keyed_vectors, "w2v")

    assert fast.wv == wem.model_
Ejemplo n.º 27
0
def train(sentences):
    model = FastText(sentences,
                     size=100,
                     window=4,
                     min_count=1,
                     iter=5,
                     min_n=4,
                     max_n=7,
                     word_ngrams=1)
    model.save(MODEL_PATH)
Ejemplo n.º 28
0
def fasttext_train(corpus_path, save_path):
    """输入分词完成的txt文件,一行为一个文本。"""
    model = FastText(window=5, size=200, min_count=1, workers=2)
    model.build_vocab(
        corpus_file=corpus_path)  # scan over corpus to build the vocabulary

    total_words = model.corpus_total_words  # number of words in the corpus
    model.train(corpus_file=corpus_path, total_words=total_words, epochs=5)

    model.save(save_path)
def fasttext(model_path, sentences):
    '''
    https://radimrehurek.com/gensim/models/fasttext.html
    model_path should have a .model extension
    sentences: list of list of strings(tokens)
    '''
    model = FastText(sentences, min_count=1)
    word_vectors = model.wv
    model.save(model_path)
    return model_path
Ejemplo n.º 30
0
 def trainer(self, corpus):
     """
     :param corpus: [sentence: [ch, ch, . . ., ch]]
     :return:
     """
     train_instance = FastText(sentences=corpus,
                               size=self.size,
                               window=8,
                               min_count=1)
     self.model = train_instance