Exemple #1
0
    def create_new_model(cls,
                         corpus_path,
                         pmodel_name,
                         epochs=5,
                         pmin_count=10,
                         psize=150,
                         installdir=''):
        """ Creates and trains (and optionally saves) a model using gensim's implementation 
        of the fastText algorithm, and then loads the KeyedVectors associated with that model.
        
        For CREATION/first time training only. To continue training an already existing
        model, use update_model().

        Parameters
        -----------
        corpus_path (str) - path to the corpus you wish to train the model with
        
        pmodel_name (str) - the name to be assigned to the model when saved. Must be unique
        or error will be raised to avoid overwriting an existing model

        epochs (int, optional) - Number of times to iterate over training corpus during training

        pmin_count (int, optional) - Minimum frequency for a word to be used in training

        psize (int, optional) - Size of vectors for training

        Returns:
        -----------
        True if model created/trained, False if could not be created

        Throws
        -----------
        FileNotFoundError - If corpus_path not found
        RuntimeError - If training an already existing model that makes it past first if statement. This
        is because build_vocab raises RuntimeError if building existing vocab without update=True (see update_model)
        """
        if installdir != '':
            model_path = installdir + IKFastTextModeling.__PATH_PREFIX__

        if pmodel_name[-4:] != '.bin':
            pmodel_name = pmodel_name + '.bin'

        if os.path.exists(os.path.join(model_path, pmodel_name)):
            raise FileExistsError(
                "Model named {} already exists, model could not be created".
                format(pmodel_name[:-4]))

        model = ft.FastText(vector_size=psize, sg=1, min_count=pmin_count)

        super().create_new_model(corpus_path, model, epochs)

        ft.save_facebook_model(model,
                               path=os.path.join(model_path, pmodel_name))
        return True
Exemple #2
0
    def update_model(cls,
                     corpus_path,
                     pmodel_name,
                     use_iknow_entities=True,
                     tokenize_concepts=True,
                     installdir=''):
        """ Updates an already existing model by continuing its training
        on a new corpus.

        Parameters
        -----------
        corpus_path (str) - path to the corpus being used to update the model
        
        pmodel_name (str, optional) - The name of the model to be updated, defaults to the
        model currently in use

        Return
        -----------
        True if model was updated, else False

        Throws
        -----------
        FileNotFoundError - if corpus or model not found
        """

        model_path = installdir + IKFastTextModeling.__PATH_PREFIX__

        try:
            if pmodel_name[-4:] != '.bin':
                pmodel_name = pmodel_name + '.bin'
            path = os.path.join(model_path, pmodel_name)
            model = ft.load_facebook_model(path)

            super().update_model(corpus_path, model, use_iknow_entities,
                                 tokenize_concepts)

            # Clear current contents of folders storing model and KeyedVectors files as gensim doesn't do it
            os.remove(path)

            ft.save_facebook_model(model, path=path)

        except FileNotFoundError as err:
            raise FileNotFoundError(
                "Model could not be updated, check specified corpus and model names"
            ) from err
Exemple #3
0
 def save_model(self, path=""):
     self.check_no_model()
     fasttext.save_facebook_model(self.model, path)
minutes, seconds = divmod(rem, 60)
print("Time elapsed {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                                   seconds))

print(model.corpus_count)
print('\n\ntrain model...')
model.train(MyIter(input_corpus),
            total_examples=total_examples,
            epochs=epoch,
            callbacks=[LossCallback()])

print(model.wv[' '])
print(model.wv['use'])
print(model.wv['even'])

print('\n\nsaving model...')
start = time.time()
save_facebook_model(model, model_bin)
end = time.time()
hours, rem = divmod(end - start, 3600)
minutes, seconds = divmod(rem, 60)
print("Time elapsed {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                                   seconds))

# load model?
# print('loading saved model...')
# fb_model = load_facebook_model(model_bin)
# print(fb_model.wv[' '])
# print(fb_model.wv['use'])
# print(fb_model.wv['thisIsaReallyLongN'])
Exemple #5
0
 def save(self):
     save_facebook_model(self.model, self.reference)