def create_new_model(cls, corpus_path, pmodel_name, epochs=5, pmin_count=10, psize=150, installdir=''): """ Creates and trains (and optionally saves) a model using gensim's implementation of the fastText algorithm, and then loads the KeyedVectors associated with that model. For CREATION/first time training only. To continue training an already existing model, use update_model(). Parameters ----------- corpus_path (str) - path to the corpus you wish to train the model with pmodel_name (str) - the name to be assigned to the model when saved. Must be unique or error will be raised to avoid overwriting an existing model epochs (int, optional) - Number of times to iterate over training corpus during training pmin_count (int, optional) - Minimum frequency for a word to be used in training psize (int, optional) - Size of vectors for training Returns: ----------- True if model created/trained, False if could not be created Throws ----------- FileNotFoundError - If corpus_path not found RuntimeError - If training an already existing model that makes it past first if statement. This is because build_vocab raises RuntimeError if building existing vocab without update=True (see update_model) """ if installdir != '': model_path = installdir + IKFastTextModeling.__PATH_PREFIX__ if pmodel_name[-4:] != '.bin': pmodel_name = pmodel_name + '.bin' if os.path.exists(os.path.join(model_path, pmodel_name)): raise FileExistsError( "Model named {} already exists, model could not be created". format(pmodel_name[:-4])) model = ft.FastText(vector_size=psize, sg=1, min_count=pmin_count) super().create_new_model(corpus_path, model, epochs) ft.save_facebook_model(model, path=os.path.join(model_path, pmodel_name)) return True
def update_model(cls, corpus_path, pmodel_name, use_iknow_entities=True, tokenize_concepts=True, installdir=''): """ Updates an already existing model by continuing its training on a new corpus. Parameters ----------- corpus_path (str) - path to the corpus being used to update the model pmodel_name (str, optional) - The name of the model to be updated, defaults to the model currently in use Return ----------- True if model was updated, else False Throws ----------- FileNotFoundError - if corpus or model not found """ model_path = installdir + IKFastTextModeling.__PATH_PREFIX__ try: if pmodel_name[-4:] != '.bin': pmodel_name = pmodel_name + '.bin' path = os.path.join(model_path, pmodel_name) model = ft.load_facebook_model(path) super().update_model(corpus_path, model, use_iknow_entities, tokenize_concepts) # Clear current contents of folders storing model and KeyedVectors files as gensim doesn't do it os.remove(path) ft.save_facebook_model(model, path=path) except FileNotFoundError as err: raise FileNotFoundError( "Model could not be updated, check specified corpus and model names" ) from err
def save_model(self, path=""): self.check_no_model() fasttext.save_facebook_model(self.model, path)
minutes, seconds = divmod(rem, 60) print("Time elapsed {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) print(model.corpus_count) print('\n\ntrain model...') model.train(MyIter(input_corpus), total_examples=total_examples, epochs=epoch, callbacks=[LossCallback()]) print(model.wv[' ']) print(model.wv['use']) print(model.wv['even']) print('\n\nsaving model...') start = time.time() save_facebook_model(model, model_bin) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("Time elapsed {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) # load model? # print('loading saved model...') # fb_model = load_facebook_model(model_bin) # print(fb_model.wv[' ']) # print(fb_model.wv['use']) # print(fb_model.wv['thisIsaReallyLongN'])
def save(self): save_facebook_model(self.model, self.reference)