def train(self, epochs=30,no_threads=None): """ Train with own Data(s) Support single or multiple corpus or dataframe. Parameters: ----------- model_name(optional): preferred model name epochs : int : total epochs for training no_threads(optional): int : no of threads for training Example -------- >>> from ekushey.feature_extraction import BN_GloVe #Training Against Sentences >>> glv = BN_GloVe(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ]) >>> glv.train() #Training Against one Text Corpus >>> glv = BN_GloVe(corpus_file="path_to_corpus.txt") >>> glv.train() #Training Against Multiple Corpuses path ->corpus ->1.txt ->2.txt ->3.txt >>> glv = BN_GloVe(corpus_path="path/corpus") >>> glv.train(epochs=25) #Training Against a Dataframe Column >>> glv = BN_GloVe(df= news_data['text_content']) >>> glv.train(epochs=25) """ if not(self.sentences) and not(self.corpus_file) and not(self.corpus_path) and self.df is None: raise Exception('Data is not given') elif self.sentences: data = self.sentences print("got sentence") elif self.corpus_file: print("got sentence") data = PathLineSentences(self.corpus_file) elif self.corpus_path: print("got sentence") data = PathLineSentences(self.corpus_path) elif self.df is not None: print("Dataframe got") data = '\n'.join(self.df) data = data.split('\n') data = [sent.split() for sent in data] else: print("Unexpected error occured: Please check your data file again.") if no_threads is None: no_threads = self.cpu_cores t = time() corpus = Corpus() corpus.fit(data, window=self.window) print('Dict size: %s' % len(corpus.dictionary)) glove = Glove(no_components=self.size, learning_rate=self.n) glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=True) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2))) glove.add_dictionary(corpus.dictionary) print("Saving model to current directory") glove.save(self.model_name)
def train(self, epochs=30): """ Train with own Data(s) Support single or multiple corpus or dataframe. Parameters: ----------- model_name(optional): preferred model name epochs : int : total epochs for training Example -------- >>> from ekushey.feature_extraction import BN_FastText #Training Against Sentences >>> ft = BN_FastText(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ]) >>> ft.train() #Training Against one Text Corpus >>> ft = BN_FastText(corpus_file="path_to_corpus.txt") >>> ft.train() #Training Against Multiple Corpuses path ->corpus ->1.txt ->2.txt ->3.txt >>> ft = BN_FastText(corpus_path="path/corpus") >>> ft.train(epochs=25) #Training Against a Dataframe Column >>> ft = BN_FastText(df= news_data['text_content']) >>> ft.train(epochs=25) """ if not(self.sentences) and not(self.corpus_file) and not(self.corpus_path) and self.df is None: raise Exception('Data is not given') elif self.sentences: data = self.sentences #print("got sentence") elif self.corpus_file: #print("got sentence") data = PathLineSentences(self.corpus_file) elif self.corpus_path: #print("got sentence") data = PathLineSentences(self.corpus_path) elif self.df is not None: #print("Dataframe got") data = '\n'.join(self.df) data = data.split('\n') data = [sent.split() for sent in data] else: print("Unexpected error occured: Please check your data file again.") cpu_cores = multiprocessing.cpu_count() ft_model = FastText( size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, negative=self.negative ) print("Working with "+str(self.workers)+" worker threads") ft_model.build_vocab(data, progress_per=10000) print("Vocabulary build Successfully") t=time() ft_model.train(data, total_examples=ft_model.corpus_count, epochs=epochs, report_delay=1) print('Training took : {} mins'.format(round((time() - t) / 60, 2))) ft_model.save(self.model_name) print(ft_model)