def word2vec_train_new_dataset(config_path: str, dataset_path: str, model_save_path: str, pretrain_model_path: str = '', epochs: int = -1): assert os.path.exists(dataset_path), 'dataset path is invalid' cfg = Config.from_json(config_path) train_word2vec(cfg=cfg, input_file_name=dataset_path, pretrain_model_path=pretrain_model_path, model_save_path=model_save_path, epoch=epochs)
def sentiment_rank(config_path: str, sentence: str) -> float: """ :param config_path: configuration file path :param sentence: sentence to deal with :return: sentiment score, ranging in [0, 1]. The bigger the score is, the more positive sentiment is """ # print(sentence) config = Config.from_json(config_path) sentiment_analysis = SentimentAnalysis(cfg=config) state_dict = torch.load(os.path.join(config.checkpoint_folder, 'optimal.pth'), map_location=config.device) sentiment_analysis.load_state_dict(state_dict) sentence_distiller = SentenceDistiller(stop_words_path=os.path.join( 'comments_dataset', 'stop_words', 'all_stop_words.txt')) distilled_sentence = sentence_distiller(sentence) print(distilled_sentence) return float(sentiment_analysis(distilled_sentence.split()))
import argparse from config.Config import Config from sentiment import word2vec_train_new_dataset, sentiment_rank if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('config', type=str, help='configuration file path') # parser.add_argument('-p', "--preprocess", type=bool, default=False, help='preprocess dataset or not') parser.add_argument('-t', "--train", type=bool, default=False, help='Train word2vec model or not') parser.add_argument('-s', "--sentence", type=str, default='', help='Sentence to deal with') args = parser.parse_args() # data_preprocess(args.datasetPath) config = Config.from_json(args.config) if args.train: for dataset_path in config.sentence_datasets: word2vec_train_new_dataset( config_path=args.config, dataset_path=dataset_path, model_save_path=config.word2vec_file_path, pretrain_model_path=config.word2vec_file_path ) if len(args.sentence) > 0: sentiment_score = sentiment_rank(config_path=args.config, sentence=args.sentence) print(f'sentiment score: {sentiment_score: .3f}')
import gensim def train_word2vec(cfg: Config, input_file_name: str = os.path.join('data', 'wiki.txt'), pretrain_model_path: str = '', model_save_path: str = '', epoch: int = -1): print(f'dataset path: {input_file_name}, train_word2vec starts...') if len(model_save_path) == 0: model_save_path = cfg.word2vec_file_path if not os.path.exists(pretrain_model_path): model = Word2Vec(LineSentence(input_file_name), size=cfg.embedding_dim, window=cfg.window_size, min_count=cfg.min_count, workers=multiprocessing.cpu_count()) else: model = gensim.models.Word2Vec.load(cfg.word2vec_file_path) model.train(LineSentence(input_file_name), epochs=model.iter if epoch <= 0 else epoch, total_examples=model.corpus_count) model.save(model_save_path) if __name__ == '__main__': config = Config.from_json(os.path.join('config', 'config.json')) train_word2vec(cfg=config)