コード例 #1
0
def word2vec_train_new_dataset(config_path: str,
                               dataset_path: str,
                               model_save_path: str,
                               pretrain_model_path: str = '',
                               epochs: int = -1):
    assert os.path.exists(dataset_path), 'dataset path is invalid'
    cfg = Config.from_json(config_path)
    train_word2vec(cfg=cfg,
                   input_file_name=dataset_path,
                   pretrain_model_path=pretrain_model_path,
                   model_save_path=model_save_path,
                   epoch=epochs)
コード例 #2
0
def sentiment_rank(config_path: str, sentence: str) -> float:
    """
    :param config_path: configuration file path
    :param sentence: sentence to deal with
    :return: sentiment score, ranging in [0, 1]. The bigger the score is, the more positive sentiment is
    """
    # print(sentence)
    config = Config.from_json(config_path)
    sentiment_analysis = SentimentAnalysis(cfg=config)

    state_dict = torch.load(os.path.join(config.checkpoint_folder,
                                         'optimal.pth'),
                            map_location=config.device)
    sentiment_analysis.load_state_dict(state_dict)

    sentence_distiller = SentenceDistiller(stop_words_path=os.path.join(
        'comments_dataset', 'stop_words', 'all_stop_words.txt'))

    distilled_sentence = sentence_distiller(sentence)
    print(distilled_sentence)

    return float(sentiment_analysis(distilled_sentence.split()))
コード例 #3
0
import argparse
from config.Config import Config
from sentiment import word2vec_train_new_dataset, sentiment_rank

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('config', type=str, help='configuration file path')
    # parser.add_argument('-p', "--preprocess", type=bool, default=False, help='preprocess dataset or not')
    parser.add_argument('-t', "--train", type=bool, default=False, help='Train word2vec model or not')

    parser.add_argument('-s', "--sentence", type=str, default='', help='Sentence to deal with')

    args = parser.parse_args()

    # data_preprocess(args.datasetPath)
    config = Config.from_json(args.config)
    if args.train:
        for dataset_path in config.sentence_datasets:
            word2vec_train_new_dataset(
                config_path=args.config,
                dataset_path=dataset_path,
                model_save_path=config.word2vec_file_path,
                pretrain_model_path=config.word2vec_file_path
            )

    if len(args.sentence) > 0:
        sentiment_score = sentiment_rank(config_path=args.config,
                                         sentence=args.sentence)
        print(f'sentiment score: {sentiment_score: .3f}')
コード例 #4
0
import gensim


def train_word2vec(cfg: Config,
                   input_file_name: str = os.path.join('data', 'wiki.txt'),
                   pretrain_model_path: str = '',
                   model_save_path: str = '',
                   epoch: int = -1):
    print(f'dataset path: {input_file_name}, train_word2vec starts...')

    if len(model_save_path) == 0:
        model_save_path = cfg.word2vec_file_path

    if not os.path.exists(pretrain_model_path):
        model = Word2Vec(LineSentence(input_file_name),
                         size=cfg.embedding_dim,
                         window=cfg.window_size,
                         min_count=cfg.min_count,
                         workers=multiprocessing.cpu_count())
    else:
        model = gensim.models.Word2Vec.load(cfg.word2vec_file_path)
        model.train(LineSentence(input_file_name), epochs=model.iter if epoch <= 0 else epoch,
                    total_examples=model.corpus_count)

    model.save(model_save_path)


if __name__ == '__main__':
    config = Config.from_json(os.path.join('config', 'config.json'))
    train_word2vec(cfg=config)