Example #1
0
    def create_model(self, fname, max_news=99, n_proc=1, window=5, splits=100):
        name = clean_name(fname)
        model = word2vec.Word2Vec(window=window, workers=n_proc)
        if name == 'text8':
            sentences = word2vec.Text8Corpus(os.path.join('res', 'model', 'text8'))
            model.train(sentences)
        elif name == 'brown':
        #     sentences = word2vec.BrownCorpus(fpath)
            sentences = brown.sents()
            model.train(sentences)
        elif name.startswith('news'):
            target_fpath = os.path.join('res', 'model', name+'.txt')
            if not os.path.exists(target_fpath):
                build_news_corpus(name, max_news, n_proc, target_fpath)
            sentences = word2vec.LineSentence(target_fpath)
            model.build_vocab(sentences)
            model.train(sentences)
#         elif name.startswith('wikipedia.deps'):
#             target_fpath = os.path.join('res', 'model', name+'.txt')
#             if not os.path.exists(target_fpath):
#                 build_wikipedia_corpus(name, max_news, n_proc, target_fpath)
        elif name.startswith('spanishEtiquetado'):
            target_fpath = os.path.join('res', 'model', name+'.txt')
            if not os.path.exists(target_fpath):
                path = os.path.join('res', 'model', 'spanishEtiquetado')
                max_pos_len = re.search('\d+', name)
                if max_pos_len:
                    max_pos_len = int(max_pos_len.group(0))
                build_corpus(path, name.endswith('pos'), target_fpath, max_pos_len)
            sentences = word2vec.LineSentence(target_fpath)
#             with open(target_fpath) as fp:
#                 sentences = fp.readlines()
            model.build_vocab(sentences)
            model.train(sentences)        
        else:
            target_fpath = os.path.join('res', 'model', name+'.txt')
            file_to_lower(target_fpath)
            sentences = word2vec.LineSentence(target_fpath)
            model.build_vocab(sentences)
            model.train(sentences)
#             n_sents = len(sentences)  
#             print(n_sents)
#             if splits == 0:
#                 splits = 1
#             split_size = int(n_sents/splits)
#             for i in range(splits):
#                 print(str(i) + '\r')
#                 split_sentences = sentences[i*split_size:(i+1)*split_size-1]
#                 model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin'))
#                 model.save()  
                         
    #     model.save(os.path.join('res',name+'.model'))
        model.save_word2vec_format(os.path.join('res', 'model', fname), binary=fname.endswith('.bin'))
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]
    test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus]
    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # preprocessing
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    ##
#    target_map = {c:i for i, c in enumerate(['null', 'true'])}
    target_map = ddi2013.target_map
    train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless)
    test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless)
    
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
    train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True)
    val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    print('Preprocessing done! Vocab size: {}'.format(len(feature_map)))
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = utils.build_model(args, vocab_size, tagset_size)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = SeqTrainer(args, model, criterion)
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
    
    track_list = []
    best_f1 = float('-inf')
    patience_count = 0
    start_time = time.time()
    
    
    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
    
        # update lr
        trainer.lr_step()
           
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint + '_lstm')
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break
Example #3
0
def train_model_gensim_cross_validation(authors,
                                        label_type,
                                        pipeline,
                                        config="",
                                        token_level="word",
                                        verbose=1):
    '''
    Takes a doc2vec model and trains it on the specified corpus.
    Takes a classifier and trains it on the doc2vec model vectors.
    Processes a cross-validation algorithm (K-fold) in order to evaluate the
    quality of the overall model.
    Returns the best trained pipeline (in terms of macro f-score).
    '''
    labels = get_labels(lang=authors[0]["lang"], label_type=label_type)

    if not (labels):
        abort_clean("Could not extract labels")

    if verbose:
        print("Labels extraction succeded.")
        print("Available labels : " + " / ".join(labels) + "\n")

    if verbose:
        t0 = time()
        print("Starting model Cross Validation ... (this may take some time)")

    # load doc2vec conf
    conf = []
    if config:
        conf = load_config(config)["extractors"][0]  # legacy conf files
        if verbose:
            print("loading doc2vec config file from disk :")
            print("  - vector_size = " +
                  str(conf["configuration"]["vector_size"]))
            print("  - window      = " + str(conf["configuration"]["window"]))
            print("  - min_count   = " +
                  str(conf["configuration"]["min_count"]))

    # load the tokenizer
    tknzr = Tokenizer(token_level)
    if verbose:
        print("Selected token level : " + token_level + "\n")

    # Kfold parameters.
    confusion = array([[0 for x in range(len(labels))]
                       for y in range(len(labels))])
    scores = []
    best_f_score = 0
    best_pipeline = None
    best_model = None
    scores_micro = []
    scores_macro = []
    n_run = 1
    k_fold = KFold(n_splits=10, shuffle=True)
    authors = array(authors)

    # start Kfold cross validation.
    for train_indices, test_indices in k_fold.split(authors):

        # import gensim lib (heavy load)
        from gensim import models as gensim_models

        # get doc2vec model
        model_dm = get_doc2vec(conf, 1, verbose)
        model_pv = get_doc2vec(conf, 0, verbose)

        # build train corpus
        train_authors = authors[train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type=label_type,
                                    verbosity=verbose)

        # build test corpus
        test_authors = authors[test_indices]

        # learn the vocabulary (tokenisation of each tweet)
        tweets = list(zip(train_corpus["labels"], train_corpus["tweets"]))
        processed_tweets = []
        idxs = [0 for l in labels]
        for t in tweets:
            prefix = t[0] + "_" + str(idxs[labels.index(t[0])])
            idxs[labels.index(t[0])] += 1
            processed_tweets.append(
                gensim_models.doc2vec.LabeledSentence(words=tknzr.tokenize(
                    t[1]),
                                                      tags=[prefix]))
        tweets = processed_tweets
        model_dm.build_vocab(tweets)
        model_pv.build_vocab(tweets)

        # train doc2vec model
        shuffle(tweets)
        model_dm.train(sentences=tweets,
                       total_examples=model_dm.corpus_count,
                       epochs=100,
                       start_alpha=0.025,
                       end_alpha=0.0025)
        model_dm.delete_temporary_training_data()
        model_pv.train(sentences=tweets,
                       total_examples=model_pv.corpus_count,
                       epochs=100,
                       start_alpha=0.025,
                       end_alpha=0.0025)
        model_pv.delete_temporary_training_data()

        # train dataset conversion (doc->vectors)
        train_vectors = zeros((sum(idxs), model_dm.vector_size * 2))
        train_labels = []
        for i, tag in enumerate(model_dm.docvecs.doctags):
            train_vectors[i] = concatenate(
                (model_dm.docvecs[tag], model_pv.docvecs[tag]), axis=0)
            train_labels.append(tag.split('_')[0])
        train_labels = array(train_labels)

        # train classifier
        pipeline.fit(train_vectors, train_labels)

        # test models
        truthes = []
        predictions = []
        for author in test_authors:
            # test dataset conversion (doc->vectors)
            tweet_vectors = [
                concatenate((model_dm.infer_vector(tknzr.tokenize(tweet)),
                             model_pv.infer_vector(tknzr.tokenize(tweet))),
                            axis=0) for tweet in author["tweets"]
            ]

            author_tmp = {"tweets": tweet_vectors}
            var_classes, var_predictions = predict_author_proba(
                author=author_tmp, model=pipeline)
            var_max_idx = var_predictions.index(max(var_predictions))
            label_predicted = var_classes[var_max_idx]
            predictions.append(label_predicted)
            truthes.append(author[label_type])

        # compute metrics
        confusion += confusion_matrix(truthes, predictions, labels=labels)
        score_micro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="micro")
        score_macro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="macro")

        if verbose:
            print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) +
                  " macrof1=" + str(score_macro))

        # store for avg
        scores_micro.append(score_micro)
        scores_macro.append(score_macro)
        n_run += 1

        # save the pipeline if better than the current one
        if score_macro > best_f_score:
            best_model = [model_dm, model_pv]
            best_pipeline = clone(pipeline, True)
            best_f_score = score_macro

    if verbose:
        print("Model Cross Validation complete in %.3f seconds.\n" %
              (time() - t0))

    scores = {
        "mean_score_micro": sum(scores_micro) / len(scores_micro),
        "mean_score_macro": sum(scores_macro) / len(scores_macro),
        "confusion_matrix": confusion,
        "best_macro_score": best_f_score,
        "labels": labels
    }

    return best_model, best_pipeline, scores
Example #4
0
def optimize(options):
    '''
    Optimize the given classifier or/and features extractor on a specified list
    of parameters
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - load the parameters for tuning
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains and compares the different classifiers on the corpus
        - outputs the best set of parameters found
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Label type not specified", "expected 'v' or 'g'")

    if not (options["hyper-parameters"]):
        abort_clean("hyper parameters not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the optimize parameters

    try:
        params = load_config(options["hyper-parameters"])
    except:
        abort_clean("Configuration couldn't be loaded",
                    "given path: " + options["hyper-parameters"])

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=params["classifier-call"],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = get_features_extr(
        features_str_list=params["features-extractr-call"],
        verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    # Set the classifier and the parameters to be tuned
    tuning_parameters = get_opt_parameters(params)
    scores = params["scores"]

    if options["verbosity"]:
        print("Starting the optimization process ...")

    # Launch the tuning of hyper parameters
    for score in scores:
        print("Tuning hyper-parameters for %s" % score)

        optimize_corpus = build_corpus(authors=Authors,
                                       label_type=options["label-type"],
                                       verbosity=options["verbosity"])

        clf_optimizer = GridSearchCV(estimator=pipeline,
                                     param_grid=tuning_parameters,
                                     scoring='%s_macro' % score,
                                     fit_params=None,
                                     n_jobs=-1,
                                     pre_dispatch='2*n_jobs',
                                     iid=True,
                                     cv=None,
                                     refit=True,
                                     verbose=options["verbosity"],
                                     error_score='raise',
                                     return_train_score=True)

        # Start optimisation
        clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"])

        if options["verbosity"]:
            print("Best parameters set found on development set:")
            best_parameters = clf_optimizer.best_params_
            for param_name in sorted(best_parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
            print()

        if options["verbosity"] > 1:
            print("Grid scores on development set:")
            means = clf_optimizer.cv_results_['mean_test_score']
            stds = clf_optimizer.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         clf_optimizer.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

        # saving results
        save_optimisation_results(grid=clf_optimizer,
                                  output_dir=options["output-dir"],
                                  score=score,
                                  verbose=options["verbosity"])
Example #5
0
def train_model_cross_validation(authors, label_type, pipeline, verbose=1):
    '''
    Takes a pipeline and train it on the specified corpus.
    Processes a cross-validation algorithm (K-fold) in order to evaluate the
    quality of the model.
    Returns the best trained pipeline (in terms of macro f-score).
    '''

    labels = get_labels(lang=authors[0]["lang"], label_type=label_type)

    if not (labels):
        abort_clean("Could not extract labels")
    if verbose:
        print("Labels extraction succeded.")
        print("Available labels : " + " / ".join(labels) + "\n")

    if verbose:
        t0 = time()
        print("Starting model Cross Validation ... (this may take some time)")

    confusion = array([[0 for x in range(len(labels))]
                       for y in range(len(labels))])
    scores = []
    best_f_score = 0
    best_pipeline = None
    scores_micro = []
    scores_macro = []

    # start Kfold cross validation.
    n_run = 1
    k_fold = KFold(n_splits=10, shuffle=True)
    authors = array(authors)
    for train_indices, test_indices in k_fold.split(authors):

        # build train corpus
        train_authors = authors[train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type=label_type,
                                    verbosity=verbose)

        # build test corpus
        test_authors = authors[test_indices]

        # train model
        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=0)

        # test model
        truthes = []
        predictions = []
        for author in test_authors:
            var_classes, var_predictions = predict_author_proba(author=author,
                                                                model=pipeline)
            var_max_idx = var_predictions.index(max(var_predictions))
            label_predicted = var_classes[var_max_idx]
            predictions.append(label_predicted)
            truthes.append(author[label_type])

        # compute metrics
        confusion += confusion_matrix(truthes, predictions, labels=labels)
        score_micro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="micro")
        score_macro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="macro")

        if verbose:
            print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) +
                  " macrof1=" + str(score_macro))

        # store for avg
        scores_micro.append(score_micro)
        scores_macro.append(score_macro)
        n_run += 1

        # save the pipeline if better than the current one
        if score_macro > best_f_score:
            best_pipeline = clone(pipeline, True)
            best_f_score = score_macro

    if verbose:
        print("Model Cross Validation complete in %.3f seconds.\n" %
              (time() - t0))

    scores = {
        "mean_score_micro": sum(scores_micro) / len(scores_micro),
        "mean_score_macro": sum(scores_macro) / len(scores_macro),
        "confusion_matrix": confusion,
        "best_macro_score": best_f_score,
        "labels": labels
    }

    return best_pipeline, scores
Example #6
0
def train(options):
    '''
    Trains a specified classifier on a specified dataset using specified 
    feature extractors.
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifier
        - loads the features extractor
        - builds the execution pipeline
        - trains the classifier on the corpus
        - cross-validates the resulting model [optional]
        - saves the resulting model [optional]
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Labels not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]) and not (options["gensim"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=options["classifier"][0],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = None
    if not (options["gensim"]):
        features_extr = get_features_extr(
            features_str_list=options["features"][0],
            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Train the execution pipeline

    # train and cross validate results
    if (options["cross-validation"]):
        if (options["verbosity"]):
            print("Model Training with cross validation\n")

        if options["gensim"]:
            model, pipeline, scores = train_model_gensim_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                config=options["hyper-parameters"],
                token_level=options["token-level"],
                verbose=options["verbosity"])
        else:
            pipeline, scores = train_model_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                verbose=options["verbosity"])

        if options["verbosity"]:
            print_scores(scores)
        if options["output-dir"]:
            if options["gensim"]:
                filename = str("doc2vec" + "-siz_" +
                               str(model[0].vector_size) + "-win_" +
                               str(model[0].window) + "-cnt_" +
                               str(model[0].min_count) +
                               get_classifier_name(classifier))
            else:
                filename = str(
                    get_features_extr_name(features_extr) + "+" +
                    get_classifier_name(classifier))
                save_scores(scores=scores,
                            output_dir=options["output-dir"],
                            filename=filename,
                            verbose=options["verbosity"])

    # train without validation --> output-dir required
    else:
        if options["verbosity"]:
            print("Model Training without cross validation\n")
        if not (options["output-dir"]):
            abort_clean("No output directory specified.",
                        "Training without persisting is not allowed")

        train_corpus = build_corpus(authors=Authors,
                                    label_type=options["label-type"],
                                    verbosity=options["verbosity"])

        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Save the resulting model
    if options["gensim"]:
        filename = "doc2vec+" + get_classifier_name(classifier)
    else:
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_model(pipeline=pipeline,
                   output_dir=options["output-dir"],
                   filename=filename,
                   verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # End Execution
    if options["verbosity"]:
        print("Training task complete in " + str(round(time() - t0)) + " s")
def main():
    parser = options.get_parser('Generator')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_generation_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    
    
    caseless = args.caseless
    batch_size = args.batch_size
    
    
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
    else:
        print('No checkpoint file found: {}'.format(args.load_checkpoint))
        raise OSError
        
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=True)
    test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus]
    
    # preprocessing
    feature_map = checkpoint_file['f_map']
    target_map = checkpoint_file['t_map']
    test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless)
    
    # train/val split
    test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = utils.build_model(args, vocab_size, tagset_size)
    # loss
    criterion = utils.build_loss(args)
    
    # load states
    model.load_state_dict(checkpoint_file['state_dict'])
    
    # trainer
    trainer = SeqTrainer(args, model, criterion)
    
    if args.cuda:
        model.cuda()
    
    y_true, y_pred, att_weights = predict(trainer, test_loader, target_map, cuda=args.cuda)
    assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set'
    # prediction
    print('Predicting...')
    assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set'
    # write result: sent_id|e1|e2|ddi|type
    with open(args.predict_file, 'w') as f:
        for tup, pred in zip(test_raw_corpus, y_pred):
            ddi = 0 if pred == 'null' else 1
            f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred]))
            f.write('\n')

    # error analysis
    print('Analyzing...')
    with open(args.error_file, 'w') as f:
        f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred']))
        f.write('\n')
        for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights):
            if target != pred:
                size = len(tup.sent)
                f.write('{}\n'.format(' '.join(tup.sent)))
                if args.model != 'InterAttentionLSTM':
                    att_weight = [att_weight]
                for i in range(len(att_weight)):
                    f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
                f.write('{}\n\n'.format(' | '.join([tup.sent_id, tup.e1, tup.e2, target, pred])))
            
    # attention
    print('Writing attention scores...')
    with open(args.att_file, 'w') as f:
        f.write(' | '.join(['target', 'sent', 'att_weight']))
        f.write('\n')
        for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights):
            if target == pred and target != 'null':
                size = len(tup.sent)
                f.write('{}\n'.format(target))
                f.write('{}\n'.format(' '.join(tup.sent)))
                if args.model != 'InterAttentionLSTM':
                    att_weight = [att_weight]
                for i in range(len(att_weight)):
                    f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
Example #8
0
DATABASE = utils.get_file_path(cfg.DATABASE_FILE)
content = help_content.HelpContent(DATABASE)

# print( help( corpora.dictionary ) )
should_rebuild = False

# ### Dictionary ###
dict_file = utils.get_file_path(cfg.DICT_BACKUP)
# dictionary = corpora.dictionary.Dictionary.load(dict_file)
dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP)

# ### Corpus ###
corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP)
# utils.pickle_save(corpus_file, corpus)
# corpus = corpora.MmCorpus(corpus_file)
corpus = utils.build_corpus(dictionary, content, should_rebuild,
                            cfg.CORPUS_BACKUP)
# corpus = pickle.load( open( corpus_file, "rb" ) )

# print( cfg.MODEL_NAME )

# ### LDA Model ###
bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split())
# bag_of_words = [word for word in bow]
model = utils.build_model(dictionary, corpus, should_rebuild)
q_vec = model[bow]  # "query vector"
# topic_details = list()
topic_details = model.print_topic(max(q_vec, key=lambda item: item[1])[0])

print('Dictionary Size = {}'.format(len(dictionary)))
print('Corpus Size = {}'.format(len(corpus)))
print('Topic Details: ')
Example #9
0
                features_array.append(features)

    inflated_feats = []
    for dense in features_array:
        sparse = np.zeros(len(features_all))
        for i in dense:
            sparse[i] = 1
        inflated_feats.append(sparse)
    A = np.array(inflated_feats)
    return scipy.sparse.csr_matrix(A), np.array(labels), features_all


def save_model(clf, features):
    joblib.dump(clf, 'model.pkl')
    pickle.dump(features, open('feature.pkl', 'wb'))


if __name__ == '__main__':
    corpus_file = sys.argv[1]
    annot_file = sys.argv[2]
    annotations = read_annotation_files(annot_file)
    print "Read annotation files " + str(passed_time(start_time))
    sentences = ut.build_corpus(open(corpus_file, "r").read().split("\n"))
    print "Read the corpus " + str(passed_time(start_time))
    features, tags, all_features = build_datas(annotations, sentences)
    print "Build the features " + str(passed_time(start_time))
    clf = svm.LinearSVC()
    clf.fit(features, tags)
    save_model(clf, all_features)
    print "Saved model " + str(passed_time(start_time))
def main():
    parser = options.get_parser('Generator')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_generation_args(parser)

    args = parser.parse_args()

    model_path = args.load_checkpoint + '.model'
    args_path = args.load_checkpoint + '.json'
    with open(args_path, 'r') as f:
        _args = json.load(f)['args']
    [setattr(args, k, v) for k, v in _args.items()]

    args.cuda = not args.disable_cuda and torch.cuda.is_available()

    print(args)

    if args.cuda:
        torch.backends.cudnn.benchmark = True

    # increase recursion depth
    sys.setrecursionlimit(10000)

    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(
        args.processed_dir, ddi=False)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2)
                    for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2)
                  for line in val_raw_corpus]

    caseless = args.caseless
    batch_size = args.batch_size

    # build vocab
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents,
                                    min_count=args.min_count,
                                    caseless=caseless)
    target_map = ddi2013.target_map

    # get class weights
    _, train_targets = utils.build_corpus(train_corpus, feature_map,
                                          target_map, caseless)
    class_weights = torch.Tensor(
        utils.get_class_weights(train_targets)) if args.class_weight else None

    # load dataets
    _, _, test_loader = utils.load_datasets(args.processed_dir,
                                            args.train_size,
                                            args,
                                            feature_map,
                                            dataloader=True)

    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = RelationTreeModel(vocab_size, tagset_size, args)

    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)

    # load states
    assert os.path.isfile(model_path), "Checkpoint not found!"
    print('Loading checkpoint file from {}...'.format(model_path))
    checkpoint_file = torch.load(model_path)
    model.load_state_dict(checkpoint_file['state_dict'])

    # trainer
    trainer = TreeTrainer(args, model, criterion)

    # predict
    y_true, y_pred, treelists, f1_by_len = predict(trainer,
                                                   test_loader,
                                                   target_map,
                                                   cuda=args.cuda)

    # assign words to roots
    for tup, treelist in zip(test_raw_corpus, treelists):
        for t in treelist:
            t.idx = tup.sent[t.idx] if t.idx < len(tup.sent) else None

    # prediction
    print('Predicting...')
    # write result: sent_id|e1|e2|ddi|type
    with open(args.predict_file, 'w') as f:
        for tup, pred in zip(test_raw_corpus, y_pred):
            ddi = 0 if pred == 'null' else 1
            f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred]))
            f.write('\n')

    def print_info(f, tup, target, pred, root):
        f.write('{}\n'.format(' '.join(tup.sent)))
        f.write('{}\n'.format(' | '.join(
            [tup.sent_id, tup.e1, tup.e2, target, pred])))
        f.write('{}\n\n'.format(root))

    # error analysis
    print('Analyzing...')
    with open(args.error_file, 'w') as f:
        f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred']))
        f.write('\n')
        for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred,
                                               treelists):
            if target != pred:
                print_info(f, tup, target, pred, treelist[-1])

    # attention
    print('Writing attention scores...')
    with open(args.correct_file, 'w') as f:
        f.write(' | '.join(['target', 'sent', 'att_weight']))
        f.write('\n')
        for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred,
                                               treelists):
            if target == pred and target != 'null':
                print_info(f, tup, target, pred, treelist[-1])
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    torch.manual_seed(5)
    
    if args.cuda:
        torch.backends.cudnn.benchmark = True
    
    # increase recursion depth
    sys.setrecursionlimit(10000)
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # build vocab
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    target_map = ddi2013.target_map
    
    # get class weights
    _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
        
    train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True)            
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = RelationTreeModel(vocab_size, tagset_size, args)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = TreeTrainer(args, model, criterion)
    
    best_f1 = float('-inf')
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        best_f1 = dev_f1
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
        
    track_list = []
    
    patience_count = 0
    start_time = time.time()
    q = mp.Queue()
    
    # set start methods
    try:
        mp.set_start_method('spawn')
    except RuntimeError:
        pass

    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
#        processes = []
#        for rank in range(args.num_processes):
#            p = mp.Process(target=train, args=(train_loader, trainer, epoch, q))
#            p.start()
#            processes.append(p)
#        for p in processes:
#            p.join()
#        
#        epoch_loss = q.get()

                
        # update lr
        trainer.lr_step(epoch_loss)
        
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint)
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break