from lda import lda_gibbs_sampling
from datetime import datetime
from sklearn.cross_validation import train_test_split, StratifiedKFold
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.utils import shuffle
from functions import *

path2training = sys.argv[1]
training = codecs.open(path2training, 'r', encoding='utf8').read().splitlines()

topics = int(sys.argv[2])
alpha, beta = 0.5 / float(topics), 0.5 / float(topics)

voca_en = vocabulary.Vocabulary(set(nltk.corpus.stopwords.words('english')),
                                WordNetLemmatizer(),
                                excluds_stopwords=True)

ldaTrainingData = change_raw_2_lda_input(training, voca_en, True)
ldaTrainingData = voca_en.cut_low_freq(ldaTrainingData, 1)

classificationData, y = load_classification_data(sys.argv[3], sys.argv[4])
classificationData = change_raw_2_lda_input(classificationData, voca_en, False)
classificationData = voca_en.cut_low_freq(classificationData, 1)

iterations = 201
start = time.time()
final_acc, final_mif, final_perpl, final_ar, final_nmi, final_p, final_r, final_f = [], [], [], [], [], [], [], []
for j in range(5):
    perpl, cnt, acc, mif, ar, nmi, p, r, f = [], 0, [], [], [], [], [], [], []
    lda = lda_gibbs_sampling(K=topics,
Exemple #2
0
def run_train(args):
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)
        torch.manual_seed(args.numpy_seed)

    print("Loading training trees from {}...".format(args.train_path))
    train_treebank = trees.load_trees(args.train_path)
    print("Loaded {:,} training examples.".format(len(train_treebank)))

    print("Loading development trees from {}...".format(args.dev_path))
    dev_treebank = trees.load_trees(args.dev_path)
    print("Loaded {:,} development examples.".format(len(dev_treebank)))

    print("Processing trees for training...")
    train_parse = [tree.convert() for tree in train_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(parse.START)
    tag_vocab.index(parse.STOP)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(parse.START)
    word_vocab.index(parse.STOP)
    word_vocab.index(parse.UNK)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    for tree in train_parse:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, trees.InternalParseNode):
                label_vocab.index(node.label)
                nodes.extend(reversed(node.children))
            else:
                tag_vocab.index(node.tag)
                word_vocab.index(node.word)

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()

    def print_vocabulary(name, vocab):
        special = {parse.START, parse.STOP, parse.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)

    print("Initializing model...")
    # model = dy.ParameterCollection()
    if args.parser_type == "top-down":
        parser = parse.TopDownParser(
            # model,
            tag_vocab,
            word_vocab,
            label_vocab,
            args.tag_embedding_dim,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            args.split_hidden_dim,
            args.dropout,
        )
    # else:
    #     parser = parse.ChartParser(
    #         model,
    #         tag_vocab,
    #         word_vocab,
    #         label_vocab,
    #         args.tag_embedding_dim,
    #         args.word_embedding_dim,
    #         args.lstm_layers,
    #         args.lstm_dim,
    #         args.label_hidden_dim,
    #         args.dropout,
    #     )
    # trainer = dy.AdamTrainer(model)
    optimizer = torch.optim.Adam(parser.parameters())

    total_processed = 0
    current_processed = 0
    check_every = len(train_parse) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_dev_model_path = None

    start_time = time.time()

    def check_dev():
        nonlocal best_dev_fscore
        nonlocal best_dev_model_path

        dev_start_time = time.time()

        dev_predicted = []
        for tree in dev_treebank:
            # dy.renew_cg()
            parser.eval()
            sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
            predicted, _ = parser.parse(sentence)
            dev_predicted.append(predicted.convert())

        dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank,
                                    dev_predicted)

        print("dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(
                  dev_fscore,
                  format_elapsed(dev_start_time),
                  format_elapsed(start_time),
              ))

        if dev_fscore.fscore > best_dev_fscore:
            if best_dev_model_path is not None:
                # for ext in [".data", ".meta"]:
                #     path = best_dev_model_path + ext
                #     if os.path.exists(path):
                #         print("Removing previous model file {}...".format(path))
                #         os.remove(path)
                path = best_dev_model_path
                if os.path.exists(path):
                    print("Removing previous model file {}...".format(path))
                    os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            # best_dev_model_path = "{}_dev={:.2f}".format(
            best_dev_model_path = "{}_dev={:.2f}.pth".format(
                args.model_path_base, dev_fscore.fscore)
            print("Saving new best model to {}...".format(best_dev_model_path))
            # dy.save(best_dev_model_path, [parser])
            torch.save(parser, best_dev_model_path)

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_parse)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_parse), args.batch_size):
            # dy.renew_cg()
            optimizer.zero_grad()
            parser.train()
            batch_losses = []
            for tree in train_parse[start_index:start_index + args.batch_size]:
                sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
                if args.parser_type == "top-down":
                    _, loss = parser.parse(sentence, tree, args.explore)
                # else:
                #     _, loss = parser.parse(sentence, tree)
                batch_losses.append(loss)
                total_processed += 1
                current_processed += 1

            # batch_loss = dy.average(batch_losses)
            # batch_loss_value = batch_loss.scalar_value()
            batch_loss = torch.stack(batch_losses).mean()
            assert batch_loss.data.numel() == 1
            batch_loss_value = batch_loss.data[0]
            batch_loss.backward()
            # trainer.update()
            optimizer.step()

            print("epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_parse) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ))

            if current_processed >= check_every:
                current_processed -= check_every
                check_dev()
Exemple #3
0
import vocabulary
import inference_wrapper
import configuration
import h5py
import tensorflow as tf
import caption_generator
import math
import json

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

conf = configuration.MyConfig()

vocab = vocabulary.Vocabulary("data/dic.txt")
file = h5py.File("data/feat.hdf5", 'r')
encoded_images = file['valid_set']
valid_list_file = "data/valid_list.txt"
train_step = conf.train_step
checkpoint_steps = conf.original_train_steps + (train_step -
                                                1) * conf.interval_train_steps

check_point_path = "train_log/{}.ckpt".format(checkpoint_steps)

model = inference_wrapper.InferenceWrapper()
restore_fn = model.build_graph_from_config(configuration.ModelConfig(),
                                           check_point_path)

sess = tf.InteractiveSession()
restore_fn(sess)
Exemple #4
0
def _read_training_classification_data(file_in):
    """Read the data for classification"""
    with open(file_in, 'r') as f:
        # (1) Class Number
        class_num = int(f.readline()[:-1])
        # (2) Vocabularies
        #     NP1
        np1_num = int(f.readline()[:-1])
        np1_voc = vocabulary.Vocabulary()
        for i in range(0, np1_num):
            np1_voc.add(f.readline()[:-1])
        f.readline()  # Blank line
        #     VP
        vp_num = int(f.readline()[:-1])
        vp_voc = vocabulary.Vocabulary()
        for i in range(0, vp_num):
            vp_voc.add(f.readline()[:-1])
        f.readline()  # Blank line
        #     NP2
        np2_num = int(f.readline()[:-1])
        np2_voc = vocabulary.Vocabulary()
        for i in range(0, np2_num):
            np2_voc.add(f.readline()[:-1])
        f.readline()  # Blank line
        # (3) Priors for classes
        if class_num == int(f.readline()[:-1]):
            class_prior_prob = probability.Probability(1, class_num)
            for i in range(0, class_num):
                class_prior_prob.set_value(0, i, float(f.readline()[:-1]))
        else:
            print('Class Number Does Not Match in File {0}'.format(file_in))
        f.readline()  # Blank line
        # (4) Vocabularies' Probability
        #     NP1
        if np1_num == int(f.readline()[:-1]):
            np1_prob = probability.Probability(np1_num, class_num)
            for i in range(0, np1_num):
                f.readline()  # class number
                for j in range(0, class_num):
                    np1_prob.set_value(i, j, float(f.readline()[:-1]))
                f.readline()  # Blank line
        else:
            print('NP1 Number Does Not Match in File {0}'.format(file_in))
        f.readline()  # Blank line
        #     VP
        if vp_num == int(f.readline()[:-1]):
            vp_prob = probability.Probability(vp_num, class_num)
            for i in range(0, vp_num):
                f.readline()  # class number
                for j in range(0, class_num):
                    vp_prob.set_value(i, j, float(f.readline()[:-1]))
                f.readline()  # Blank line
        else:
            print('VP Number Does Not Match in File {0}'.format(file_in))
        f.readline()  # Blank line
        #     NP2
        if np2_num == int(f.readline()[:-1]):
            np2_prob = probability.Probability(np2_num, class_num)
            for i in range(0, np2_num):
                f.readline()  # class number
                for j in range(0, class_num):
                    np2_prob.set_value(i, j, float(f.readline()[:-1]))
                f.readline()  # Blank line
        else:
            print('NP2 Number Does Not Match in File {0}'.format(file_in))
        f.readline()  # Blank line
        # (5) Classes' Transition Matrix
        if class_num == int(f.readline()[:-1]):
            transition_prob = probability.Probability(
                class_num, class_num)  # current class given previous class
            for i in range(0, class_num):
                f.readline()  # Class number
                for j in range(0, class_num):
                    transition_prob.set_value(i, j, float(f.readline()[:-1]))
                f.readline()  # Blank line

            f.readline()
        else:
            print('Class Number Does Not Match in File {0}'.format(file_in))
        # (6) Length Distribution
        #  Currently not calculated. TO be Added
    return [
        class_num, np1_voc, vp_voc, np2_voc, np1_prob, vp_prob, np2_prob,
        class_prior_prob, transition_prob
    ]
Exemple #5
0
def loadAndPreprocessData():
    '''
	Read all the words to create a vocabulary
	'''
    all_tokens = []
    indir = '../preprocess/subset/'

    for root, dirs, filenames in os.walk(indir):
        for filename in filenames:
            if filename.startswith('canonicalized_words_'):
                with open(indir + filename, 'r') as f:
                    for line in f.readlines():
                        w = line.rstrip()
                        if w != '':
                            all_tokens.append(w)
    print 'Processed all tokens: ', len(all_tokens)

    tokens_dict = Counter()
    for w in all_tokens:
        if w.startswith('DG') and w.endswith('DG'):
            w = 'DG'
        tokens_dict[w] += 1
    '''
	Remove noisy tokens - see notebook for exploratory analysis
	The first ~2500 tokens when sorted by key are noisy like "!!!!" or "* * * *" - for eg, the end of a chapter
	'''
    noisy_tokens = sorted(tokens_dict)[0:2507]
    print 'Identified noisy tokens - some examples: ', noisy_tokens[0:30]
    '''
	Clean up the tokens now that we know the noisy tokens and then generate the vocab
	'''
    noisy_tokens = set(noisy_tokens)
    words = [w for w in all_tokens if w not in noisy_tokens]
    # TODO: Should make V configurable
    V = 50000
    vocab = vocabulary.Vocabulary((word for word in words), size=V)
    print 'Vocabulary created with size: ', vocab.size
    '''
	Read in the sentences already parsed from the ~3000 books Gutenberg subset
	'''
    sents = []
    indir = '../preprocess/subset/'
    books = []
    for root, dirs, filenames in os.walk(indir):
        for filename in filenames:
            if filename.startswith('parsed_sents_'):
                with open(indir + filename, 'r') as f:
                    for line in f.readlines():
                        sents.append(line.rstrip())
    print 'Parsed sentences loaded into memory: ', len(sents)
    print 'The 10,000th sentence is: ', sents[10000]
    '''
	Prepare training and test sentences
	'''
    split = 0.8
    shuffle = True

    sentences = np.array(sents, dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))
    print "Loaded %d sentences (%g tokens)" % fmt

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print "Training set: %d sentences (%d tokens)" % fmt
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print "Test set: %d sentences (%d tokens)" % fmt
    '''
	Apply the vocab to the train and test sentences and convert words to ids to start training
	'''
    ## Preprocess sentences
    ## convert words to ids based on the vocab wordset created above
    ## Do this in batches to avoid crashes due to insufficient memory
    batch_size = 50000
    num_of_batches = int(round(len(train_sentences) / batch_size))
    print 'Preprocessing train sentences - number of batches: ', num_of_batches
    train_id_batches = []
    start = 0
    end = start + batch_size
    for i in range(num_of_batches):
        if i % 15 is 0:
            print 'Completed Batches: ', i
        train_id_batches.append(
            utils.preprocess_sentences(train_sentences[start:end], vocab))
        start = end
        end += batch_size
    # flatten the lists for 1D tensor
    temp = utils.flatten(train_id_batches)
    train_ids = utils.flatten(temp)
    train_ids = np.array(train_ids)
    print 'Train sentences converted to their IDs including start, end token and unknown word token'

    # repeat the same with test data
    batch_size = 50000
    num_of_batches = int(round(len(test_sentences) / batch_size))
    if num_of_batches > 10:
        num_of_batches = 10
    print 'Preprocessing test sentences - number of batches: ', num_of_batches
    test_id_batches = []
    start = 0
    end = start + batch_size
    for i in range(num_of_batches):
        print 'Batch: ', i
        test_id_batches.append(
            utils.preprocess_sentences(test_sentences[start:end], vocab))
        start = end
        end += batch_size
    test_ids = utils.flatten(utils.flatten(test_id_batches))
    test_ids = np.array(test_ids)
    print 'Test sentences converted to their IDs including start, end token and unknown word token'
    max_time = 40
    batch_size = 64
    learning_rate = 0.01
    num_epochs = 3

    # Model parameters
    model_params = dict(V=vocab.size, H=100, softmax_ns=200, num_layers=1)

    TF_SAVEDIR = "tf_saved"
    checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
    trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")
    # Will print status every this many seconds
    print_interval = 120

    # Clear old log directory
    shutil.rmtree("tf_summaries", ignore_errors=True)

    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    lm.BuildTrainGraph()

    # Explicitly add global initializer and variable saver to LM graph
    with lm.graph.as_default():
        initializer = tf.global_variables_initializer()
        saver = tf.train.Saver()

    # Clear old log directory
    shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
    if not os.path.isdir(TF_SAVEDIR):
        os.makedirs(TF_SAVEDIR)

    with tf.Session(graph=lm.graph) as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)
        session.run(initializer)
        bi = utils.batch_generator(train_ids, batch_size, max_time)
        for epoch in xrange(1, num_epochs + 1):
            t0_epoch = time.time()
            #bi = utils.batch_generator(train_ids, batch_size, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)
            #### YOUR CODE HERE ####
            # Run a training epoch.

            run_epoch(lm, session, bi, train=True, learning_rate=learning_rate)

            #### END(YOUR CODE) ####
            print "[epoch %d] Completed in %s" % (
                epoch, utils.pretty_timedelta(since=t0_epoch))

            # Save a checkpoint
            saver.save(session, checkpoint_filename, global_step=epoch)
            ##
            # score_dataset will run a forward pass over the entire dataset
            # and report perplexity scores. This can be slow (around 1/2 to
            # 1/4 as long as a full epoch), so you may want to comment it out
            # to speed up training on a slow machine. Be sure to run it at the
            # end to evaluate your score.
            print("[epoch %d]" % epoch), score_dataset(lm,
                                                       session,
                                                       train_ids,
                                                       name="Train set")
            print("[epoch %d]" % epoch), score_dataset(lm,
                                                       session,
                                                       test_ids,
                                                       name="Test set")
            print ""
        # Save final model
        saver.save(session, trained_filename)
Exemple #6
0
def main(config):
    count_words(config, False)
    preprocess_names(config)
    count_words(config, True)
    vocabulary.Vocabulary(config, load=False).write()
    prep_data(config)
Exemple #7
0
def run_train(args):

    args.numpy_seed = seed
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    if args.trial == 1:
        args.train_path = 'data/trial.txt'
        args.dev_path = 'data/trial.txt'
        args.test_path = 'data/trial.txt'

    # args.train_path = args.train_path.replace('[*]', args.treetype)
    # args.dev_path = args.dev_path.replace('[*]', args.treetype)
    # args.test_path = args.test_path.replace('[*]', args.treetype)

    print("Loading training trees from {}...".format(args.train_path))
    train_chunk_insts = util.read_chunks(args.train_path, args.normal)
    print("Loaded {:,} training examples.".format(len(train_chunk_insts)))

    print("Loading development trees from {}...".format(args.dev_path))
    dev_chunk_insts = util.read_chunks(args.dev_path, args.normal)
    print("Loaded {:,} development examples.".format(len(dev_chunk_insts)))

    print("Loading test trees from {}...".format(args.test_path))
    test_chunk_insts = util.read_chunks(args.test_path, args.normal)
    print("Loaded {:,} test examples.".format(len(test_chunk_insts)))

    # print("Processing trees for training...")
    # train_parse = [tree.convert() for tree in train_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(parse.START)
    tag_vocab.index(parse.STOP)
    tag_vocab.index(parse.XX)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(parse.START)
    word_vocab.index(parse.STOP)
    word_vocab.index(parse.UNK)
    word_vocab.index(parse.NUM)

    for x, chunks in train_chunk_insts + dev_chunk_insts + test_chunk_insts:
        for ch in x:
            word_vocab.index(ch)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    label_list = util.load_label_list(args.labellist_path)  #'data/labels.txt')
    for item in label_list:
        label_vocab.index((item, ))

    if args.nontlabelstyle != 1:
        for item in label_list:
            label_vocab.index((item + "'", ))

    if args.nontlabelstyle == 1:
        label_vocab.index((parse.EMPTY, ))

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()

    latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel,
                                             args.nontlabelstyle)

    def print_vocabulary(name, vocab):
        special = {parse.START, parse.STOP, parse.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)

    print("Initializing model...")

    pretrain = {'giga': 'data/giga.vec100', 'none': 'none'}
    pretrainemb = util.load_pretrain(pretrain[args.pretrainemb],
                                     args.word_embedding_dim, word_vocab)

    model = dy.ParameterCollection()
    if args.parser_type == "chartdyRBTC":
        parser = parse.ChartDynamicRBTConstraintParser(
            model,
            tag_vocab,
            word_vocab,
            label_vocab,
            args.tag_embedding_dim,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            args.dropout,
            (args.pretrainemb, pretrainemb),
            args.chunkencoding,
            args.trainc == 1,
            True,
            (args.zerocostchunk == 1),
        )

    else:
        print('Model is not valid!')
        exit()

    if args.loadmodel != 'none':
        tmp = dy.load(args.loadmodel, model)
        parser = tmp[0]
        print('Model is loaded from ', args.loadmodel)

    trainer = dy.AdamTrainer(model)

    total_processed = 0
    current_processed = 0
    check_every = len(train_chunk_insts) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_dev_model_path = None

    start_time = time.time()

    def check_dev():
        nonlocal best_dev_fscore
        nonlocal best_dev_model_path

        dev_start_time = time.time()

        dev_predicted = []
        #dev_gold = []

        #dev_gold = latent_tree.build_latent_trees(dev_chunk_insts)
        dev_gold = []
        for inst in dev_chunk_insts:
            chunks = util.inst2chunks(inst)
            dev_gold.append(chunks)

        for x, chunks in dev_chunk_insts:
            dy.renew_cg()
            #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
            sentence = [(parse.XX, ch) for ch in x]
            predicted, _ = parser.parse(sentence)
            dev_predicted.append(predicted.convert().to_chunks())

        #dev_fscore = evaluate.evalb(args.evalb_dir, dev_gold, dev_predicted, args.expname + '.dev.') #evalb
        dev_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                           dev_gold,
                                           dev_predicted,
                                           output_filename=args.expname +
                                           '.dev.txt')  # evalb

        print("dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(
                  dev_fscore,
                  format_elapsed(dev_start_time),
                  format_elapsed(start_time),
              ))

        if dev_fscore.fscore > best_dev_fscore:
            if best_dev_model_path is not None:
                for ext in [".data", ".meta"]:
                    path = best_dev_model_path + ext
                    if os.path.exists(path):
                        print(
                            "Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            best_dev_model_path = "{}_dev={:.2f}".format(
                args.model_path_base + "_" + args.expname, dev_fscore.fscore)
            print("Saving new best model to {}...".format(best_dev_model_path))
            dy.save(best_dev_model_path, [parser])

            test_start_time = time.time()
            test_predicted = []
            #test_gold = latent_tree.build_latent_trees(test_chunk_insts)
            test_gold = []
            for inst in test_chunk_insts:
                chunks = util.inst2chunks(inst)
                test_gold.append(chunks)

            ftreelog = open(args.expname + '.test.predtree.txt',
                            'w',
                            encoding='utf-8')

            for x, chunks in test_chunk_insts:
                dy.renew_cg()
                #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
                sentence = [(parse.XX, ch) for ch in x]
                predicted, _ = parser.parse(sentence)
                pred_tree = predicted.convert()
                ftreelog.write(pred_tree.linearize() + '\n')
                test_predicted.append(pred_tree.to_chunks())

            ftreelog.close()

            #test_fscore = evaluate.evalb(args.evalb_dir, test_chunk_insts, test_predicted, args.expname + '.test.')
            test_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                                test_gold,
                                                test_predicted,
                                                output_filename=args.expname +
                                                '.test.txt')  # evalb

            print("epoch {:,} "
                  "test-fscore {} "
                  "test-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      test_fscore,
                      format_elapsed(test_start_time),
                      format_elapsed(start_time),
                  ))

    train_trees = latent_tree.build_dynamicRBT_trees(train_chunk_insts)
    train_trees = [(x, tree.convert(), chunks, latentscope)
                   for x, tree, chunks, latentscope in train_trees]

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_chunk_insts)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_chunk_insts), args.batch_size):
            dy.renew_cg()
            batch_losses = []

            for x, tree, chunks, latentscope in train_trees[
                    start_index:start_index + args.batch_size]:

                discard = False
                for chunk in chunks:
                    length = chunk[2] - chunk[1]
                    if length > args.maxllimit:
                        discard = True
                        break

                if discard:
                    continue
                    print('discard')

                sentence = [(parse.XX, ch) for ch in x]
                if args.parser_type == "top-down":
                    _, loss = parser.parse(sentence, tree, args.explore)
                else:
                    _, loss = parser.parse(sentence, tree, chunks, latentscope)
                batch_losses.append(loss)
                total_processed += 1
                current_processed += 1

            batch_loss = dy.average(batch_losses)
            batch_loss_value = batch_loss.scalar_value()
            batch_loss.backward()
            trainer.update()

            print("Epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_chunk_insts) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ),
                  flush=True)

            if current_processed >= check_every:
                current_processed -= check_every
                if epoch > 7:
                    check_dev()
Exemple #8
0
def main():
    import optparse
    import vocabulary
    import lda
    import lda_cvb0
    parser = optparse.OptionParser()
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)",
                      default="0:100")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=10)
    (options, args) = parser.parse_args()

    corpus = vocabulary.load_corpus(options.corpus)
    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
    train_docs = [[x for i, x in enumerate(doc) if i % 10 != 0]
                  for doc in docs]
    test_docs = [[x for i, x in enumerate(doc) if i % 10 == 0] for doc in docs]
    test_docs_wf = conv_word_freq(test_docs)

    f = FileOutput("lda_test2")
    f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" %
          (len(docs), len(voca.vocas), options.K, options.alpha, options.beta))

    lda_learning(f, lda_cvb0.LDA_CVB0, False, options, train_docs,
                 test_docs_wf, voca)
    lda_learning(f, lda_cvb0.LDA_CVB0, True, options, train_docs, test_docs_wf,
                 voca)
    lda_learning(f, lda.LDA, False, options, train_docs, test_docs, voca, 2)
    lda_learning(f, lda.LDA, True, options, train_docs, test_docs, voca, 2)
Exemple #9
0
    def perplexity(self):
        """パープレキシティを計算"""
        phi = self.worddist()
        log_per = 0
        Kalpha = self.K * self.alpha
        for m, doc in enumerate(self.docs):
            theta = self.n_m_z[m,:] / (len(doc) + Kalpha)
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
        return numpy.exp(log_per / self.N)

if __name__ == '__main__':


    voca = vocabulary.Vocabulary()
    #docs = [ voca.doc_to_ids(doc) for doc in vocabulary.read_from('corpus_1') ]
    corpus = vocabulary.read_from('corpus_1')

    lda = LDA(K = 10, alpha = 0.5, beta = 0.5)
    lda.set_corpus(corpus)

    for i in range(20):

        lda.inference()

        print(lda.perplexity())

    phi = lda.worddist()
    print(phi[0])
    print(lda.n_z)
Exemple #10
0
def main(_):
    #convert jpg image(s) into iamge representations using alexnet:
    filenames = [
        os.path.join(image_dir, f) for f in [
            'overly-attached-girlfriend.jpg',
            'high-expectations-asian-father.jpg', 'foul-bachelor-frog.jpg',
            'stoner-stanley.jpg', 'y-u-no.jpg', 'willy-wonka.jpg',
            'futurama-fry.jpg', 'success-kid.jpg', 'one-does-not-simply.jpg',
            'bad-luck-brian.jpg', 'first-world-problems.jpg',
            'philosoraptor.jpg', 'what-if-i-told-you.jpg', 'TutorPP.jpg'
        ]
    ]
    print(filenames)
    tf.logging.info("Running caption generation on %d files matching %s",
                    len(filenames), FLAGS.input_files)
    #mean of imagenet dataset in BGR
    imagenet_mean = np.array([104., 117., 124.], dtype=np.float32)

    #placeholder for input and dropout rate
    x_Alex = tf.placeholder(tf.float32, [1, 227, 227, 3])
    keep_prob_Alex = tf.placeholder(tf.float32)

    #create model with default config ( == no skip_layer and 1000 units in the last layer)
    modelAlex = AlexNet(x_Alex, keep_prob_Alex, 1000, [], ['fc7', 'fc8'],
                        512)  #maybe need to put fc8 in skip_layers

    #define activation of last layer as score
    score = modelAlex.fc6

    meme_embeddings = []
    with tf.Session() as sess:

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        # Load the pretrained weights into the model
        modelAlex.load_initial_weights(sess)

        for i, meme in enumerate(filenames):
            img = Image.open(meme)
            try:
                img.thumbnail((227, 227), Image.ANTIALIAS)
                #img = img.resize((227,227))
                #use img.thumbnail for square images, img.resize for non square
                assert np.shape(img) == (227, 227, 3)
            except AssertionError:
                img = img.resize((227, 227))
                print('sizing error')

            # Subtract the ImageNet mean
            img = img - imagenet_mean  #should probably change this

            # Reshape as needed to feed into model
            img = img.reshape((1, 227, 227, 3))

            meme_vector = sess.run(score,
                                   feed_dict={
                                       x_Alex: img,
                                       keep_prob_Alex: 1
                                   })  #[1,4096]
            meme_vector = np.reshape(meme_vector, [4096])
            assert np.shape(meme_vector) == (4096, )

            #now have np embeddings to feed for inference
            meme_embeddings.append(meme_vector)

    with open('Captions.txt', 'r') as f:
        data_captions = f.readlines()
    data_captions = [s.lower() for s in data_captions]

    # Build the inference graph.
    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(configuration.ModelConfig(),
                                                   FLAGS.checkpoint_path)
    g.finalize()

    # Create the vocabulary.
    vocab = vocabulary.Vocabulary(FLAGS.vocab_file)

    #filenames = []
    #for file_pattern in FLAGS.input_files.split(","):
    #filenames.extend(tf.gfile.Glob(file_pattern))
    #tf.logging.info("Running caption generation on %d files matching %s",
    #len(filenames), FLAGS.input_files)
    with tf.Session(graph=g) as sess:
        # Load the model from checkpoint.
        restore_fn(sess)

        # Prepare the caption generator. Here we are implicitly using the default
        # beam search parameters. See caption_generator.py for a description of the
        # available beam search parameters.
        generator = caption_generator.CaptionGenerator(model, vocab)
        num_in_data_total = 0
        num_captions = 0
        for i, meme in enumerate(meme_embeddings):
            #with tf.gfile.GFile(filename, "rb") as f:
            #image = f.read()
            captions = generator.beam_search(sess, meme)
            print("Captions for image %s:" % os.path.basename(filenames[i]))
            num_in_data = 0
            for i, caption in enumerate(captions):
                # Ignore begin and end words.
                sentence = [
                    vocab.id_to_word(w) for w in caption.sentence[1:-1]
                ]
                sentence = " ".join(sentence)
                in_data = 0
                if b_any(sentence in capt for capt in data_captions):
                    in_data = 1
                    num_in_data += 1
                    num_in_data_total += 1
                    num_captions += 1
                else:
                    num_captions += 1
                print("  %d) %s (p=%f) [in data = %d]" %
                      (i, sentence, math.exp(caption.logprob), in_data))
            print("number of captions in data = %d" % (num_in_data))
        print("(total number of captions in data = %d) percent in data = %f" %
              (num_in_data_total, (num_in_data_total / num_captions)))
Exemple #11
0
def load_or_create_model(args, parses_for_vocab):
    components = args.model_path_base.split('/')
    directory = '/'.join(components[:-1])
    if os.path.isdir(directory):
        relevant_files = [f for f in os.listdir(directory) if f.startswith(components[-1])]
    else:
        relevant_files = []
    assert len(relevant_files) <= 2, "Multiple possibilities {}".format(relevant_files)
    if len(relevant_files) > 0:
        print("Loading model from {}...".format(args.model_path_base))

        model = dy.ParameterCollection()
        [parser] = dy.load(args.model_path_base, model)
    else:
        assert parses_for_vocab is not None
        print("Constructing vocabularies using train parses...")

        tag_vocab = vocabulary.Vocabulary()
        tag_vocab.index(parse.START)
        tag_vocab.index(parse.STOP)

        word_vocab = vocabulary.Vocabulary()
        word_vocab.index(parse.START)
        word_vocab.index(parse.STOP)
        word_vocab.index(parse.UNK)

        label_vocab = vocabulary.Vocabulary()
        label_vocab.index(())

        for tree in parses_for_vocab:
            nodes = [tree]
            while nodes:
                node = nodes.pop()
                if isinstance(node, trees.InternalParseNode):
                    label_vocab.index(node.label)
                    nodes.extend(reversed(node.children))
                else:
                    assert isinstance(node, LeafParseNode)
                    tag_vocab.index(node.tag)
                    word_vocab.index(node.word)

        tag_vocab.freeze()
        word_vocab.freeze()
        label_vocab.freeze()

        print("Initializing model...")
        model = dy.ParameterCollection()
        parser = parse.Parser(
            model,
            tag_vocab,
            word_vocab,
            label_vocab,
            None,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            None,
            args.dropout,
            not args.no_elmo
        )
    return parser, model
Exemple #12
0
def main():
    t1 = time.time()
    import optparse
    import vocabulary
    global out_dir
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.1)
    parser.add_option("--eta",
                      dest="eta",
                      type="float",
                      help="parameter eta",
                      default=0.2)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    #parser.add_option("--setup", dest="setup", help="setup details", default="uniform")
    parser.add_option("--dataset",
                      dest="did",
                      help="setup details : Dataset-1/Dataset-2/Dataset-3",
                      default="Dataset-1")
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        if options.did == 'Dataset-1':
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file(
                options.filename)
        else:
            corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter(
                options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    if event_list is not None: options.K = options.K  #len(event_list)
    suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    #out_dir = '%s/all_words/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    #out_dir = '%s/Dataset-1/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    out_dir = '%s/%s/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' % (
        out_dir, options.did, options.K, options.alpha, options.eta,
        options.iteration, suffix)

    #out_dir = '%s/Reuters-21578/R-8-train-train_no-stop/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    #out_dir = '%s/20-Newsgroup/20-Newsgroup_train-train_all_term/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix)
    print('out_dir: ', out_dir)
    try:
        os.makedirs(out_dir)
    except Exception as e:
        print(' %s Dir exist ' % (out_dir))
        print('E MSG : ', e)
    lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids,
              voca.size(), options.smartinit)
    t_int = time.time()
    #print 'Intialization time : %f' %(t_int-t1)
    flog = '%s/log_file.txt' % (out_dir)
    f = open(flog, 'w')
    f.write(
        "corpus(# of doc)=%d, no of event = %d , Uniq words=%d, Toal # of word =%d, K=%d, a=%f, b=%f , iteration = %d \n"
        % (len(corpus), len(event_list), len(voca.vocas), total_no_word,
           options.K, options.alpha, options.eta, options.iteration))
    f.close()
    print("corpus=%d, no of event =%d , uniq words=%d, K=%d, a=%f, b=%f" %
          (len(corpus), len(event_list), len(
              voca.vocas), options.K, options.alpha, options.eta)),

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
    t2 = time.time()
    print(' Total time taken : %f ' % (t2 - t1))
    flog = '%s/log_file.txt' % (out_dir)
    f = open(flog, 'a')
    f.write(' TOtal time taken : %f ' % (t2 - t1))
    f.close()
Exemple #13
0
def training_model(main_path, type, config_file, from_date, to_date, customer):

    #logging.basicConfig(filename=logCustomer, level=logging.INFO)
    #lg.configureLogger(QIUserLogger, customer, "training")
    #
    QIUserLogger.info(
        "-----------------------------------------------------------------")
    QIUserLogger.info(
        "------------------------Training Start---------------------------")
    #
    QIUserLogger.info("** Initialization start... **")
    main_path = main_path
    type = type
    config_file = config_file
    from_date = from_date
    to_date = to_date

    QIUserLogger.info("	MainPath - " + str(main_path))
    QIUserLogger.info("	Type - " + str(type))
    QIUserLogger.info("	ConfigFile - " + str(config_file))
    QIUserLogger.info("	FromDate - " + str(from_date))
    QIUserLogger.info("	ToDate - " + str(to_date))
    #
    QIUserLogger.info("** Initialization End **")

    try:
        QIUserLogger.info("1 - Load Configurations")
        QIUserLogger.info("	** Config for Classification")
        # Load Config files
        configModel = cg.Config()
        configModel.configFromFile(config_file)
        configModel.main_path = main_path
        configModel.updateDataOfMainPath(config_file, main_path)
        dataL = dt.Data(configModel)
        #
        QIUserLogger.info("2 - Login In API")
        # Login to API
        configConnection = con.ConfigConnection()

        dir_path = os.path.dirname(os.path.realpath(__file__))
        configConnection.configFromFile(dir_path + "/config/" + customer +
                                        "/connector_config.json")
        connector = con.Connector(configConnection)
        # Create Persistent Session
        Reqsess = requests.session()
        # LogIN
        connector.login(Reqsess)
        QIUserLogger.info("3 - GET TICKETS FROM API")
        #
        params = "closedfrom=" + str(from_date) + "&closedto=" + str(
            to_date) + "&maxnum=" + str(configConnection.max_tickets_to_get)
        #params = {"closedfrom": from_date, "closedto": to_date, "maxnum" : configConnection.max_tickets_to_get}
        responseTicket = connector.getTickets(Reqsess, params)
        if len(responseTicket) > 0:
            rTicket = []
            for t in responseTicket:
                rTicket.append(t['description'])
            #
            id2lab = dict(
                zip(configModel.labels_map.values(),
                    configModel.labels_map.keys()))
            #
            gather_tickets, gather_targets = gatherData(
                type, responseTicket, configModel, id2lab)
            #
            QIUserLogger.info("4 - REMOVE STOP WORDS FROM NEW TICKETS")
            tok = tk.Tokenizer(gather_tickets)
            tok.tokenizeTickets()
            tickets_to_lower = tok.toLower()
            gather_tickets, gather_targets = tok.removeStopWordsToString(
                tickets_to_lower, gather_targets)

            QIUserLogger.info("5 - GET STORED DATA TICKETS")
            tickets_train = dataL.loadDataInArray(
                configModel.data_path + "/tickets.txt",
                configModel.csv_encoding)
            targets_train = dataL.loadDataInArray(configModel.data_path +
                                                  "/targets.txt")
            #
            # Count if we reached the threshold
            QIUserLogger.info("6 - MERGE THE DATA - STORED AND GATHERED")
            max_length = configModel.max_num_tickets
            len_gather_tickets = len(gather_tickets)
            len_tickets = len(tickets_train)
            #Effettuo un nuovo training su tutto il dataset e non un transfer
            #learning perchè voglio utilizzare sempre un vocabolario aggiornato.
            tickets = tickets_train + gather_tickets
            targets = targets_train + gather_targets
            reached_dim = len_gather_tickets + len_tickets
            if reached_dim > max_length:
                elem_to_cut = reached_dim - max_length
                #cut out the firsts elem_to_cut elements
                merged_targets = tickets[elem_to_cut:]
                merged_tickets = targets[elem_to_cut:]
                tickets = merged_tickets
                targets = merged_targets
                reached_dim = max_length

            QIUserLogger.info("7 - REMOVE IDENTICAL TICKETS")
            #tickets, targets = ut.removeIdenticalTickets(tickets, targets)
            tickets, targets = ut.removeIdenticalTicketsFromNew(
                tickets, targets, len_tickets, reached_dim)

            QIUserLogger.info("8 - SAVING MERGED DATA")
            dataL.writeArrayInFileCompleteDataPath(
                tickets, configModel.data_path + '/tickets.txt', "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                targets, configModel.data_path + '/targets.txt', "utf-8")
            #
            QIUserLogger.info("9 - EXTRACT WORDS FROM TICKETS")
            words = tok.extractWordsTicketString(tickets)
            #
            QIUserLogger.info("10 - BUILD NEW VOCABULARY")
            # Create Vocabulary
            voc = vc.Vocabulary(configModel)
            dictionary, reverse_dict = voc.build_dictionary(
                words, configModel.labels)
            voc.saveDictionary(dictionary, "vocabulary")
            QIUserLogger.info("*** Vocabulary saved")
            #
            QIUserLogger.info("11 -- SPLIT DATA IN TRAINING AND TEST DATASET")
            tickets_training, tickets_test, Target_training, Target_test = ut.get_train_and_test(
                tickets, targets)
            dataL.writeArrayInFileCompleteDataPath(
                tickets_training,
                configModel.data_path + '/tickets_training.txt', "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                Target_training,
                configModel.data_path + '/targets_training.txt', "utf-8")

            dataL.writeArrayInFileCompleteDataPath(
                tickets_test, configModel.data_path + '/tickets_test.txt',
                "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                Target_test, configModel.data_path + '/targets_test.txt',
                "utf-8")

            #
            QIUserLogger.info("12 - CREATE TICKETS AND TARGETS SEQUENCES")
            # Create Sequences and HotVectors for the Target
            tickets_training_sequences = dataL.createDataSequenceTicketsString(
                tickets_training, dictionary)
            oneHotVectorTarget_training = dataL.transformInOneHotVector(
                configModel.labels, Target_training)
            #
            QIUserLogger.info("13 - FILTER OUT  DATA - Removing Token OOV")
            filtdata = fd.FilterData(configModel, configModel.labels)
            tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV(
                tickets_training_sequences, oneHotVectorTarget_training,
                dictionary)
            QIUserLogger.info("	*** Classe Cestino in Training : " +
                              str(len(trash)))
            #
            #QIUserLogger.info("	-- Split Training | Test Dataset")
            #tickets_training_sequences, tickets_test_sequences, oneHotVectorTarget_training, oneHotVectorTarget_test = ut.get_train_and_test(tickets_training_sequences, oneHotVectorTarget_training)
            #
            QIUserLogger.info("14 - SAVING TRAINING SEQUENCES")
            dataL.writeArrayInFileCompleteDataPath(
                tickets_training_sequences,
                configModel.data_sequences_path + '/tickets_training.txt',
                "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                oneHotVectorTarget_training,
                configModel.data_sequences_path + '/target_training.txt',
                "utf-8")

            QIUserLogger.info("	*** Training Size : " +
                              str(len(tickets_training_sequences)) + "\n")
            if configModel.use_pretrained_embs:
                QIUserLogger.info("	*** Use pretrained Words Embedding")
                skip = sk.SkipgramModel(configModel)
                skipgramModel = skip.get_skipgram()
                skipgramEmbedding = skip.getCustomEmbeddingMatrix(
                    skipgramModel, reverse_dict)
                configModel.skipgramEmbedding = skipgramEmbedding
                # Start Training
                QIUserLogger.info("15 - START TRAINING")
            ml.runTraining(configModel, tickets_training_sequences,
                           oneHotVectorTarget_training, configModel.labels)
            QIUserLogger.info("============ End =============")
        else:
            QIUserLogger.info(
                "No New Tickets found. There is no need of a new training.")

        # LogIN
        connector.logout(Reqsess)
        #
    except Exception as e:
        print(str(e))
        QIUserLogger.error("Error in training_model " + str(e))
Exemple #14
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("--lamda",
                      dest="lamda",
                      type="float",
                      help="parameter lamda",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=100)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename):
        parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.filename:
        (pids, tids) = vocabulary.load_file(options.filename)
    if options.seed != None:
        numpy.random.seed(options.seed)
    #voca is the object which stores the data structures needed by LDA
    voca = vocabulary.Vocabulary(options.stopwords)
    docs = voca.PT_to_idlist(pids, tids)
    #print docs
    size_of_vocab = max(tids) + 1
    lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs,
               size_of_vocab, options.smartinit)
    #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
    blda_learning(lda, options.iteration)
Exemple #15
0
def mainTrainModelOnApertureWithSequenceFeatures():
    print("============ Start =============\n")

    print("1 - Load Configuration\n")
    config = cg.Config()
    dataL = dt.Data(config)
    print("2 - Load Data and Targets\n")
    tickets_training, tickets_test, targets_training, targets_test = loadAndSplit(
    )
    map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json")
    labels = dataL.getfirstLevelTargets(map_labels['map'])
    print("3 - Preprocess Data\n")
    tickets_training_tl, targets_training, words = preprocessData(
        tickets_training, targets_training, labels)
    tickets_test_tl, targets_test, w_ = preprocessData(tickets_test,
                                                       targets_test, labels)
    print("4 - Build Vocabulary\n")
    # Create Vocabulary
    voc = vc.Vocabulary(config)
    dictionary, reverse_dict = voc.build_dictionary(words, labels)
    voc.saveDictionary(dictionary, "vocabulary")
    print("5 - Create Ticket Sequences and Targets Hot Vectors\n")
    #Create Sequences and HotVectors for the Target
    tickets_training_sequences = dataL.createDataSequence(
        tickets_training_tl, dictionary)
    oneHotVectorTarget_training = dataL.transformInOneHotVector(
        labels, targets_training)
    tickets_test_sequences = dataL.createDataSequence(tickets_test_tl,
                                                      dictionary)
    oneHotVectorTarget_test = dataL.transformInOneHotVector(
        labels, targets_test)
    print("6 - Create Ticket Feature Sequences")
    #Create Sequences Features
    tickets_feature_sequences = dataL.extractFeatures(tickets_training_tl,
                                                      dictionary)
    tickets_feature_test_sequences = dataL.createDataSequence(
        tickets_test_tl, dictionary)

    print("6 - Filter Data - Removeing Token OOV\n")
    filtdata = fd.FilterData(config, labels)
    tickets_training_sequences, oneHotVectorTarget_training, tickets_feature_sequences_training, trash = filtdata.removeTokenOOVwithSequenceFeatures(
        tickets_training_sequences, oneHotVectorTarget_training,
        tickets_feature_sequences, dictionary)
    print("*** Classe Cestino in Training : " + str(len(trash)))
    tickets_test_sequences, oneHotVectorTarget_test, tickets_feature_test_sequences, trash = filtdata.removeTokenOOVwithSequenceFeatures(
        tickets_test_sequences, oneHotVectorTarget_test,
        tickets_feature_test_sequences, dictionary)
    print("*** Classe Cestino in Test : " + str(len(trash)))
    print("7 - Generate Training and Testing Dataset\n")
    dataL.writeArrayStringInFile(tickets_training_sequences,
                                 'parsed_sequences/tickets_training.txt',
                                 "utf-8")
    dataL.writeArrayStringInFile(tickets_test_sequences,
                                 'parsed_sequences/tickets_test.txt', "utf-8")
    dataL.writeArrayStringInFile(oneHotVectorTarget_training,
                                 'parsed_sequences/target_training.txt',
                                 "utf-8")
    dataL.writeArrayStringInFile(oneHotVectorTarget_test,
                                 'parsed_sequences/target_test.txt', "utf-8")
    print("*** Training Size : " + str(len(tickets_training_sequences)) + "\n")
    if config.use_pretrained_embs:
        print("*** Uso pretrained Words Embedding\n")
        skip = sk.SkipgramModel(config)
        skipgramModel = skip.get_skipgram()
        skipgramEmbedding = skip.getCustomEmbeddingMatrix(
            skipgramModel, reverse_dict)
        config.skipgramEmbedding = skipgramEmbedding

    print("8 - Start Training\n")
    ml.runTrainingWithFeatureSequence(config, tickets_training_sequences,
                                      oneHotVectorTarget_training, labels,
                                      tickets_feature_sequences_training)
    print("============ End =============\n")
Exemple #16
0
def build_vocab(corpus, V=10000):
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab
Exemple #17
0
def trainPriority():
    print("============ Start =============\n")

    print("1 - Load Configuration\n")
    config = cg.Config()
    config.configFromFile("config/priority_config.json")
    dataL = dt.Data(config)
    print("2 - Load Data and Targets\n")
    tickets_training = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/tickets_training.txt",
        config.csv_encoding)
    tickets_test = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/tickets_test.txt",
        config.csv_encoding)
    targets_training = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/targets_training.txt",
        config.csv_encoding)
    targets_test = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/targets_test.txt",
        config.csv_encoding)
    labels = ["1", "2", "3", "4", "5"]
    print("3 - Preprocess Data\n")
    tickets_training_tl, targets_training, words = preprocessData(
        tickets_training, targets_training, labels)
    tickets_test_tl, targets_test, w_ = preprocessData(tickets_test,
                                                       targets_test, labels)
    if config.loadOrbuild_dictionary == "build":
        print("4 - Build Vocabulary\n")
        # Create Vocabulary
        voc = vc.Vocabulary(config)
        dictionary, reverse_dict = voc.build_dictionary(words, labels)
        voc.saveDictionary(dictionary, "vocabulary")
        print("*** Vocabulary saved \n")
    else:
        print("4 - Load Vocabulary\n")
        # Load Existing Vocabulary
        voc = vc.Vocabulary(config)
        dictionary = voc.loadDictionary("vocabulary")
        reverse_dict = voc.getReverseDictionary(dictionary)

    print("5 - Create Ticket Sequences and Targets Hot Vectors\n")
    # Create Sequences and HotVectors for the Target
    tickets_training_sequences = dataL.createDataSequence(
        tickets_training_tl, dictionary)
    oneHotVectorTarget_training = dataL.transformInOneHotVector(
        labels, targets_training)
    tickets_test_sequences = dataL.createDataSequence(tickets_test_tl,
                                                      dictionary)
    oneHotVectorTarget_test = dataL.transformInOneHotVector(
        labels, targets_test)
    print("6 - Filter Data - Removeing Token OOV\n")
    filtdata = fd.FilterData(config, labels)
    tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV(
        tickets_training_sequences, oneHotVectorTarget_training, dictionary)
    print("	*** Classe Cestino in Training : " + str(len(trash)) + "\n")
    tickets_test_sequences, oneHotVectorTarget_test, trash = filtdata.removeTokenOOV(
        tickets_test_sequences, oneHotVectorTarget_test, dictionary)
    print("	*** Classe Cestino in Test : " + str(len(trash)) + "\n")
    print("7 - Generate Training and Testing Dataset\n")
    dataL.writeArrayInFileCompleteDataPath(
        tickets_training_sequences,
        config.data_sequences_path + '/tickets_training.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        tickets_test_sequences,
        config.data_sequences_path + '/tickets_test.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        oneHotVectorTarget_training,
        config.data_sequences_path + '/target_training.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        oneHotVectorTarget_test,
        config.data_sequences_path + '/target_test.txt', "utf-8")
    print("	*** Training Size : " + str(len(tickets_training_sequences)) +
          "\n")
    print("	*** Test Size : " + str(len(tickets_test_sequences)) + "\n")
    if config.use_pretrained_embs:
        print("	*** Use pretrained Words Embedding\n")
        skip = sk.SkipgramModel(config)
        skipgramModel = skip.get_skipgram()
        skipgramEmbedding = skip.getCustomEmbeddingMatrix(
            skipgramModel, reverse_dict)
        config.skipgramEmbedding = skipgramEmbedding

    print("8 - Start Training\n")
    ml.runTraining(config, tickets_training_sequences,
                   oneHotVectorTarget_training, labels)
    print("============ End =============\n")
Exemple #18
0
def run_train(args, hparams):
    # if args.numpy_seed is not None:
    #     print("Setting numpy random seed to {}...".format(args.numpy_seed))
    #     np.random.seed(args.numpy_seed)
    #
    # # Make sure that pytorch is actually being initialized randomly.
    # # On my cluster I was getting highly correlated results from multiple
    # # runs, but calling reset_parameters() changed that. A brief look at the
    # # pytorch source code revealed that pytorch initializes its RNG by
    # # calling std::random_device, which according to the C++ spec is allowed
    # # to be deterministic.
    # seed_from_numpy = np.random.randint(2147483648)
    # print("Manual seed for pytorch:", seed_from_numpy)
    # torch.manual_seed(seed_from_numpy)

    now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    log_file_name = os.path.join(args.log_dir, 'log-' + now_time)
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        filename=log_file_name,
        filemode='w',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    console_handler = logging.StreamHandler()
    logger.addHandler(console_handler)
    logger = logging.getLogger(__name__)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    hparams.set_from_args(args)
    logger.info("Hyperparameters:")
    logger.info(hparams.print())

    logger.info("Loading training trees from {}...".format(args.train_path))
    if hparams.predict_tags and args.train_path.endswith('10way.clean'):
        logger.info(
            "WARNING: The data distributed with this repository contains "
            "predicted part-of-speech tags only (not gold tags!) We do not "
            "recommend enabling predict_tags in this configuration.")
    train_treebank = trees.load_trees(args.train_path)
    if hparams.max_len_train > 0:
        train_treebank = [
            tree for tree in train_treebank
            if len(list(tree.leaves())) <= hparams.max_len_train
        ]
    logger.info("Loaded {:,} training examples.".format(len(train_treebank)))

    logger.info("Loading development trees from {}...".format(args.dev_path))
    dev_treebank = trees.load_trees(args.dev_path)
    if hparams.max_len_dev > 0:
        dev_treebank = [
            tree for tree in dev_treebank
            if len(list(tree.leaves())) <= hparams.max_len_dev
        ]
    logger.info("Loaded {:,} development examples.".format(len(dev_treebank)))

    logger.info("Loading test trees from {}...".format(args.test_path))
    test_treebank = trees.load_trees(args.test_path)
    if hparams.max_len_dev > 0:
        test_treebank = [
            tree for tree in test_treebank
            if len(list(tree.leaves())) <= hparams.max_len_dev
        ]
    logger.info("Loaded {:,} test examples.".format(len(test_treebank)))

    logger.info("Processing trees for training...")
    train_parse = [tree.convert() for tree in train_treebank]
    dev_parse = [tree.convert() for tree in dev_treebank]
    test_parse = [tree.convert() for tree in test_treebank]

    logger.info("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(tokens.START)
    tag_vocab.index(tokens.STOP)
    tag_vocab.index(tokens.TAG_UNK)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(tokens.START)
    word_vocab.index(tokens.STOP)
    word_vocab.index(tokens.UNK)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    char_set = set()

    for tree in train_parse:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, trees.InternalParseNode):
                label_vocab.index(node.label)
                nodes.extend(reversed(node.children))
            else:
                tag_vocab.index(node.tag)
                word_vocab.index(node.word)
                char_set |= set(node.word)

    char_vocab = vocabulary.Vocabulary()

    # If codepoints are small (e.g. Latin alphabet), index by codepoint directly
    highest_codepoint = max(ord(char) for char in char_set)
    if highest_codepoint < 512:
        if highest_codepoint < 256:
            highest_codepoint = 256
        else:
            highest_codepoint = 512

        # This also takes care of constants like tokens.CHAR_PAD
        for codepoint in range(highest_codepoint):
            char_index = char_vocab.index(chr(codepoint))
            assert char_index == codepoint
    else:
        char_vocab.index(tokens.CHAR_UNK)
        char_vocab.index(tokens.CHAR_START_SENTENCE)
        char_vocab.index(tokens.CHAR_START_WORD)
        char_vocab.index(tokens.CHAR_STOP_WORD)
        char_vocab.index(tokens.CHAR_STOP_SENTENCE)
        for char in sorted(char_set):
            char_vocab.index(char)

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()
    char_vocab.freeze()

    # -------- ngram vocab ------------
    ngram_vocab = vocabulary.Vocabulary()
    ngram_vocab.index(())
    ngram_finder = FindNgrams(min_count=hparams.ngram_threshold)

    def get_sentence(parse):
        sentences = []
        for tree in parse:
            sentence = []
            for leaf in tree.leaves():
                sentence.append(leaf.word)
            sentences.append(sentence)
        return sentences

    sentence_list = get_sentence(train_parse)
    if not args.cross_domain:
        sentence_list.extend(get_sentence(dev_parse))
    # sentence_list.extend(get_sentence(test_parse))

    if hparams.ngram_type == 'freq':
        logger.info('ngram type: freq')
        ngram_finder.count_ngram(sentence_list, hparams.ngram)
    elif hparams.ngram_type == 'pmi':
        logger.info('ngram type: pmi')
        ngram_finder.find_ngrams_pmi(sentence_list, hparams.ngram,
                                     hparams.ngram_freq_threshold)
    else:
        raise ValueError()
    ngram_type_count = [0 for _ in range(hparams.ngram)]
    for w, c in ngram_finder.ngrams.items():
        ngram_type_count[len(list(w)) - 1] += 1
        for _ in range(c):
            ngram_vocab.index(w)
    logger.info(str(ngram_type_count))
    ngram_vocab.freeze()

    ngram_count = [0 for _ in range(hparams.ngram)]
    for sentence in sentence_list:
        for n in range(len(ngram_count)):
            length = n + 1
            for i in range(len(sentence)):
                gram = tuple(sentence[i:i + length])
                if gram in ngram_finder.ngrams:
                    ngram_count[n] += 1
    logger.info(str(ngram_count))

    # -------- ngram vocab ------------

    def print_vocabulary(name, vocab):
        special = {tokens.START, tokens.STOP, tokens.UNK}
        logger.info("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)
        print_vocabulary("Ngram", ngram_vocab)

    logger.info("Initializing model...")

    load_path = None
    if load_path is not None:
        logger.info(f"Loading parameters from {load_path}")
        info = torch_load(load_path)
        parser = SAPar_model.SAChartParser.from_spec(info['spec'],
                                                     info['state_dict'])
    else:
        parser = SAPar_model.SAChartParser(
            tag_vocab,
            word_vocab,
            label_vocab,
            char_vocab,
            ngram_vocab,
            hparams,
        )

    print("Initializing optimizer...")
    trainable_parameters = [
        param for param in parser.parameters() if param.requires_grad
    ]
    trainer = torch.optim.Adam(trainable_parameters,
                               lr=1.,
                               betas=(0.9, 0.98),
                               eps=1e-9)
    if load_path is not None:
        trainer.load_state_dict(info['trainer'])
    pytorch_total_params = sum(p.numel() for p in parser.parameters()
                               if p.requires_grad)
    logger.info('# of trainable parameters: %d' % pytorch_total_params)

    def set_lr(new_lr):
        for param_group in trainer.param_groups:
            param_group['lr'] = new_lr

    assert hparams.step_decay, "Only step_decay schedule is supported"

    warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        trainer,
        'max',
        factor=hparams.step_decay_factor,
        patience=hparams.step_decay_patience,
        verbose=True,
    )

    def schedule_lr(iteration):
        iteration = iteration + 1
        if iteration <= hparams.learning_rate_warmup_steps:
            set_lr(iteration * warmup_coeff)

    clippable_parameters = trainable_parameters
    grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm

    logger.info("Training...")
    total_processed = 0
    current_processed = 0
    check_every = len(train_parse) / args.checks_per_epoch
    best_eval_fscore = -np.inf
    test_fscore_on_dev = -np.inf
    best_eval_scores = None
    best_eval_model_path = None
    best_eval_processed = 0

    start_time = time.time()

    def check_eval(eval_treebank, ep, flag='dev'):
        # nonlocal best_eval_fscore
        # nonlocal best_eval_model_path
        # nonlocal best_eval_processed

        dev_start_time = time.time()

        eval_predicted = []
        for dev_start_index in range(0, len(eval_treebank),
                                     args.eval_batch_size):
            subbatch_trees = eval_treebank[dev_start_index:dev_start_index +
                                           args.eval_batch_size]
            subbatch_sentences = [[(leaf.tag, leaf.word)
                                   for leaf in tree.leaves()]
                                  for tree in subbatch_trees]
            predicted, _ = parser.parse_batch(subbatch_sentences)
            del _
            eval_predicted.extend([p.convert() for p in predicted])

        eval_fscore = evaluate.evalb(args.evalb_dir, eval_treebank,
                                     eval_predicted)

        logger.info(flag + ' eval '
                    'epoch {} '
                    "fscore {} "
                    "elapsed {} "
                    "total-elapsed {}".format(
                        ep,
                        eval_fscore,
                        format_elapsed(dev_start_time),
                        format_elapsed(start_time),
                    ))
        return eval_fscore

    def save_model(eval_fscore, remove_model):
        nonlocal best_eval_fscore
        nonlocal best_eval_model_path
        nonlocal best_eval_processed
        nonlocal best_eval_scores

        if best_eval_model_path is not None:
            extensions = [".pt"]
            for ext in extensions:
                path = best_eval_model_path + ext
                if os.path.exists(path) and remove_model:
                    logger.info(
                        "Removing previous model file {}...".format(path))
                    os.remove(path)
        best_eval_fscore = eval_fscore.fscore
        best_eval_scores = eval_fscore
        best_eval_model_path = "{}_eval={:.2f}_{}".format(
            args.model_path_base, eval_fscore.fscore, now_time)
        best_eval_processed = total_processed
        logger.info(
            "Saving new best model to {}...".format(best_eval_model_path))
        torch.save(
            {
                'spec': parser.spec,
                'state_dict': parser.state_dict(),
                # 'trainer' : trainer.state_dict(),
            },
            best_eval_model_path + ".pt")

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_parse)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_parse), args.batch_size):
            trainer.zero_grad()
            schedule_lr(total_processed // args.batch_size)

            batch_loss_value = 0.0
            batch_trees = train_parse[start_index:start_index +
                                      args.batch_size]
            batch_sentences = [[(leaf.tag, leaf.word)
                                for leaf in tree.leaves()]
                               for tree in batch_trees]
            batch_num_tokens = sum(
                len(sentence) for sentence in batch_sentences)

            for subbatch_sentences, subbatch_trees in parser.split_batch(
                    batch_sentences, batch_trees, args.subbatch_max_tokens):
                _, loss = parser.parse_batch(subbatch_sentences,
                                             subbatch_trees)

                if hparams.predict_tags:
                    loss = loss[0] / len(
                        batch_trees) + loss[1] / batch_num_tokens
                else:
                    loss = loss / len(batch_trees)
                loss_value = float(loss.data.cpu().numpy())
                batch_loss_value += loss_value
                if loss_value > 0:
                    loss.backward()
                del loss
                total_processed += len(subbatch_trees)
                current_processed += len(subbatch_trees)

            grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters,
                                                       grad_clip_threshold)

            trainer.step()

            print("epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "grad-norm {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_parse) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      grad_norm,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ))

            if current_processed >= check_every:
                current_processed -= check_every
                dev_fscore = check_eval(dev_treebank, epoch, flag='dev')
                test_fscore = check_eval(test_treebank, epoch, flag='test')

                if dev_fscore.fscore > best_eval_fscore:
                    save_model(dev_fscore, remove_model=True)
                    test_fscore_on_dev = test_fscore

        # adjust learning rate at the end of an epoch
        if (total_processed // args.batch_size +
                1) > hparams.learning_rate_warmup_steps:
            scheduler.step(best_eval_fscore)
            if (total_processed - best_eval_processed) > args.patients \
                    + ((hparams.step_decay_patience + 1) * hparams.max_consecutive_decays * len(train_parse)):
                logger.info(
                    "Terminating due to lack of improvement in eval fscore.")
                logger.info("best dev {} test {}".format(
                    best_eval_scores,
                    test_fscore_on_dev,
                ))
                break
Exemple #19
0
def main():
    import optparse
    parser = optparse.OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=numpy.random.gamma(1, 1))
    parser.add_option("--gamma",
                      dest="gamma",
                      type="float",
                      help="parameter gamma",
                      default=numpy.random.gamma(1, 1))
    parser.add_option("--beta",
                      dest="base",
                      type="float",
                      help="parameter of beta measure H",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="initial number of topics",
                      default=1)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=10)
    parser.add_option("-s",
                      dest="stopwords",
                      type="int",
                      help="0=exclude stop words, 1=include stop words",
                      default=1)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")
    if options.seed != None:
        numpy.random.seed(options.seed)

    import vocabulary
    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(options.corpus)
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")

    voca = vocabulary.Vocabulary(options.stopwords == 0)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    hdplda = HDPLDA(options.alpha, options.gamma, options.base, docs,
                    voca.size())
    print "corpus=%d words=%d alpha=%.3f gamma=%.3f base=%.3f stopwords=%d" % (
        len(corpus), len(voca.vocas), options.alpha, options.gamma,
        options.base, options.stopwords)
    #hdplda.dump()

    #import cProfile
    #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile')
    hdplda_learning(hdplda, options.iteration)
    """
Exemple #20
0
def main():
    import os
    import pickle
    import optparse

    parser = optparse.OptionParser()
    parser.add_option("-m", dest="model", help="model filename")
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("-b",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.1)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.01)
    parser.add_option("--eta",
                      dest="eta",
                      type="float",
                      help="parameter eta",
                      default=100)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=20)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=10)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    parser.add_option(
        "-c",
        dest="constraint",
        help="add constraint (wordlist which should belong to the same topic)")
    parser.add_option("-u",
                      "--unassign",
                      dest="unassign",
                      help="unassign method (all/doc/term/none)",
                      default="none")
    (options, args) = parser.parse_args()

    numpy.random.seed(options.seed)

    if options.model and os.path.exists(options.model):
        with open(options.model, "rb") as f:
            lda, voca = pickle.load(f)
    elif not (options.filename or options.corpus):
        parser.error(
            "need corpus filename(-f) or corpus range(-b) or model(-m)")
    else:
        import vocabulary
        if options.filename:
            corpus = vocabulary.load_file(options.filename)
        else:
            corpus = vocabulary.load_corpus(options.corpus)
            if not corpus: parser.error("corpus range(-c) forms 'start:end'")
        voca = vocabulary.Vocabulary()
        docs = [voca.doc_to_ids(doc) for doc in corpus]
        if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
        lda = ITM(options.K, options.alpha, options.beta, options.eta, docs,
                  voca.size())
    param = (len(lda.docs), len(voca.vocas), options.K, options.alpha,
             options.beta, options.eta)
    print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param

    if options.constraint:
        if options.unassign == "all":
            add_constraint = lda.add_constraint_all
        elif options.unassign == "doc":
            add_constraint = lda.add_constraint_doc
        elif options.unassign == "term":
            add_constraint = lda.add_constraint_term
        elif options.unassign == "none":
            add_constraint = lda.add_constraint_none
        else:
            parser.error("unassign method(-u) must be all/doc/term/none")

        wordlist = options.constraint.split(',')
        idlist = [voca.vocas_id[w] for w in wordlist]

        print "\n== add constraint =="
        for w, v in zip(idlist, wordlist):
            print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:, w]))

        add_constraint(idlist)

        lda.verify_topic()

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)

    with open(options.model, "wb") as f:
        pickle.dump((lda, voca), f)
Exemple #21
0
def run_train(args, hparams):
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    # Make sure that pytorch is actually being initialized randomly.
    # On my cluster I was getting highly correlated results from multiple
    # runs, but calling reset_parameters() changed that. A brief look at the
    # pytorch source code revealed that pytorch initializes its RNG by
    # calling std::random_device, which according to the C++ spec is allowed
    # to be deterministic.
    seed_from_numpy = np.random.randint(2147483648)
    print("Manual seed for pytorch:", seed_from_numpy)
    torch.manual_seed(seed_from_numpy)

    hparams.set_from_args(args)
    print("Hyperparameters:")
    hparams.print()

    print("Loading training trees from {}...".format(args.train_path))
    if hparams.predict_tags and args.train_path.endswith('10way.clean'):
        print("WARNING: The data distributed with this repository contains "
              "predicted part-of-speech tags only (not gold tags!) We do not "
              "recommend enabling predict_tags in this configuration.")
    train_treebank = trees.load_trees(args.train_path)
    if hparams.max_len_train > 0:
        train_treebank = [
            tree for tree in train_treebank
            if len(list(tree.leaves())) <= hparams.max_len_train
        ]
    print("Loaded {:,} training examples.".format(len(train_treebank)))

    print("Loading development trees from {}...".format(args.dev_path))
    dev_treebank = trees.load_trees(args.dev_path)
    if hparams.max_len_dev > 0:
        dev_treebank = [
            tree for tree in dev_treebank
            if len(list(tree.leaves())) <= hparams.max_len_dev
        ]
    print("Loaded {:,} development examples.".format(len(dev_treebank)))

    print("Processing trees for training...")
    train_parse = [tree.convert() for tree in train_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(tokens.START)
    tag_vocab.index(tokens.STOP)
    tag_vocab.index(tokens.TAG_UNK)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(tokens.START)
    word_vocab.index(tokens.STOP)
    word_vocab.index(tokens.UNK)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    char_set = set()

    for tree in train_parse:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, trees.InternalParseNode):
                label_vocab.index(node.label)
                nodes.extend(reversed(node.children))
            else:
                tag_vocab.index(node.tag)
                word_vocab.index(node.word)
                char_set |= set(node.word)

    char_vocab = vocabulary.Vocabulary()

    # If codepoints are small (e.g. Latin alphabet), index by codepoint directly
    highest_codepoint = max(ord(char) for char in char_set)
    if highest_codepoint < 512:
        if highest_codepoint < 256:
            highest_codepoint = 256
        else:
            highest_codepoint = 512

        # This also takes care of constants like tokens.CHAR_PAD
        for codepoint in range(highest_codepoint):
            char_index = char_vocab.index(chr(codepoint))
            assert char_index == codepoint
    else:
        char_vocab.index(tokens.CHAR_UNK)
        char_vocab.index(tokens.CHAR_START_SENTENCE)
        char_vocab.index(tokens.CHAR_START_WORD)
        char_vocab.index(tokens.CHAR_STOP_WORD)
        char_vocab.index(tokens.CHAR_STOP_SENTENCE)
        for char in sorted(char_set):
            char_vocab.index(char)

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()
    char_vocab.freeze()

    def print_vocabulary(name, vocab):
        special = {tokens.START, tokens.STOP, tokens.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)

    print("Initializing model...")

    load_path = None
    if load_path is not None:
        print(f"Loading parameters from {load_path}")
        info = torch_load(load_path)
        parser = parse_nk.NKChartParser.from_spec(info['spec'],
                                                  info['state_dict'])
    else:
        parser = parse_nk.NKChartParser(
            tag_vocab,
            word_vocab,
            label_vocab,
            char_vocab,
            hparams,
        )

    print("Initializing optimizer...")
    trainable_parameters = [
        param for param in parser.parameters() if param.requires_grad
    ]
    trainer = torch.optim.Adam(trainable_parameters,
                               lr=1.,
                               betas=(0.9, 0.98),
                               eps=1e-9)
    if load_path is not None:
        trainer.load_state_dict(info['trainer'])

    def set_lr(new_lr):
        for param_group in trainer.param_groups:
            param_group['lr'] = new_lr

    assert hparams.step_decay, "Only step_decay schedule is supported"

    warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        trainer,
        'max',
        factor=hparams.step_decay_factor,
        patience=hparams.step_decay_patience,
        verbose=True,
    )

    def schedule_lr(iteration):
        iteration = iteration + 1
        if iteration <= hparams.learning_rate_warmup_steps:
            set_lr(iteration * warmup_coeff)

    clippable_parameters = trainable_parameters
    grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm

    print("Training...")
    total_processed = 0
    current_processed = 0
    check_every = len(train_parse) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_dev_model_path = None
    best_dev_processed = 0

    start_time = time.time()

    def check_dev():
        nonlocal best_dev_fscore
        nonlocal best_dev_model_path
        nonlocal best_dev_processed

        dev_start_time = time.time()

        dev_predicted = []
        for dev_start_index in range(0, len(dev_treebank),
                                     args.eval_batch_size):
            subbatch_trees = dev_treebank[dev_start_index:dev_start_index +
                                          args.eval_batch_size]
            subbatch_sentences = [[(leaf.tag, leaf.word)
                                   for leaf in tree.leaves()]
                                  for tree in subbatch_trees]
            predicted, _ = parser.parse_batch(subbatch_sentences)
            del _
            dev_predicted.extend([p.convert() for p in predicted])

        dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank,
                                    dev_predicted)

        print("dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(
                  dev_fscore,
                  format_elapsed(dev_start_time),
                  format_elapsed(start_time),
              ))

        if dev_fscore.fscore > best_dev_fscore:
            if best_dev_model_path is not None:
                extensions = [".pt"]
                for ext in extensions:
                    path = best_dev_model_path + ext
                    if os.path.exists(path):
                        print(
                            "Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            best_dev_model_path = "{}_dev={:.2f}".format(
                args.model_path_base, dev_fscore.fscore)
            best_dev_processed = total_processed
            print("Saving new best model to {}...".format(best_dev_model_path))
            torch.save(
                {
                    'spec': parser.spec,
                    'state_dict': parser.state_dict(),
                    'trainer': trainer.state_dict(),
                }, best_dev_model_path + ".pt")

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_parse)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_parse), args.batch_size):
            trainer.zero_grad()
            schedule_lr(total_processed // args.batch_size)

            batch_loss_value = 0.0
            batch_trees = train_parse[start_index:start_index +
                                      args.batch_size]
            batch_sentences = [[(leaf.tag, leaf.word)
                                for leaf in tree.leaves()]
                               for tree in batch_trees]
            batch_num_tokens = sum(
                len(sentence) for sentence in batch_sentences)

            for subbatch_sentences, subbatch_trees in parser.split_batch(
                    batch_sentences, batch_trees, args.subbatch_max_tokens):
                _, loss = parser.parse_batch(subbatch_sentences,
                                             subbatch_trees)

                if hparams.predict_tags:
                    loss = loss[0] / len(
                        batch_trees) + loss[1] / batch_num_tokens
                else:
                    loss = loss / len(batch_trees)
                loss_value = float(loss.data.cpu().numpy())
                batch_loss_value += loss_value
                if loss_value > 0:
                    loss.backward()
                del loss
                total_processed += len(subbatch_trees)
                current_processed += len(subbatch_trees)

            grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters,
                                                       grad_clip_threshold)

            trainer.step()

            print("epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "grad-norm {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_parse) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      grad_norm,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ))

            if current_processed >= check_every:
                current_processed -= check_every
                check_dev()

        # adjust learning rate at the end of an epoch
        if (total_processed // args.batch_size +
                1) > hparams.learning_rate_warmup_steps:
            scheduler.step(best_dev_fscore)
            if (total_processed - best_dev_processed) > (
                (hparams.step_decay_patience + 1) *
                    hparams.max_consecutive_decays * len(train_parse)):
                print("Terminating due to lack of improvement in dev fscore.")
                break
Exemple #22
0
    'images/9.JPG',
    'images/10.JPG',
]

nbr_images = len(imlist)

featlist = [imlist[i][:-3] + 'sift' for i in range(nbr_images)]

for i in range(nbr_images):
    # print featlist[i]
    # print imlist[i]
    sift.process_image(imlist[i], featlist[i])
"""

imagename = "/opt/cv/images/2603.JPG"
from PIL import Image
im = Image.open(imagename).convert('L')
im.save('/opt/cv/images/tmp.pgm')
sift /opt/cv/images/tmp.pgm  --output /opt/cv/images/2603.sift --edge-thresh 10 --peak-thresh 5
"""
# print "ok"
# exit()

voc = vocabulary.Vocabulary('ukbenchtest')
voc.train(featlist, 1000, 10)

with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(voc, f)

print('vocabulary is:', voc.name, voc.nbr_words)
positivos = []
for file in dir:
    w = open("learn2/positivos/" + file).read()
    positivos.append(w)

nn_corpus = [sentence.split(' ') for sentence in positivos]
nnn_corpus = [sentence.split(' ') for sentence in negativos]
nn_corpus += nnn_corpus

classes = []
for i in range(50):
    classes.append(1)
for i in range(50):
    classes.append(0)

voca = vocabulary.Vocabulary("stopwords.txt")
docs = [voca.doc_to_ids(doc) for doc in nn_corpus]

NB = bernoulliNB.BernoulliNB(voca, docs, classes)


def gg():
    print ":B"


def classify(comment):
    comment = comment.split(' ')
    tst_bow = voca.doc_to_ids_no_add(comment)
    print tst_bow
    return NB.apply(classes, voca, tst_bow)
    def shared_tree_constrained_inference(self, testing_hps):

        np.set_printoptions(precision=4, suppress=True)

        testing_sent_tokens_1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        testing_sent_tokens_1 = [str(t) for t in testing_sent_tokens_1]
        testing_sent_edu_ids_1 = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
        testing_sent_parent_ids_1 = [1, 1, 2, 2, -1, -1, 2, 2, 3, 3]

        testing_sent_tokens_2 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
        testing_sent_tokens_2 = [str(t) for t in testing_sent_tokens_2]
        testing_sent_edu_ids_2 = [5, 5, 5, 6, 7, 7, 7, 8, 8]
        testing_sent_parent_ids_2 = [6, 6, 6, -1, 6, 6, 6, 6, 6]

        dummy_abstract_sentence_1 = [
            "0", "0", "0", "0", "0", "0", "0", "0", "0", "0"
        ]

        dummy_abstract_sentence_2 = ["0", "0", "0"]

        extract_labels_1 = [0 for _ in testing_sent_tokens_1]
        extract_labels_2 = [0 for _ in testing_sent_tokens_2]

        vocab_word_indices = range(11)
        dummy_vocab_indices = dict([(str(i), i) for i in vocab_word_indices])
        stem_indices = dummy_vocab_indices
        word_stems = dict([(str(w), str(w)) for w in vocab_word_indices])
        is_stop = [False for _ in vocab_word_indices]
        vocab = vocabulary.Vocabulary(dummy_vocab_indices, stem_indices,
                                      word_stems, is_stop,
                                      testing_hps.vocab_size)

        if testing_hps.single_sentence_concat:
            combined_toks = [["0"] + testing_sent_tokens_1 +
                             testing_sent_tokens_2]
            combined_labels = [[1] + extract_labels_1 + extract_labels_2]
            combined_edu_ids = [
                -1
            ] + testing_sent_edu_ids_1 + testing_sent_edu_ids_2
            combined_parent_ids = [
                -2
            ] + testing_sent_parent_ids_1 + testing_sent_parent_ids_2
            combined_edu_ids = [[i + 1 for i in combined_edu_ids]]
            combined_parent_ids = [[i + 1 for i in combined_parent_ids]]
            nyt_ex_1 = data.SummaryExample(0, combined_toks, combined_edu_ids,
                                           combined_parent_ids,
                                           combined_labels,
                                           dummy_abstract_sentence_1)
            nyt_ex_2 = data.SummaryExample(1, combined_toks, combined_edu_ids,
                                           combined_parent_ids,
                                           combined_labels,
                                           dummy_abstract_sentence_2)
        else:
            nyt_ex_1 = data.SummaryExample(
                0, [testing_sent_tokens_1, testing_sent_tokens_2],
                [testing_sent_edu_ids_1, testing_sent_edu_ids_2],
                [testing_sent_parent_ids_1, testing_sent_parent_ids_2],
                [extract_labels_1, extract_labels_2],
                dummy_abstract_sentence_1)

            nyt_ex_2 = data.SummaryExample(
                1, [testing_sent_tokens_1, testing_sent_tokens_2],
                [testing_sent_edu_ids_1, testing_sent_edu_ids_2],
                [testing_sent_parent_ids_1, testing_sent_parent_ids_2],
                [extract_labels_1, extract_labels_2],
                dummy_abstract_sentence_2)

        with self.test_session() as session:

            tf.set_random_seed(12)

            model_inp = ffttci.TreeInferenceInputs(testing_hps)

            ex_batch = ffttci.TreeInferenceBatch(testing_hps, model_inp,
                                                 [nyt_ex_1, nyt_ex_2], vocab)

            inferencer = ffttci.TreeConstrainedInferencer()

            logit_shape = [testing_hps.batch_size, testing_hps.num_art_steps]

            word_logits = tf.constant(np.full(logit_shape, 0.0),
                                      dtype=tf.float32,
                                      shape=logit_shape)

            margs, samples, logz = inferencer.do_tree_inference(
                testing_hps, model_inp, word_logits)

            margs = tf.reshape(margs, [testing_hps.batch_size, -1])
            grad_logz = tf.gradients(logz, word_logits)[0]

            margs_np, samples_np, logz_np, grad_logz_np = session.run(
                [margs, samples, logz, grad_logz], ex_batch.feeds)

            emp_marg = np.average(samples_np, axis=1)
            emp_marg = np.reshape(emp_marg, [testing_hps.batch_size, -1])

            # sampled marginals should be pretty close to marginals calculated from BP
            self.assertNDArrayNear(margs_np, emp_marg, 0.05)
            # gradient of logz should be _very_ close to marginals calculated from BP
            self.assertNDArrayNear(margs_np, grad_logz_np, 0.001)
            # for k=3 example, logz should equal log(3)
            self.assertNear(1.08961229, logz_np[1], 0.01)
Exemple #25
0
def run_train(args, hparams):
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    # Make sure that pytorch is actually being initialized randomly.
    # On my cluster I was getting highly correlated results from multiple
    # runs, but calling reset_parameters() changed that. A brief look at the
    # pytorch source code revealed that pytorch initializes its RNG by
    # calling std::random_device, which according to the C++ spec is allowed
    # to be deterministic.
    seed_from_numpy = np.random.randint(2147483648)
    print("Manual seed for pytorch:", seed_from_numpy)
    torch.manual_seed(seed_from_numpy)

    hparams.set_from_args(args)
    print("Hyperparameters:")
    hparams.print()

    train_path = args.train_ptb_path
    dev_path = args.dev_ptb_path

    dep_train_path = args.dep_train_ptb_path
    dep_dev_path = args.dep_dev_ptb_path

    if hparams.dataset == 'ctb':
        train_path = args.train_ctb_path
        dev_path = args.dev_ctb_path

        dep_train_path = args.dep_train_ctb_path
        dep_dev_path = args.dep_dev_ctb_path

    dep_reader = CoNLLXReader(dep_train_path)
    print('Reading dependency parsing data from %s' % dep_train_path)

    dep_dev_reader = CoNLLXReader(dep_dev_path)
    print('Reading dependency parsing data from %s' % dep_dev_path)


    counter = 0
    dep_sentences = []
    dep_data = []
    dep_heads = []
    dep_types = []
    inst = dep_reader.getNext()
    while inst is not None:

        inst_size = inst.length()
        if hparams.max_len_train > 0 and inst_size - 1 > hparams.max_len_train:
            inst = dep_reader.getNext()
            continue

        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)
        sent = inst.sentence
        dep_data.append((sent.words, inst.postags, inst.heads, inst.types))
        #dep_sentences.append([(tag, word) for i, (word, tag) in enumerate(zip(sent.words, sent.postags))])
        dep_sentences.append(sent.words)
        dep_heads.append(inst.heads)
        dep_types.append(inst.types)
        inst = dep_reader.getNext()
    dep_reader.close()
    print("Total number of data: %d" % counter)

    dep_dev_data = []
    dev_inst = dep_dev_reader.getNext()
    dep_dev_headid = np.zeros([3000,300],dtype=int)
    dep_dev_type = []
    dep_dev_word = []
    dep_dev_pos = []
    dep_dev_lengs = np.zeros(3000, dtype=int)
    cun = 0
    while dev_inst is not None:
        inst_size = dev_inst.length()
        if hparams.max_len_dev > 0 and inst_size - 1> hparams.max_len_dev:
            dev_inst = dep_dev_reader.getNext()
            continue
        dep_dev_lengs[cun] = inst_size
        sent = dev_inst.sentence
        dep_dev_data.append((sent.words, dev_inst.postags, dev_inst.heads, dev_inst.types))
        for i in range(inst_size):
            dep_dev_headid[cun][i] = dev_inst.heads[i]

        dep_dev_type.append(dev_inst.types)
        dep_dev_word.append(sent.words)
        dep_dev_pos.append(sent.postags)
        #dep_sentences.append([(tag, word) for i, (word, tag) in enumerate(zip(sent.words, sent.postags))])
        dev_inst = dep_dev_reader.getNext()
        cun = cun + 1
    dep_dev_reader.close()


    print("Loading training trees from {}...".format(train_path))
    train_treebank = trees.load_trees(train_path, dep_heads, dep_types, dep_sentences)
    if hparams.max_len_train > 0:
        train_treebank = [tree for tree in train_treebank if len(list(tree.leaves())) <= hparams.max_len_train]
    print("Loaded {:,} training examples.".format(len(train_treebank)))

    print("Loading development trees from {}...".format(dev_path))
    dev_treebank = trees.load_trees(dev_path, dep_dev_headid, dep_dev_type, dep_dev_word)
    if hparams.max_len_dev > 0:
        dev_treebank = [tree for tree in dev_treebank if len(list(tree.leaves())) <= hparams.max_len_dev]
    print("Loaded {:,} development examples.".format(len(dev_treebank)))


    print("Processing trees for training...")
    train_parse = [tree.convert() for tree in train_treebank]
    dev_parse = [tree.convert() for tree in dev_treebank]

    count_wh("train data:", train_parse, dep_heads, dep_types)
    count_wh("dev data:", dev_parse, dep_dev_headid, dep_dev_type)

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(Zparser.START)
    tag_vocab.index(Zparser.STOP)
    tag_vocab.index(Zparser.TAG_UNK)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(Zparser.START)
    word_vocab.index(Zparser.STOP)
    word_vocab.index(Zparser.UNK)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())
    sublabels = [Zparser.Sub_Head]
    label_vocab.index(tuple(sublabels))

    type_vocab = vocabulary.Vocabulary()

    char_set = set()

    for i, tree in enumerate(train_parse):

        const_sentences = [leaf.word for leaf in tree.leaves()]
        assert len(const_sentences)  == len(dep_sentences[i])
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, trees.InternalParseNode):
                label_vocab.index(node.label)
                if node.type is not Zparser.ROOT:#not include root type
                    type_vocab.index(node.type)
                nodes.extend(reversed(node.children))
            else:
                tag_vocab.index(node.tag)
                word_vocab.index(node.word)
                type_vocab.index(node.type)
                char_set |= set(node.word)

    char_vocab = vocabulary.Vocabulary()

    #char_vocab.index(tokens.CHAR_PAD)

    # If codepoints are small (e.g. Latin alphabet), index by codepoint directly
    highest_codepoint = max(ord(char) for char in char_set)
    if highest_codepoint < 512:
        if highest_codepoint < 256:
            highest_codepoint = 256
        else:
            highest_codepoint = 512

        # This also takes care of constants like tokens.CHAR_PAD
        for codepoint in range(highest_codepoint):
            char_index = char_vocab.index(chr(codepoint))
            assert char_index == codepoint
    else:
        char_vocab.index(tokens.CHAR_UNK)
        char_vocab.index(tokens.CHAR_START_SENTENCE)
        char_vocab.index(tokens.CHAR_START_WORD)
        char_vocab.index(tokens.CHAR_STOP_WORD)
        char_vocab.index(tokens.CHAR_STOP_SENTENCE)
        for char in sorted(char_set):
            char_vocab.index(char)

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()
    char_vocab.freeze()
    type_vocab.freeze()

    punctuation = hparams.punctuation
    punct_set = punctuation

    def print_vocabulary(name, vocab):
        special = {tokens.START, tokens.STOP, tokens.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)
        print_vocabulary("Char", char_vocab)
        print_vocabulary("Type", type_vocab)


    print("Initializing model...")

    load_path = None
    if load_path is not None:
        print(f"Loading parameters from {load_path}")
        info = torch_load(load_path)
        parser = Zparser.ChartParser.from_spec(info['spec'], info['state_dict'])
    else:
        parser = Zparser.ChartParser(
            tag_vocab,
            word_vocab,
            label_vocab,
            char_vocab,
            type_vocab,
            hparams,
        )

    print("Initializing optimizer...")
    trainable_parameters = [param for param in parser.parameters() if param.requires_grad]
    trainer = torch.optim.Adam(trainable_parameters, lr=1., betas=(0.9, 0.98), eps=1e-9)
    if load_path is not None:
        trainer.load_state_dict(info['trainer'])

    def set_lr(new_lr):
        for param_group in trainer.param_groups:
            param_group['lr'] = new_lr

    assert hparams.step_decay, "Only step_decay schedule is supported"

    warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        trainer, 'max',
        factor=hparams.step_decay_factor,
        patience=hparams.step_decay_patience,
        verbose=True,
    )
    def schedule_lr(iteration):
        iteration = iteration + 1
        if iteration <= hparams.learning_rate_warmup_steps:
            set_lr(iteration * warmup_coeff)

    clippable_parameters = trainable_parameters
    grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm


    print("Training...")
    total_processed = 0
    current_processed = 0
    check_every = len(train_parse) / args.checks_per_epoch
    best_dev_score = -np.inf
    best_model_path = None
    model_name = hparams.model_name

    print("This is ", model_name)
    start_time = time.time()

    def check_dev(epoch_num):
        nonlocal best_dev_score
        nonlocal best_model_path

        dev_start_time = time.time()

        parser.eval()

        dev_predicted = []

        for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size):
            subbatch_trees = dev_treebank[dev_start_index:dev_start_index+args.eval_batch_size]
            subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees]

            predicted,  _,= parser.parse_batch(subbatch_sentences)
            del _

            dev_predicted.extend([p.convert() for p in predicted])

        dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted)

        print(
            "dev-fscore {} "
            "dev-elapsed {} "
            "total-elapsed {}".format(
                dev_fscore,
                format_elapsed(dev_start_time),
                format_elapsed(start_time),
            )
        )

        dev_pred_head = [[leaf.father for leaf in tree.leaves()] for tree in dev_predicted]
        dev_pred_type = [[leaf.type for leaf in tree.leaves()] for tree in dev_predicted]
        assert len(dev_pred_head) == len(dev_pred_type)
        assert len(dev_pred_type) == len(dep_dev_type)
        stats, stats_nopunc, stats_root, num_inst = dep_eval.eval(len(dev_pred_head), dep_dev_word, dep_dev_pos,
                                                                  dev_pred_head, dev_pred_type,
                                                                  dep_dev_headid, dep_dev_type,
                                                                  dep_dev_lengs, punct_set=punct_set,
                                                                  symbolic_root=False)
        dev_ucorr, dev_lcorr, dev_total, dev_ucomlpete, dev_lcomplete = stats
        dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucomlpete_nopunc, dev_lcomplete_nopunc = stats_nopunc
        dev_root_corr, dev_total_root = stats_root
        dev_total_inst = num_inst
        print(
            'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
                dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total, dev_lcorr * 100 / dev_total,
                dev_ucomlpete * 100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst))
        print(
            'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % (
                dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc,
                dev_ucorr_nopunc * 100 / dev_total_nopunc,
                dev_lcorr_nopunc * 100 / dev_total_nopunc,
                dev_ucomlpete_nopunc * 100 / dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst))
        print('Root: corr: %d, total: %d, acc: %.2f%%' % (
            dev_root_corr, dev_total_root, dev_root_corr * 100 / dev_total_root))

        dev_uas = dev_ucorr_nopunc * 100 / dev_total_nopunc
        dev_las = dev_lcorr_nopunc * 100 / dev_total_nopunc

        if dev_fscore.fscore + dev_las > best_dev_score :
            if best_model_path is not None:
                extensions = [".pt"]
                for ext in extensions:
                    path = best_model_path + ext
                    if os.path.exists(path):
                        print("Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_score = dev_fscore.fscore + dev_las
            best_model_path = "{}_best_dev={:.2f}_devuas={:.2f}_devlas={:.2f}".format(
                args.model_path_base, dev_fscore.fscore, dev_uas,dev_las)
            print("Saving new best model to {}...".format(best_model_path))
            torch.save({
                'spec': parser.spec,
                'state_dict': parser.state_dict(),
                'trainer' : trainer.state_dict(),
                }, besthh_model_path + ".pt")


    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break
        #check_dev(epoch)
        np.random.shuffle(train_parse)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_parse), args.batch_size):
            trainer.zero_grad()
            schedule_lr(total_processed // args.batch_size)

            parser.train()

            batch_loss_value = 0.0
            batch_trees = train_parse[start_index:start_index + args.batch_size]

            batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees]
            for subbatch_sentences, subbatch_trees in parser.split_batch(batch_sentences, batch_trees, args.subbatch_max_tokens):
                _, loss = parser.parse_batch(subbatch_sentences, subbatch_trees)

                loss = loss / len(batch_trees)
                loss_value = float(loss.data.cpu().numpy())
                batch_loss_value += loss_value
                if loss_value > 0:
                    loss.backward()
                del loss
                total_processed += len(subbatch_trees)
                current_processed += len(subbatch_trees)

            grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold)

            trainer.step()

            print(
                "epoch {:,} "
                "batch {:,}/{:,} "
                "processed {:,} "
                "batch-loss {:.4f} "
                "grad-norm {:.4f} "
                "epoch-elapsed {} "
                "total-elapsed {}".format(
                    epoch,
                    start_index // args.batch_size + 1,
                    int(np.ceil(len(train_parse) / args.batch_size)),
                    total_processed,
                    batch_loss_value,
                    grad_norm,
                    format_elapsed(epoch_start_time),
                    format_elapsed(start_time),
                )
            )

            if current_processed >= check_every:
                current_processed -= check_every
                check_dev(epoch)

        # adjust learning rate at the end of an epoch
        if hparams.step_decay:
            if (total_processed // args.batch_size + 1) > hparams.learning_rate_warmup_steps:
                scheduler.step(best_dev_score)
def run_train(args, hparams):
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    seed_from_numpy = np.random.randint(2147483648)
    print("Manual seed for pytorch:", seed_from_numpy)
    torch.manual_seed(seed_from_numpy)

    hparams.set_from_args(args)
    print("Hyperparameters:")
    hparams.print()

    train_path = args.train_ptb_path
    dev_path = args.dev_ptb_path

    if hparams.dataset == "ctb":
        train_path = args.train_ctb_path
        dev_path = args.dev_ctb_path

    print("Loading training trees from {}...".format(train_path))
    train_treebank = trees.load_trees(train_path)
    if hparams.max_len_train > 0:
        train_treebank = [
            tree for tree in train_treebank
            if len(list(tree.leaves())) <= hparams.max_len_train
        ]
    print("Loaded {:,} training examples.".format(len(train_treebank)))

    print("Loading development trees from {}...".format(dev_path))
    dev_treebank = trees.load_trees(dev_path)
    if hparams.max_len_dev > 0:
        dev_treebank = [
            tree for tree in dev_treebank
            if len(list(tree.leaves())) <= hparams.max_len_dev
        ]
    print("Loaded {:,} development examples.".format(len(dev_treebank)))

    print("Processing trees for training...")
    train_parse = [tree.convert() for tree in train_treebank]
    dev_parse = [tree.convert() for tree in dev_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(Lparser.START)
    tag_vocab.index(Lparser.STOP)
    tag_vocab.index(Lparser.TAG_UNK)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(Lparser.START)
    word_vocab.index(Lparser.STOP)
    word_vocab.index(Lparser.UNK)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    char_set = set()

    for tree in train_parse:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, trees.InternalParseNode):
                label_vocab.index(node.label)
                nodes.extend(reversed(node.children))
            else:
                tag_vocab.index(node.tag)
                word_vocab.index(node.word)
                char_set |= set(node.word)

    char_vocab = vocabulary.Vocabulary()

    # If codepoints are small (e.g. Latin alphabet), index by codepoint directly
    highest_codepoint = max(ord(char) for char in char_set)
    if highest_codepoint < 512:
        if highest_codepoint < 256:
            highest_codepoint = 256
        else:
            highest_codepoint = 512

        # This also takes care of constants like tokens.CHAR_PAD
        for codepoint in range(highest_codepoint):
            char_index = char_vocab.index(chr(codepoint))
            assert char_index == codepoint
    else:
        char_vocab.index(tokens.CHAR_UNK)
        char_vocab.index(tokens.CHAR_START_SENTENCE)
        char_vocab.index(tokens.CHAR_START_WORD)
        char_vocab.index(tokens.CHAR_STOP_WORD)
        char_vocab.index(tokens.CHAR_STOP_SENTENCE)
        for char in sorted(char_set):
            char_vocab.index(char)

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()
    char_vocab.freeze()

    def print_vocabulary(name, vocab):
        special = {tokens.START, tokens.STOP, tokens.UNK}
        print("{} ({:,}): {}".format(
            name,
            vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special),
        ))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)
        print_vocabulary("Char", char_vocab)

    print("Initializing model...")

    load_path = None
    if load_path is not None:
        print("Loading parameters from {}".format(load_path))
        info = torch_load(load_path)
        parser = Lparser.ChartParser.from_spec(info["spec"],
                                               info["state_dict"])
    else:
        parser = Lparser.ChartParser(
            tag_vocab,
            word_vocab,
            label_vocab,
            char_vocab,
            hparams,
        )

    print("Initializing optimizer...")
    trainable_parameters = [
        param for param in parser.parameters() if param.requires_grad
    ]
    trainer = torch.optim.Adam(trainable_parameters,
                               lr=1.0,
                               betas=(0.9, 0.98),
                               eps=1e-9)
    if load_path is not None:
        trainer.load_state_dict(info["trainer"])

    def set_lr(new_lr):
        for param_group in trainer.param_groups:
            param_group["lr"] = new_lr

    assert hparams.step_decay, "Only step_decay schedule is supported"

    warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        trainer,
        "max",
        factor=hparams.step_decay_factor,
        patience=hparams.step_decay_patience,
        verbose=True,
    )

    def schedule_lr(iteration):
        iteration = iteration + 1
        if iteration <= hparams.learning_rate_warmup_steps:
            set_lr(iteration * warmup_coeff)

    clippable_parameters = trainable_parameters
    grad_clip_threshold = (np.inf if hparams.clip_grad_norm == 0 else
                           hparams.clip_grad_norm)

    print("Training...")
    total_processed = 0
    current_processed = 0
    check_every = len(train_parse) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_model_path = None
    model_name = hparams.model_name
    best_dev_processed = 0

    print("This is ", model_name)
    start_time = time.time()

    def check_dev(epoch_num):
        nonlocal best_dev_fscore
        nonlocal best_model_path
        nonlocal best_dev_processed

        dev_start_time = time.time()

        parser.eval()

        dev_predicted = []

        for dev_start_index in range(0, len(dev_treebank),
                                     args.eval_batch_size):
            subbatch_trees = dev_treebank[dev_start_index:dev_start_index +
                                          args.eval_batch_size]
            subbatch_sentences = [[(leaf.tag, leaf.word)
                                   for leaf in tree.leaves()]
                                  for tree in subbatch_trees]

            (
                predicted,
                _,
            ) = parser.parse_batch(subbatch_sentences)
            del _

            dev_predicted.extend([p.convert() for p in predicted])
        dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank,
                                    dev_predicted)

        print("\n"
              "dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(dev_fscore,
                                        format_elapsed(dev_start_time),
                                        format_elapsed(start_time)))

        if dev_fscore.fscore > best_dev_fscore:
            if best_model_path is not None:
                extensions = [".pt"]
                for ext in extensions:
                    path = best_model_path + ext
                    if os.path.exists(path):
                        print(
                            "Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            best_model_path = "{}_best_dev={:.2f}".format(
                args.model_path_base, dev_fscore.fscore)
            best_dev_processed = total_processed
            print("Saving new best model to {}...".format(best_model_path))
            torch.save(
                {
                    "spec": parser.spec,
                    "state_dict": parser.state_dict(),
                    "trainer": trainer.state_dict(),
                },
                best_model_path + ".pt",
            )

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_parse)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_parse), args.batch_size):
            trainer.zero_grad()
            schedule_lr(total_processed // args.batch_size)

            parser.train()

            batch_loss_value = 0.0
            batch_trees = train_parse[start_index:start_index +
                                      args.batch_size]

            batch_sentences = [[(leaf.tag, leaf.word)
                                for leaf in tree.leaves()]
                               for tree in batch_trees]
            for subbatch_sentences, subbatch_trees in parser.split_batch(
                    batch_sentences, batch_trees, args.subbatch_max_tokens):
                _, loss = parser.parse_batch(subbatch_sentences,
                                             subbatch_trees)

                loss = loss / len(batch_trees)
                loss_value = float(loss.data.cpu().numpy())
                batch_loss_value += loss_value
                if loss_value > 0:
                    loss.backward()
                del loss
                total_processed += len(subbatch_trees)
                current_processed += len(subbatch_trees)

            grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters,
                                                       grad_clip_threshold)

            trainer.step()

            print(
                "\r"
                "epoch {:,} "
                "batch {:,}/{:,} "
                "processed {:,} "
                "batch-loss {:.4f} "
                "grad-norm {:.4f} "
                "epoch-elapsed {} "
                "total-elapsed {}".format(
                    epoch,
                    start_index // args.batch_size + 1,
                    int(np.ceil(len(train_parse) / args.batch_size)),
                    total_processed,
                    batch_loss_value,
                    grad_norm,
                    format_elapsed(epoch_start_time),
                    format_elapsed(start_time),
                ),
                end="",
            )
            sys.stdout.flush()

            if current_processed >= check_every:
                current_processed -= check_every
                check_dev(epoch)

        # adjust learning rate at the end of an epoch
        if hparams.step_decay:
            if (total_processed // args.batch_size +
                    1) > hparams.learning_rate_warmup_steps:
                scheduler.step(best_dev_fscore)
Exemple #27
0
def main():
    import optparse
    import vocabulary
    parser = optparse.OptionParser()
    parser.add_option("-f",
                      dest="filename",
                      help="corpus filename",
                      default='ap.txt')  #use any sample .txt for training
    parser.add_option("-c",
                      dest="corpus",
                      help="using range of Brown corpus' files(start:end)")
    parser.add_option("--alpha",
                      dest="alpha",
                      type="float",
                      help="parameter alpha",
                      default=0.5)
    parser.add_option("--beta",
                      dest="beta",
                      type="float",
                      help="parameter beta",
                      default=0.5)
    parser.add_option("-k",
                      dest="K",
                      type="int",
                      help="number of topics",
                      default=2)
    parser.add_option("-i",
                      dest="iteration",
                      type="int",
                      help="iteration count",
                      default=10)
    parser.add_option("-s",
                      dest="smartinit",
                      action="store_true",
                      help="smart initialize of parameters",
                      default=False)
    parser.add_option("--stopwords",
                      dest="stopwords",
                      help="exclude stop words",
                      action="store_true",
                      default=False)
    parser.add_option("--seed", dest="seed", type="int", help="random seed")
    parser.add_option("--df",
                      dest="df",
                      type="int",
                      help="threshold of document freaquency to cut words",
                      default=0)
    (options, args) = parser.parse_args()
    if not (options.filename or options.corpus):
        parser.error("need corpus filename(-f) or corpus range(-c)")

    if options.filename:
        corpus = vocabulary.load_file(options.filename)
    else:
        corpus = vocabulary.load_corpus(
            options.corpus)  #here is to load 'corpora/wordnet', install first
        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
    if options.seed != None:
        numpy.random.seed(options.seed)

    voca = vocabulary.Vocabulary(options.stopwords)
    docs = [voca.doc_to_ids(doc) for doc in corpus]
    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)

    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(),
              options.smartinit)
    print(
        "corpus=%d, words=%d, K=%d, a=%f, b=%f" %
        (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta))

    #import cProfile
    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
    lda_learning(lda, options.iteration, voca)
Exemple #28
0
"""
import os
import cPickle as pickle
import tic
import sift
import vocabulary
from imtools import get_imageList

imlist = get_imageList("../local/data/JianDa1")
imcount = len(imlist)
print imlist
print imcount
featlist = [imlist[i][:-3] + 'sift' for i in range(imcount)]

tic.k('Start')
for i in range(imcount):
    if not os.path.exists(featlist[i]):
        sift.process_image(imlist[i], featlist[i])
tic.k('sift loaded')

voc = vocabulary.Vocabulary('JianDa1')  # ukbenchtest
voc.train(featlist, k=imcount, subsampling=10)
tic.k('train loaded')

# 保存词汇
#imagepkl = r"pickle\vocabulary.pkl"
imagepkl = r"../static\pickle\jianda1.pkl"
with open(imagepkl, 'wb') as f:
    pickle.dump(voc, f)
print imagepkl, 'is:', voc.name, voc.word_count
Exemple #29
0
#
# The most important thing that you will likely wind up changing here is the `feature_processor` code.  You'll see the two lists of features being used (the word features and the list features).  As you add more features, you will need to add those features to these arguments so they actually get checked against tokens in the input.

# In[6]:

default_tokenizer = lambda i: tagged_contexts(tagtools.bies_tagged_tokens(i))
default_token_view = lambda i: i[0]
default_feature_processor = make_cxt_feature_processor(
    [all_digits, lonely_initial, identity_feature], [is_empty])


def default_features(vocab):
    return lambda data: vocab


bib_features = vocabulary.Vocabulary()

bib_data = tagtools.DataManager(reference_train_file, reference_test_file,
                                reference_dev_file, reference_xml_item_keyword,
                                default_tokenizer, default_token_view,
                                default_features(bib_features),
                                default_feature_processor)

# Load the data from the file system

# In[7]:

bib_data.initialize()

# Look at how we're analyzing a typical item.
#
print "Cleaned sample! Final rows ", len(sample_final)
#######SPLITING TRAIN TEST

rows = random.sample(sample_final.index, 2000)
sample_train = sample_final.drop(rows)
sample_test = sample_final.ix[rows]

print "Final test and train sizes:", len(sample_test), len(sample_train)
cleaned_reviews_train = sample_train['text']
cleaned_reviews_test = sample_test['text']
print("Sample cleaned!")

############ SETTING THE VARIABLES

#Text
voca = v.Vocabulary()
docs = voca.read_corpus(cleaned_reviews_train)
docs_test = voca.new_corpus(cleaned_reviews_test)

if options.model == 'slda':
    # Supervised
    Y_train = sample_train.stars
if options.model == 'dmr':
    feat_orig = np.reshape(sample_train.stars, (len(sample_train.stars), 1))
    #Features
    sample_train.columns = [s.encode('utf-8') for s in sample_train.columns]
    features_biz = sample_train.filter(regex='biz_')
    features_biz = features_biz.drop('biz_name', axis=1)
    #veeecs = features_biz
    #vecs = np.array([[v for v in vec] for vec in vecs], dtype=np.float32)
    feat_biz = np.array(features_biz)