Ejemplo n.º 1
0
def main():
  # First load the models as it was, that is loading it with the training sizes and vocab
  training_data = DataReader(training_data_filepath)

  vocab = training_data.vocab

  # Build a list of trigrams
  words = training_data.get_words()

  # Get the pretrained word vectors
  word_to_index, embed_dict = get_pretrained_word_indexes(pretrained_filepath)

  # Update word_to_index and vocabulary
  word_to_index, vocab = update_word_indexes_vocab(word_to_index, vocab)

  # Get the numpy matrix containing the pretrained word vectors
  # with randomly initialized unknown words from the corpus
  word_embeddings = get_embeddings_matrix(word_to_index, embed_dict, WORD_EMBEDDINGS_DIMENSION)

  model = NGramLanguageModeler(len(vocab), 50, CONTEXT_SIZE, word_embeddings)
  model.load_state_dict(torch.load("AWS_model.pt"))

  test_data = DataReader(test_data_filepath, read_limit=READ_LIMIT)

  evaluate_model(model, test_data, word_to_index)
Ejemplo n.º 2
0
def main():
    train_data_reader = DataReader(FLAGS, dtype='train')
    test_data_reader = DataReader(FLAGS, dtype='test')

    with tf.Graph().as_default():
        net = Net(FLAGS)
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        saver = tf.train.Saver()

        if FLAGS.mode == 'train':
            do_train.run(FLAGS, sess, net, saver, train_data_reader,
                         test_data_reader)
        else:
            ckpt = tf.train.get_checkpoint_state(FLAGS.log_path)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                print("Model restored...")
            if FLAGS.mode == 'test':
                do_validate.run(sess, net, test_data_reader)
            else:
                do_train.run(FLAGS, sess, net, saver, train_data_reader,
                             test_data_reader)
Ejemplo n.º 3
0
def main(args):
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    coord = tf.train.Coordinator()

    if args.mode == "train":
        with tf.compat.v1.name_scope('create_inputs'):
            data_reader = DataReader(
                data_dir=args.train_dir,
                data_list=args.train_list,
                mask_window=0.4,
                queue_size=args.batch_size * 3,
                coord=coord)
            if args.valid_list is not None:
                data_reader_valid = DataReader(
                    data_dir=args.valid_dir,
                    data_list=args.valid_list,
                    mask_window=0.4,
                    queue_size=args.batch_size * 2,
                    coord=coord)
                logging.info(
                    "Dataset size: train {}, valid {}".format(data_reader.num_data, data_reader_valid.num_data))
            else:
                data_reader_valid = None
                logging.info("Dataset size: train {}".format(data_reader.num_data))
        train_fn(args, data_reader, data_reader_valid)

    elif args.mode == "valid" or args.mode == "test":
        with tf.compat.v1.name_scope('create_inputs'):
            data_reader = DataReader_test(
                data_dir=args.data_dir,
                data_list=args.data_list,
                mask_window=0.4,
                queue_size=args.batch_size * 10,
                coord=coord)
        valid_fn(args, data_reader)

    elif args.mode == "pred":
        with tf.compat.v1.name_scope('create_inputs'):
            if args.input_mseed:
                data_reader = DataReader_mseed(
                    data_dir=args.data_dir,
                    data_list=args.data_list,
                    queue_size=args.batch_size * 10,
                    coord=coord,
                    input_length=args.input_length)
            else:
                data_reader = DataReader_pred(
                    data_dir=args.data_dir,
                    data_list=args.data_list,
                    queue_size=args.batch_size * 10,
                    coord=coord,
                    input_length=args.input_length)
        pred_fn(args, data_reader, log_dir=args.output_dir)

    else:
        print("mode should be: train, valid, test, pred or debug")

    return
Ejemplo n.º 4
0
def train(model, config):
    """Trains the input model using specified configurations

    Args:
        model: tensorflow keras model
        config: instance of class configuration
    """

    train_data = DataReader(config.train_file_path, config)
    train_batch = train_data.read_batch(train=True, num_epochs=config.num_epochs, 
                                        shuffle=True)
    train_iterations = int(train_data.num_images//config.batch_size)

    if config.val_file_path:
        val_data = DataReader(config.val_file_path, config)

    learning_rate = LinearWarmUpCosineDecay(train_iterations*config.num_epochs,
                                            config.learning_rate)   
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate(0))


    epoch = 1
    epoch_loss_train = 0
    for iteration, (images, labels, weights) in enumerate(train_batch):
        loss, grads, preds = train_step(model, images, labels, weights)
        epoch_loss_train += loss

        optimizer.__setattr__('lr', learning_rate(optimizer.iterations))
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if iteration >  0 and iteration % train_iterations == 0:
            print("Epoch {} Train loss:  {}".format(epoch, epoch_loss_train/train_iterations))
            epoch_loss_train = 0

            if config.val_file_path:
                epoch_loss_val = []
                acc = []

                val_batch = val_data.read_batch(train=False, num_epochs=1)

                for images, labels, weights in val_batch:
                    loss, preds = val_step(model, images, labels)
                    epoch_loss_val.append(loss)

                    acc.append(accuracy(labels, preds, config.num_classes))

                print("Epoch {} Val loss:  {}".format(epoch, epoch_loss_val/len(epoch_loss_val)))

                for j in config.num_classes:
                    print("Epoch {} Class {} Accuracy: {}".format(epoch, j, sum([val[j] for val in acc])/len(acc)))

            model.save_weights(os.path.join(config.save_directory, 'model'), 
                               save_format='tf')
            epoch += 1
Ejemplo n.º 5
0
def main():
	#Prepare dataset from csv to npz files
	#DatasetPreparation.prepare('train_preprocessed.csv','test_preprocessed.csv')
	
	#Read the dataset, create batches, and one hot encode the targets
	batch_size = 100
	train_data = DataReader('train.npz',batch_size)
	validation_data = DataReader('validation.npz')
	
	test_data = np.load('test.npz')

	m = Model(train_data,validation_data)
	m.train()
	
	m.test(test_data)	
Ejemplo n.º 6
0
def data_training():
    """
    仅用样本集进行训练
    """
    sentences = []
    reader = DataReader(TRAIN_DATA_TYPE)
    reader.set_pos()
    start_id = reader.get_next_pic_id()
    qa = reader.get_pic_qa(start_id)
    for q in qa:
        question = q['question']
        question = question.replace('?', ' ?')
        question = question.replace(',', ' ,')
        question = question.replace('.', ' .')
        sentence = question.split(' ')
        sentences.append(sentence)
    now_id = reader.get_next_pic_id()
    i = 0
    while now_id != start_id:
        qa = reader.get_pic_qa(now_id)
        for q in qa:
            question = q['question']
            question = question.replace('?', ' ?')
            question = question.replace(',', ' ,')
            question = question.replace('.', ' .')
            sentence = question.split(' ')
            sentences.append(sentence)
        now_id = reader.get_next_pic_id()
        i = i + 1
        if i % 1000 == 0:
            print('*', end='')
    print('load data over!')
    model = gensim.models.Word2Vec(sentences, size=300, min_count=1)
    model.save(GENSIM_DATA_PATH)
Ejemplo n.º 7
0
    def __init__(self, input_file, vocabulary_file, img_data_file,
                 char2ix_file, output_dir, maxwordlength, emb_dimension,
                 line_batch_size, sample_batch_size, neg_num, window_size,
                 discard, epochs, initial_lr, seed):

        torch.manual_seed(seed)
        self.img_data = np.load(img_data_file)
        self.data = DataReader(input_file, vocabulary_file, char2ix_file,
                               maxwordlength, discard, seed)
        dataset = Word2vecDataset(self.data, window_size, sample_batch_size,
                                  neg_num)
        self.dataloader = DataLoader(dataset,
                                     batch_size=line_batch_size,
                                     shuffle=True,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_dir = output_dir
        self.emb_size = len(self.data.word2id)
        self.char_size = len(self.data.char2id) + 1  #5031
        self.emb_dimension = emb_dimension
        self.line_batch_size = line_batch_size
        self.epochs = epochs
        self.initial_lr = initial_lr
        self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension,
                                    self.data.wordid2charid, self.char_size)
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        self.num_train_steps = int(len(self.dataloader) * self.epochs)
        if self.use_cuda:
            self.VCWE_model.cuda()
Ejemplo n.º 8
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=300,
                 batch_size=64,
                 window_size=5,
                 iterations=5,
                 initial_lr=1.0,
                 min_count=5):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("USING CUDA")
            self.skip_gram_model.cuda()
        else:
            print("CUDA FAIL")
Ejemplo n.º 9
0
def test_va():
    # reader = DataReader("direcnet_pid", "../data/tblADataCGMS.csv", 5)
    # data = reader.read()
    reader = DataReader("VA2", "../data/CGMdataCSMcomplete.xlsx", 5)
    data = reader.read()
    normal, diabetic = read_patient_info()
    pids = list(normal["Patient ID"])
    t = np.arange(len(data[pids[0]][0])) * 5
    res = np.vstack((t, data[pids[0]][0]))
    np.savetxt("{}.txt".format(pids[0]), res, fmt="%.4f")

    for p in pids:
        m = map(lambda x: len(x), data[p])
        print(list(m))
    exit()
    plt.figure()
    tot, hyper, hypo = 0, 0, 0
    for pid in data:
        #    if pid not in pids:
        #        continue
        for y in data[pid]:
            t = np.arange(len(y)) * 5
            tot += len(y)
            y = np.array(y)
            hyper += sum(y > 180)
            hypo += sum(y < 70)
            plt.plot(t, y)
            plt.hlines(70, 0, 10000)
            plt.hlines(80, 0, 10000)
            plt.hlines(180, 0, 10000)
    plt.show()
    print(hypo, hyper, tot)
    def build_cnv_2_gene_training_data(self, data_dir, outcome_file,
                                       cnv_2_gene_file):
        '''
		Function:
			產生以gene symbol為feature的sample set.
		Input:
			cnv_2_gene_file: 整理好的array_id對應到gene_symbol.

		'''
        excel_obj = ExcelReader()
        data_reader_obj = DataReader()

        outcome_dict = excel_obj.get_cyto_cnv_result(outcome_file)
        cnv_df = data_reader_obj.cnv_data_reader_pipeline(data_dir)

        #### probe mapping to gene
        (array_2_gene,
         gene_2_array) = data_reader_obj.get_cnv_to_gene_table(cnv_2_gene_file)

        # print(array_2_gene)
        gene_cnv = data_reader_obj.build_array_to_gene(cnv_df, array_2_gene,
                                                       gene_2_array)

        ## gene cnv
        data_df = data_reader_obj.combine_outcome_data(gene_cnv, outcome_dict)

        return data_df
Ejemplo n.º 11
0
def train_one_epoch(model,
                    cfg,
                    optimizer,
                    lr_scheduler,
                    loss_func,
                    loss_metric,
                    cuda=True):
    ann_files, img_dirs = [], []
    data_info = cfg.dataset[cfg.train_mode[0]]
    for mode in cfg.train_mode:
        data_info = cfg.dataset[mode]
        ann_files.append(data_info['ann_file'])
        img_dirs.append(data_info['img_prefix'])
    data_reader = DataReader(
        ann_files=ann_files,
        img_dirs=img_dirs,
        transform=None,
        mode='train',
        img_scale=data_info['img_scale'],
        keep_ratio=data_info['keep_ratio'],
        label_transform=cfg.dataset['label_transform'],
    )

    data_loader = DataLoader(data_reader,
                             collate_fn=collate_fn,
                             **cfg.data_loader)
    loss_metric.update(total_iter=len(data_loader))
    model.train()
    for step, (data, target) in enumerate(data_loader):
        # inputs = torch.stack(data)
        # targets = torch.from_numpy(np.array(target)).type(torch.LongTensor)
        if data.shape[0] == 0: continue
        inputs = data
        targets = target
        if cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()
        if cfg.mix['type'] == 'mixup':
            alpha = cfg.mix['alpha']
            lam = np.random.beta(alpha, alpha)
            index = torch.randperm(inputs.size(0)).cuda()
            inputs = lam * inputs + (1 - lam) * inputs[index, :]
            targets_a, targets_b = targets, targets[index]
            outputs = model(inputs)
            loss = lam * loss_func(outputs, targets_a) + (1 - lam) * loss_func(
                outputs, targets_b)
        else:
            outputs = model(inputs)
            loss = loss_func(outputs, targets)
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_metric.update(iter=step, loss=loss)
        if step % cfg.freq_cfg['log_print'] == 0 or step == len(data_loader):
            line = loss_metric.str()
            logger.info(line)
            with open(os.path.join(cfg.work_dir, cfg.log['out_file']),
                      'a+') as fp:
                fp.write(line + '\n')
Ejemplo n.º 12
0
    def __init__(self, model_load_path, artist_name, test, prime_text):
        self.sess = tf.Session()
        self.artist_name = artist_name

        print 'Process data...'
        self.data_reader = DataReader(self.artist_name)
        self.vocab = self.data_reader.get_vocab()

        print 'Init model...'
        self.model = LSTMModel(self.sess,
                               self.vocab,
                               c.BATCH_SIZE,
                               c.SEQ_LEN,
                               c.CELL_SIZE,
                               c.NUM_LAYERS,
                               test=test)

        print 'Init variables...'
        self.saver = tf.train.Saver(max_to_keep=None)
        self.sess.run(tf.global_variables_initializer())

        # if load path specified, load a saved model
        if model_load_path is not None:
            self.saver.restore(self.sess, model_load_path)
            print 'Model restored from ' + model_load_path

        if test:
            self.test(prime_text)
        else:
            self.train()
Ejemplo n.º 13
0
    def test_parse_slabinfo(self):
        test_str = (
            "Name                   Objects Objsize    Space Slabs/Part/Cpu  O/S O %Fr %Ef Flg\n"
            "kmalloc-1024               288    1024   294.9K          9/0/9   16 2   0 100 *\n"
            "kmalloc-128               2822     128   372.7K       64/11/27   32 0  12  96 *\n"
            "kmalloc-192               2163     192   425.9K        82/0/22   21 0   0  97 *\n"
            "kmalloc-2048              2241    2048     4.7M       138/45/7   16 3  31  96 *\n"
            "kmalloc-256               4921     256     1.3M       313/67/5   16 0  21  96 *\n"
            "kmalloc-4096               584    4096     2.3M         70/0/3    8 3   0 100 *\n"
            "kmalloc-512               2674     512     1.3M        163/6/7   16 1   3  98 *\n"
            "kmalloc-64               12904      64   843.7K      174/23/32   64 0  11  97 *\n"
            "kmalloc-8192                32    8192   262.1K          3/0/5    4 3   0 100 \n"
        )

        dut = DataReader(None)
        exist, val = dut.parse_slabinfo(test_str, tag='kmalloc-1024')
        self.assertEqual(exist, True)
        self.assertEqual(val, 288)

        exist, val = dut.parse_slabinfo(test_str, tag='kmalloc-64')
        self.assertEqual(exist, True)
        self.assertEqual(val, 12904)

        exist, val = dut.parse_slabinfo(test_str, tag='will_not_found')
        self.assertEqual(exist, False)
        self.assertEqual(val, 0)
Ejemplo n.º 14
0
def createQuestionsDict():
    """
    创建问题字典(包含回答字典)
    """
    reader = DataReader()
    reader.set_pos()
    dealer = DataDealer(ANSWERS_DICT_PATH)
    start_id = reader.get_next_pic_id()
    qa = reader.get_pic_qa(start_id)
    for q in qa:
        question = q['question']
        dealer.deal(question)
    now_id = reader.get_next_pic_id()
    i = 0
    while now_id != start_id:
        qa = reader.get_pic_qa(now_id)
        for q in qa:
            question = q['question']
            dealer.deal(question)
        now_id = reader.get_next_pic_id()
        i = i + 1
        if i % 1000 == 0:
            print('*', end='')
    dealer.saveData(QUESTIONS_DICT_PATH)
    print('over!')
Ejemplo n.º 15
0
def run(sysargs):
    if len(sysargs) < 1:
        print("Insufficient input args.")
        print("Usage:")
        print("python lstm.py <input_file_path>")
    else:
        skip_train_flag = False
        testbed = TestBed()

        if (len(sysargs) == 2):
            skip_train_flag = literal_eval(sysargs[1])

        print("\nskip_train_flag:'" + str(skip_train_flag) + "'")

        input_file_path = sysargs[0]
        dr = DataReader()
        dr.read_pkl_data_at_file_path(input_file_path)
        sequences = dr.get_sequences()
        labels = dr.get_labels()

        if not skip_train_flag:
            # train
            testbed.init_model()
            testbed.train(sequences, labels)
            testbed.save_model()
        else:
            # skipping training part, load model
            testbed.load_model()

        metrics_names, score = testbed.test(sequences, labels)
        print("metrics_names:")
        print(metrics_names)
        print("score=" + str(score))
    def __init__(self,
                 input_file,
                 antonym_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)
        self.antonym_file = open(antonym_file, 'r')

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
Ejemplo n.º 17
0
def determine_iterations_per_epoch(config):
    """Determine the number of iterations per training epoch
    
    Creates an instance of the DataReader class and iterates over one epoch to 
    determine number of iterations in an epoch. Required in order to 
    accurately decay the learning rate.
    
    Args:
        config: instance of config class
    Returns:
        count: number of iterations in each epoch
    """

    if config.train_file_path:
        data = DataReader(config, config.train_file_path)

    batch = data.read_batch(current_epoch=0, num_epochs=1)
    count = 0

    if config.task == 'pretrain':
        for image, epoch in batch:
            count += 1
    else:
        for image, label, epoch in batch:
            count += 1
    return count
Ejemplo n.º 18
0
def eval(model, cfg, mode='val', cuda=True):
    data_info = cfg.dataset[mode]
    data_reader = DataReader(
        ann_files=[data_info['ann_file']],
        img_dirs=[data_info['img_prefix']],
        transform=None,
        mode='val',
        img_scale=data_info['img_scale'],
        keep_ratio=data_info['keep_ratio'],
    )
    data_loader = DataLoader(data_reader,
                             collate_fn=collate_fn,
                             **cfg.val_data_loader)
    y_true, y_pred = [], []
    model.eval()
    for step, (data, target) in tqdm(enumerate(data_loader)):
        # inputs = torch.stack(data)
        # target = torch.from_numpy(np.array(target)).type(torch.LongTensor)
        inputs = data
        targets = target
        if cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()
        with torch.no_grad():
            outputs = model(inputs)
        outs = nn.functional.softmax(outputs, dim=1)
        pred = torch.argmax(outs, dim=1)
        y_true.extend(list(targets.cpu().detach().numpy()))
        y_pred.extend(list(pred.cpu().detach().numpy()))
    model.train()
    return classification_report(y_true, y_pred, output_dict=True), \
           classification_report(y_true, y_pred, output_dict=False)
Ejemplo n.º 19
0
def main():
    # Extract arguments
    ap = argparse.ArgumentParser()
    ap.add_argument("data", help="Data file containing bugs")
    ap.add_argument("vocabulary", help="Vocabulary file")
    ap.add_argument("-s", "--suffix", help="Model and log-file suffix")
    args = ap.parse_args()

    data = DataReader(config["data"],
                      data_file=args.data,
                      vocab_path=args.vocabulary)
    model = TransformerPatchingModel(config["transformer"],
                                     data.vocabulary.vocab_dim,
                                     is_pointer=config["data"]["edits"])

    # Restore model after a simple init
    tracker = Tracker(model, suffix=args.suffix)
    model(tf.zeros((1, 2), 'int32'), tf.zeros((1, 2), 'int32'),
          tf.zeros((1, 2), 'int32'), tf.zeros((0, 0), 'int32'), True)
    tracker.restore(best_only=True)

    with open(
            "results" + ("" if args.suffix is None else "-" + args.suffix) +
            ".txt", "w") as f_out:
        for batch in data.batcher(mode="test", optimize_packing=False):
            pre, pre_locs = batch[:2]
            preds = model.predict(data.vocabulary, pre, pre_locs,
                                  config["data"]["beam_size"],
                                  config["data"]["max_bug_length"])
            write_completions(f_out, data.vocabulary, pre.numpy(),
                              pre_locs.numpy(), preds)
Ejemplo n.º 20
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=500,
                 batch_size=32,
                 window_size=5,
                 iterations=5,
                 initial_lr=0.001,
                 min_count=12):

        self.data = DataReader(input_file, min_count)
        dataset = PennDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.penn_skip_gram_model = PennSkipGramModel(self.emb_size,
                                                      self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.penn_skip_gram_model.cuda()
def main():
    data_reader = DataReader()
    df = data_reader.get_all_data()

    # random split of data
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    # set up train data
    train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False,
                                       remove_empty=True)
    train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw)

    # train model
    model  = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'],
                                                      epochs=50, batch_size=64)

    # set up test data
    test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True)
    test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names)

    # evaluate model
    evaluate_model_nn(model, test_x, test_y, plot_roc=False)

    # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL
    #################################################
    # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL

    # read unlabelled data and format it to be the same as labelled data
    unlabelled_df = data_reader.get_east_dir()
    unlabelled_df = normalize_east_dir_df(unlabelled_df)

    # set up unlabelled data as semi-supervised data
    tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True)
    semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names)

    # Confidence threshold to train on
    train_threshold = 0.8
    semi_train_amount = 30

    # SELF TRAIN MANY TIMES
    for i in range(semi_train_amount):
        # get predictions on unlabelled data
        pred = model.model.predict(semi_x_base)
        # convert probablities to 1 hot encoded output
        semi_y = np.zeros_like(pred)
        semi_y[np.arange(len(pred)), pred.argmax(1)] = 1
        # filter semi_x and semi_y to only include predictions above train_threshold
        semi_y = semi_y[pred.max(axis=1) > train_threshold]
        semi_x = semi_x_base[pred.max(axis=1) > train_threshold]

        # train on semi supervised data
        model.model.fit(semi_x, semi_y, batch_size=64, epochs=100)
        # retrain on original train data
        model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10)

        # evaluate model
        evaluate_model_nn(model, test_x, test_y, plot_roc=False)

        # remove semi data used in this iteration from future iterations
        semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
Ejemplo n.º 22
0
def save(artist, model_path, num_save):
    sample_save_dir = c.get_dir('../save/samples/')
    sess = tf.Session()

    print artist

    data_reader = DataReader(artist)
    vocab = data_reader.get_vocab()

    print 'Init model...'
    model = LSTMModel(sess,
                      vocab,
                      c.BATCH_SIZE,
                      c.SEQ_LEN,
                      c.CELL_SIZE,
                      c.NUM_LAYERS,
                      test=True)

    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())

    saver.restore(sess, model_path)
    print 'Model restored from ' + model_path

    artist_save_dir = c.get_dir(join(sample_save_dir, artist))
    for i in xrange(num_save):
        print i

        path = join(artist_save_dir, str(i) + '.txt')
        sample = model.generate()
        processed_sample = process_sample(sample)

        with open(path, 'w') as f:
            f.write(processed_sample)
    def get_img_ary(self, data_df):
        data_reader_obj = DataReader()
        # data_ary = data_reader_obj.tif_ary_reader(data_df, 'tif_path', 'cnv_outcome')
        data_ary = data_reader_obj.png_ary_reader(data_df, 'tif_path',
                                                  'cnv_outcome')

        return data_ary
Ejemplo n.º 24
0
	def read_data(self):

		self._logger.info('Reading meta data...')

		self._reader = DataReader(self._logger)

		(self._vocab,
		self._vocab_size,
		self._dictionary,
		self._reverse_dictionary,
		self._unigrams,
		self._arts_srcs,
		self._srcs_ents,
		self._ents_srcs) = self._reader.read_meta_files(self._args.data)


		with open(self._args.output + '-labels-dict.pkl', 'wb') as f:
			cPickle.dump(self._reverse_dictionary, f,protocol=cPickle.HIGHEST_PROTOCOL)

		with open(self._args.output + '-vocab-dict.pkl', 'wb') as f:
			cPickle.dump(self._dictionary, f,protocol=cPickle.HIGHEST_PROTOCOL)

		self._number_of_srcs = len(set(self._srcs_ents.keys()))

		self._sample_dist()
Ejemplo n.º 25
0
    def readData(self, path_to_data, path_to_energy):
        """
        Reads in weather data from a file and stores it
        """

        if path_to_data == None:
            weather_reader = RandomReader(365 * 24)
        else:
            weather_reader = DataReader(path_to_data, path_to_energy)

        while weather_reader.canGetForecast():
            forecast = weather_reader.getForecast(
            )  #forecast = list of 24 tuples of (windSpeed, sunlight, energy_needed)
            # store raw numbers
            self.raw_data.append(copy.deepcopy(forecast[0]))
            self.energy_needed.append(forecast[0].ERCOT)
            self.energy_gained.append(
                (self.calculate_wind_power(forecast[0].windSpeed),
                 self.calculate_solar_power(forecast[0].sunlight),
                 self.calculate_hydro_power()))
            # calculate features
            wind_power = 0.0
            solar_power = 0.0
            hydro_power = 0.0
            for weather_tuple in forecast:
                #convert weather to power
                wind_power += self.calculate_wind_power(
                    weather_tuple.windSpeed)
                solar_power += self.calculate_solar_power(
                    weather_tuple.sunlight)
                hydro_power += self.calculate_hydro_power()
            self.features.append((wind_power, solar_power, hydro_power))
            weather_reader.advanceTime()
Ejemplo n.º 26
0
def solver():
    parser = argparse.ArgumentParser()
    parser.add_argument("integer",
                        type=int,
                        help="Please give arguments as 'Centroid','Min','Max'")
    args = parser.parse_args()
    clusters = args.integer

    reader = DataReader()
    data = reader.loadData()
    simMatrix, indexes = genSimilarityMatrix(data)
    M, C = kmedoids.kMedoids(simMatrix, clusters)
    fileWriter = open('data/Kmedoids_output_{}.txt'.format(clusters), 'w')
    print('medoids', file=fileWriter)
    i = 1
    for point in M:
        print('medoid of cluster ', i, ' ', indexes[point], file=fileWriter)
        i = i + 1
    print(' ', file=fileWriter)
    print('clustering result:', file=fileWriter)
    i = 1
    for label in C:
        for point_idx in C[label]:
            print('Cluster ', i, ': ', indexes[point_idx], file=fileWriter)
        i = i + 1

    fileWriter.close()
    print("Clustering Done!!,No. of new clusters are {}".format(clusters))
    print("New clusters are stored in file-data/Kmedoids_output_{}.txt".format(
        clusters))
Ejemplo n.º 27
0
def main(_):
    ''' Loads trained model and evaluates it on test split '''

    if FLAGS.load_model is None:
        print('Please specify checkpoint file to load model from')
        return -1

    if not os.path.exists(FLAGS.load_model + ".index"):
        print('Checkpoint file not found', FLAGS.load_model)
        return -1

    word_vocab, word_tensors, max_doc_length, label_tensors = \
        load_data(FLAGS.data_dir, FLAGS.max_doc_length, FLAGS.max_sen_length)

    test_reader = DataReader(word_tensors['test'], label_tensors['test'],
                             FLAGS.batch_size)

    print('initialized test dataset reader')

    with tf.Graph().as_default(), tf.Session() as session:

        # tensorflow seed must be inside graph
        tf.set_random_seed(FLAGS.seed)
        np.random.seed(seed=FLAGS.seed)
        ''' build inference graph '''
        with tf.variable_scope("Model"):
            m = build_model(word_vocab)
            global_step = tf.Variable(0, dtype=tf.int32, name='global_step')

        saver = tf.train.Saver()
        saver.restore(session, FLAGS.load_model)
        print('Loaded model from', FLAGS.load_model, 'saved at global step',
              global_step.eval())
        ''' training starts here '''
        count = 0
        start_time = time.time()
        result_scores = None
        for x, y in test_reader.iter():
            count += 1
            logits = session.run(m.logits, {m.input: x, m.targets: y})

            total_scores = []
            for tid, tlogits in enumerate(logits):
                scores = softmax(tlogits)
                weights = np.array([0, 1, 0.5])
                scores = np.dot(scores, weights)
                total_scores.append(scores)

            total_scores = np.transpose(np.asarray(total_scores))
            if result_scores is None:
                result_scores = total_scores
            else:
                result_scores = np.vstack((result_scores, total_scores))

        save_as = '%s/scores' % (FLAGS.train_dir)
        np.savetxt(save_as, result_scores, delimiter=' ')
        time_elapsed = time.time() - start_time

        print("test samples:", count * FLAGS.batch_size, "time elapsed:",
              time_elapsed, "time per one batch:", time_elapsed / count)
Ejemplo n.º 28
0
def create_library():
    pkey_filename = '/Users/cole/.ssh/million-song-dataset.pem'
    pkey_password = keyring.get_password('SSH', pkey_filename)
    pkey = RSAKey.from_private_key_file(pkey_filename, password=pkey_password)
    ssh = SSHClient()
    ssh.set_missing_host_key_policy(AutoAddPolicy())
    ssh.connect('52.91.85.148', username='******', pkey=pkey)

    letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
               'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    letters2 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
                'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    letters3 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
                'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

    dr = DataReader()
    dr.reset_lib()
    import shutil
    for letter in letters:
        for letter2 in letters2:
            for letter3 in letters3:
                with SCPClient(ssh.get_transport()) as scp:
                    print letter + letter2 + letter3
                    scp.get('/mnt/snap/data/' + letter + '/' + letter2 + '/' + letter3,
                            '/Users/cole/eclipse-workspace/EC2 File Transfer/Data/', 1)
                    dr.append_files(letter3);
                    shutil.rmtree('/Users/cole/eclipse-workspace/EC2 File Transfer/Data/' + letter3);
                    scp.close()
    ssh.close()
def main():
    test = False
    heuristic = 'Centroid'
    reader = DataReader()
    data = reader.loadData()
    dataArray = reader.getDataArray()
    if test == True:
        clusters = [
            Cluster(dataPoint, data[dataPoint])
            for dataPoint in list(data.keys())[:5]
        ]
    else:
        clusters = [
            Cluster(dataPoint, data[dataPoint])
            for dataPoint in list(data.keys())[:]
        ]
    Cluster.generateInitialDistanceMatrix(test)
    Uni = UnionTracker(len(clusters))
    # print('')
    iteration = 0
    while (Cluster.currentClusterCount() > 1):
        clsA, clsB, dist = Cluster.findMinDistance()
        mergedRC = min(clsA, clsB)
        toDelete = max(clsA, clsB)
        newIDm, newIDd, pts, factor = Cluster.mergeSimilarClusters(
            mergedRC, toDelete, iteration, dist, heuristic=heuristic)
        Uni.union(newIDd, newIDm, dist, pts, iteration)
        iteration += 1

    labels = list(data.keys())
    drawDendrogram(Uni, labels, heuristic)
Ejemplo n.º 30
0
def init_trainer(config, text_lines, slot_value_lines):

    hidden_dim = config.hidden_dim

    segment_begin = config.segment_begin
    segment_end = config.segment_end

    data = DataReader(text_lines, slot_value_lines, segment_begin, segment_end)

    # Create model nodes for the source and target inputs
    vocab_dim = data.vocab_dim
    sv_dim = data.sv_dim

    input_sequence, sv_pair, label_sequence, inputH, inputC = create_inputs(hidden_dim, sv_dim, vocab_dim)
    model = create_model(hidden_dim, sv_dim, vocab_dim)
    z = model(input_sequence, inputH, inputC, sv_pair)
    # cross_entropy: this is used training criterion
    ce, err = cross_entropy_with_full_softmax(z, label_sequence, sv_dim, vocab_dim)

    learning_rate = config.learning_rate
    momentum_as_time_constant = config.momentum_as_time_constant
    clipping_threshold_per_sample = config.clipping_threshold_per_sample
    lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample)
    gradient_clipping_with_truncation = True
    momentum_schedule = momentum_as_time_constant_schedule(momentum_as_time_constant)
    # Instantiate the trainer object to drive the model training
    learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule,
			gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
			gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (ce, err), learner)
    inputs = [input_sequence, sv_pair, label_sequence, inputH, inputC]

    return data, z, trainer, inputs