Ejemplo n.º 1
0
    def __init__(self, flags):
        run_config = tf.ConfigProto()
        run_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=run_config)

        self.flags = flags
        self.best_mae = float("inf")
        self.iter_time = 0

        self.train_dataset = dataset(self.flags.dataset)
        print('train dataset name: {}'.format(self.train_dataset.dataset_name))
        if self.flags.dataset == 'brain01':
            self.val_dataset = dataset('brain05')
        elif self.flags.dataset == 'spine04':
            self.val_dataset = dataset('spine_val')
        print('val datset name: {}'.format(self.val_dataset.dataset_name))

        self.model = gan_repository(self.sess, self.flags, self.train_dataset)
        self._make_folders()
        self.evaluator = None

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

        # threads for tfrecrod
        self.coord = tf.train.Coordinator()
        self.threads = tf.train.start_queue_runners(sess=self.sess,
                                                    coord=self.coord)
Ejemplo n.º 2
0
    def __init__(self, train_file, test_file):
        self.train_data = dataset(train_file, 'train')
        self.test_data = dataset(test_file, 'test')
        '''
        texts = self.train_data.texts + self.test_data.texts
        targets = self.train_data.targets + self.test_data.targets
        self.train_data.texts, self.test_data.texts, self.train_data.targets, self.test_data.targets = train_test_split(
            texts, targets, test_size=0.2
        )
        self.train_data.size = len(self.train_data.targets)
        self.test_data.size = len(self.test_data.targets)
        '''
        self.encoder = Encoder()
        self.encoder.build_vocab(self.train_data.texts + self.test_data.texts)

        self.vocab_size = self.encoder.vocab_size
        self.feature_size = self.vocab_size * n_label
        print('feature count:', self.feature_size)

        self.timemark = time.strftime('%Y%m%d-%H%M%S',
                                      time.localtime(time.time()))

        self.v = get_matrix(n_label, self.vocab_size)

        print('model built')
Ejemplo n.º 3
0
    def build_data(self):
        if self.opt['process_data']:
            self.train_dataset = dataset(
                "../../data/data1030/output/train_cut.pkl", self.opt, 'train')
            self.valid_dataset = dataset(
                "../../data/data1030/output/valid_cut.pkl", self.opt, 'valid')
            self.test_dataset = dataset(
                "../../data/data1030/output/test_cut.pkl", self.opt, 'test')

            self.train_processed_set = self.train_dataset.data_process(True)
            self.valid_processed_set = self.valid_dataset.data_process(True)
            self.test_processed_set = self.test_dataset.data_process(True)

            pickle.dump(self.train_processed_set,
                        open('data/train_processed_set.pkl', 'wb'))
            pickle.dump(self.valid_processed_set,
                        open('data/valid_processed_set.pkl', 'wb'))
            pickle.dump(self.test_processed_set,
                        open('data/test_processed_set.pkl', 'wb'))
            logger.info("[Save processed data]")
        else:
            try:
                self.train_processed_set = pickle.load(
                    open('data/train_processed_set.pkl', 'rb'))
                self.valid_processed_set = pickle.load(
                    open('data/valid_processed_set.pkl', 'rb'))
                self.test_processed_set = pickle.load(
                    open('data/test_processed_set.pkl', 'rb'))
            except:
                assert 1 == 0, "No processed data"
            logger.info("[Load processed data]")
Ejemplo n.º 4
0
    def val(self,is_test=False):
        self.metrics_gen={"ppl":0,"dist1":0,"dist2":0,"dist3":0,"dist4":0,"bleu1":0,"bleu2":0,"bleu3":0,"bleu4":0,"count":0}
        self.metrics_rec={"recall@1":0,"recall@10":0,"recall@50":0,"loss":0,"gate":0,"count":0,'gate_count':0}
        self.model.eval()
        if is_test:
            val_dataset = dataset('data/test_data.jsonl', self.opt)
        else:
            val_dataset = dataset('data/valid_data.jsonl', self.opt)
        val_set=CRSdataset(val_dataset.data_process(),self.opt['n_entity'],self.opt['n_concept'])
        val_dataset_loader = torch.utils.data.DataLoader(dataset=val_set,
                                                           batch_size=self.batch_size,
                                                           shuffle=False)
        recs=[]
        for context, c_lengths, response, r_length, mask_response, mask_r_length, entity, entity_vector, movie, concept_mask, dbpedia_mask, concept_vec, db_vec, rec in tqdm(val_dataset_loader):
            with torch.no_grad():
                seed_sets = []
                batch_size = context.shape[0]
                for b in range(batch_size):
                    seed_set = entity[b].nonzero().view(-1).tolist()
                    seed_sets.append(seed_set)
                scores, preds, rec_scores, rec_loss, _, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=True, maxlen=20, bsz=batch_size)

            recs.extend(rec.cpu())
            #print(losses)
            #exit()
            self.metrics_cal_rec(rec_loss, rec_scores, movie)

        output_dict_rec={key: self.metrics_rec[key] / self.metrics_rec['count'] for key in self.metrics_rec}
        print(output_dict_rec)

        return output_dict_rec
Ejemplo n.º 5
0
    def __init__(self, input_dim, hid_dim, class_num, d1, lrn_rate, momentum,
                 batch_size_train, epoch_max, reg_lambda, train_file_name,
                 val_file_name, test_file_name, log_file_name_head,
                 gaus_train_file_name, gaus_val_file_name, gaus_test_file_name,
                 attr_train_file_name, attr_val_file_name, attr_test_file_name,
                 write_model_log_period):
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.class_num = class_num
        self.d1 = d1
        self.lrn_rate = lrn_rate
        self.momentum = momentum
        self.batch_size_train = batch_size_train
        self.epoch_max = epoch_max
        self.reg_lambda = reg_lambda
        self.log_file_name_head = log_file_name_head
        self.write_model_log_period = write_model_log_period

        self.data = dataset.dataset(train_file_name, val_file_name,
                                    test_file_name, class_num,
                                    batch_size_train)
        self.gaus_sample = dataset.dataset(gaus_train_file_name,
                                           gaus_val_file_name,
                                           gaus_test_file_name, class_num,
                                           batch_size_train)
Ejemplo n.º 6
0
    def __init__(self, train_file, test_file):
        train_data = dataset(train_file, 'train')
        test_data = dataset(test_file, 'test')
        texts = train_data.texts + test_data.texts
        targets = train_data.targets + test_data.targets
        train_data.texts, test_data.texts, train_data.targets, test_data.targets = train_test_split(
            texts, targets)
        train_data.size = len(train_data.targets)
        test_data.size = len(test_data.targets)

        tfidf = TfidfVectorizer(max_features=max_features,
                                stop_words=stopwords)
        tfidf.fit(texts)
        self.vocab = {
            value: i
            for i, value in enumerate(tfidf.get_feature_names())
        }
        self.train_texts = train_data.texts
        self.train_targets = train_data.targets
        self.test_texts = test_data.texts
        self.test_targets = test_data.targets
        self.train_size = train_data.size
        self.test_size = test_data.size
        self.timemark = time.strftime('%Y%m%d-%H%M%S',
                                      time.localtime(time.time()))

        self.v = get_matrix(n_label, max_features)
def get_train_data(batch_size,
                   seq_len,
                   dir_type,
                   pixel,
                   pretrained=None,
                   shuffle=True,
                   num_workers=0):
    train_folder = 'MOT17/train/'
    datastorageobject = datastorage(train_folder)
    datastorageobject.prepare(dir_type)
    datastorageobject.split(seq_len)

    traindatasetobject = dataset(datastorageobject, seq_len, pixel, pretrained,
                                 'train')
    traindatasetobject.create()

    valdatasetobject = dataset(datastorageobject, seq_len, pixel, pretrained,
                               'val')
    valdatasetobject.create()

    traindataloader = data.DataLoader(traindatasetobject,
                                      batch_size=batch_size,
                                      collate_fn=traindatasetobject.collate_fn,
                                      shuffle=shuffle,
                                      num_workers=num_workers)
    valdataloader = data.DataLoader(valdatasetobject,
                                    batch_size=batch_size,
                                    collate_fn=valdatasetobject.collate_fn,
                                    shuffle=shuffle,
                                    num_workers=num_workers)
    return traindataloader, traindatasetobject.__len__(
    ), valdataloader, valdatasetobject.__len__()
Ejemplo n.º 8
0
Archivo: sae0.py Proyecto: ylytju/SAAE
    def __init__(self,
                 input_dim,
                 hid_dim,
                 class_num,
                 d1,
                 lrn_rate,
                 momentum,
                 batch_size_train,
                 epoch_max,
                 reg_lambda,
                 train_file_name,
                 val_file_name,
                 test_file_name,
                 log_file_name_head,
                 gaus_train_file_name,
                 gaus_val_file_name,
                 gaus_test_file_name,
                 attr_train_file_name,
                 attr_val_file_name,
                 attr_test_file_name,
                 write_model_log_period,
                 match_coef=1,
                 train_label_file_name=None,
                 val_label_file_name=None,
                 test_label_file_name=None,
                 load_model_file_directory=None):
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.class_num = class_num
        self.d1 = d1
        self.d2 = 100
        self.lrn_rate = lrn_rate
        self.momentum = momentum
        self.batch_size_train = batch_size_train
        self.batch_size_test = 24295
        self.epoch_max = epoch_max
        self.reg_lambda = reg_lambda
        self.log_file_name_head = log_file_name_head
        self.write_model_log_period = write_model_log_period
        self.match_coef = match_coef
        self.load_model_file_directory = load_model_file_directory

        self.data = dataset.dataset(
            train_file_name,
            val_file_name,
            test_file_name,
            class_num,
            batch_size_train=batch_size_train,
            train_label_file_name=train_label_file_name,
            val_label_file_name=val_label_file_name,
            test_label_file_name=test_label_file_name)
        self.gaus_sample = dataset.dataset(gaus_train_file_name,
                                           gaus_val_file_name,
                                           gaus_test_file_name,
                                           class_num,
                                           batch_size_train=batch_size_train)
        self.attrdata = attrdataset.attrdataset(attr_train_file_name,
                                                attr_val_file_name,
                                                attr_test_file_name)
Ejemplo n.º 9
0
    def __init__(self, data_directory, data_test_directory, n):

        print data_directory, data_test_directory

        self.data_directory = data_directory
        self.n = n

        X, y, tags = dataset.dataset(self.data_directory, self.n)
        self.nb_classes = len(tags)

        if data_test_directory == None:
            sample_count = len(y)
            train_size = sample_count * 4 // 5
            X_train = X[:train_size]
            y_train = y[:train_size]

            X_test = X[train_size:]
            y_test = y[train_size:]

        else:
            X_train = X
            y_train = y

            X_test, y_test, test_tags = dataset.dataset(data_test_directory, n)
            nb_classes_test = len(test_tags)
            print test_tags
            print nb_classes_test, self.nb_classes
            assert nb_classes_test == self.nb_classes

        Y_train = np_utils.to_categorical(y_train, self.nb_classes)
        Y_test = np_utils.to_categorical(y_test, self.nb_classes)
        X_train = [x.reshape(n, n, 3) for x in X_train]
        X_test = [x.reshape(n, n, 3) for x in X_test]

        self.datagen = ImageDataGenerator(featurewise_center=False,
                                          samplewise_center=False,
                                          featurewise_std_normalization=False,
                                          samplewise_std_normalization=False,
                                          zca_whitening=False,
                                          rotation_range=45,
                                          width_shift_range=0.25,
                                          height_shift_range=0.25,
                                          horizontal_flip=True,
                                          vertical_flip=True,
                                          zoom_range=0.5,
                                          channel_shift_range=0.5,
                                          fill_mode='nearest')

        self.X = X
        self.y = y
        self.X_train = np.array(X_train)
        self.X_test = np.array(X_test)
        self.Y_train = Y_train
        self.Y_test = Y_test
        self.y_train = y_train
        self.y_test = y_test
        self.tags = tags
Ejemplo n.º 10
0
 def test_parse_dataset(self):
     print('=== Testing parse_dataset() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder + b'/data/photons.emc')
     self.photons_tests(dset)
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder + b'/data/photons.h5')
     self.photons_tests(dset)
Ejemplo n.º 11
0
    def __init__(self,
                 input_dim,
                 hid_dim,
                 d1,
                 lrn_rate,
                 train_batch_size,
                 epoch_max,
                 momentum=0.0,
                 coef_recon=1.0,
                 coef_gan=1.0,
                 unseen_class_file_name=None,
                 train_file_name=None,
                 val_file_name=None,
                 test_file_name=None,
                 train_label_file_name=None,
                 val_label_file_name=None,
                 test_label_file_name=None,
                 train_attr_file_name=None,
                 val_attr_file_name=None,
                 test_attr_file_name=None,
                 log_file_name_head=None,
                 save_model_period=1,
                 load_model_directory=None,
                 generalizedZSL=False):

        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.d1 = d1

        self.lrn_rate = lrn_rate
        self.train_batch_size = train_batch_size
        self.epoch_max = epoch_max
        self.momentum = momentum

        self.coef_recon = coef_recon
        self.coef_gan = coef_gan

        self.unseen_class = np.load(unseen_class_file_name)

        self.log_file_name_head = log_file_name_head
        self.save_model_period = save_model_period
        self.load_model_directory = load_model_directory

        self.generalizedZSL = generalizedZSL

        self.data = dataset.dataset(
            train_file_name=train_file_name,
            val_file_name=val_file_name,
            test_file_name=test_file_name,
            train_label_file_name=train_label_file_name,
            val_label_file_name=val_label_file_name,
            test_label_file_name=test_label_file_name)
        self.attr_data = dataset.dataset(train_file_name=train_attr_file_name,
                                         val_file_name=val_attr_file_name,
                                         test_file_name=test_attr_file_name)
Ejemplo n.º 12
0
    def val(self,is_test=False):
        self.metrics_gen={"ppl":0,"dist1":0,"dist2":0,"dist3":0,"dist4":0,"bleu1":0,"bleu2":0,"bleu3":0,"bleu4":0,"count":0}
        self.metrics_rec={"recall@1":0,"recall@10":0,"recall@50":0,"loss":0,"gate":0,"count":0,'gate_count':0}
        self.model.eval()
        if is_test:
            val_dataset = dataset('data/test_data.jsonl', self.opt)
        else:
            val_dataset = dataset('data/valid_data.jsonl', self.opt)
        val_set=CRSdataset(val_dataset.data_process(True),self.opt['n_entity'],self.opt['n_concept'])
        val_dataset_loader = torch.utils.data.DataLoader(dataset=val_set,
                                                           batch_size=self.batch_size,
                                                           shuffle=False)
        inference_sum=[]
        golden_sum=[]
        context_sum=[]
        losses=[]
        recs=[]
        for context, c_lengths, response, r_length, mask_response, mask_r_length, entity, entity_vector, movie, concept_mask, dbpedia_mask, concept_vec, db_vec, rec in tqdm(val_dataset_loader):
            with torch.no_grad():
                seed_sets = []
                batch_size = context.shape[0]
                for b in range(batch_size):
                    seed_set = entity[b].nonzero().view(-1).tolist()
                    seed_sets.append(seed_set)
                _, _, _, _, gen_loss, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=False)
                scores, preds, rec_scores, rec_loss, _, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=True, maxlen=20, bsz=batch_size)

            golden_sum.extend(self.vector2sentence(response.cpu()))
            inference_sum.extend(self.vector2sentence(preds.cpu()))
            context_sum.extend(self.vector2sentence(context.cpu()))
            recs.extend(rec.cpu())
            losses.append(torch.mean(gen_loss))
            #print(losses)
            #exit()

        self.metrics_cal_gen(losses,inference_sum,golden_sum,recs)

        output_dict_gen={}
        for key in self.metrics_gen:
            if 'bleu' in key:
                output_dict_gen[key]=self.metrics_gen[key]/self.metrics_gen['count']
            else:
                output_dict_gen[key]=self.metrics_gen[key]
        print(output_dict_gen)

        f=open('context_test.txt','w',encoding='utf-8')
        f.writelines([' '.join(sen)+'\n' for sen in context_sum])
        f.close()

        f=open('output_test.txt','w',encoding='utf-8')
        f.writelines([' '.join(sen)+'\n' for sen in inference_sum])
        f.close()
        return output_dict_gen
Ejemplo n.º 13
0
    def __init__(self,
                 input_dim,
                 attr_dim,
                 disp_dim,
                 lrn_rate,
                 train_batch_size,
                 epoch_max,
                 momentum=0.0,
                 coef_match=1.0,
                 coef_recon=1.0,
                 train_file_name=None,
                 val_file_name=None,
                 test_file_name=None,
                 train_label_file_name=None,
                 val_label_file_name=None,
                 test_label_file_name=None,
                 train_attr_file_name=None,
                 val_attr_file_name=None,
                 test_attr_file_name=None,
                 log_file_name_head=None,
                 save_model_period=1,
                 load_model_directory=None):

        self.input_dim = input_dim
        self.attr_dim = attr_dim
        self.disp_dim = disp_dim

        self.lrn_rate = lrn_rate
        self.train_batch_size = train_batch_size
        self.epoch_max = epoch_max
        self.momentum = momentum

        self.coef_match = coef_match
        self.coef_recon = coef_recon

        self.log_file_name_head = log_file_name_head
        self.save_model_period = save_model_period
        self.load_model_directory = load_model_directory

        self.data = dataset.dataset(
            train_file_name=train_file_name,
            val_file_name=val_file_name,
            test_file_name=test_file_name,
            train_label_file_name=train_label_file_name,
            val_label_file_name=val_label_file_name,
            test_label_file_name=test_label_file_name)
        self.attr_data = dataset.dataset(train_file_name=train_attr_file_name,
                                         val_file_name=val_attr_file_name,
                                         test_file_name=test_attr_file_name)
Ejemplo n.º 14
0
 def test_parse_dataset(self):
     print('=== Testing parse_dataset() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
     self.photons_tests(dset)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
Ejemplo n.º 15
0
def train():
    model = MyRNN(units, total_words, embedding_len, input_len)
    trainloader, testloader = dataset()
    model.compile(optimizer=optimizer,
                  loss=tf.losses.BinaryCrossentropy(),
                  metrics=['accuracy'])
    model.fit(trainloader, epochs=epochs, validation_data=testloader)
Ejemplo n.º 16
0
def main():
    """
        Begin training the classifiers using gathered training data
    """

    good = 0
    bad = 0
    for i in range(0, 10):
        """ The frame classifier to be trained """
        svcf = svc_frame()
        """ Still handler """
        s = still()
        """ Load all the training data from know folder loaction """
        d = dataset()
        """ Create training set """
        features = []
        labels = []
        for i in range(0, d.len()):
            s.load(d.feature(i))
            features.append(s.compress_make_linear())
            labels.append(d.is_frame(i))
        """ Split for validiation """
        features_train, features_test, labels_train, labels_test = \
            cross_validation.train_test_split(features, labels, test_size=0.2, random_state=int(time.time()))
        """ Train """
        for i in range(0, len(features_train)):
            svcf.train(features_train[i], labels_train[i])
        """ Validate """
        for i in range(0, len(features_test)):
            if svcf.is_frame(features_test[i]) == labels_test[i]:
                good += 1
            else:
                bad += 1

        print good, bad
Ejemplo n.º 17
0
 def test_generate_blacklist(self):
     print('=== Testing generate_blacklist() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
     dset.generate_blacklist(config_fname)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 0)
     dset.generate_blacklist(config_fname)
     
     blist_fname = recon_folder+b'/data/blacklist.dat'
     blist = np.zeros(dset.tot_num_data, dtype='u1')
     blist[:10] = 1
     np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d')
     config = DragonflyConfig(config_fname)
     config.modify_entry('emc', 'blacklist_file', 'data/blacklist.dat')
     dset.generate_blacklist(config_fname)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 10)
     os.remove(blist_fname)
     
     config.remove_entry('emc', 'blacklist_file')
     config.modify_entry('emc', 'selection', 'odd_only')
     dset.generate_blacklist(config_fname)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 1500)
     config.remove_entry('emc', 'selection')
Ejemplo n.º 18
0
def run(dataset_dir, output_dir):

    # Get data
    data_X, data_y, _ = dataset.dataset(dataset_dir, 299)
    for i, x in enumerate(data_X):
        output_file = output_dir + "/frame-" + str(i) + ".png"
        scipy.misc.imsave(output_file, x)
Ejemplo n.º 19
0
def get_dataset(type, positive_mfcc, negative_mfcc, positive_speakers, negative_speakers, positive_listeners,
                negative_listeners, max = None, min = None):


    if type == "both":
        return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path = positive_listeners ,
                 p_speakers_path = positive_speakers, n_listeners_path = negative_listeners, n_speakers_path = negative_speakers )
    elif type == "speaker":
        return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path = None ,
                 p_speakers_path = positive_speakers, n_listeners_path = None, n_speakers_path = negative_speakers )
    elif type == "listener":
        return dataset.dataset(positive_mfcc, negative_mfcc,  max = max, min = min, p_listeners_path = positive_listeners ,
                 p_speakers_path = None, n_listeners_path = negative_listeners, n_speakers_path = None )
    else:
        return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path=None,
                               p_speakers_path=None, n_listeners_path=None, n_speakers_path=None)
Ejemplo n.º 20
0
    def __init__(self, unlabeled_datasets = None, models = None, undersample_before_eval = False):
        '''
        unlabeled_datasets should be either (1) a string pointing to a single data file (e.g., "mydata.txt") or (2) a list of strings
        pointing to multiple data files that represent the same data with different feature spaces. For more on the data format,
        consult the doc or see the samples.        
        '''
        if isinstance(unlabeled_datasets, str):
            # then a string, presumably pointing to a single data file, was passed in
            unlabeled_datasets  = [unlabeled_datasets]
            
        self.unlabeled_datasets = unlabeled_datasets or []
        # initialize empty labeled datasets (i.e., all data is unlabeled to begin with)
        # note that we give the labeled dataset the same name as the corresponding
        # unlabeled dataset
        if unlabeled_datasets is not None:
            self.labeled_datasets = [dataset.dataset(name=d.name) for d in unlabeled_datasets]

        self.models = models
        self.undersample_before_eval = undersample_before_eval 
        self.undersample_function = self.undersample_labeled_datasets if undersample_before_eval else None
        
        self.query_function = self.base_q_function # throws exception if not overridden 
        self.name = "Base"
        self.description = ""
        self.longer_name = ""
        
        # default prediction function; only important if you're aggregating multiple feature spaces (see 
        # cautious_predict function documentation)
        self.predict_func = self.at_least
        print "using prediction function: %s" % self.predict_func.__name__
        
        # if this is false, the models will not be rebuilt after each round of active learning
        self.rebuild_models_at_each_iter = True 
    def test_pearson_recommendation2(self):
        # データセットの指定
        import dataset
        no = 3
        data = dataset.dataset(no)

        # テストパラメータの設定
        sim_func_name = "pearson"
        model_type_name = "normalized"
        cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name)

        # 類似度テスト
        user1 = "u1" # target
        user2 = "u2"
        user3 = "u3"
        user4 = "u4"
        self.assertEqual(cf_model.get_sim(user1, user2), -0.8)
        self.assertEqual(cf_model.get_sim(user1, user3),  1.0)
        self.assertEqual(cf_model.get_sim(user1, user4),  0.0)
    
        # スコアテスト
        user = "******"
        item = "i6"
        score = cf_model.calc_score(user, item)
        self.assertEqual(score, 3.0 + (2-(-0.8)) / (abs(1) + abs(-0.8)) ) # 4.555...
        return
Ejemplo n.º 22
0
def main():

    data = dataset("vertebrate_train_nonoise.csv")
    data.normalize()

    x = DBSCAN(data, 2, 3)
    x.train_DBSCAN()
Ejemplo n.º 23
0
    def test_calc_sum_fact(self):
        print('=== Testing calc_sum_fact() ===')
        det = self.create_det()
        dset = dataset.dataset(det)
        dset.parse_dataset(recon_folder + b'/data/photons.emc')
        dset.calc_sum_fact()

        frame = np.zeros(dset.num_pix, dtype='i4')
        frame[dset.place_ones[dset.ones_accum[0]:dset.ones_accum[0] +
                              dset.ones[0]]] = 1
        frame[dset.place_multi[dset.multi_accum[0]:dset.multi_accum[0] +
                               dset.multi[0]]] = dset.count_multi[
                                   dset.multi_accum[0]:dset.multi_accum[0] +
                                   dset.multi[0]]
        self.assertAlmostEqual(
            np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[0])

        frame = np.zeros(dset.num_pix, dtype='i4')
        frame[dset.place_ones[dset.ones_accum[-1]:dset.ones_accum[-1] +
                              dset.ones[-1]]] = 1
        frame[dset.place_multi[dset.multi_accum[-1]:dset.multi_accum[-1] +
                               dset.multi[-1]]] = dset.count_multi[
                                   dset.multi_accum[-1]:dset.multi_accum[-1] +
                                   dset.multi[-1]]
        self.assertAlmostEqual(
            np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[-1])
Ejemplo n.º 24
0
    def test_generate_blacklist(self):
        print('=== Testing generate_blacklist() ===')
        det = self.create_det()
        dset = dataset.dataset(det)
        dset.parse_dataset(recon_folder + b'/data/photons.emc')
        dset.generate_blacklist(config_fname)
        self.assertEqual(dset.blacklist.shape[0], 3000)
        self.assertEqual(dset.blacklist.sum(), 0)
        dset.generate_blacklist(config_fname)

        blist_fname = recon_folder + b'/data/blacklist.dat'
        blist = np.zeros(dset.tot_num_data, dtype='u1')
        blist[:10] = 1
        np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d')
        config = DragonflyConfig(config_fname)
        config.modify_entry('emc', 'blacklist_file', 'data/blacklist.dat')
        dset.generate_blacklist(config_fname)
        self.assertEqual(dset.blacklist.shape[0], 3000)
        self.assertEqual(dset.blacklist.sum(), 10)
        os.remove(blist_fname)

        config.remove_entry('emc', 'blacklist_file')
        config.modify_entry('emc', 'selection', 'odd_only')
        dset.generate_blacklist(config_fname)
        self.assertEqual(dset.blacklist.shape[0], 3000)
        self.assertEqual(dset.blacklist.sum(), 1500)
        config.remove_entry('emc', 'selection')
Ejemplo n.º 25
0
def main():
    img_paths, annos = dataset(ANNO_DIR, IMG_DIR)

    idxs = random.sample(range(len(annos)), 4)

    new_image, new_annos = mosaic(img_paths, annos,
    	                          idxs,
    	                          OUTPUT_SIZE, SCALE_RANGE,
    	                          filter_scale=FILTER_TINY_SCALE)

    cv2.imwrite('output.jpg', new_image) #The mosaic image
    for anno in new_annos:
        start_point = (int(anno[1] * OUTPUT_SIZE[1]), int(anno[2] * OUTPUT_SIZE[0]))
        end_point = (int(anno[3] * OUTPUT_SIZE[1]), int(anno[4] * OUTPUT_SIZE[0]))
        cv2.rectangle(new_image, start_point, end_point, (0, 255, 0), 1, cv2.LINE_AA)
    cv2.imwrite('output_box.jpg', new_image) # The mosaic image with the bounding boxes
    
    yolo_anno = []
    
    for anno in up_annos:
      tmp = []
      tmp.append(anno[0])
      tmp.append((anno[3]+anno[1])/2)
      tmp.append((anno[4]+anno[2])/2)
      tmp.append(anno[3]-anno[1])
      tmp.append(anno[4]-anno[2])
      yolo_anno.append(tmp)

    with open('output.txt', 'w') as file: # The output annotation file will appear in the output.txt file
      for line in yolo_anno:
        file.write((' ').join([str(x) for x in line]) + '\n')   
Ejemplo n.º 26
0
def test(args):
    """
    Predict on a test dataset given a model.
    :param args:
    """

    device = get_device(args)

    net = seq2seq.load(args['m'])

    dts = dataset(device)
    dts.read_training_dataset(args['train_data'])

    test = test_dataset(device,
                        words_converter=dts.words_converter,
                        slots_converter=dts.slots_converter,
                        intent_converter=dts.intent_converter)

    if args['E'] != None:
        test.read_test_dataset(args['test_data'], lock=False)
        embeddings = gensim.models.KeyedVectors.load_word2vec_format(
            args['E'], binary=True)
        net.pretrained_embeddings(test, embeddings)

    else:
        test.read_test_dataset(args['test_data'], lock=True)

    print(dts.intent_converter.no_entries())
    # # predict!
    intent_pred, slots_pred = net.predict_batch(test, args['b'])

    predictions2json(test, intent_pred, slots_pred, args['O'])
Ejemplo n.º 27
0
 def undersample_labeled_datasets(self, k=None):
     '''
     Undersamples the current labeled datasets, i.e., makes the two classes of equal sizes. 
     Note that this methods returns a *copy* of the undersampled datasets. Thus it
     *does not mutate the labeled datasets*.
     '''
     if self.labeled_datasets and len(self.labeled_datasets) and (len(self.labeled_datasets[0].instances)):
         if not k:
             print "undersampling majority class to equal that of the minority examples"
             # we have to include 'false' minorities -- i.e., instances we've assumed are positives -- because otherwise we'd be cheating
             k = self.labeled_datasets[0].number_of_majority_examples() - self.labeled_datasets[0].number_of_minority_examples()
         # we copy the datasets rather than mutate the class members.
         copied_datasets = [dataset.dataset(list(d.instances)) for d in self.labeled_datasets]
         if k < self.labeled_datasets[0].number_of_majority_examples() and k > 0:
             # make sure we have enough majority examples...
             print "removing %s majority instances. there are %s total majority examples in the dataset." % (k, self.labeled_datasets[0].number_of_majority_examples())
             removed_instances = copied_datasets[0].undersample(k)
             # get the removed instance numbers
             removed_instance_nums = [inst.id for inst in removed_instances]
             # if there is more than one feature-space, remove the same instances from the remaining spaces (sets)
             for labeled_dataset in copied_datasets[1:]:
                 # now remove them from the corresponding sets
                 labeled_dataset.remove_instances(removed_instance_nums)
     else:
         raise Exception, "No labeled data has been provided!"   
     return copied_datasets
Ejemplo n.º 28
0
def predict(model_path, dataset_dir):
    # load tf graph
    tf_model, tf_input, tf_output = load_graph(model_path)

    # Create tensors for model input and output
    x = tf_model.get_tensor_by_name(tf_input)
    y = tf_model.get_tensor_by_name(tf_output)

    # Get data
    data_X, data_y, _ = dataset.dataset(dataset_dir, 299)

    with tf.Session(graph=tf_model) as sess:

        graph_def = sess.graph.as_graph_def()

        tf.summary.image('input', x, max_outputs=7)
        writer = tf.summary.FileWriter("/tmp/log/")
        writer.add_graph(sess.graph)
        merged_summary = tf.summary.merge_all()

        predictions = []
        for i, d in enumerate(data_X):
            s, prediction = sess.run([merged_summary, y], feed_dict={x: [d]})
            writer.add_summary(s, i)
            predictions.append(prediction[0])

    return predictions
    def test_pearson_recommendation1(self):
        # データセットの指定
        import dataset
        no = 2
        data = dataset.dataset(no)
        
        # テストパラメータの設定
        sim_func_name = "pearson"
        model_type_name = "normalized"
        cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name)

        # 類似度テスト
        user1 = "u1"
        user2 = "u2" # target
        user3 = "u3"
        user4 = "u4"
        self.assertEqual(cf_model.get_sim(user2, user1),  0.0)
        self.assertEqual(cf_model.get_sim(user2, user3),  1.0)
        self.assertEqual(cf_model.get_sim(user2, user4), -1.0)
    
        # スコアテスト
        user = "******"
        item = "i1"
        score = cf_model.calc_score(user, item)
        self.assertEqual(score, 2.75)
        return
Ejemplo n.º 30
0
def main():
    """
    Get the dataset, model
    Set the callback
    Train and save the best weights based on validation accuracy

    """
    train_images, train_labels, test_images, test_labels = dataset()

    model = get_model()

    model.summary()

    checkpoint_path = "training/cp-{epoch:04d}.ckpt"
    os.path.dirname(checkpoint_path)

    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        monitor="val_accuracy",
        save_best_only=True,
        save_weights_only=True)

    # Save the weights using the `checkpoint_path` format
    model.save_weights(checkpoint_path.format(epoch=0))

    # Train the model with the new callback
    model.fit(train_images,
              train_labels,
              epochs=100,
              validation_data=(test_images, test_labels),
              callbacks=[cp_callback],
              verbose=2)
Ejemplo n.º 31
0
def extract_features(features_opts, dataset_opts, params):
    print "# Extracting image features"
    files1, files2 = dataset(dataset_opts)
    features = []
    for img_file, depth_file in print_progress(files1 + files2):
        features.append(feature_extraction(img_file, depth_file, features_opts, params))
    return files1, features[: len(features) / 2], files2, features[len(features) / 2 :]
Ejemplo n.º 32
0
def run(session):
    d = dataset.dataset(max_length=config.EXAMPLE_MAX_LEN,
                        num_examples=config.NUM_EXAMPLES)

    g = graph.graph(
        batch_size=config.BATCH_SIZE,
        sequence_length=config.EXAMPLE_MAX_LEN,
        vocab_size=len(d.word_to_idx),
        num_embedding_dimensions=config.NUM_EMBEDDING_DIMENSIONS,
        num_lstm_layers=config.NUM_LSTM_LAYERS,
        num_lstm_units=config.NUM_LSTM_UNITS,
        start_word_idx=d.word_to_idx[dataset.start_word],
        stop_word_idx=d.word_to_idx[dataset.stop_word],
    )

    session.run(tf.global_variables_initializer())

    file_writer = tf.summary.FileWriter('logs/', session.graph)

    run_info = RunInfo(dataset=d, graph=g, session=session)
    for epoch_idx in range(1, config.NUM_EPOCHS):
        num_batches = len(d.training_idx_examples) // config.BATCH_SIZE
        for batch_idx in range(1, num_batches):
            batch_info = BatchInfo(
                epoch_idx=epoch_idx,
                batch_idx=batch_idx,
            )
            run_batch(run_info, batch_info)
        run_validation(run_info, epoch_idx)
Ejemplo n.º 33
0
def main():
    # Set random seed if given
    torch.manual_seed(RANDOM_SEED or torch.initial_seed())

    # Define dataset
    trainloader, testloader = dataset(BATCH_SIZE)

    # Set model parameters
    model_params = {
        "batch_size": BATCH_SIZE,
        "layers_sizes": LAYER_SIZES,
        "learning_rates": LEARNING_RATES,
        "n_iter_1": N_ITER_1,
        "n_iter_2": N_ITER_2,
        "rho": lambda x: x.clamp(0,1),  # Assuming x is a torch.Tensor
        "beta": BETA,
        "dt": DELTA,
    }

    # Define network
    eqprop_net = EqPropNet_NoGrad(**model_params)

    # Train
    train(eqprop_net, trainloader)

    # Validate
    test(eqprop_net, testloader)
Ejemplo n.º 34
0
def problem_generator_y(N,
                        dev,
                        mode: GenerationMode,
                        factory: ProblemFactory,
                        path=None):
    """
    The function problem_generator_y is an adapted version of problem_generator
    which can be used as callback function while training a neural network
    in order to generate new training data at the beginning of each epoch.

    For a more detailed description, see problem_generator.
    """
    cont = extract(mode, factory, path=path)
    prob_root = lin_opt_pbs(cont[0],
                            cont[1],
                            cont[2],
                            cont[3],
                            cont[4],
                            mode=mode)
    prob_root.set_deviation(dev)

    while True:
        prob_root.clear_generated_RHS()
        sol_list = prob_root.generate_and_solve(N)
        rhs_list = prob_root.extract_RHS()
        data = dataset(rhs_list, sol_list)

        yield data
Ejemplo n.º 35
0
    def test_parse_dataset_list(self):
        print('=== Testing parse_dataset_list() ===')
        det = self.create_det()
        dset = dataset.dataset(det)
        list_fname = b'test_dset_flist.txt'
        with open(list_fname, 'w') as f:
            f.writelines([
                (recon_folder + b'/data/photons.emc\n').decode('utf-8'),
                (recon_folder + b'/data/photons.emc\n').decode('utf-8')
            ])
        num_dsets = dset.parse_dataset_list(list_fname)
        self.photons_tests(dset, num_dsets)
        ndset = dset.next
        self.photons_tests(ndset, num_dsets, False)
        dset.parse_dataset_list(list_fname)

        with open(list_fname, 'w') as f:
            f.writelines([
                (recon_folder + b'/data/photons.emc\n').decode('utf-8'),
                (recon_folder + b'/data/photons.h5\n').decode('utf-8')
            ])
        num_dsets = dset.parse_dataset_list(list_fname)
        self.photons_tests(dset, num_dsets)
        ndset = dset.next
        self.photons_tests(ndset, num_dsets, False)

        os.remove(list_fname)
    def test_cosine_recommendation_sim_multi_score(self):
        # データセットの指定
        import dataset
        no = 1
        data = dataset.dataset(no)

        # テストパラメータの設定
        sim_func_name = "cosine"
        model_type_name = "sim_multi_rating"
        cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name)

        # スコアテスト
        user1 = "u1" 
        user2 = "u2"
        user3 = "u3"
        user4 = "u4"
        user5 = "u5"
        item1 = "i1"
        item2 = "i2"
        item3 = "i3"
        item4 = "i4"
    
        self.assertEqual(cf_model.calc_score(user1, item1), 2.652365080899908) 
        self.assertEqual(cf_model.calc_score(user1, item2), 0.3434277633975243) 
        self.assertEqual(cf_model.calc_score(user1, item3), 2.7852678291248507) 
        self.assertEqual(cf_model.calc_score(user1, item4), 2.2348318324104093) 

        # U2 [('i4', 6.5480200273566895), ('i3', 5.685121675688175), ('i1', 3.2869185147104245), ('i2', 2.737414017877664)]
        # u3 [('i3', 7.04574847038176), ('i4', 5.517604872186145), ('i1', 2.5262421022799537), ('i2', 1.415727598843663)]
        # u4 [('i4', 7.945448566343385), ('i3', 7.371732349896483), ('i1', 3.142243637026426), ('i2', 1.6623943235017373)]
        # u5 [('i4', 7.690628217779786), ('i3', 6.50655550912908), ('i1', 3.5991851423622627), ('i2', 0.8617748202735571)]
        return
Ejemplo n.º 37
0
    def test_generate_data(self):
        print('=== Testing generate_data() ===')
        det = self.create_det()
        dset = dataset.dataset(det)
        dset.generate_data(config_fname)
        self.photons_tests(dset)
        dset.generate_data(config_fname)

        list_fname = recon_folder + b'/test_photons_list.txt'
        with open(list_fname, 'w') as f:
            f.writelines(['data/photons.emc\n', 'data/photons.emc\n'])
        config = DragonflyConfig(config_fname)
        config.modify_entry('emc', 'in_photons_list',
                            list_fname.decode('utf-8'))
        self.assertRaises(AssertionError, dset.generate_data, config_fname)
        config.remove_entry('emc', 'in_photons_file')
        dset.generate_data(config_fname)
        self.photons_tests(dset, 2)
        ndset = dset.next
        self.photons_tests(ndset, 2, False)

        with open(list_fname, 'w') as f:
            f.writelines(['data/photons.h5\n', 'data/photons.h5\n'])
        config.modify_entry('emc', 'in_photons_list',
                            list_fname.decode('utf-8'))
        dset.generate_data(config_fname)
        self.photons_tests(dset, 2)
        ndset = dset.next
        self.photons_tests(ndset, 2, False)

        os.remove(list_fname)
        config.remove_entry('emc', 'in_photons_list')
        config.modify_entry('emc', 'in_photons_file',
                            'make_data:::out_photons_file')
Ejemplo n.º 38
0
    def main(self):

        print("SmartGator Intelligent chatbot")

        self.root_dir = os.getcwd()
        self.load_config()
        self.load_model_params()
        self.load_args()
        self.update_settings()
        self.text_data = dataset(self.args)

        # RNN Model Initialized #
        self.model = RNNModel(self.text_data, self.args)

        # Handlers to write and save learned models #
        self.writer = tf.summary.FileWriter(self._get_summary_name())
        self.saver = tf.train.Saver(max_to_keep=200,
                                    write_version=tf.train.SaverDef.V1)

        self.session = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        print("Initializing tf variables")
        self.session.run(tf.global_variables_initializer())

        # If a previous model exists load it and procedd from last run step #
        self.manage_previous_model(self.session)

        # If using word2vec model we need to laod word vectors #
        if self.init_embeddings:
            self.load_embedding(self.session)

        # Twitter Interface up or not #
        if self.twitter:
            return

        # Batch Testing #
        elif self.file_:
            try:
                with open(self.TEST_IN_NAME, "r") as f:
                    try:
                        with open(self.TEST_OUT_SUFFIX, 'w') as output:
                            for line in f:
                                output.write(
                                    self.predict_daemon(line[:-1]) + "\n")
                    except:
                        print("Writing in file is a problem")
            except:
                print("Open file error")

        # Else if in CLI testing mode #
        elif self.test:
            self.interactive_main(self.session)

        # Else in training mode #
        else:
            self.train_model(self.session)

        self.session.close()
        print("Say Bye Bye to SmartGator! ;)")
Ejemplo n.º 39
0
 def test_free_data(self):
     print('=== Testing free_data() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
     dset.free_data()
     dset.free_data()
     self.assertIsNone(dset.num_data)
Ejemplo n.º 40
0
 def allocate_iterate(self):
     itr = iterate.iterate()
     det = detector.detector()
     dset = dataset.dataset(det)
     param = params.params()
     qmax = det.generate_detectors(config_fname)
     dset.generate_data(config_fname)
     param.generate_params(config_fname)
     dset.generate_blacklist(config_fname)
     itr.generate_iterate(config_fname, qmax, param, det, dset)
     return itr, det, dset, param, qmax
Ejemplo n.º 41
0
 def test_parse_data(self):
     print('=== Testing parse_data() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     list_fname = b'test_dset_flist.txt'
     with open(list_fname, 'w') as f:
         f.writelines([(recon_folder+b'/data/photons.emc\n').decode('utf-8'), (recon_folder+b'/data/photons.emc\n').decode('utf-8')])
     num_dsets = dset.parse_data(list_fname)
     self.photons_tests(dset, num_dsets)
     ndset = dset.next
     self.photons_tests(ndset, num_dsets, False)
     dset.parse_data(list_fname)
     os.remove(list_fname)
Ejemplo n.º 42
0
 def test_calc_sum_fact(self):
     print('=== Testing calc_sum_fact() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
     dset.calc_sum_fact()
     
     frame = np.zeros(dset.num_pix, dtype='i4')
     frame[dset.place_ones[dset.ones_accum[0]:dset.ones_accum[0]+dset.ones[0]]] = 1
     frame[dset.place_multi[dset.multi_accum[0]:dset.multi_accum[0]+dset.multi[0]]] = dset.count_multi[dset.multi_accum[0]:dset.multi_accum[0]+dset.multi[0]]
     self.assertAlmostEqual(np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[0])
     
     frame = np.zeros(dset.num_pix, dtype='i4')
     frame[dset.place_ones[dset.ones_accum[-1]:dset.ones_accum[-1]+dset.ones[-1]]] = 1
     frame[dset.place_multi[dset.multi_accum[-1]:dset.multi_accum[-1]+dset.multi[-1]]] = dset.count_multi[dset.multi_accum[-1]:dset.multi_accum[-1]+dset.multi[-1]]
     self.assertAlmostEqual(np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[-1])
Ejemplo n.º 43
0
 def test_make_blacklist(self):
     print('=== Testing make_blacklist() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.parse_dataset(recon_folder+b'/data/photons.emc')
     dset.make_blacklist(b'')
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 0)
     
     dset.make_blacklist(b'', odd_flag=2)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 1500)
     npt.assert_array_equal(dset.blacklist[:4], [0,1,0,1])
     
     dset.make_blacklist(b'', odd_flag=1)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 1500)
     npt.assert_array_equal(dset.blacklist[:4], [1,0,1,0])
     
     blist_fname = recon_folder+b'/data/blacklist.dat'
     blist = np.zeros(dset.tot_num_data, dtype='u1')
     blist[:10] = 1
     np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d')
     dset.make_blacklist(blist_fname)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 10)
     npt.assert_array_equal(dset.blacklist[8:12], [1,1,0,0])
     
     # Behavior when both blacklist file and odd/even selection
     # Alternate frames which are not blacklisted by file are blacklisted
     dset.make_blacklist(blist_fname, odd_flag=2)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 1505)
     npt.assert_array_equal(dset.blacklist[8:12], [1,1,0,1])
     
     dset.make_blacklist(blist_fname, odd_flag=1)
     self.assertEqual(dset.blacklist.shape[0], 3000)
     self.assertEqual(dset.blacklist.sum(), 1505)
     npt.assert_array_equal(dset.blacklist[8:12], [1,1,1,0])
     os.remove(blist_fname)
Ejemplo n.º 44
0
 def __init__(self, unlabeled_datasets = [], models = None, undersample_before_eval = False):
     '''
     unlabeled_datasets should be either (1) a string pointing to a single data file (e.g., "mydata.txt") or (2) a list of strings
     pointing to multiple data files that represent the same data with different feature spaces. For more on the data format,
     consult the doc or see the samples.
     
     '''
     if type(unlabeled_datasets) == type(""):
         # then a string, presumably pointing to a single data file, was passed in
         unlabeled_datasets  = [unlabeled_datasets]
         
     self.unlabeled_datasets = unlabeled_datasets
     # initialize empty labeled datasets (i.e., all data is unlabeled to begin with)
     self.labeled_datasets = [dataset.dataset([]) for d in unlabeled_datasets]
     self.models = models
     self.undersample_first = undersample_before_eval 
     self.query_function = self.base_q_function # throws exception if not overridden 
     self.name = "Base"
     
     # default prediction function; only important if you're aggregating multiple feature spaces (see 
     # cautious_predict function documentation)
     self.predict = self.majority_predict
Ejemplo n.º 45
0
 def test_generate_data(self):
     print('=== Testing generate_data() ===')
     det = self.create_det()
     dset = dataset.dataset(det)
     dset.generate_data(config_fname)
     self.photons_tests(dset)
     dset.generate_data(config_fname)
     
     list_fname = recon_folder+b'/test_photons_list.txt'
     with open(list_fname, 'w') as f:
         f.writelines(['data/photons.emc\n', 'data/photons.emc\n'])
     config = DragonflyConfig(config_fname)
     config.modify_entry('emc', 'in_photons_list', list_fname.decode('utf-8'))
     self.assertRaises(AssertionError, dset.generate_data, config_fname)
     config.remove_entry('emc', 'in_photons_file')
     dset.generate_data(config_fname)
     self.photons_tests(dset, 2)
     ndset = dset.next
     self.photons_tests(ndset, 2, False)
     
     os.remove(list_fname)
     config.remove_entry('emc', 'in_photons_list')
     config.modify_entry('emc', 'in_photons_file', 'make_data:::out_photons_file')
        RecIs = [item for item,score in I2Score[:N]]
        return RecIs




        
if __name__ == "__main__":
    import sys
    import dataset

    print('***** begin *****');sys.stdout.flush()

    print('=== get dataset ===');sys.stdout.flush()
    no = 1
    data = dataset.dataset(no, N=2000, M=400, K=20, R=5, seed=1)
    # print(data)
    
    print('=== create CF model ===');sys.stdout.flush()
    #sim_func_name = "pearson"
    sim_func_name = "cosine"
    #model_type_name = "normalized"
    model_type_name = "sim_multi_rating"
    cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name)
    print(sim_func_name)
    print(model_type_name)
    # print("I2Us :", cf_model.I2Us)

    print('=== calc scores ===');sys.stdout.flush()
    U2I2Score = cf_model.calcU2I2Score(cf_model.Us)
    for user in cf_model.Us:
Ejemplo n.º 47
0
def run_experiments_hold_out(data_paths, outpath, hold_out_p = .25,  datasets_for_eval = None, upto = None, step_size = 25, 
                                                  initial_size = 2, batch_size = 5,  pick_balanced_initial_set = True, 
                                                  num_runs=10, report_results_after_runs=True):
    '''
    This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically,
    a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results
    for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter
    module to aggregate and plot the output.
    
    @parameters
    --
    data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be
                                the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere.
    outpath -- this is a directory under which all of the results will be dumped.
    hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own    
                                  dataset(s) for evaluation (i.e., datasets_for_eval is not None)'.
    datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data
                                                in data_paths. 
    upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available
    initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models)
    batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1
    step_size -- results will be reported every time another step_size examples have been labeled
    pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes.
    num_runs -- this many runs will be performed
    report_results -- if true, the results_reporter module will be used to generate output.
    '''
    for run in range(num_runs):
        print "\n********\non run %s" % run
 
        print data_paths
        num_labels_so_far = initial_size # set to initial size for first iteration

        if not os.path.isdir(outpath):
            os.mkdir(outpath)
        
        # if a string (pointing to a single dataset) is passed in, box it in a list
        data_paths = box_if_string(data_paths)
        datasets = [dataset.build_dataset_from_file(f) for f in data_paths]
        total_num_examples = len(datasets[0].instances)
        
        test_datasets = []
        if datasets_for_eval is not None:
            # if a test set is specified, use it.
            datasets_for_eval = box_if_string(datasets_for_eval)
            test_datasets = [dataset.build_dataset_from_file(f) for f in datasets_for_eval]
            if upto is None:
                upto = total_num_examples
        else:
            # other wise, we copy the first (even if there multiple datasets, it won't matter, as we're just using 
            # the labels) and pick random examples
            hold_out_size = int(hold_out_p * total_num_examples)
            test_instances = random.sample(datasets[0].instances, hold_out_size)
            test_instance_ids = [inst.id for inst in test_instances]
            # now remove them from the dataset(s)
            for d in datasets:
                cur_test_dataset = dataset.dataset(d.remove_instances(test_instance_ids))                    
                test_datasets.append(cur_test_dataset)
            
            # if no upper bound was passed in, use the whole pool U
            if upto is None:
                upto = total_num_examples - hold_out_size
                
        print "using %s out of %s instances for test set" % (hold_out_size, total_num_examples)
        print "U has cardinality: %s" % datasets[0].size()
        
        
        #
        # Here is where learners can be added for comparison
        #
        learners = [random_learner.RandomLearner([d.copy() for d in datasets]), 
                    simple_learner.SimpleLearner([d.copy() for d in datasets]),
                    nb_learner.NBLearner([d.copy() for d in datasets])]
                
        output_files = [open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w') for learner in learners]

        # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids
        initial_f = learners[0].get_random_unlabeled_ids 
        init_size = num_labels_so_far
        if pick_balanced_initial_set:
            initial_f = learners[0].pick_balanced_initial_training_set
            init_size = int(num_labels_so_far/2.0) # equal number from both classes
            
        # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will
        # be the same for all learners.
        init_ids =initial_f(init_size)
        
        # label instances and build initial models
        for learner in learners:
            learner.label_instances_in_all_datasets(init_ids)
            learner.rebuild_models()
            
        # report initial results, to console and file.
        report_results(learners, test_datasets, num_labels_so_far, output_files)
              
        first_iter = True
        while num_labels_so_far <= upto - step_size:
            #
            # the main active learning loop
            #
            cur_step_size = step_size
            cur_batch_size = batch_size
            if first_iter:
                # here we account for the initial labeled dataset size. for example, suppose
                # the step_size is set to 25 (we want to report results every 25 labels), 
                # but the initial size was 2; then we want to label 23 on the first iteration
                # so that we report results when 25 total labels have been provided
                cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \
                                else step_size - (num_labels_so_far - step_size)
                # in general, step_size is assumed to be a multiple of batch_size, for the first iteration, 
                # when we're catching up to to the step_size (as outlined above), we set the
                # batch_size to 1 to make sure this condition holds.
                cur_batch_size = 1 
                first_iter = False
            
            for learner in learners:
                learner.active_learn(cur_step_size, num_to_label_at_each_iteration = cur_batch_size)
                            
            num_labels_so_far += cur_step_size
            print "\n***labeled %s examples out of %s so far***" % (num_labels_so_far, upto)
            
            report_results(learners, test_datasets, num_labels_so_far, output_files)

        # close files
        for output_file in output_files:
            output_file.close()
    
    # post-experimental reporting
    if report_results_after_runs:
        results_reporter.post_runs_report(outpath, [l.name for l in learners], num_runs)
Ejemplo n.º 48
0
            dset the dataset object
        output:
            P [1d ndarray] probability P(L|Ix)
        '''
        #print("test>>mnode:{}".format(self))
        if self.tau is None:#reaching terminal node
            return self.P
        else:
            #if (self.L is not None and goLeft) :
            if (dset.getI(self.theta,x)<self.tau) :
                return self.L.getP(x,dset)
            else:
                return self.R.getP(x,dset)
    
    def getL(self,x,dset):
        '''
        input:
            x sample index [int]
            dset the dataset object
        output:
            L [integer] label
        '''
        return np.argmax(self.getP(x,dset))

if __name__ == '__main__':
    from dataset import dataset
    dset=dataset()
    print dset
    root=mnode()
    
    
Ejemplo n.º 49
0
import tensorflow as tf
import dataset

data = dataset.dataset()
data.load_from_json("train.json", 'cuisine', 'ingredients')

input_size = len(data.vocabulary)
categories =  len(data.labels)

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape = shape)
    return tf.Variable(initial)

def relu_activation(W, x):
    return tf.relu(tf.matmul(x,W) + b)

def softmax(W, x):
    return tf.nn.softmax()

#placeholders
tf_x = tf.placeholder(tf.float32, shape=[None, input_size])
y_expected = tf.placeholder(tf.float32, shape=[None, categories])

#first hidden layer params.
#size is [lower_layer_size, upper_layer_size]
W_h1 = weight_variable([input_size, 1000])
Ejemplo n.º 50
0
import theano
import theano.tensor as T
import numpy as np
import time as ti
import dataset as d
import markhov as m
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d
from sklearn import metrics
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
srng = RandomStreams()
import pandas as pd
from PIL import Image

ds=d.dataset(test_size=0.15)
ma=m.markhov()
x_gold, labels_gold = ds.test_batch(size=128,emit=False)
# define symbolic Theano variables
x = T.tensor4()
t = T.matrix()

# define model: neural network
def floatX(x):
    return np.asarray(x, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.1))

def meanfscore(y_pred,y_true):
    return metrics.f1_score(np.array(np.rint(y_true),dtype="int"), np.array(np.rint(y_pred),dtype="int") , average='samples')  
Ejemplo n.º 51
0
import uninet as ssb
import neuralnetwork_tensorflow as nntf
import cPickle
import activationFunction as af
import dataset

old_trainingSet = cPickle.load(open("/media/tassadar/Work/Google Drive/My/NeuralNet/data/mnist/MNISTTrainingSet_square", 'rb'))
#cvSet = cPickle.load(open("/media/tassadar/Work/Google Drive/My/NeuralNet/data/mnist/MNISTTestSet_square", 'rb'))

trainingSet = dataset.dataset(examples=old_trainingSet.examples, labels=old_trainingSet.labels)
trainingSet.rearrangeToCubic()

l1 = ssb.input(input_size=[28,28,1])
l2 = ssb.convolutional(patchSize=3, strides=[1,1,1,1], depth=16, activationFunction=af.relu)
l3 = ssb.softmax(neurons=10)

net = nntf.neuralnetwork(layers=[l1,l2,l3], errorFunction=ssb.logLoss)

net.train(trainingSet=trainingSet, numEpochs=10000, minibatchSize=100, learningRate=0.1, errorCheckPeriod=100)
from dataset import dataset
import numpy as np
import matplotlib.pyplot as plt
import sys

ds_name = sys.argv[1]
dir_name = "datasets/%s"%(ds_name)

dataset_ts = dataset("%s_ts.txt"%(ds_name))
u2i = dataset_ts.user_item_matrix['binary']
i2u = dataset_ts.item_user_matrix['binary']

users = sorted(u2i.keys())
items = sorted(i2u.keys())

f = open("%s/user_history_lengths.txt"%(dir_name), 'w')
for user in users:
	f.write("%d %d\n" % (user, len(u2i[user])))
f.close()

f = open("%s/item_listening_lengths.txt"%(dir_name), 'w')
for item in items:
	f.write("%d %d\n" % (item, len(i2u[item])))
f.close()
Ejemplo n.º 53
0
def main(_):
    train_set, test_set, idx2word, word2idx = dataset.dataset()

    train(train_set, test_set, idx2word, word2idx)
    data = {}

    for header, column in header_column.iteritems():
        data[header] = None if column > len(row) else row[column-1]

    return data

asr_id_rows = {}

for idx, r in enumerate(values[1:]):
    row_number = idx + 2
    row = data(r)
    asr_id_rows[row['ASR_ID']] = row

# creates a CSV file with all pledges in the neighborhood boundary.
with dataset() as ds:
    addressLayer = ds.layers['address']
    neighborhoodLayer = ds.layers['neighborhood']

    for neighborhood in neighborhoodLayer:

        neighborhoodName = neighborhood.GetField("name")

        print neighborhoodName

        targetDir = os.path.join(r'C:\personal\BeeSafeBoulder\GoogleDrive-BeeSafe\BeeSafe', neighborhoodName)
        if not os.path.exists(targetDir):
            os.mkdir(targetDir)

        addressLayer.SetSpatialFilter(neighborhood.GetGeometryRef())
        addressLayer.SetAttributeFilter('ADDR_FMT = "EXACT"')
Ejemplo n.º 55
0
			r.data[x] = float(r.data[x])
		else:
			r.data[x] = -1

# 构造跟节点,构造集合S,S初始化包含根节点
# 当根节点非空时,执行操作
# 取出Sets中的一个节点R,如果
# R为单一属性的记录
# 	-->得到叶子节点
# 否则
# 	根据最好属性进行分类
# 	将得到的新的子节点放入到S中,从S中移除R
# 构造对应的树结构
	

root_node = dataset(test_datas, [])
unpure_list = [root_node]

records = test_datas
# attr = attr_list[4]
# print attr
# print 'split_by_attr(test_datas, attr): ', split_by_attr(test_datas, attr)
# print 'gain_ratio(records, attr) : ', gain_ratio(records, attr)
# print 'split_by_attr(records, attr_list[0]): ', split_by_attr(records, attr_list[0])

print records[0]
[gains, attr_best] = [-1, None]
for attr in attr_list[0:-1]:
	if gains<gain_ratio(records, attr):
		[gains, attr_best] = [gain_ratio(records, attr), attr]
print gains, attr_best
Ejemplo n.º 56
0
def main():
    ds = dataset()
    ds.model_background_state()
    ds.model_fires(30)
    ds.model_cloud_cover()
    ds._save_to_gif()
Ejemplo n.º 57
0
# https://github.com/tensorflow/tensorflow/issues/1541
import scipy.misc

from keras.utils import np_utils

import dataset
import net

np.random.seed(1337)

n = 224
batch_size = 128

data_directory, = sys.argv[1:]

X, y, tags = dataset.dataset(data_directory, n)
nb_classes = len(tags)

sample_count = len(y)
train_size = sample_count * 4 // 5
X_train = X[:train_size]
y_train = y[:train_size]
Y_train = np_utils.to_categorical(y_train, nb_classes)
X_test  = X[train_size:]
y_test  = y[train_size:]
Y_test = np_utils.to_categorical(y_test, nb_classes)


def evaluate(model, vis_filename=None):
    Y_pred = model.predict(X_test, batch_size=batch_size)
    y_pred = np.argmax(Y_pred, axis=1)
Ejemplo n.º 58
0
Archivo: main.py Proyecto: erraX/rescal
    parser.add_argument("--log", type=str, help="log file", default="rescal.log", required=False)
    parser.add_argument("--result", type=str, help="result file", default="result.txt",  required=False)
    return parser.parse_args()

if __name__ == '__main__':
    # Parsing arguments
    start_time = datetime.now()
    cliInputArgs = parseArguments()

    # Log file
    logFile = "./log/" + cliInputArgs.log
    # Result file
    resultToFile = open("./result/" + cliInputArgs.result, 'w')

    logger = Logger()
    runRescal = RunRescal(dataset(cliInputArgs.train, "UTF-8"), dataset(cliInputArgs.test, "UTF-8"), logger)
    config = {'numLatentComponents':cliInputArgs.latent, 'regularizationParam':cliInputArgs.lmbda, 'th':cliInputArgs.th}
    runRescal.rescal(config)

    # Start training
    result = rescal(runRescal.X, config['numLatentComponents'], lmbda=config['regularizationParam'])
    logger.getLog().info('[Tensor] Objective function value: %.3f' % result[2])
    logger.getLog().info('[Tensor] Of iterations: %d' % result[3])

    A = result[0]
    R = result[1]
    logger.getLog().info("[Tensor] Matrix A's shape: %s" % str(A.shape))
    logger.getLog().info("[Tensor] Matrix R's shape: %s" % str(R[0].shape))
    # _log.info("## Execute time: %s" % str(sum(result[4])))
    # Evaluate algorithm performance
    resultToFile.write("-" * 20 + "\n")
Ejemplo n.º 59
0
            if sample[0] == sample[1]:
                count += 1
        self.logger.getLog().info("Çount: %d, Evaluation length: %d" % (count, len(evaluation)))
        # print "Count:", count, " Evaluation: ", len(evaluation)
        return totalScore * 1.0 / len(allRel), totalAccuracy * 1.0 / len(allRel)

if __name__ == '__main__':
    start_time = datetime.now()
    dataArgs, algoArgs = parseArguments()

    logFile = "./log/" + dataArgs['log']
    resultToFile = open("./result/" + dataArgs['result'], 'w')

    # 初始化logger和算法实例
    logger = Logger()
    runRescal = RunRescal(dataset(dataArgs['train'], "UTF-8"), dataset(dataArgs['test'], "UTF-8"), logger)

    # 运行RESCAL和Tranlating Embedding算法
    runRescal.rescal(algoArgs, False)
    runRescal.RunTransE("myData/FOAF/ent2id_c.txt", "myData/FOAF/rel2id_c.txt", "myData/FOAF/entity2vec.foaf.bern", "myData/FOAF/relation2vec.foaf.bern")
    # runRescal.tranE(loadPickle(dataArgs['embed']))
    # t1, t2 = runRescal.training(0.001)
    # for t in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    # for t in [0, ]:
    #     runRescal.calEveryRelationScore(t, 1-t)
    #     testCase = runRescal.pickPredictedResult()
    #     roc, acc = runRescal.roc([(i[-1], i[-3]) for i in testCase])
    #     # for t in testCase:
    #     #     print t[0], t[1], t[2], t[3]
    #     print "t1: %f, t2 %f, ROC: %f, ACC: %f" % (t, 1-t, roc, acc)
    # end_time = datetime.now()