def run_benchmark(self, output_file_path):
     """
     Run benchmark for all learners on every dataset provided.
     The result will be output to the provided file.
     """
     dataloader = DataLoader()
     
     f = open(output_file_path,'w')
     
     # score, learner_1_name, learner_2_name, ..., learner_n_name
     f.write("dataset")
     for learner in self.learners:
         f.write(", " + learner.name())
     
     # write scores for all data sets
     for dataset_id in self.dataset_ids:
         print "Benchmarking dataset: " + str(dataset_id)
         f.write("\n" + str(dataset_id))
         train_data = dataloader.load_sequences_from_file("../data/" + str(dataset_id) + ".pautomac" + ".train")
         test_data = dataloader.load_sequences_from_file("../data/" + str(dataset_id) + ".pautomac" + ".test")
         solution_data = dataloader.load_probabilities_from_file("../data/" + str(dataset_id) + ".pautomac_solution" + ".txt")
         for learner in self.learners:
             print "Training learner: " + learner.name()
             learner.train(train_data, test_data)
             print "Evaluating learner: " + learner.name()
             score = learner.evaluate(test_data, solution_data)
             print "Achieved score: " + str(score)
             str_score = " {0:.1f}".format(score)
             while len(str_score) < 8:
                 str_score = " " + str_score
             f.write(", " + str_score)
     f.close()
def extractData():
    parser = OptionParser()
    parser.add_option("--inputDir", dest="inputDir", help="Input directory", metavar="DIRECTORY")
    parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1)
    parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING")
    parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
    parser.add_option("--save_dir", dest="save_dir", help="save the training samples to this directory", metavar="DIRECTORY", default="../trained_model")
    parser.add_option("--save_file", dest="save_file", help="save the training samples to file", metavar="FILE")
    (opt, args) = parser.parse_args()

    inputDir = opt.inputDir
    particle_size = int(opt.particle_size)
    coordinate_symbol = opt.coordinate_symbol
    mrc_number = int(opt.mrc_number)
    output_dir = opt.save_dir
    output_filename = opt.save_file
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if particle_size == -1:
        print("particle size should be a positive value!")
        return 

    output_filename = os.path.join(output_dir, output_filename)
    DataLoader.extractData(inputDir, particle_size, coordinate_symbol, mrc_number, output_filename)
Example #3
0
def load_data(filePath: str,
              label_txt_filePath: str,
              shuffle: bool = True,
              seq_length: int = 3000,
              batch_size: int = 64,
              training: bool = True):
    voc = Vocab()
    dataLoader = DataLoader()

    # 全部数据
    dataLoader.sequences = dataLoader.read_fasta_file(fasta_file_path=filePath)
    # 训练集
    dataLoader.train_seq = dataLoader.sequences[:900]
    # 测试集
    dataLoader.test_seq = dataLoader.sequences[900:1000]
    # 标签,0/1
    dataLoader.labels = dataLoader.read_label_txt(
        label_file_path=label_txt_filePath)
    # 训练集的向量表示
    dataLoader.train_vectorized_seq = voc.sequences_to_ids(
        dataLoader.train_seq)
    # 测试集的向量表示
    dataLoader.test_vectorized_seq = voc.sequences_to_ids(dataLoader.test_seq)

    # print(dataLoader.train_vectorized_seq)
    # print(dataLoader.test_vectorized_seq)
    # x_batch, y_batch = dataLoader.get_batch(shuffle=shuffle, seq_length=seq_length, batch_size=batch_size, training=training)
    # print("x_batch.shape={}, y_batch.shape={}".format(x_batch.shape, y_batch.shape))
    # print("x_batch[0]:{}".format(x_batch[0]))
    # print("y_batch[0]:{}".format(y_batch[0]))

    return voc, dataLoader
Example #4
0
def _reading_data():
    print(config.USER)

    # step2 the way to load_data
    # load data contains :
    # the way to load data
    # the way to preprocess with data
    # doing some special data cleaning process
    trainFilepath = os.path.join(os.getcwd(), "data", config.FILENAME)
    trainDataLoader = DataLoader(trainFilepath)
    train_data = trainDataLoader.load_data(useSpark=False, interactive=False)

    train_data.save_data(os.getcwd())
Example #5
0
def main():
    ranker = SVMRank()
    file_name = 'input/BioASQ-trainingDataset6b.json'
    data = DataLoader(file_name)
    data.load_ner_entities()
    questions = data.get_questions_of_type(C.FACTOID_TYPE)[:419]

    for i, question in enumerate(questions):
        ranked_sentences = question.ranked_sentences()
        X, y = get_features(question, ranked_sentences)
        ranker.feed(X, y, i)

    ranker.train_from_feed()
    ranker.save('weights_2')
Example #6
0
def main():
    file_name = 'input/BioASQ-task6bPhaseB-testset3.json'
    file_name = 'input/BioASQ-trainingDataset6b.json'
    file_name = 'input/BioASQ-trainingDataset5b.json'
    file_name = 'input/phaseB_5b_05.json'
    save_model_file_name = 'weights_2'
    ranker = SVMRank(save_model_file_name)
    data = DataLoader(file_name)
    data.load_ner_entities()
    ans_file = 'output/factoid_list_%s.json' % data.name

    questions = data.get_questions_of_type(C.FACTOID_TYPE)
    for i, question in enumerate(tqdm(questions)):
        ranked_sentences = question.ranked_sentences()
        X, candidates = get_only_features(question, ranked_sentences)
        top_answers = ranker.classify_from_feed(X, candidates, i)
        question.exact_answer = [[answer] for answer in top_answers[:5]]
        # question.exact_answer = [answer for answer in top_answers]
        # print question.exact_answer_ref
        # print '\n'
        # print top5
        # print '\n'
        # print '\n\n\n'
    questions = data.get_questions_of_type(C.LIST_TYPE)
    for i, question in enumerate(tqdm(questions)):
        ranked_sentences = question.ranked_sentences()
        X, candidates = get_only_features(question, ranked_sentences)
        top_answers = ranker.classify_from_feed(X, candidates, i)
        question.exact_answer = [[answer] for answer in top_answers[:10]]

    data.save_factoid_list_answers(ans_file)
def run_cv(fold_iterator, logger, params_dict, upsample=True):
    for traindirs, testdirs in fold_iterator:
        # TRAIN LOCAL PREDICTION MODEL
        # Generators
        logger.info('############ FOLD #############')
        logger.info('Training folders are {}'.format(traindirs))
        training_generator = DataLoader(data_dir,
                                        traindirs,
                                        32,
                                        width_template=params_dict['width'],
                                        upsample=upsample)
        validation_generator = DataLoader(data_dir,
                                          testdirs,
                                          32,
                                          width_template=params_dict['width'],
                                          type='val',
                                          upsample=upsample)

        # Design model
        model = create_model(params_dict['width'] + 1,
                             params_dict['h1'],
                             params_dict['h2'],
                             params_dict['h3'],
                             embed_size=params_dict['embed_size'],
                             drop_out_rate=params_dict['dropout_rate'],
                             use_batch_norm=params_dict['use_batchnorm'])
        # Train model on training dataset
        '''
        model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            use_multiprocessing=True,
                            epochs=params_dict['n_epochs'],
                            workers=6)
        '''
        try:
            model.load_weights(os.path.join(checkpoint_dir, 'model22.h5'))
        except OSError:
            print('here')
            model.fit_generator(generator=training_generator,
                                validation_data=validation_generator,
                                use_multiprocessing=True,
                                epochs=params_dict['n_epochs'],
                                workers=4,
                                max_queue_size=20)
            model.save_weights(os.path.join(checkpoint_dir, 'model.h5'))
        metrics = model.evaluate_generator(generator=validation_generator,
                                           workers=4,
                                           max_queue_size=20)
        logger.info(metrics)
Example #8
0
def extractData():
    parser = OptionParser()
    parser.add_option("--inputDir",
                      dest="inputDir",
                      help="Input directory",
                      metavar="DIRECTORY")
    parser.add_option("--mrc_number",
                      dest="mrc_number",
                      help="Number of mrc files to be trained.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option(
        "--coordinate_symbol",
        dest="coordinate_symbol",
        help="The symbol of the coordinate file, like '_manualPick'",
        metavar="STRING")
    parser.add_option("--particle_size",
                      dest="particle_size",
                      help="the size of the particle.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--save_dir",
                      dest="save_dir",
                      help="save the training samples to this directory",
                      metavar="DIRECTORY",
                      default="../trained_model")
    parser.add_option("--save_file",
                      dest="save_file",
                      help="save the training samples to file",
                      metavar="FILE")
    (opt, args) = parser.parse_args()

    inputDir = opt.inputDir
    particle_size = int(opt.particle_size)
    coordinate_symbol = opt.coordinate_symbol
    mrc_number = int(opt.mrc_number)
    output_dir = opt.save_dir
    output_filename = opt.save_file
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if particle_size == -1:
        print("particle size should be a positive value!")
        return

    output_filename = os.path.join(output_dir, output_filename)
    DataLoader.extractData(inputDir, particle_size, coordinate_symbol,
                           mrc_number, output_filename)
Example #9
0
def get_data_loaders(data_root, speaker_id, test_shuffle=True):
    data_loaders = {}
    local_conditioning = hparams.cin_channels > 0
    for phase in ["train", "test"]:
        train = phase == "train"
        X = FileSourceDataset(
            RawAudioDataSource(data_root,
                               speaker_id=speaker_id,
                               train=train,
                               test_size=hparams.test_size,
                               test_num_samples=hparams.test_num_samples,
                               random_state=hparams.random_state))
        if local_conditioning:
            Mel = FileSourceDataset(
                MelSpecDataSource(data_root,
                                  speaker_id=speaker_id,
                                  train=train,
                                  test_size=hparams.test_size,
                                  test_num_samples=hparams.test_num_samples,
                                  random_state=hparams.random_state))
            assert len(X) == len(Mel)
            print("Local conditioning enabled. Shape of a sample: {}.".format(
                Mel[0].shape))
        else:
            Mel = None
        print("[{}]: length of the dataset is {}".format(phase, len(X)))

        if train:
            lengths = np.array(X.file_data_source.lengths)
            # Prepare sampler
            sampler = PartialyRandomizedSimilarTimeLengthSampler(
                lengths, batch_size=hparams.batch_size)
            shuffle = False
        else:
            sampler = None
            shuffle = test_shuffle

        dataset = PyTorchDataset(X, Mel)
        data_loader = DataLoader(dataset,
                                 batch_size=hparams.batch_size,
                                 num_workers=hparams.num_workers,
                                 sampler=sampler,
                                 shuffle=shuffle,
                                 collate_fn=collate_fn,
                                 pin_memory=hparams.pin_memory)

        speaker_ids = {}
        for idx, (x, c, g) in enumerate(dataset):
            if g is not None:
                try:
                    speaker_ids[g] += 1
                except KeyError:
                    speaker_ids[g] = 1
        if len(speaker_ids) > 0:
            print("Speaker stats:", speaker_ids)

        data_loaders[phase] = data_loader

    return data_loaders
Example #10
0
    def _build_data(self, data_dir='train_dir', num_classes=10, mode='train'):
        loader = DataLoader(data_dir=data_dir, num_classes=num_classes,
                            mode=mode, height=self.height, width=self.width)

        dataset = tf.data.Dataset.from_generator(generator=loader.generator,
                                                 output_types=(tf.float32,
                                                               tf.int32),
                                                 output_shapes=(tf.TensorShape([self.height, self.width, 3]),
                                                                tf.TensorShape([self.num_classes])))
        return dataset
Example #11
0
def prepare_data_loader_train_10_splits(texture_train_data_set_path,
                                        texture_train_label_set_path,
                                        texture_val_data_set_path,
                                        texture_val_label_set_path,
                                        texture_batch_size, num_workers,
                                        device):
    data_loader_list = []
    for i in range(10):
        idx = i + 1
        print("Split: {0}".format(idx))
        texture_train_data_set_path = texture_train_data_set_path.format(idx)
        texture_train_label_set_path = texture_train_label_set_path.format(idx)
        texture_val_data_set_path = texture_val_data_set_path.format(idx)
        texture_val_label_set_path = texture_val_label_set_path.format(idx)

        dL = DataLoader()
        texture_train_set, train_set_size = dL.get_tensor_set(
            texture_train_data_set_path, texture_train_label_set_path, device)
        texture_val_set, val_set_size = dL.get_tensor_set(
            texture_val_data_set_path, texture_val_label_set_path, device)
        print("Train set size: {0}".format(train_set_size))
        print("Val set size: {0}".format(val_set_size))

        texture_train_data_loader = torch.utils.data.DataLoader(
            texture_train_set,
            batch_size=texture_batch_size,
            shuffle=True,
            num_workers=num_workers)
        texture_val_data_loader = torch.utils.data.DataLoader(texture_val_set,
                                                              num_workers=1,
                                                              shuffle=False,
                                                              pin_memory=True)

        data_loader_dict = {
            "train": texture_train_data_loader,
            "val": texture_val_data_loader
        }
        data_loader_list.append(data_loader_dict)

    return data_loader_list
def starcraft_sp_test():

    # Create DataLoader instance to load and format data
    dataLoader = DataLoader()

    logging.info("Program started")

    logging.info("Loading starcraft data")
    # Read skillcraft dataset, the class index is the second column
    dataLoader.read(filename="data/SkillCraft1_Dataset.csv",
                    classIndex=1,
                    numOfFeatures=15)

    # Normalize data values from 0 - 1
    #dataLoader.normalize()

    # Create new labels to fit into binary classification
    dataLoader.scaleToBinary(5)

    # Spectral Clustering

    # Binary
    clustering(dataLoader.x_train,
               dataLoader.y_train,
               writer_starcraft,
               'starcraft-binary',
               multiple=True,
               binary=True)

    # Multiclass
    #clustering(dataLoader.x_train, dataLoader.multi_y_train, writer_starcraft, 'starcraft-multiclass', multiple=True, binary=False)

    # Write all the results
    writer_starcraft.save()
Example #13
0
def prepare_data_loader_test_10_splits(texture_test_data_set_path,
                                       texture_test_label_set_path, device):
    data_loader_list = []
    for i in range(10):
        idx = i + 1
        print("Split: {0}".format(idx))
        texture_test_data_set_path = texture_test_data_set_path.format(idx)
        texture_test_label_set_path = texture_test_label_set_path.format(idx)

        dL = DataLoader()
        texture_test_set, test_set_size = dL.get_tensor_set(
            texture_test_data_set_path, texture_test_label_set_path, device)
        print("Test set size: {0}".format(test_set_size))

        test_data_loader = torch.utils.data.DataLoader(texture_test_set,
                                                       num_workers=1,
                                                       shuffle=False,
                                                       pin_memory=True)

        data_loader_list.append(test_data_loader)

    return data_loader_list
def test_real_dataset(create_obj_func,
                      src_name=None,
                      trg_name=None,
                      show=False,
                      block_figure_on_end=False):
    print('Running {} ...'.format(os.path.basename(__file__)))

    if src_name is None:
        if len(sys.argv) > 2:
            src_name = sys.argv[2]
        else:
            raise Exception('Not specify source dataset')
    if trg_name is None:
        if len(sys.argv) > 3:
            trg_name = sys.argv[3]
        else:
            raise Exception('Not specify trgget dataset')

    np.random.seed(random_seed())
    tf.set_random_seed(random_seed())
    tf.reset_default_graph()

    print("========== Test on real data ==========")
    users_params = dict()
    users_params = parse_arguments(users_params)
    data_format = 'mat'

    if 'format' in users_params:
        data_format, users_params = extract_param('format', data_format,
                                                  users_params)

    data_loader = DataLoader(src_domain=src_name,
                             trg_domain=trg_name,
                             data_path=data_dir(),
                             data_format=data_format,
                             cast_data=users_params['cast_data'])

    assert users_params['batch_size'] % data_loader.num_src_domain == 0
    print('users_params:', users_params)

    learner = create_obj_func(users_params)
    learner.dim_src = data_loader.data_shape
    learner.dim_trg = data_loader.data_shape

    learner.x_trg_test = data_loader.trg_test[0][0]
    learner.y_trg_test = data_loader.trg_test[0][1]
    learner._init(data_loader)
    learner._build_model()
    learner._fit_loop()
    def test(self,
             data_dir='test_b',
             model_dir=None,
             output_dir=None,
             threshold=0.5):
        print("testing starts.")

        loader = DataLoader(data_dir=data_dir,
                            mode='test',
                            height=self.height,
                            width=self.width,
                            label_value=self.label_values)
        testset = tf.data.Dataset.from_generator(
            generator=loader.generator,
            output_types=(tf.string, tf.int32, tf.float32),
            output_shapes=(tf.TensorShape([]), tf.TensorShape([2]),
                           tf.TensorShape([self.height, self.width, 3])))

        testset = testset.batch(1)
        testset = testset.prefetch(10)
        test_init = self.it.make_initializer(testset)

        saver = tf.train.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            saver.restore(sess, model_dir)
            sess.run(test_init)
            queue = multiprocessing.Queue(maxsize=30)
            writer_process = multiprocessing.Process(
                target=writer,
                args=[output_dir, self.label_values, queue, 'stop'])
            writer_process.start()
            print('writing predictions...')
            try:
                while True:
                    img, path, size, output_image = sess.run(
                        [self.img, self.path, self.size, self.logits])
                    queue.put(('continue', path, size, img, output_image))
            except tf.errors.OutOfRangeError:
                queue.put(('stop', None, None, None, None))

        print('testing finished.')
Example #16
0
    def test(self, data_dir='test', model_dir=None, output_dir='result', batch_size=10):
        print("testing starts.")

        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        # load test data
        loader = DataLoader(data_dir=data_dir, num_classes=self.num_classes,
                            mode='test', height=self.height, width=self.width)

        testset = tf.data.Dataset.from_generator(generator=loader.generator,
                                                 output_types=(tf.string,
                                                               tf.float32),
                                                 output_shapes=(tf.TensorShape([]),
                                                                tf.TensorShape([self.height, self.width, 3])))
        testset = testset.shuffle(100)
        testset = testset.batch(batch_size)
        testset = testset.prefetch(20)
        test_init = self.it.make_initializer(testset)

        saver = tf.train.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            saver.restore(sess, model_dir)
            sess.run(test_init)
            queue = multiprocessing.Queue(maxsize=30)
            writer_process = multiprocessing.Process(target=writer, args=[output_dir, batch_size, queue, 'stop'])
            writer_process.start()
            print('writing predictions...')
            try:
                while True:
                    img_name, pre_label = sess.run([self.img_name, self.prediction_value])
                    queue.put(('continue', img_name, pre_label))
            except tf.errors.OutOfRangeError:
                queue.put(('stop', None, None))

        print('testing finished.')
Example #17
0
    def pick(self, mrc_filename):
        """Do the picking job through tensorflow.

        This function read the micrograph data information based on the given filename of micrograph.
        Then do the auto picking based on pre-trained CNN model.

        Args:
            mrc_filename: string, it is the filename of the target micrograph.

        Returns:
            return list_coordinate
            list_coordinate: a list, the length of this list stands for the number of picked particles.
                                   Each element in the list is also a list, the length is 4, the first one is y-axis, 
                                   the second one is x-axis, the third one is the predicted score, the fourth is the micrograph filename.
        """
        # read the micrograph image data
        print(mrc_filename)
        header, body = DataLoader.readMrcFile(mrc_filename)
        num_col = header[0]
        num_row = header[1]
        body_2d = np.array(body, dtype = np.float32).reshape(num_row, num_col)
        
        # do process to micrograph
        body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
        
        # Edge detection to get the ice noise mask
        # a binary matrix, 1 stands for the ice noise site
        # mask = edge_detection_ice(body_2d)

        step_size = 4
        candidate_patches = None
        candidate_patches_exist = False
        num_total_patch = 0
        patch_size = int(self.particle_size/bin_size)
        # the size to do peak detection 
        local_window_size = int(0.6*patch_size/step_size)

        #print("image_col:", body_2d.shape[0])
        #print("particle_size:", patch_size)
        #print("step_size:", step_size)
        map_col = int((body_2d.shape[0]-patch_size)/step_size)
        map_row = int((body_2d.shape[1]-patch_size)/step_size)
         
        #prediction = np.zeros((map_col, map_row), dtype = float)
        time1 = time.time()
        particle_candidate_all = []
        map_index_col = 0
        for col in range(0, body_2d.shape[0]-patch_size+1, step_size):
            for row in range(0, body_2d.shape[1]-patch_size+1, step_size):
                # extract the particle patch
                patch = np.copy(body_2d[col:(col+patch_size), row:(row+patch_size)])
                # do preprocess to the particle
                patch = DataLoader.preprocess_particle(patch, self.model_input_size)
                particle_candidate_all.append(patch)
                num_total_patch = num_total_patch + 1
            map_index_col = map_index_col + 1

        map_index_row = map_index_col-map_col+map_row
        #print("map_col:",map_col)
        #print("map_row:",map_row)
        #print(len(particle_candidate_all))
        #print("map_index_col:",map_index_col)
        #print("map_index_row:",map_index_row)
        #print("col*row:",map_index_col*map_index_row)
        # reshape it to fit the input format of the model
        particle_candidate_all = np.array(particle_candidate_all).reshape(num_total_patch, self.model_input_size[1], self.model_input_size[2], 1)
        # predict
        predictions = self.deepModel.evaluation(particle_candidate_all, self.sess)
        predictions = predictions[:, 1:2]
        predictions = predictions.reshape(map_index_col, map_index_row)

        time_cost = time.time() - time1
        print("time cost: %d s"%time_cost)
        #display.save_image(prediction, "prediction.png")
        # get the prediction value to be a positive sample, it is a value between 0~1
        # the following code not tested
        # do a connected component analysis
        # prediction = detete_large_component(prediction)

        # do a local peak detection to get the best coordinate
        # list_coordinate is a 2D list of shape (number_particle, 3)
        # element in list_coordinate is [x_coordinate, y_coordinate, prediction_value]
        list_coordinate = self.peak_detection(predictions, local_window_size)
        # add the mrc filename to the list of each coordinate
        for i in range(len(list_coordinate)):
            list_coordinate[i].append(mrc_filename)
            # transform the coordinates to the original size 
            list_coordinate[i][0] = (list_coordinate[i][0]*step_size+patch_size/2)*bin_size
            list_coordinate[i][1] = (list_coordinate[i][1]*step_size+patch_size/2)*bin_size
            
        return list_coordinate
from dataLoader import DataLoader

loader = DataLoader()
loader.loadAll()

fileobj = open("csv/subjectAreaDump.csv", 'w')

for id, paper in loader.papers.iteritems():
    if paper.accepted:
        fileobj.write("%s|%d|%s" %
                      (paper.primarySpecificSubjectArea, id, paper.title))
        for subj in paper.specificSubjectAreas:
            fileobj.write("|" + subj)
        fileobj.write("\n")
fileobj.close()
Example #19
0
from pathlib import Path

from flask import Flask, render_template, make_response, jsonify, request, send_from_directory

import configurations
from analyzeResults import AnalyzeResults
from dataLoader import DataLoader
from hitCounter import HitCounter
import numpy as np

from vistDataset import VistDataset
import base64
import time

app = Flask(__name__)
data_loader = DataLoader(root_path=configurations.root_data)
hit_counter = HitCounter(root_path=configurations.root_data,
                         story_max_hits=configurations.max_story_submit)
vist_dataset = VistDataset(root_path=configurations.root_data,
                           hit_counter=hit_counter,
                           samples_num=configurations.samples)
analyze_results = AnalyzeResults(data_root=configurations.root_data,
                                 data_loader=data_loader,
                                 vist_dataset=vist_dataset)


@app.route('/api/images/<image_id>', methods=['GET'])
def serve_image(image_id):
    print("Requested image file: {}".format(image_id))
    image_path = data_loader._find_file(image_id)
    if image_path is None:
Example #20
0
from myModel import MyModel
from dataLoader import DataLoader

if __name__ == '__main__':
    ENABLE_SAVE_MODEL = True
    MODEL_NAME = 'mini'
    # 4 mnist
    # H, W, C = 28, 28, 1
    # (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    # 4 ukiyoe
    DATA_PATH = './data/'
    H, W, C = 224, 224, 3
    RH, RW = 224, 224
    x_train, y_train, x_test, y_test = DataLoader(0.2).load(DATA_PATH)
    if C == 1:
        x_train = np.sum(x_train, axis=-1) / 3
        x_test = np.sum(x_test, axis=-1) / 3

    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255
    x_train = x_train.reshape(x_train.shape[0], H, W, C)
    x_test = x_test.reshape(x_test.shape[0], H, W, C)

    model = MyModel()

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.Adam()

    train_loss = tf.keras.metrics.Mean(name='train_loss')
from numpy import *
from decimal import *
from sys import *
from learner import Learner
from decimal import *
from sys import *
from utilities import *
from dataLoader import DataLoader
import time

list1 = [[1, 2], [3, 4], [5, 6]]

list2 = [2, 3]

#for x in xrange(0, len(list1), 2):
#	print list1[x]

dataloader = DataLoader()

train_data = dataloader.load_sequences_from_file("../data/" + "1" + ".pautomac" + ".test")

#comps = collect_unique_symbol_compositions(train_data, 2)

MathiasLearner.train(train_data)

#print comps.index([1, 1])
Example #22
0
def train():
    parser = OptionParser()
    parser.add_option("--train_inputDir", dest="train_inputDir", help="Input directory", metavar="DIRECTORY")
    parser.add_option("--train_inputFile", dest="train_inputFile", help="Input file", metavar="FILE")
    parser.add_option("--train_type", dest="train_type", help="Training type, 1|2|3|4.", metavar="VALUE", default=2)
    parser.add_option("--particle_number", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1)
    parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1)
    parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING")
    parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
    parser.add_option("--validation_ratio", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1)
    parser.add_option("--model_retrain", action="store_true", dest="model_retrain", help="train the model using the pre-trained model as parameters initialization .", default=False)
    parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE")
    parser.add_option("--model_save_dir", dest="model_save_dir", help="save the model to this directory", metavar="DIRECTORY", default="../trained_model")
    parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE")
    (opt, args) = parser.parse_args()
 
    # set the tensoflow seed
    tf.set_random_seed(1234)
    # set the numpy seed
    np.random.seed(1234)

    # define the input size of the model
    model_input_size = [100, 64, 64, 1]
    num_class = 2                   # the number of the class
    batch_size = model_input_size[0]

    # define input parameters
    train_type = int(opt.train_type)
    train_inputDir = opt.train_inputDir
    train_inputFile = opt.train_inputFile
    train_number = float(opt.train_number) 
    mrc_number = int(opt.mrc_number)
    coordinate_symbol = opt.coordinate_symbol
    debug_dir = '../train_output'   # output dir
    particle_size = int(opt.particle_size)
    validation_ratio = float(opt.validation_ratio)   

    # define the save model
    model_retrain = opt.model_retrain
    model_load_file = opt.model_load_file
    model_save_dir = opt.model_save_dir
    model_save_file = os.path.join(model_save_dir, opt.model_save_file)

    if not os.access(model_save_dir, os.F_OK):
        os.mkdir(model_save_dir)
    if not os.access(debug_dir, os.F_OK):
        os.mkdir(debug_dir)

    # define the learning rate decay parameters
    # more information about this, refer to function tf.train.exponential_decay()
    learning_rate = 0.01
    learning_rate_decay_factor = 0.95
    # the value will be changed base on the train_size and batch size
    learning_rate_decay_steps = 400
    learning_rate_staircase = True
    # momentum
    momentum = 0.9

    # load training dataset
    dataLoader = DataLoader()
    if train_type == 1:
        # load train data from mrc file dir
        train_number = int(train_number)
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_mrcFileDir(train_inputDir, particle_size, model_input_size, validation_ratio, coordinate_symbol, mrc_number, train_number)
    elif train_type == 2:
        # load train data from numpy data struct
        train_number = int(train_number)
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_ExtractedDataFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number)
    elif train_type == 3:
        # load train data from relion .star file 
        train_number = int(train_number)
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(train_inputFile, particle_size, model_input_size, validation_ratio, train_number)
    elif train_type == 4:
        # load train data from prepicked results
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_PrePickedResults(train_inputDir, train_inputFile, particle_size, model_input_size, validation_ratio, train_number)
    else:
        print("ERROR: invalid value of train_type:", train_type)    

    display.show_particle(train_data, os.path.join(debug_dir, 'positive.png'))
    # test whether train_data exist
    try: 
        train_data
    except NameError:
        print("ERROR: in function load.loadInputTrainData.")
        return None
    else:
        print("Load training data successfully!")
    # shuffle the training data
    train_data, train_label = shuffle_in_unison_inplace(train_data, train_label)
    eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label)

    train_size = train_data.shape[0]
    eval_size = eval_data.shape[0]    
    # initalize the decay_steps based on train_size and batch size.
    # change the learning rate each 2 epochs
    learning_rate_decay_steps = 10*(train_size // batch_size)
    # initialize the parameters of deepModel
    deepModel = DeepModel(particle_size, model_input_size, num_class)
    deepModel.init_learning_rate(learning_rate = learning_rate, learning_rate_decay_factor = learning_rate_decay_factor,
                                  decay_steps = learning_rate_decay_steps, staircase = learning_rate_staircase)
    deepModel.init_momentum(momentum = momentum)
    # initialize the model
    # define the computation procedure of optimizer, loss, lr, prediction, eval_prediction 
    deepModel.init_model_graph_train()
    saver = tf.train.Saver(tf.all_variables())
    
    start_time = time.time()
    init = tf.initialize_all_variables()
    with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
        # initialize all the parameters
        sess.run(init)
        max_epochs = 200   # the max number of epoch to train the model
        best_eval_error_rate = 100
        toleration_patience = 10 
        toleration_patience_flag  = 0
        eval_frequency = train_size // batch_size   # the frequency to evaluate the evaluation dataset
        for step in xrange(int(max_epochs * train_size) // batch_size):
            # get the batch training data
            offset =  (step * batch_size) % (train_size - batch_size)
            batch_data = train_data[offset:(offset+batch_size), ...]
            batch_label = train_label[offset:(offset+batch_size)]
            # online augmentation
            #batch_data = DataLoader.preprocess_particle_online(batch_data)
            loss_value, lr, train_prediction = deepModel.train_batch(batch_data, batch_label,sess)

            # do the computation
            if step % eval_frequency == 0:
                stop_time = time.time() - start_time
                start_time = time.time()
                eval_prediction = deepModel.evaluation(eval_data, sess)
                eval_error_rate = error_rate(eval_prediction, eval_label)
                print('epoch: %.2f , %.2f ms' % (step * batch_size /train_size, 1000 * stop_time / eval_frequency)) 
                print('train loss: %.6f,\t learning rate: %.6f' % (loss_value, lr)) 
                print('train error: %.6f%%,\t valid error: %.6f%%' % (error_rate(train_prediction, batch_label), eval_error_rate))         
                if eval_error_rate < best_eval_error_rate:
                    best_eval_error_rate = eval_error_rate
                    toleration_patience = 10
                else:
                    toleration_patience = toleration_patience - 1
            if toleration_patience == 0:
                saver.save(sess, model_save_file)
                break
 def test_my_own_png(self):
     # load the mnist test data CSV file into a list
     test_data_list = DataLoader.load_my_data()
     self.__test_png(self.n, test_data_list)
Example #24
0

args = get_args()
setup_seed(args.seed)
device = args.device
checkpoint_dir = args.checkpoint_dir
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

data = np.load(args.dataset_path)

model = net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

dataLoader = DataLoader(data['X'],
                        data['Y'],
                        train_val_split=[0.7, 0.15, 0.15],
                        batch_size=args.batch_size,
                        device=device)
loader_train, loader_val, loader_test = dataLoader.get_loader()
trainer = Trainer(model, optimizer)
loss_fn = torch.nn.functional.cross_entropy
trainer.train_with_val(loss_fn,
                       loader_train=loader_train,
                       loader_val=loader_val,
                       epochs=args.epochs,
                       save_path=checkpoint_dir + 'model.pth',
                       save_best_only=True,
                       monitor_on='acc')
trainer.test(loader_test, loss_fn, info='Test ')
Example #25
0
def learn_test(expr):
    loader = DataLoader()
    dataset = loader.loadData(
        dataset=expr)  # dataset options: electricity, traffic, BLE
    pastObserve = pastObserves[expr]
    o_columns = dataset.columns
    predCol = o_columns[-1]
    lenAll = len(dataset)
    lenx = int(lenAll * .75)

    test_orig = []
    mean_errors = []
    error_stds = []
    all_errors = []

    all_predictions = []

    values = dataset.values
    origData = values

    # normalize
    parameters = dataset.values.shape[1]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    reframed = series_to_supervised(scaled, pastObserve, 1)

    # drop columns we don't want to predict
    droppings = []
    for i in range(1, pastObserve + 1):
        x = [a for a in range(parameters * (i - 1), parameters * i - 1)]
        droppings.extend(x)
    reframed.drop(reframed.columns[droppings], axis=1, inplace=True)
    valuesTrans = reframed.values
    test = valuesTrans

    # split into input and outputs
    train_X_all, train_y_all = valuesTrans[:, :-1], valuesTrans[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]

    trainingModels = []
    for i in range(modelsNo):
        deepModel = create_model(parameters, pastObserve)
        trainingModels.append(deepModel)

    dy = 0
    sparsity = 3
    for model in trainingModels:
        # fit network
        partsLen = int(len(train_X_all) / sparsity) * sparsity
        a = np.arange(partsLen)
        a = a.reshape(sparsity, int(partsLen / sparsity))
        ixs = []
        # just consider part of dataset not all of that
        for t in range(sparsity):
            if (t == dy):
                ixs.append(a[t])
        # ixs.append(a[t+1]) # for considering 40% sparsity
        # ixs.append(a[t+2]) # for considering 60% sparsity
        ixs = np.array(ixs)
        train_ixs = ixs.flatten()
        train_X, train_y = train_X_all[train_ixs], train_y_all[train_ixs]
        model.fit(train_X, train_y, epochs=20, batch_size=20, verbose=2)
        dy += 1
        # calculate predictions
        predictions = model.predict(test_X)
        predictions = predictions.reshape((len(predictions), 1))

        pads = np.zeros(len(test_y) * (parameters - 1))
        pads = pads.reshape(len(test_y), parameters - 1)

        inv_yhat = concatenate((pads, predictions), axis=1)
        inv_yhat = scaler.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:, -1]
        inv_yhat = np.around(inv_yhat, decimals=2)

        # invert scaling for actual
        test_y = test_y.reshape((len(test_y), 1))
        inv_test = concatenate((test_X[:, pastObserve:], test_y), axis=1)
        test_orig = scaler.inverse_transform(inv_test)

        origY = test_orig[:, -1]
        meanErr, std, errors = report_errors(origY, inv_yhat, errorType[expr])

        mean_errors.append(meanErr)
        error_stds.append(std)

        all_errors.append(errors)
        all_predictions.append(inv_yhat)

        print(min(origY), max(origY))
        print(min(inv_yhat), max(inv_yhat))
        print('Test Mean Error: %.3f ' % meanErr)

    p_cols = []
    df = DataFrame(test_orig, columns=o_columns)
    for k in range(len(all_predictions)):
        colName = 'predict_' + str(k + 1)
        p_cols.append(colName)
        df[colName] = all_predictions[k]
    for k in range(len(all_predictions)):
        errName = 'error_' + str(k + 1)
        df[errName] = all_errors[k]

    print(errorType[expr])
    print(mean_errors)

    if not os.path.exists(models_output_folder):
        os.makedirs(models_output_folder)

    outDetails_filename = models_output_folder + 'predictions_details_%s.csv' % expr
    out_filename = models_output_folder + 'predictions_output_%s.csv' % expr

    df.to_csv(outDetails_filename, index=False)

    models_prediction_cols = p_cols
    models_prediction_cols.append(predCol)
    df_modelOutput = df[models_prediction_cols]
    df_modelOutput.to_csv(out_filename, index=False)
def starcraft_svm_test():

    # Create DataLoader instance to load and format data
    dataLoader = DataLoader()

    logging.info("Program started")

    logging.info("Loading starcraft data")
    # Read skillcraft dataset, the class index is the second column
    dataLoader.read(filename="data/SkillCraft1_Dataset.csv",
                    classIndex=1,
                    numOfFeatures=15)
    multi_label_count = dataLoader.labelCount(8)

    # Creates plots for a few of the data features
    # dataLoader.visualize()

    # Normalize data values from 0 - 1
    #dataLoader.normalize()

    # Create new labels to fit into binary classification
    dataLoader.scaleToBinary(5)
    label_count = dataLoader.binaryLabelCount(5)
    logging.info("Number of examples per class")
    logging.info("Casual - (1):           " + str(label_count[0]))
    logging.info("Hardcore - (-1):           " + str(label_count[1]))

    label_count = dataLoader.labelCount(8)
    logDataCount(label_count)
    """
    # Create SVM
    svm = SVM()

    # Train and predict for binary svm
    logging.info("Running SVM for binary classification")
    # Train for binary single run with these objects
    logging.info("Single binary SVM")
    svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test)

    # Train and test binary svm multiple times for all available binary variables
    logging.info("Multiple runs with different parameters - binary SVM")
    svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, iterate=True)

    # Save binary results to excel sheet
    logging.info("Saving binary SVM results")
    svm.results.to_excel(writer_starcraft, sheet_name='binary-svm')


    # MULTI CLASS SVM
    logging.info("Running SVM for multiclass classification")


    # Train and predict for multi-class data using the linear svm from liblinear implementation
    logging.info("Running SVM for multiclass classification with liblinear implementation")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False)
    logging.info("Saving multiclass liblinear results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-liblinear')

    # Train for multi-class single run with these objects using the libsvm implementation
    logging.info("Running SVM for multiclass classification with libsvm implementation")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False, linear=False)
    logging.info("Saving multiclass libsvm results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-libsvm')

    # Train and test multi-class svm multiple times for all available multi-class variables
    logging.info("Running SVM for multiclass classification for all available multi-class variables")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False)
    logging.info("Saving multiclass multiple-runs results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-multiple-variables')

    # Train and test multi-class svm multiple times with KPCA-LDA
    logging.info("Running SVM for multiclass classification with KPCA-LDA")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False, decomposition=True)
    logging.info("Saving multiclass multiple-runs results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-kpca-lda')

    # KNN and NC
    nearest(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, dataLoader.multi_y_train, dataLoader.multi_y_test, writer_starcraft)
    """

    clustering(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test,
               dataLoader.y_test)

    # Write all the results
    writer_starcraft.save()
Example #27
0
            update_checkpoint_link([('epoch_%d.pt' % best_epoch, 'best.pt'),
                                    ('epoch_%d.pt' % epoch, 'last.pt')])

        epoch += 1

    cost_time = time.time() - since
    print('Training complete in {:.0f}h{:.0f}m{:.0f}s'.format(
        (cost_time // 60) // 60, (cost_time // 60) % 60, cost_time % 60))

    return model, cost_time, best_acc, best_train_acc


if __name__ == '__main__':

    loader = DataLoader(args.dataset,
                        batch_size=args.batch_size,
                        seed=args.seed)
    dataloaders, dataset_sizes = loader.load_data(args.img_size)

    num_classes = 10
    if args.dataset == 'cifar-10':
        num_classes = 10
    if args.dataset == 'cifar-100':
        num_classes = 100
    if args.dataset == 'VOCpart':
        num_classes = len(dataloaders['train'].dataset.classes)

    assert args.img_size == 128, 'only supports --img_size 128'
    model = resnet_std(depth=args.depth,
                       num_classes=num_classes,
                       ifmask=args.ifmask,
Example #28
0
    unique_name = 'stack_2406_2x_SumCorr_movie_DW'
    #unique_name = 'stack_3025_2x_SumCorr_movie_DW'
    coordinates = []
    class_number = []
    starfile = os.path.join(basepath, unique_name + new + '.star')
    with open(starfile) as fin:
        idx = 0
        for l in fin:
            idx += 1
            if idx <= 5 or l.strip() == '':
                continue
            t = map(float, l.strip().split())
            coordinates.append([int(t[0]), int(t[1])])
            class_number.append(int(t[2]))
    plot = 'test_plot_%d' % peek_cls + new + '.png'
    filename = os.path.join(mrcpath, unique_name + '.mrc')
    header, body = DataLoader.readMrcFile(filename)
    n_col = header[0]
    n_row = header[1]
    print n_col, n_row
    body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1)
    body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
    coordinates = np.array(coordinates)
    coordinates = coordinates / bin_size
    plot_circle_in_micrograph(body_2d,
                              coordinates,
                              class_number,
                              180 / bin_size,
                              plot,
                              color='white')
Example #29
0
    def analysis_pick_results(pick_results_file, reference_coordinate_dir, reference_coordinate_symbol, particle_size, minimum_distance_rate):
        """Load the picking results from a file of binary format and compare it with the reference coordinate.

        This function analysis the picking results with reference coordinate and calculate the recall, precision and the deviation from the center.

        Args:
            pick_results_file: string, the file name of the pre-picked results.
            reference_mrc_dir: string, the directory of the mrc file dir.
            reference_coordinate_symbol: the symbol of the coordinate, like '_manualpick'
            particle_size: int, the size of particle
            minimum_distance_rate: float, the default is 0.2, a picked coordinate is considered to be a true positive only when the distance between the picked coordinate and the reference coordinate is less than minimum_distance_rate mutiplicate particle_size.
        """
        with open(pick_results_file, 'rb') as f:
            coordinate = pickle.load(f)
            """
            coordinate: a list, the length of it stands for the number of picked micrograph file.
                        Each element is a list too, which contains all coordinates from the same micrograph. 
                        The length of the list stands for the number of the particles.
                        And each element in the list is a small list of length of 4.
                        The first element in the small list is the coordinate x-aixs. 
                        The second element in the small list is the coordinate y-aixs. 
                        The third element in the small list is the prediction score. 
                        The fourth element in the small list is the micrograh name. 
            """
        tp = 0
        total_pick = 0
        total_reference = 0
        coordinate_total = []
        for i in range(len(coordinate)):
            mrc_filename = os.path.basename(coordinate[i][0][3])
            #print(mrc_filename)
            reference_coordinate_file = mrc_filename.replace('.mrc', reference_coordinate_symbol+'.star')
            reference_coordinate_file = os.path.join(reference_coordinate_dir, reference_coordinate_file)
            #print(reference_coordinate_file)
            if os.path.isfile(reference_coordinate_file):
                reference_coordinate = DataLoader.read_coordinate_from_star(reference_coordinate_file)
                """
                reference_coordinate: a list, the length of it stands for the number of picked particles.
                            And each element in the list is a small list of length of 2.
                            The first element in the small list is the coordinate x-aixs. 
                            The second element in the small list is the coordinate y-aixs. 
                """    
                tp_sigle, average_distance = AutoPicker.calculate_tp(coordinate[i], reference_coordinate, particle_size*minimum_distance_rate)
                #print("tp:",tp_sigle)
                #print("average_distance:",average_distance)
                # calculate the number of true positive, when the threshold is set to 0.5
                tp_sigle = 0
                total_reference = total_reference + len(reference_coordinate)
                for j in range(len(coordinate[i])):
                    coordinate_total.append(coordinate[i][j])
                    if coordinate[i][j][2]>0.5:
                        total_pick = total_pick + 1
                        if coordinate[i][j][4] == 1:
                            tp = tp + 1
                            tp_sigle = tp_sigle + 1
                print(tp_sigle/len(reference_coordinate))
            else:
                print("Can not find the reference coordinate:"+reference_coordinate_file)
        precision = tp/total_pick
        recall = tp/total_reference
        print("(threshold 0.5)precision:%f recall:%f"%(precision, recall))
        # sort the coordinate based on prediction score in a descending order.
        coordinate_total = sorted(coordinate_total, key = itemgetter(2), reverse = True) 
        total_tp = []
        total_recall = []
        total_precision = []
        total_probability = []
        total_average_distance = []
        total_distance = 0
        tp_tem = 0
        for i in range(len(coordinate_total)):
            if coordinate_total[i][4] == 1:
                tp_tem = tp_tem + 1
                total_distance = total_distance + coordinate_total[i][5]
            precision = tp_tem/(i+1)
            recall = tp_tem/total_reference
            total_tp.append(tp_tem)
            total_recall.append(recall)
            total_precision.append(precision)
            total_probability.append(coordinate_total[i][2])
            if tp_tem==0:
                average_distance = 0
            else:
                average_distance = total_distance/tp_tem
            total_average_distance.append(average_distance)
        # write the list results in file
        directory_pick = os.path.dirname(pick_results_file)
        total_results_file = os.path.join(directory_pick, 'results.txt')
        f = open(total_results_file, 'w')
        # write total_tp
        f.write(','.join(map(str, total_tp))+'\n')
        f.write(','.join(map(str, total_recall))+'\n')
        f.write(','.join(map(str, total_precision))+'\n')
        f.write(','.join(map(str, total_probability))+'\n')
        f.write(','.join(map(str, total_average_distance))+'\n')
        f.write('#total autopick number:%d\n'%(len(coordinate_total))) 
        f.write('#total manual pick number:%d\n'%(total_reference))
        f.write('#the first row is number of true positive\n')
        f.write('#the second row is recall\n')
        f.write('#the third row is precision\n')
        f.write('#the fourth row is probability\n')
        f.write('#the fiveth row is distance\n')    
        
        # show the recall and precision
        times_of_manual = len(coordinate_total)//total_reference + 1
        for i in range(times_of_manual):
            print('autopick_total sort, take the head number of total_manualpick * ratio %d'%(i+1))
            f.write('#autopick_total sort, take the head number of total_manualpick * ratio %d \n'%(i+1))
            if i==times_of_manual-1:
                print('precision:%f \trecall:%f'%(total_precision[-1], total_recall[-1]))
                f.write('precision:%f \trecall:%f \n'%(total_precision[-1], total_recall[-1]))
            else:
                print('precision:%f \trecall:%f'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1]))
                f.write('precision:%f \trecall:%f \n'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1]))
        f.close()
 def train(self):
     training_data = DataLoader.load_nmist_train_data()
     self.__train(self.n, training_data)
Example #31
0
def Run_SRNN_NormalCase(args, no_dataset):

    data_path, graph_path = Data_path(no_dataset)
    log_path = Log_path(no_dataset)

    # Construct the DataLoader object that loads data
    dataloader = DataLoader(args)
    dataloader.load_data(data_path)

    # Construct the ST-graph object that reads graph
    stgraph = ST_GRAPH(args)
    stgraph.readGraph(dataloader.num_sensor, graph_path)

    # Initialize net
    net = SRNN(args)
    net.setStgraph(stgraph)

    print('- Number of trainable parameters:',
          sum(p.numel() for p in net.parameters() if p.requires_grad))

    # optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
    # optimizer = torch.optim.RMSprop(net.parameters(), lr=args.learning_rate, momentum=0.0001, centered=True)
    optimizer = torch.optim.Adagrad(net.parameters())

    best_eval_loss = 10000
    best_epoch = 0

    print('')
    print('---- Train and Evaluation ----')

    eval_loss_res = np.zeros((args.num_epochs + 1, 2))
    for e in range(args.num_epochs):
        epoch = e + 1

        ####  Training ####
        print('-- Training, epoch {}/{}'.format(epoch, args.num_epochs))
        loss_epoch = 0

        # For each batch
        for b in range(dataloader.num_batches_train):
            batch = b + 1
            start = time.time()

            # Get batch data
            x = dataloader.next_batch_train()

            # Loss for this batch
            loss_batch = 0

            # For each sequence in the batch
            for sequence in range(dataloader.batch_size):

                # put node and edge features
                stgraph.putSequenceData(x[sequence])

                # get data to feed
                data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData(
                )

                # put a sequence to net
                loss_output, data_nodes, outputs = forward(
                    net, optimizer, args, stgraph, data_nodes,
                    data_temporalEdges, data_spatialEdges)
                loss_output.backward()
                loss_batch += loss_RMSE(data_nodes[-1], outputs[-1],
                                        dataloader.scaler)

                # Clip gradients
                torch.nn.utils.clip_grad_norm_(net.parameters(),
                                               args.grad_clip)

                # Update parameters
                optimizer.step()

            end = time.time()
            loss_batch = loss_batch / dataloader.batch_size
            loss_epoch += loss_batch

            print('Train: {}/{}, train_loss = {:.3f}, time/batch = {:.3f}'.
                  format(e * dataloader.num_batches_train + batch,
                         args.num_epochs * dataloader.num_batches_train,
                         loss_batch, end - start))
        # Compute loss for the entire epoch
        loss_epoch /= dataloader.num_batches_train
        print('(epoch {}), train_loss = {:.3f}'.format(epoch, loss_epoch))

        # Save the model after each epoch
        save_path = Save_path(no_dataset, epoch)
        print('Saving model to ' + save_path)
        torch.save(
            {
                'epoch': epoch,
                'state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }, save_path)

        #### Evaluation ####
        print('-- Evaluation, epoch {}/{}'.format(epoch, args.num_epochs))
        loss_epoch = 0
        for b in range(dataloader.num_batches_eval):
            batch = b + 1
            start = time.time()

            # Get batch data
            x = dataloader.next_batch_eval()

            # Loss for this batch
            loss_batch = 0

            for sequence in range(dataloader.batch_size):

                # put node and edge features
                stgraph.putSequenceData(x[sequence])

                # get data to feed
                data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData(
                )

                # put a sequence to net
                _, data_nodes, outputs = forward(net, optimizer, args, stgraph,
                                                 data_nodes,
                                                 data_temporalEdges,
                                                 data_spatialEdges)

                loss_batch += loss_RMSE(data_nodes[-1], outputs[-1],
                                        dataloader.scaler)

            end = time.time()
            loss_batch = loss_batch / dataloader.batch_size
            loss_epoch += loss_batch

            print(
                'Eval: {}/{}, eval_loss = {:.3f}, time/batch = {:.3f}'.format(
                    e * dataloader.num_batches_eval + batch,
                    args.num_epochs * dataloader.num_batches_eval, loss_batch,
                    end - start))
        loss_epoch /= dataloader.num_batches_eval
        eval_loss_res[e] = (epoch, loss_epoch)

        # Update best validation loss until now
        if loss_epoch < best_eval_loss:
            best_eval_loss = loss_epoch
            best_epoch = epoch

        print('(epoch {}), eval_loss = {:.3f}'.format(epoch, loss_epoch))

    # Record the best epoch and best validation loss overall
    print('Best epoch: {}, Best evaluation loss {:.3f}'.format(
        best_epoch, best_eval_loss))
    eval_loss_res[-1] = (best_epoch, best_eval_loss)
    np.savetxt(log_path, eval_loss_res, fmt='%d, %.3f')
    print('- Eval result has been saved in ', log_path)
    print('')
Example #32
0
def integrated_benchmark(dataset_path):
    """
    Variables:
    Dataset size: number of columns
    Dataset distribution: column length distribution
    threshold
    query column
    """
    loader = DataLoader("")
    dataset = loader.load_dataset(dataset_path)

    bf_lists, lsh_list = init(dataset)
    print("""
Benchmark 1 
Goal: Measure scalability of different methods
Variable: 
    the size of datasets. size: 400, 600, 800, 1000
Fix:
    threshold = 0.6
    query column = median col
Output:
    Runtime
    precision, recall, f1
""")
    labels = ["bloom filter", "lsh", "lsh ensemble", "lsh + bloom filter"]
    time_for_each_size = np.empty((len(dataset), len(labels)), dtype=float)
    x_axis = np.empty(len(dataset), dtype=int)

    for i, cols in enumerate(dataset):
        candidate_index = len(cols) // 2  # median col
        brute_force_result = brute_force(candidate_index, cols, 0.6)
        print("brute_force finished\n")
        time = benchmark(cols, candidate_index, 0.6, bf_lists[i], lsh_list[i],
                         brute_force_result,
                         "Benchmark-1-cols-size-" + str(len(cols)))
        time_for_each_size[i] = time
        x_axis[i] = len(cols)

    fig, ax = plt.subplots()
    for i in range(len(labels)):
        ax.plot(x_axis, time_for_each_size[:, i], 'o-', label=labels[i])
    ax.legend()
    ax.set_title("Benchmark-1-cols-size")
    ax.set_xticks(x_axis)
    ax.set_xlabel("size")
    ax.set_ylabel("time(s)")
    fig.tight_layout()
    # plt.show()
    fig.savefig("./bench_results/Benchmark-1-cols-size")

    print("""
Benchmark 2
Goal: Measure the effect of threshold
Variable:
   threshold: 0.1 0.3 0.5 0.7 0.9
Fix:
    dataset size = median col
Output
    Runtime
    precision, recall, f1
""")
    threshold_list = [0.1, 0.3, 0.5, 0.7, 0.9]
    time_for_each_threshold = np.empty((len(threshold_list), len(labels)),
                                       dtype=float)
    x_axis = np.empty(len(threshold_list), dtype=float)

    cols_index = len(dataset) // 2
    cols = dataset[cols_index]
    for i in range(len(threshold_list)):
        threshold = threshold_list[i]
        candidate_index = len(cols) // 2  # median col
        brute_force_result = brute_force(candidate_index, cols, threshold)
        print("brute_force finished\n")
        time = benchmark(
            cols, candidate_index, threshold, bf_lists[cols_index],
            lsh_list[cols_index], brute_force_result,
            "Benchmark-2-threshold-" + str(int(threshold * 100)) + "%")
        time_for_each_threshold[i] = time
        x_axis[i] = threshold

    fig, ax = plt.subplots()
    for i in range(len(labels)):
        ax.plot(x_axis, time_for_each_threshold[:, i], 'o-', label=labels[i])
    ax.legend()
    ax.set_title("Benchmark-2-threshold")
    ax.set_xticks(x_axis)
    ax.set_xlabel("threshold")
    ax.set_ylabel("time(s)")
    fig.tight_layout()
    # plt.show()
    fig.savefig("./bench_results/Benchmark-2-threshold")

    print("""
Benchmark 3
Goal: Measure the effect of query column
Variable:
    query column = small col, median col, large col
Fix:
    dataset size = median size cols
    threshold = 0.6
Output
    Runtime
    precision, recall, f1
""")
    cols_index = len(dataset) // 2
    cols = dataset[cols_index]
    label = ["small-col", "median-col", "large-col"]
    for i, candidate_index in enumerate([0, len(cols) // 2, len(cols) - 1]):
        brute_force_result = brute_force(candidate_index, cols, 0.6)
        benchmark(cols, candidate_index, 0.6, bf_lists[cols_index],
                  lsh_list[cols_index], brute_force_result,
                  "Benchmark-3-candidate-" + label[i])
Example #33
0
def main(pretrain_checkpoint_dir,
         train_summary_writer,
         vocab: Vocab,
         dataloader: DataLoader,
         batch_size: int = 64,
         embedding_dim: int = 256,
         seq_length: int = 3000,
         gen_seq_len: int = 3000,
         gen_rnn_units: int = 1024,
         disc_rnn_units: int = 1024,
         epochs: int = 40000,
         pretrain_epochs: int = 4000,
         learning_rate: float = 1e-4,
         rollout_num: int = 2,
         gen_pretrain: bool = False,
         disc_pretrain: bool = False,
         load_gen_weights: bool = False,
         load_disc_weights: bool = False,
         save_gen_weights: bool = True,
         save_disc_weights: bool = True,
         disc_steps: int = 3):
    gen = Generator(dataloader=dataloader,
                    vocab=vocab,
                    batch_size=batch_size,
                    embedding_dim=embedding_dim,
                    seq_length=seq_length,
                    checkpoint_dir=pretrain_checkpoint_dir,
                    rnn_units=gen_rnn_units,
                    start_token=0,
                    learning_rate=learning_rate)
    if load_gen_weights:
        gen.load_weights()
    if gen_pretrain:
        gen_pre_trainer = GenPretrainer(gen,
                                        dataloader=dataloader,
                                        vocab=vocab,
                                        pretrain_epochs=pretrain_epochs,
                                        tb_writer=train_summary_writer,
                                        learning_rate=learning_rate)
        print('Start pre-training generator...')
        gen_pre_trainer.pretrain(gen_seq_len=gen_seq_len,
                                 save_weights=save_gen_weights)

    disc = Discriminator(vocab_size=vocab.vocab_size,
                         embedding_dim=embedding_dim,
                         rnn_units=disc_rnn_units,
                         batch_size=batch_size,
                         checkpoint_dir=pretrain_checkpoint_dir,
                         learning_rate=learning_rate)
    if load_disc_weights:
        disc.load_weights()
    if disc_pretrain:
        disc_pre_trainer = DiscPretrainer(disc,
                                          gen,
                                          dataloader=dataloader,
                                          vocab=vocab,
                                          pretrain_epochs=pretrain_epochs,
                                          tb_writer=train_summary_writer,
                                          learning_rate=learning_rate)
        print('Start pre-training discriminator...')
        disc_pre_trainer.pretrain(save_disc_weights)
    rollout = Rollout(generator=gen,
                      discriminator=disc,
                      vocab=vocab,
                      batch_size=batch_size,
                      seq_length=seq_length,
                      rollout_num=rollout_num)

    with tqdm(desc='Epoch: ', total=epochs, dynamic_ncols=True) as pbar:
        for epoch in range(epochs):
            fake_samples = gen.generate()
            rewards = rollout.get_reward(samples=fake_samples)
            gen_loss = gen.train_step(fake_samples, rewards)
            real_samples, _ = dataloader.get_batch(shuffle=shuffle,
                                                   seq_length=seq_length,
                                                   batch_size=batch_size,
                                                   training=True)
            disc_loss = 0
            for i in range(disc_steps):
                disc_loss += disc.train_step(fake_samples,
                                             real_samples) / disc_steps

            with train_summary_writer.as_default():
                tf.summary.scalar('gen_train_loss', gen_loss, step=epoch)
                tf.summary.scalar('disc_train_loss', disc_loss, step=epoch)
                tf.summary.scalar('total_train_loss',
                                  disc_loss + gen_loss,
                                  step=epoch)

            pbar.set_postfix(gen_train_loss=tf.reduce_mean(gen_loss),
                             disc_train_loss=tf.reduce_mean(disc_loss),
                             total_train_loss=tf.reduce_mean(gen_loss +
                                                             disc_loss))

            if (epoch + 1) % 5 == 0 or (epoch + 1) == 1:
                print('保存weights...')
                # 保存weights
                gen.model.save_weights(gen.checkpoint_prefix)
                disc.model.save_weights(disc.checkpoint_prefix)
                # gen.model.save('gen.h5')
                # disc.model.save('disc.h5')

                # 测试 disc
                fake_samples = gen.generate(gen_seq_len)
                real_samples = dataloader.get_batch(shuffle=shuffle,
                                                    seq_length=gen_seq_len,
                                                    batch_size=batch_size,
                                                    training=False)
                disc_loss = disc.test_step(fake_samples, real_samples)

                # 测试 gen
                gen_loss = gen.test_step()

                # 得到bleu_score
                # bleu_score = get_bleu_score(true_seqs=real_samples, genned_seqs=fake_samples)
                genned_sentences = vocab.extract_seqs(fake_samples)
                # print(genned_sentences)
                # print(vocab.idx2char[fake_samples[0]])

                # 记录 test losses
                with train_summary_writer.as_default():
                    tf.summary.scalar('disc_test_loss',
                                      tf.reduce_mean(disc_loss),
                                      step=epoch)
                    tf.summary.scalar('gen_test_loss',
                                      tf.reduce_mean(gen_loss),
                                      step=epoch)
                    # tf.summary.scalar('bleu_score', tf.reduce_mean(bleu_score), step=epoch + gen_pretrain * pretrain_epochs)

            pbar.update()
Example #34
0
def train():
    parser = OptionParser()
    parser.add_option("--train_good",
                      dest="train_good",
                      help="Input good particles ",
                      metavar="FILE")
    parser.add_option("--train_bad",
                      dest="train_bad",
                      help="Input bad particles",
                      metavar="FILE")
    parser.add_option("--particle_number",
                      type="int",
                      dest="train_number",
                      help="Number of positive samples to train.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--bin_size",
                      type="int",
                      dest="bin_size",
                      help="image size reduction",
                      metavar="VALUE",
                      default=3)

    parser.add_option(
        "--coordinate_symbol",
        dest="coordinate_symbol",
        help="The symbol of the coordinate file, like '_manualPick'",
        metavar="STRING")
    parser.add_option("--particle_size",
                      type="int",
                      dest="particle_size",
                      help="the size of the particle.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--validation_ratio",
                      type="float",
                      dest="validation_ratio",
                      help="the ratio.",
                      metavar="VALUE",
                      default=0.1)
    parser.add_option(
        "--model_retrain",
        action="store_true",
        dest="model_retrain",
        help=
        "train the model using the pre-trained model as parameters initialization .",
        default=False)
    parser.add_option("--model_load_file",
                      dest="model_load_file",
                      help="pre-trained model",
                      metavar="FILE")
    parser.add_option("--logdir",
                      dest="logdir",
                      help="directory of logfiles",
                      metavar="DIRECTORY",
                      default="Logfile")
    parser.add_option("--model_save_file",
                      dest="model_save_file",
                      help="save the model to file",
                      metavar="FILE")
    (opt, args) = parser.parse_args()

    np.random.seed(1234)

    # define the input size of the model
    model_input_size = [100, 64, 64, 1]
    num_classes = 2  # the number of output classes
    batch_size = model_input_size[0]

    if not os.access(opt.logdir, os.F_OK):
        os.mkdir(opt.logdir)

    # load training dataset
    dataLoader = DataLoader()
    train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(
        opt.train_good, opt.particle_size, model_input_size,
        opt.validation_ratio, opt.train_number, opt.bin_size)

    # Check if train_data exist
    try:
        train_data
    except NameError:
        print("ERROR: in function load.loadInputTrainData.")
        return None
    else:
        print("Load training data successfully!")
    # shuffle training data
    train_data, train_label = shuffle_in_unison_inplace(
        train_data, train_label)
    eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label)

    train_x = train_data.reshape(train_data.shape[0], 64, 64, 1)
    test_x = eval_data.reshape(eval_data.shape[0], 64, 64, 1)
    print("shape of training data: ", train_x.shape, test_x.shape)
    train_y = to_categorical(train_label, 2)
    test_y = to_categorical(eval_label, 2)
    print(train_y.shape, test_y.shape)
    datagen = ImageDataGenerator(featurewise_center=True,
                                 featurewise_std_normalization=True,
                                 rotation_range=20,
                                 width_shift_range=0.0,
                                 height_shift_range=0.0,
                                 horizontal_flip=True,
                                 vertical_flip=True)
    datagen.fit(train_x)

    model = Sequential()
    model.add(
        Conv2D(32,
               kernel_size=(8, 8),
               strides=(1, 1),
               activation='relu',
               input_shape=(64, 64, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(8, 8), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    for layer in model.layers:
        print(layer.name, layer.output_shape)

    logdir = opt.logdir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=logdir)
    checkpoint = ModelCheckpoint('best_model.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 period=1)
    reduce_lr_plateau = ReduceLROnPlateau(monitor='val_acc',
                                          patience=10,
                                          verbose=1)
    callbacks = [checkpoint, reduce_lr_plateau, tensorboard_callback]
    model.compile(optimizer=SGD(0.01),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    model.fit_generator(datagen.flow(train_x, train_y, batch_size=batch_size),
                        steps_per_epoch=len(train_x) / 32,
                        epochs=30,
                        validation_data=(test_x, test_y),
                        callbacks=callbacks)
    model.save(opt.model_save_file)
    accuracy = model.evaluate(x=test_x, y=test_y, batch_size=batch_size)
    print("Accuracy:", accuracy[1])
Example #35
0
from dataLoader import DataLoader
from crfBrandDetector import CrfBrandDetector

if __name__ == '__main__':
    print('Preparing Data...')
    df = DataLoader().get_data()
    print('Building Model...')
    crf_model = CrfBrandDetector()
    print('Fitting...')
    x_train, x_test, y_train, y_test = crf_model.train_test_split(df)
    crf_model.fit(x_train, y_train)
    crf_model.report_classification(x_test, y_test)
    print('Accuracy: {}'.format(crf_model.evaluate(x_test, y_test)))
    pred = crf_model.predict(x_test)
    pred.to_csv('./pred.csv', index=False)
Example #36
0
                'loss': loss,
            }, os.path.join(args.exp_dir , 'unfinished_model.pt'))
        epoch += 1

    cost_time = time.time() - since
    print ('Training complete in {:.0f}m {:.0f}s'.format(cost_time//60,cost_time%60))
    print ('Best Train Acc is {:.4f}'.format(best_train_acc))
    print ('Best Val Acc is {:.4f}'.format(best_acc))
    model.load_state_dict(best_model)
    return model,cost_time,best_acc,best_train_acc


if __name__ == '__main__':
    print ('DataSets: '+args.dataset)
    print ('ResNet Depth: '+str(args.depth))
    loader = DataLoader(args.dataset,batch_size=args.batch_size)
    dataloaders,dataset_sizes = loader.load_data()
    num_classes = 10
    if args.dataset == 'cifar-10':
        num_classes = 10
    if args.dataset == 'cifar-100':
        num_classes = 100

    model = resnet_cifar(depth=args.depth, num_classes=num_classes)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1,
                                momentum=0.9, nesterov=True, weight_decay=1e-4)

    # define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    scheduler = MultiStepLR(optimizer, milestones=[args.epoch*0.4, args.epoch*0.6, args.epoch*0.8], gamma=0.1)
Example #37
0
from dataLoader import DataLoader

loader = DataLoader()
loader.loadAll()

fileobj = open("csv/subjectAreaDump.csv", 'w')

for id, paper in loader.papers.iteritems():
    if paper.accepted:
        fileobj.write("%s|%d|%s" % (
            paper.primarySpecificSubjectArea, id, paper.title))
        for subj in paper.specificSubjectAreas:
            fileobj.write("|" + subj)
        fileobj.write("\n")
fileobj.close()