Exemple #1
0
    def build_cpg_model(self):
        opts = self.opts
        log = self.log

        replicate_names = dat.get_replicate_names(
            opts.train_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        if not replicate_names:
            raise ValueError('No replicates found!')
        print('Replicate names:')
        print(', '.join(replicate_names))
        print()

        cpg_wlen = dat.get_cpg_wlen(opts.train_files[0], opts.cpg_wlen)

        if os.path.exists(opts.cpg_model[0]):
            log.info('Loading existing CpG model ...')
            src_cpg_model = mod.load_model(opts.cpg_model, log=log.info)
            remove_outputs(src_cpg_model)
            rename_layers(src_cpg_model, 'cpg')
            nb_replicate = src_cpg_model.input_shape[0][1]
            if nb_replicate != len(replicate_names):
                tmp = 'CpG model was trained with %d replicates but %d'
                'replicates provided. Copying weight to new model ...'
                tmp %= (nb_replicate, len(replicate_names))
                log.info('Replicate names differ: '
                         'Copying weights to new model ...')
                cpg_model_builder = mod.cpg.get(src_cpg_model.name)(
                    l1_decay=opts.l1_decay,
                    l2_decay=opts.l2_decay,
                    dropout=opts.dropout)
                cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names)
                cpg_model = cpg_model_builder(cpg_inputs)
                mod.copy_weights(src_cpg_model, cpg_model)
            else:
                cpg_model = src_cpg_model
        else:
            log.info('Building CpG model ...')
            cpg_model_builder = mod.cpg.get(opts.cpg_model[0])(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names)
            cpg_model = cpg_model_builder(cpg_inputs)

        return cpg_model
Exemple #2
0
    def build_cpg_model(self):
        opts = self.opts
        log = self.log

        replicate_names = dat.get_replicate_names(opts.train_files[0],
                                                  regex=opts.replicate_names,
                                                  nb_key=opts.nb_replicate)
        if not replicate_names:
            raise ValueError('No replicates found!')
        print('Replicate names:')
        print(', '.join(replicate_names))
        print()

        cpg_wlen = dat.get_cpg_wlen(opts.train_files[0], opts.cpg_wlen)

        if os.path.exists(opts.cpg_model[0]):
            log.info('Loading existing CpG model ...')
            src_cpg_model = mod.load_model(opts.cpg_model, log=log.info)
            remove_outputs(src_cpg_model)
            rename_layers(src_cpg_model, 'cpg')
            nb_replicate = src_cpg_model.input_shape[0][1]
            if nb_replicate != len(replicate_names):
                tmp = 'CpG model was trained with %d replicates but %d'
                'replicates provided. Copying weight to new model ...'
                tmp %= (nb_replicate, len(replicate_names))
                log.info('Replicate names differ: '
                         'Copying weights to new model ...')
                cpg_model_builder = mod.cpg.get(src_cpg_model.name)(
                    l1_decay=opts.l1_decay,
                    l2_decay=opts.l2_decay,
                    dropout=opts.dropout)
                cpg_inputs = cpg_model_builder.inputs(cpg_wlen,
                                                      replicate_names)
                cpg_model = cpg_model_builder(cpg_inputs)
                mod.copy_weights(src_cpg_model, cpg_model)
            else:
                cpg_model = src_cpg_model
        else:
            log.info('Building CpG model ...')
            cpg_model_builder = mod.cpg.get(opts.cpg_model[0])(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names)
            cpg_model = cpg_model_builder(cpg_inputs)

        return cpg_model
Exemple #3
0
 def build_dna_model(self):
     opts = self.opts
     log = self.log
     if os.path.exists(opts.dna_model[0]):
         log.info('Loading existing DNA model ...')
         dna_model = mod.load_model(opts.dna_model, log=log.info)
         remove_outputs(dna_model)
         rename_layers(dna_model, 'dna')
     else:
         log.info('Building DNA model ...')
         dna_model_builder = mod.dna.get(opts.dna_model[0])(
             l1_decay=opts.l1_decay,
             l2_decay=opts.l2_decay,
             dropout=opts.dropout)
         dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen)
         dna_inputs = dna_model_builder.inputs(dna_wlen)
         dna_model = dna_model_builder(dna_inputs)
     return dna_model
Exemple #4
0
 def build_dna_model(self):
     opts = self.opts
     log = self.log
     if os.path.exists(opts.dna_model[0]):
         log.info('Loading existing DNA model ...')
         dna_model = mod.load_model(opts.dna_model, log=log.info)
         remove_outputs(dna_model)
         rename_layers(dna_model, 'dna')
     else:
         log.info('Building DNA model ...')
         dna_model_builder = mod.dna.get(opts.dna_model[0])(
             l1_decay=opts.l1_decay,
             l2_decay=opts.l2_decay,
             dropout=opts.dropout)
         dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen)
         dna_inputs = dna_model_builder.inputs(dna_wlen)
         dna_model = dna_model_builder(dna_inputs)
     return dna_model
Exemple #5
0
 def build_dna_model(self):
     opts = self.opts
     log = self.log
     if os.path.exists(opts.dna_model[0]):   #dna_model[0] is either the existing model or model name, such as: CnnL2h128 CnnL2h256
         log.info('Loading existing DNA model ...')
         dna_model = mod.load_model(opts.dna_model, log=log.info)
         remove_outputs(dna_model)
         rename_layers(dna_model, 'dna')
     else:
         log.info('Building DNA model ...') #without pre-trained model.
         dna_model_builder = mod.dna.get(opts.dna_model[0])( #mod.dna.get() Return object from module by its name.
             #the name can be CnnL2h128, CnnL2h256. the model's structure is well-defined. extract by the name of model.
             l1_decay=opts.l1_decay, #l1_decay: default=0.0001
             l2_decay=opts.l2_decay,  #l2_decay: default=0.0001
             dropout=opts.dropout) #default = 0.0
         dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen)
         dna_inputs = dna_model_builder.inputs(dna_wlen)
         dna_model = dna_model_builder(dna_inputs)
     return dna_model
Exemple #6
0
    def build_model(self):
        opts = self.opts
        log = self.log

        output_names = dat.get_output_names(opts.train_files[0],
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        if not output_names:
            raise ValueError('No outputs found!')

        dna_model = None
        if opts.dna_model:
            dna_model = self.build_dna_model()

        cpg_model = None
        if opts.cpg_model:
            cpg_model = self.build_cpg_model()

        if dna_model is not None and cpg_model is not None:
            log.info('Joining models ...')
            joint_model_builder = mod.joint.get(opts.joint_model)(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            stem = joint_model_builder([dna_model, cpg_model])
            stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name])
        elif dna_model is not None:
            stem = dna_model
        elif cpg_model is not None:
            stem = cpg_model
        else:
            log.info('Loading existing model ...')
            stem = mod.load_model(opts.model_files, log=log.info)
            if sorted(output_names) == sorted(stem.output_names):
                return stem
            log.info('Removing existing output layers ...')
            remove_outputs(stem)

        outputs = mod.add_output_layers(stem.outputs, output_names)
        model = Model(input=stem.inputs, output=outputs, name=stem.name)
        return model
Exemple #7
0
    def build_model(self):
        opts = self.opts
        log = self.log

        output_names = dat.get_output_names(opts.train_files[0],
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        if not output_names:
            raise ValueError('No outputs found!')

        dna_model = None
        if opts.dna_model:
            dna_model = self.build_dna_model()

        cpg_model = None
        if opts.cpg_model:
            cpg_model = self.build_cpg_model()

        if dna_model is not None and cpg_model is not None:
            log.info('Joining models ...')
            joint_model_builder = mod.joint.get(opts.joint_model)(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            stem = joint_model_builder([dna_model, cpg_model])
            stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name])
        elif dna_model is not None:
            stem = dna_model
        elif cpg_model is not None:
            stem = cpg_model
        else:
            log.info('Loading existing model ...')
            stem = mod.load_model(opts.model_files, log=log.info)
            if sorted(output_names) == sorted(stem.output_names):
                return stem
            log.info('Removing existing output layers ...')
            remove_outputs(stem)

        outputs = mod.add_output_layers(stem.outputs[0], output_names)
        model = Model(inputs=stem.inputs, outputs=outputs, name=stem.name)
        return model
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files, log=log.info)

        weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True)
        log.info('Using activation layer "%s"' % act_layer.name)
        log.info('Using weight layer "%s"' % weight_layer.name)

        try:
            dna_idx = model.input_names.index('dna')
        except BaseException:
            raise IOError('Model is not a valid DNA model!')

        fun_outputs = to_list(act_layer.output)
        if opts.store_preds:
            fun_outputs += to_list(model.output)
        fun = K.function([to_list(model.input)[dna_idx]], fun_outputs)

        log.info('Reading data ...')
        if opts.store_outputs or opts.store_preds:
            output_names = model.output_names
        else:
            output_names = None
        data_reader = mod.DataReader(
            output_names=output_names,
            use_dna=True,
            dna_wlen=to_list(model.input_shape)[dna_idx][1]
        )
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=opts.shuffle)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        weights = weight_layer.get_weights()
        out_group['weights/weights'] = weights[0]
        out_group['weights/bias'] = weights[1]

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(
                    name=path,
                    shape=[nb_sample] + list(data.shape[1:]),
                    dtype=dtype,
                    compression=compression
                )
            out_group[path][idx:idx+len(data)] = data

        log.info('Computing activations')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for data in data_reader:
            if isinstance(data, tuple):
                inputs, outputs, weights = data
            else:
                inputs = data
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            if opts.store_inputs:
                for i, name in enumerate(model.input_names):
                    h5_dump('inputs/%s' % name,
                            dna.onehot_to_int(inputs[i]), idx)

            if opts.store_outputs:
                for name, output in six.iteritems(outputs):
                    h5_dump('outputs/%s' % name, output, idx)

            fun_eval = fun(inputs)
            act = fun_eval[0]

            if opts.act_wlen:
                delta = opts.act_wlen // 2
                ctr = act.shape[1] // 2
                act = act[:, (ctr-delta):(ctr+delta+1)]

            if opts.act_fun:
                if opts.act_fun == 'mean':
                    act = act.mean(axis=1)
                elif opts.act_fun == 'wmean':
                    weights = linear_weights(act.shape[1])
                    act = np.average(act, axis=1, weights=weights)
                elif opts.act_fun == 'max':
                    act = act.max(axis=1)
                else:
                    raise ValueError('Invalid function "%s"!' % (opts.act_fun))

            h5_dump('act', act, idx)

            if opts.store_preds:
                preds = fun_eval[1:]
                for i, name in enumerate(model.output_names):
                    h5_dump('preds/%s' % name, preds[i].squeeze(), idx)

            for name, value in six.iteritems(next(meta_reader)):
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
Exemple #9
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(opts.data_files[0],
                                                  regex=opts.replicate_names,
                                                  nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        # Seed used since unobserved input CpG states are randomly sampled
        if opts.seed is not None:
            np.random.seed(opts.seed)
            random.seed(opts.seed)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        writer = None
        if opts.out_data:
            writer = H5Writer(opts.out_data, nb_sample)

        log.info('Predicting ...')
        nb_tot = 0
        nb_eval = 0
        data_eval = dict()
        perf_eval = []
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            nb_tot += batch_size
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in six.iteritems(next(meta_reader)):
                data_batch[name] = value

            if writer:
                writer.write_dict(data_batch)

            nb_eval += batch_size
            dat.add_to_dict(data_batch, data_eval)

            if nb_tot >= nb_sample or \
                    (opts.eval_size and nb_eval >= opts.eval_size):
                data_eval = dat.stack_dict(data_eval)
                perf_eval.append(
                    ev.evaluate_outputs(data_eval['outputs'],
                                        data_eval['preds']))
                data_eval = dict()
                nb_eval = 0

        progbar.close()
        if writer:
            writer.close()

        report = pd.concat(perf_eval)
        report = report.groupby(['metric', 'output']).mean().reset_index()

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        log.info('Done!')

        return 0
Exemple #10
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files)

        # Get DNA layer.
        dna_layer = None
        for layer in model.layers:
            if layer.name == 'dna':
                dna_layer = layer
                break
        if not dna_layer:
            raise ValueError('The provided model is not a DNA model!')

        # Create output vector.
        outputs = []
        for output in model.outputs:
            outputs.append(K.reshape(output, (-1, 1)))
        outputs = K.concatenate(outputs, axis=1)

        # Compute gradient of outputs wrt. DNA layer.
        grads = []
        for name in opts.targets:
            if name == 'mean':
                target = K.mean(outputs, axis=1)
            elif name == 'var':
                target = K.var(outputs, axis=1)
            else:
                raise ValueError('Invalid effect size "%s"!' % name)
            grad = K.gradients(target, dna_layer.output)
            grads.extend(grad)
        grad_fun = K.function(model.inputs, grads)

        log.info('Reading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(opts.data_files[0],
                                                  regex=opts.replicate_names,
                                                  nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, outputs=False, replicate_names=replicate_names)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(name=path,
                                         shape=[nb_sample] +
                                         list(data.shape[1:]),
                                         dtype=dtype,
                                         compression=compression)
            out_group[path][idx:idx + len(data)] = data

        log.info('Computing effects ...')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for inputs in data_reader:
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            # Compute gradients.
            grads = grad_fun(inputs)

            # Slice window at center.
            if opts.dna_wlen:
                for i, grad in enumerate(grads):
                    delta = opts.dna_wlen // 2
                    ctr = grad.shape[1] // 2
                    grads[i] = grad[:, (ctr - delta):(ctr + delta + 1)]

            # Aggregate effects in window
            if opts.agg_effects:
                for i, grad in enumerate(grads):
                    if opts.agg_effects == 'mean':
                        grad = grad.mean(axis=1)
                    elif opts.agg_effects == 'wmean':
                        weights = linear_weights(grad.shape[1])
                        grad = np.average(grad, axis=1, weights=weights)
                    elif opts.agg_effects == 'max':
                        grad = grad.max(axis=1)
                    else:
                        tmp = 'Invalid function "%s"!' % (opts.agg_effects)
                        raise ValueError(tmp)
                    grads[i] = grad

            # Write computed effects
            for name, grad in zip(opts.targets, grads):
                h5_dump(name, grad, idx)

            # Store inputs
            if opts.store_inputs:
                for name, value in zip(model.input_names, inputs):
                    h5_dump(name, value, idx)

            # Store positions
            for name, value in next(meta_reader).items():
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
Exemple #11
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files, log=log.info)

        weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True)
        log.info('Using activation layer "%s"' % act_layer.name)
        log.info('Using weight layer "%s"' % weight_layer.name)

        try:
            dna_idx = model.input_names.index('dna')
        except BaseException:
            raise IOError('Model is not a valid DNA model!')

        fun_outputs = to_list(act_layer.output)
        if opts.store_preds:
            fun_outputs += to_list(model.output)
        fun = K.function([to_list(model.input)[dna_idx]], fun_outputs)

        log.info('Reading data ...')
        if opts.store_outputs or opts.store_preds:
            output_names = model.output_names
        else:
            output_names = None
        data_reader = mod.DataReader(output_names=output_names,
                                     use_dna=True,
                                     dna_wlen=to_list(
                                         model.input_shape)[dna_idx][1])
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        weights = weight_layer.get_weights()
        out_group['weights/weights'] = weights[0]
        out_group['weights/bias'] = weights[1]

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(name=path,
                                         shape=[nb_sample] +
                                         list(data.shape[1:]),
                                         dtype=dtype,
                                         compression=compression)
            out_group[path][idx:idx + len(data)] = data

        log.info('Computing activations')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for data in data_reader:
            if isinstance(data, tuple):
                inputs, outputs, weights = data
            else:
                inputs = data
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            if opts.store_inputs:
                for i, name in enumerate(model.input_names):
                    h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]),
                            idx)

            if opts.store_outputs:
                for name, output in six.iteritems(outputs):
                    h5_dump('outputs/%s' % name, output, idx)

            fun_eval = fun(inputs)
            act = fun_eval[0]

            if opts.act_wlen:
                delta = opts.act_wlen // 2
                ctr = act.shape[1] // 2
                act = act[:, (ctr - delta):(ctr + delta + 1)]

            if opts.act_fun:
                if opts.act_fun == 'mean':
                    act = act.mean(axis=1)
                elif opts.act_fun == 'wmean':
                    weights = linear_weights(act.shape[1])
                    act = np.average(act, axis=1, weights=weights)
                elif opts.act_fun == 'max':
                    act = act.max(axis=1)
                else:
                    raise ValueError('Invalid function "%s"!' % (opts.act_fun))

            h5_dump('act', act, idx)

            if opts.store_preds:
                preds = fun_eval[1:]
                for i, name in enumerate(model.output_names):
                    h5_dump('preds/%s' % name, preds[i].squeeze(), idx)

            for name, value in six.iteritems(next(meta_reader)):
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
Exemple #12
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
            log.debug(opts)

        if opts.seed is not None:
            np.random.seed(opts.seed)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        K.set_learning_phase(0)
        model = mod.load_model(opts.model_files)

        # Get DNA layer.
        dna_layer = None
        for layer in model.layers:
            if layer.name == 'dna':
                dna_layer = layer
                break
        if not dna_layer:
            raise ValueError('The provided model is not a DNA model!')

        # Create output vector.
        outputs = []
        for output in model.outputs:
            outputs.append(K.reshape(output, (-1, 1)))
        outputs = K.concatenate(outputs, axis=1)

        # Compute gradient of outputs wrt. DNA layer.
        grads = []
        for name in opts.targets:
            if name == 'mean':
                target = K.mean(outputs, axis=1)
            elif name == 'var':
                target = K.var(outputs, axis=1)
            else:
                raise ValueError('Invalid effect size "%s"!' % name)
            grad = K.gradients(target, dna_layer.output)
            grads.extend(grad)
        grad_fun = K.function(model.inputs, grads)

        log.info('Reading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, outputs=False, replicate_names=replicate_names)
        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False,
                                  shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False,
                                 shuffle=False)

        out_file = h5.File(opts.out_file, 'w')
        out_group = out_file

        def h5_dump(path, data, idx, dtype=None, compression='gzip'):
            if path not in out_group:
                if dtype is None:
                    dtype = data.dtype
                out_group.create_dataset(
                    name=path,
                    shape=[nb_sample] + list(data.shape[1:]),
                    dtype=dtype,
                    compression=compression
                )
            out_group[path][idx:idx+len(data)] = data

        log.info('Computing effects ...')
        progbar = ProgressBar(nb_sample, log.info)
        idx = 0
        for inputs in data_reader:
            if isinstance(inputs, dict):
                inputs = list(inputs.values())
            batch_size = len(inputs[0])
            progbar.update(batch_size)

            # Compute gradients.
            grads = grad_fun(inputs)

            # Slice window at center.
            if opts.dna_wlen:
                for i, grad in enumerate(grads):
                    delta = opts.dna_wlen // 2
                    ctr = grad.shape[1] // 2
                    grads[i] = grad[:, (ctr-delta):(ctr+delta+1)]

            # Aggregate effects in window
            if opts.agg_effects:
                for i, grad in enumerate(grads):
                    if opts.agg_effects == 'mean':
                        grad = grad.mean(axis=1)
                    elif opts.agg_effects == 'wmean':
                        weights = linear_weights(grad.shape[1])
                        grad = np.average(grad, axis=1, weights=weights)
                    elif opts.agg_effects == 'max':
                        grad = grad.max(axis=1)
                    else:
                        tmp = 'Invalid function "%s"!' % (opts.agg_effects)
                        raise ValueError(tmp)
                    grads[i] = grad

            # Write computed effects
            for name, grad in zip(opts.targets, grads):
                h5_dump(name, grad, idx)

            # Store inputs
            if opts.store_inputs:
                for name, value in zip(model.input_names, inputs):
                    h5_dump(name, value, idx)

            # Store positions
            for name, value in next(meta_reader).items():
                h5_dump(name, value, idx)

            idx += batch_size
        progbar.close()

        out_file.close()
        log.info('Done!')

        return 0
Exemple #13
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False, shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False, shuffle=False)

        log.info('Predicting ...')
        data = dict()
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in next(meta_reader).items():
                data_batch[name] = value
            dat.add_to_dict(data_batch, data)
        progbar.close()
        data = dat.stack_dict(data)

        report = ev.evaluate_outputs(data['outputs'], data['preds'])

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        if opts.out_data:
            hdf.write_data(data, opts.out_data)

        log.info('Done!')

        return 0
Exemple #14
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        if not opts.model_files:
            raise ValueError('No model files provided!')

        log.info('Loading model ...')
        model = mod.load_model(opts.model_files)

        log.info('Loading data ...')
        nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample)
        replicate_names = dat.get_replicate_names(
            opts.data_files[0],
            regex=opts.replicate_names,
            nb_key=opts.nb_replicate)
        data_reader = mod.data_reader_from_model(
            model, replicate_names, replicate_names=replicate_names)

        # Seed used since unobserved input CpG states are randomly sampled
        if opts.seed is not None:
            np.random.seed(opts.seed)
            random.seed(opts.seed)

        data_reader = data_reader(opts.data_files,
                                  nb_sample=nb_sample,
                                  batch_size=opts.batch_size,
                                  loop=False, shuffle=False)

        meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'],
                                 nb_sample=nb_sample,
                                 batch_size=opts.batch_size,
                                 loop=False, shuffle=False)

        writer = None
        if opts.out_data:
            writer = H5Writer(opts.out_data, nb_sample)

        log.info('Predicting ...')
        nb_tot = 0
        nb_eval = 0
        data_eval = dict()
        perf_eval = []
        progbar = ProgressBar(nb_sample, log.info)
        for inputs, outputs, weights in data_reader:
            batch_size = len(list(inputs.values())[0])
            nb_tot += batch_size
            progbar.update(batch_size)

            preds = to_list(model.predict(inputs))

            data_batch = dict()
            data_batch['preds'] = dict()
            data_batch['outputs'] = dict()
            for i, name in enumerate(model.output_names):
                data_batch['preds'][name] = preds[i].squeeze()
                data_batch['outputs'][name] = outputs[name].squeeze()

            for name, value in six.iteritems(next(meta_reader)):
                data_batch[name] = value

            if writer:
                writer.write_dict(data_batch)

            nb_eval += batch_size
            dat.add_to_dict(data_batch, data_eval)

            if nb_tot >= nb_sample or \
                    (opts.eval_size and nb_eval >= opts.eval_size):
                data_eval = dat.stack_dict(data_eval)
                perf_eval.append(ev.evaluate_outputs(data_eval['outputs'],
                                                     data_eval['preds']))
                data_eval = dict()
                nb_eval = 0

        progbar.close()
        if writer:
            writer.close()

        report = pd.concat(perf_eval)
        report = report.groupby(['metric', 'output']).mean().reset_index()

        if opts.out_report:
            report.to_csv(opts.out_report, sep='\t', index=False)

        report = ev.unstack_report(report)
        print(report.to_string())

        log.info('Done!')

        return 0