Esempio n. 1
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        output_names = dat.get_output_names(opts.data_files[0],
                                            regex=opts.output_names)
        stats = OrderedDict()
        for name in output_names:
            output = hdf.read(opts.data_files,
                              'outputs/%s' % name,
                              nb_sample=opts.nb_sample)
            output = list(output.values())[0]
            stats[name] = get_output_stats(output)
        tmp = []
        for key, value in six.iteritems(stats):
            tmp.append(pd.DataFrame(value, index=[key]))
        stats = pd.concat(tmp)
        stats.index.name = 'output'
        stats.reset_index(inplace=True)

        print(stats.to_string())
        if opts.out_tsv:
            stats.to_csv(opts.out_tsv, sep='\t', index=False)

        if opts.out_fig:
            plot_stats(stats).savefig(opts.out_fig)

        return 0
Esempio n. 2
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        output_names = dat.get_output_names(opts.data_files[0],
                                            regex=opts.output_names)
        stats = OrderedDict()
        for name in output_names:
            output = hdf.read(opts.data_files, 'outputs/%s' % name,
                              nb_sample=opts.nb_sample)
            output = list(output.values())[0]
            stats[name] = get_output_stats(output)
        tmp = []
        for key, value in six.iteritems(stats):
            tmp.append(pd.DataFrame(value, index=[key]))
        stats = pd.concat(tmp)
        stats.index.name = 'output'
        stats.reset_index(inplace=True)

        print(stats.to_string())
        if opts.out_tsv:
            stats.to_csv(opts.out_tsv, sep='\t', index=False)

        if opts.out_fig:
            plot_stats(stats).savefig(opts.out_fig)

        return 0
Esempio n. 3
0
    def build_model(self):
        opts = self.opts
        log = self.log

        output_names = dat.get_output_names(opts.train_files[0],
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        if not output_names:
            raise ValueError('No outputs found!')

        dna_model = None
        if opts.dna_model:
            dna_model = self.build_dna_model()

        cpg_model = None
        if opts.cpg_model:
            cpg_model = self.build_cpg_model()

        if dna_model is not None and cpg_model is not None:
            log.info('Joining models ...')
            joint_model_builder = mod.joint.get(opts.joint_model)(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            stem = joint_model_builder([dna_model, cpg_model])
            stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name])
        elif dna_model is not None:
            stem = dna_model
        elif cpg_model is not None:
            stem = cpg_model
        else:
            log.info('Loading existing model ...')
            stem = mod.load_model(opts.model_files, log=log.info)
            if sorted(output_names) == sorted(stem.output_names):
                return stem
            log.info('Removing existing output layers ...')
            remove_outputs(stem)

        outputs = mod.add_output_layers(stem.outputs, output_names)
        model = Model(input=stem.inputs, output=outputs, name=stem.name)
        return model
Esempio n. 4
0
    def build_model(self):
        opts = self.opts
        log = self.log

        output_names = dat.get_output_names(opts.train_files[0],
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        if not output_names:
            raise ValueError('No outputs found!')

        dna_model = None
        if opts.dna_model:
            dna_model = self.build_dna_model()

        cpg_model = None
        if opts.cpg_model:
            cpg_model = self.build_cpg_model()

        if dna_model is not None and cpg_model is not None:
            log.info('Joining models ...')
            joint_model_builder = mod.joint.get(opts.joint_model)(
                l1_decay=opts.l1_decay,
                l2_decay=opts.l2_decay,
                dropout=opts.dropout)
            stem = joint_model_builder([dna_model, cpg_model])
            stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name])
        elif dna_model is not None:
            stem = dna_model
        elif cpg_model is not None:
            stem = cpg_model
        else:
            log.info('Loading existing model ...')
            stem = mod.load_model(opts.model_files, log=log.info)
            if sorted(output_names) == sorted(stem.output_names):
                return stem
            log.info('Removing existing output layers ...')
            remove_outputs(stem)

        outputs = mod.add_output_layers(stem.outputs[0], output_names)
        model = Model(inputs=stem.inputs, outputs=outputs, name=stem.name)
        return model
Esempio n. 5
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        data_file = h5.File(opts.data_file, 'r')

        nb_sample = len(data_file['pos'])
        if opts.nb_sample:
            nb_sample = min(nb_sample, opts.nb_sample)

        data = dict()
        for name in ['chromo', 'pos']:
            data[name] = data_file[name][:nb_sample]

        idx = None
        if opts.chromos:
            idx = np.in1d(data['chromo'],
                          [chromo.encode() for chromo in opts.chromos])
            for key, value in six.iteritems(data):
                data[key] = value[idx]

        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names)

        make_dir(opts.out_dir)

        for output_name in output_names:
            log.info(output_name)
            data['output'] = data_file['outputs'][output_name][:nb_sample]
            data['pred'] = data_file['preds'][output_name][:nb_sample]
            if idx is not None:
                for name in ['output', 'pred']:
                    data[name] = data[name][idx]

            # Use `output` label if known, otherwise prediction
            data['value'] = data['pred']
            tmp = data['output'] != dat.CPG_NAN
            data['value'][tmp] = data['output'][tmp]

            name = output_name.split(dat.OUTPUT_SEP)
            if name[0] == 'cpg':
                name = name[-1]
            else:
                name = '_'.join(name)
            out_file = os.path.join(opts.out_dir, name)

            if opts.out_format == 'bedGraph':
                write_to_bedGraph(data,
                                  out_file + '.bedGraph.gz',
                                  compression='gzip')
            elif opts.out_format == 'hdf':
                write_to_hdf(data, out_file + '.h5')
            else:
                tmp = 'Invalid output format "%s"!' % opts.out_format
                raise ValueError()

        log.info('Done!')

        return 0
Esempio n. 6
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        self.opts = opts
        self.log = log

        # Get performance curve functions from names.
        curve_funs = dict()
        if opts.curves:
            for name in opts.curves:
                curve_funs[name] = get_curve_fun(name)
        anno_curve_funs = dict()
        if opts.anno_curves:
            for name in opts.anno_curves:
                anno_curve_funs[name] = get_curve_fun(name)

        log.info('Loading data ...')
        # Read and sort predictions and outputs.
        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        names = {'chromo': None, 'pos': None,
                 'outputs': output_names,
                 'preds': output_names}
        data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample)
        data['chromo'] = [chromo.decode() for chromo in data['chromo']]
        data['chromo'] = np.array(data['chromo'])
        data = fold_dict(data, nb_level=1)
        idx = np.lexsort((data['pos'], data['chromo']))
        data = slice_dict(data, idx)
        for chromo in np.unique(data['chromo']):
            chromo_pos = data['pos'][data['chromo'] == chromo]
            tmp = np.sort(chromo_pos)
            assert np.all(chromo_pos == tmp)
        log.info('%d samples' % len(data['pos']))

        reports = []
        curves = []

        log.info('Evaluating globally ...')
        # Evaluate performances globally.
        report = ev.evaluate_outputs(data['outputs'], data['preds'])
        report['anno'] = ANNO_GLOBAL
        reports.append(report)
        pd.set_option('display.width', 1000)
        print(ev.unstack_report(report))

        if curve_funs:
            # Performance curves.
            for name, fun in curve_funs.items():
                log.info('%s curve' % name)
                curve = ev.evaluate_curve(data['outputs'], data['preds'],
                                          fun=fun, nb_point=opts.nb_curve_point)
                if curve is not None:
                    curve['curve'] = name
                    curve['anno'] = ANNO_GLOBAL
                    curves.append(curve)

        if opts.anno_files:
            log.info('Evaluating annotations ...')
            # Evaluate annotations.
            for anno_file in opts.anno_files:
                anno = read_anno_file(anno_file)
                anno_name = os.path.splitext(os.path.basename(anno_file))[0]
                idx = annotate(data['chromo'], data['pos'], anno)
                log.info('%s: %d' % (anno_name, idx.sum()))
                if idx.sum() < opts.anno_min_sites:
                    log.info('Skipping due to insufficient annotated sites!')
                    continue
                # Select data at annotated sites.
                anno_data = slice_dict(data, idx)
                report = ev.evaluate_outputs(anno_data['outputs'],
                                             anno_data['preds'])
                report['anno'] = anno_name
                reports.append(report)

                if curve_funs:
                    # Performance curves.
                    for name, fun in anno_curve_funs.items():
                        log.info('%s curve' % name)
                        curve = ev.evaluate_curve(
                            data['outputs'], data['preds'],
                            fun=fun, nb_point=opts.nb_curve_point)
                        if curve is not None:
                            curve['curve'] = name
                            curve['anno'] = anno_name
                            curves.append(curve)

        make_dir(opts.out_dir)
        if reports:
            report = pd.concat(reports)
            report = report[['anno', 'metric', 'output', 'value']]
            self.save_report(report, 'metrics')
        if curves:
            curves = pd.concat(curves)
            curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']]
            self.save_report(curves, 'curves')

        log.info('Done!')

        return 0
Esempio n. 7
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)

        data_file = h5.File(opts.data_file, 'r')

        nb_sample = len(data_file['pos'])
        if opts.nb_sample:
            nb_sample = min(nb_sample, opts.nb_sample)

        data = dict()
        for name in ['chromo', 'pos']:
            data[name] = data_file[name][:nb_sample]

        idx = None
        if opts.chromos:
            idx = np.in1d(data['chromo'],
                          [chromo.encode() for chromo in opts.chromos])
            for key, value in six.iteritems(data):
                data[key] = value[idx]

        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names)

        make_dir(opts.out_dir)

        for output_name in output_names:
            log.info(output_name)
            data['output'] = data_file['outputs'][output_name][:nb_sample]
            data['pred'] = data_file['preds'][output_name][:nb_sample]
            if idx is not None:
                for name in ['output', 'pred']:
                    data[name] = data[name][idx]

            # Use `output` label if known, otherwise prediction
            data['value'] = data['pred']
            tmp = data['output'] != dat.CPG_NAN
            data['value'][tmp] = data['output'][tmp]

            name = output_name.split(dat.OUTPUT_SEP)
            if name[0] == 'cpg':
                name = name[-1]
            else:
                name = '_'.join(name)
            out_file = os.path.join(opts.out_dir, name)

            if opts.out_format == 'bedGraph':
                write_to_bedGraph(data, out_file + '.bedGraph.gz',
                                  compression='gzip')
            elif opts.out_format == 'hdf':
                write_to_hdf(data, out_file + '.h5')
            else:
                tmp = 'Invalid output format "%s"!' % opts.out_format
                raise ValueError()

        log.info('Done!')

        return 0
Esempio n. 8
0
    def main(self, name, opts):
        logging.basicConfig(filename=opts.log_file,
                            format='%(levelname)s (%(asctime)s): %(message)s')
        log = logging.getLogger(name)
        if opts.verbose:
            log.setLevel(logging.DEBUG)
        else:
            log.setLevel(logging.INFO)
        log.debug(opts)

        self.opts = opts
        self.log = log

        # Get performance curve functions from names.
        curve_funs = dict()
        if opts.curves:
            for name in opts.curves:
                curve_funs[name] = get_curve_fun(name)
        anno_curve_funs = dict()
        if opts.anno_curves:
            for name in opts.anno_curves:
                anno_curve_funs[name] = get_curve_fun(name)

        log.info('Loading data ...')
        # Read and sort predictions and outputs.
        output_names = dat.get_output_names(opts.data_file,
                                            regex=opts.output_names,
                                            nb_key=opts.nb_output)
        names = {
            'chromo': None,
            'pos': None,
            'outputs': output_names,
            'preds': output_names
        }
        data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample)
        data['chromo'] = [chromo.decode() for chromo in data['chromo']]
        data['chromo'] = np.array(data['chromo'])
        data = fold_dict(data, nb_level=1)
        idx = np.lexsort((data['pos'], data['chromo']))
        data = slice_dict(data, idx)
        for chromo in np.unique(data['chromo']):
            chromo_pos = data['pos'][data['chromo'] == chromo]
            tmp = np.sort(chromo_pos)
            assert np.all(chromo_pos == tmp)
        log.info('%d samples' % len(data['pos']))

        reports = []
        curves = []

        log.info('Evaluating globally ...')
        # Evaluate performances globally.
        report = ev.evaluate_outputs(data['outputs'], data['preds'])
        report['anno'] = ANNO_GLOBAL
        reports.append(report)
        pd.set_option('display.width', 1000)
        print(ev.unstack_report(report))

        if curve_funs:
            # Performance curves.
            for name, fun in curve_funs.items():
                log.info('%s curve' % name)
                curve = ev.evaluate_curve(data['outputs'],
                                          data['preds'],
                                          fun=fun,
                                          nb_point=opts.nb_curve_point)
                if curve is not None:
                    curve['curve'] = name
                    curve['anno'] = ANNO_GLOBAL
                    curves.append(curve)

        if opts.anno_files:
            log.info('Evaluating annotations ...')
            # Evaluate annotations.
            for anno_file in opts.anno_files:
                anno = read_anno_file(anno_file)
                anno_name = os.path.splitext(os.path.basename(anno_file))[0]
                idx = annotate(data['chromo'], data['pos'], anno)
                log.info('%s: %d' % (anno_name, idx.sum()))
                if idx.sum() < opts.anno_min_sites:
                    log.info('Skipping due to insufficient annotated sites!')
                    continue
                # Select data at annotated sites.
                anno_data = slice_dict(data, idx)
                report = ev.evaluate_outputs(anno_data['outputs'],
                                             anno_data['preds'])
                report['anno'] = anno_name
                reports.append(report)

                if curve_funs:
                    # Performance curves.
                    for name, fun in anno_curve_funs.items():
                        log.info('%s curve' % name)
                        curve = ev.evaluate_curve(data['outputs'],
                                                  data['preds'],
                                                  fun=fun,
                                                  nb_point=opts.nb_curve_point)
                        if curve is not None:
                            curve['curve'] = name
                            curve['anno'] = anno_name
                            curves.append(curve)

        make_dir(opts.out_dir)
        if reports:
            report = pd.concat(reports)
            report = report[['anno', 'metric', 'output', 'value']]
            self.save_report(report, 'metrics')
        if curves:
            curves = pd.concat(curves)
            curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']]
            self.save_report(curves, 'curves')

        log.info('Done!')

        return 0