Beispiel #1
0
 def __init__(self, pipeline):
     rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path)
     r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('mock_table', cv.Annot(str, converter=rw_conv)),
         ('mock_pileup', cv.Annot(str, converter=r_conv)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #2
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('kmer_length',
          cv.Annot(int, default=7, converter=cv.nonneg_integer)),
         ('extra_flags', cv.Annot(list, default=[])),
         ('outdir_name', cv.Annot(str, 'fastQC')),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #3
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('clip_len', cv.Annot(int, default=10,
                               converter=cv.nonneg_integer)),
         ('clipped_5prime_bc',
          cv.Annot(bool, default=False, converter=cv.boolean)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #4
0
    def __init__(self, pipeline):

        cfg_fmt = [
            ('output_prefix', cv.Annot(str)),
            ('min_cov', cv.Annot(int, default=5, converter=cv.nonneg_integer)),
            ('y_axis_limit', cv.Annot(float, default=0)),
            ('remove_tmp_files', cv.Annot(bool, default=True)),
        ]
        super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #5
0
 def __init__(self, pipeline):
     relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('file_postfix', cv.Annot(str, default='fil')),
         ('padding_bp', cv.Annot(int, default=10, converter=cv.nonneg_integer)),
         ('features', cv.Annot(list, default=[])),
         ('filter_gff', cv.Annot(str, converter=relpath_conv)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #6
0
 def __init__(self, pipeline):
     rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path)
     r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     nuc_validator = partial(cv.in_set_validator, item_set='ACGT')
     cfg_fmt = [('sites_file', cv.Annot(str, converter=rw_conv)),
                ('gff_file', cv.Annot(str, default='', converter=r_conv)),
                ('fasta_file', cv.Annot(str, converter=r_conv)),
                ('transition_nucleotide',
                 cv.Annot(str, default='T', converter=nuc_validator))]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #7
0
 def __init__(self, pipeline):
     rw_conv = partial(cv.rel_file_rw_validator, cfg_path=pipeline.cfg_path)
     r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('mock_model', cv.Annot(converter=rw_conv)),
         ('mock_statistics', cv.Annot(converter=r_conv)),
         ('n_mixture_components', cv.Annot(converter=cv.nonneg_integer)),
         ('em_iterations', cv.Annot(default=250,
                                    converter=cv.nonneg_integer)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #8
0
 def __init__(self, pipeline):
     relgen_conv = partial(cv.rel_mapindex_validator,
                           cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('genome_index', cv.Annot(str, converter=relgen_conv)),
         ('n_mismatch', cv.Annot(int,
                                 default=1,
                                 converter=cv.nonneg_integer)),
         ('n_multimap', cv.Annot(int,
                                 default=1,
                                 converter=cv.nonneg_integer)),
         ('extra_flags',
          cv.Annot(str, default=[], converter=cv.comma_sep_args)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #9
0
 def __init__(self, pipeline):
     optpath_conv = partial(opt_file_validator, cfg_path=pipeline.cfg_path)
     genome_conv = partial(cv.rel_genome_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('genome_fasta', cv.Annot(str, converter=genome_conv)),
         ('output_prefix', cv.Annot(str)),
         ('kmer_k', cv.Annot(int, default=3, converter=cv.nonneg_integer)),
         ('first_index', cv.Annot(int, default=0, converter=cv.nonneg_integer)),
         ('last_index', cv.Annot(int, default=1500, converter=cv.nonneg_integer)),
         ('width', cv.Annot(int, default=50, converter=cv.nonneg_integer)),
         ('sort_key', cv.Annot(str, default='occupancy', converter=sort_key_validator)),
         ('gff_exclude_path', cv.Annot(str, default='', converter=optpath_conv,
                                       warn_if_missing=False)),
         ('gff_padding', cv.Annot(int, default=20, converter=cv.nonneg_integer)),
         ('remove_tmp_files', cv.Annot(bool, default=True)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #10
0
 def __init__(self, pipeline):
     optpath_conv = partial(opt_file_validator, cfg_path=pipeline.cfg_path)
     relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     genome_conv = partial(cv.rel_genome_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('genome_fasta', cv.Annot(str, converter=genome_conv)),
         ('output_prefix', cv.Annot(str)),
         ('kmer_k', cv.Annot(int, default=3, converter=cv.nonneg_integer)),
         ('sort_key', cv.Annot(str, default='occ', converter=sort_key_validator)),
         ('gff_exclude_path', cv.Annot(str, default='', converter=optpath_conv,
                                       warn_if_missing=False)),
         ('use_quantiles', cv.Annot(bool, default=True)),
         ('negative_set_gff', cv.Annot(str, converter=relpath_conv)),
         ('n_negative_seqs', cv.Annot(int, default=20000, converter=cv.nonneg_integer)),
         ('remove_tmp_files', cv.Annot(bool, default=True)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #11
0
    def __init__(self, pipeline, keep_all=False, cfg_req=[]):
        """Initializes a new module with empty list of queued commands

        Args:
            pipeline: the pipeline the module is queued in
            keep_all (:obj:`boolean`): do not remove any temporary files
            cfg_req (:obj:`list`): list of configuration options
        """
        cfg_req.extend([
            ('keep_all',
             cv.Annot(cv.boolean, default=False, warn_if_missing=False)),
            ('module_info', cv.Annot(str, default='', warn_if_missing=False)),
            ('skip', cv.Annot(cv.boolean, default=False,
                              warn_if_missing=False)),
        ])
        self._default_parameters = {'keep_all', 'module_info', 'skip'}
        self._cfg_req = OrderedDict()
        for key, value in cfg_req:
            self._cfg_req[key] = value
        self._keep_all = keep_all
        self._tmp_files = []
        self._intermed_files = []
        self._cmds = []
        self._pipeline = pipeline
Beispiel #12
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('plot_dir', cv.Annot(str, default='mockinbird_plots')),
         ('max_k_mock', cv.Annot(int, default=10, converter=cv.nonneg_integer)),
         ('min_k', cv.Annot(int, default=2, converter=cv.nonneg_integer)),
         ('min_post', cv.Annot(float, default=0.1)),
         ('extra_args', cv.Annot(list, default=[])),
         ('null_fraction', cv.Annot(float, default=1)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #13
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('remove_n_edge_mut',
          cv.Annot(int, default=0, converter=cv.nonneg_integer)),
         ('max_mut_per_read',
          cv.Annot(int, default=1, converter=cv.nonneg_integer)),
         ('min_base_quality',
          cv.Annot(int, default=0, converter=cv.nonneg_integer)),
         ('min_avg_ali_quality',
          cv.Annot(int, default=20, converter=cv.nonneg_integer)),
         ('min_mismatch_quality',
          cv.Annot(int, default=20, converter=cv.nonneg_integer)),
         ('dump_raw_data', cv.Annot(cv.boolean, default=False)),
         ('outdir_name', cv.Annot(str, default='bam_analysis')),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #14
0
 def __init__(self, pipeline):
     relgen_conv = partial(cv.rel_mapindex_validator,
                           cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('genome_index', cv.Annot(str, converter=relgen_conv)),
         ('n_mismatch', cv.Annot(int,
                                 default=1,
                                 converter=cv.nonneg_integer)),
         ('n_multimap', cv.Annot(int,
                                 default=1,
                                 converter=cv.nonneg_integer)),
         ('extra_flags', cv.Annot(list, default=[])),
         ('allow_soft_clipping', cv.Annot(cv.boolean, default=True)),
         ('outdir_name', cv.Annot(str, default='star_out')),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #15
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('min_transitions', cv.Annot(cv.nonneg_integer, default=2)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #16
0
 def __init__(self, pipeline):
     r_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('gff_file', cv.Annot(str, default='', converter=r_conv)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #17
0
def run(args):

    inputfile = args.parclip_fastq
    outputdir = args.output_dir
    prefix = args.prefix

    prepare_dir_or_die(outputdir)

    # activate logging
    logging_file = os.path.join(outputdir, 'preprocess.log')
    logger = logging.getLogger()
    logger.setLevel(LOG_LEVEL_MAP[args.log_level])
    formatter = logging.Formatter(LOG_DEFAULT_FORMAT)

    console_handler = logging.StreamHandler(stream=sys.stdout)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    file_handler = logging.FileHandler(logging_file, mode='w')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    logger.info('mockinbird version: %s', __version__)
    logger.info('working directory: %s', os.getcwd())
    logger.info('started preprocessing via %r', ' '.join(sys.argv))

    config = mu.parse_yaml(args.config_file)

    def relpath_conv(file_path):
        return cv.rel_file_r_validator(file_path, args.config_file)

    def genomefasta_validator(file_path):
        return cv.rel_genome_validator(file_path, args.config_file)

    general_fmt = OrderedDict([
        ('adapter5prime', cv.Annot(str, converter=cv.dnastr_validator)),
        ('adapter3prime', cv.Annot(str, converter=cv.dnastr_validator)),
        ('genomefasta', cv.Annot(str, converter=genomefasta_validator)),
        ('normalization_pileup', cv.Annot(str, converter=relpath_conv)),
        ('rmTemp', cv.Annot(cv.boolean, default=True)),
        ('n_threads', cv.Annot(int, default=2)),
    ])

    reads_fmt = OrderedDict([
        ('bc_5prime', cv.Annot(int, default=0, converter=cv.nonneg_integer)),
        ('bc_3prime', cv.Annot(int, default=0, converter=cv.nonneg_integer)),
        ('min_len', cv.Annot(int, default=20, converter=cv.nonneg_integer)),
        ('reference_nucleotide', cv.Annot(str, default='T', converter=cv.dnanuc_validator)),
        ('mutation_nucleotide', cv.Annot(str, default='C', converter=cv.dnanuc_validator)),
    ])

    mandatory_sections = 'pipeline', 'general', 'reads'
    for section in mandatory_sections:
        if section not in config:
            logger.error('the config file does not define the mandatory section %s', section)
            sys.exit(1)

    try:
        general_raw = config['general']
        general_cfg = cv.validate_section(general_raw, general_fmt)
    except cv.ConfigError:
        logger.error('Error while parsing section %r', 'general')
        sys.exit(1)

    general_cfg['prefix'] = prefix
    general_cfg['output_dir'] = outputdir

    try:
        reads_raw = config['reads']
        reads_cfg = cv.validate_section(reads_raw, reads_fmt)
    except cv.ConfigError:
        logger.error('Error while parsing section %r', 'reads')
        sys.exit(1)

    initial_files = {'fastq': inputfile}

    if 'custom_files' in config:
        for fmt, path in config['custom_files'].items():
            if isinstance(path, str):
                try:
                    rel_path = relpath_conv(path)
                    initial_files[fmt] = rel_path
                except ValueError:
                    logger.warning('key %r: invalid file path %r', fmt, path)

    gencfg = {
        'reads': reads_cfg,
        'general': general_cfg,
    }
    pipeline = pl.Pipeline(initial_files=initial_files, general_cfg=gencfg,
                           cfg_path=args.config_file)
    mu.queue_pipeline(config, pipeline, def_lookup_path='mockinbird.utils.preprocess_modules')
    mu.run_pipeline(pipeline)

    if general_cfg['rmTemp']:
        pipeline.cleanup()

    logger.info('all done. See you soon!')
Beispiel #18
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('max_quantile', cv.Annot(float, default=0.95)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #19
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('outdir_name', cv.Annot(str, 'bam_analysis')),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #20
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('quality_cutoff',
          cv.Annot(int, default=30, converter=cv.nonneg_integer)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #21
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('extra_args', cv.Annot(list, default=[])),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #22
0
 def __init__(self, pipeline):
     relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('gff_file', cv.Annot(str, converter=relpath_conv)),
         ('output_prefix', cv.Annot(str)),
         ('downstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)),
         ('upstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)),
         ('gene_bp', cv.Annot(int, default=750, converter=cv.nonneg_integer)),
         ('min_trscr_size_bp', cv.Annot(int, default=1500, converter=cv.nonneg_integer)),
         ('max_trscr_size_bp', cv.Annot(int, default=100000, converter=cv.nonneg_integer)),
         ('smoothing_window', cv.Annot(int, default=20, converter=cv.nonneg_integer)),
         ('labelCenterA', cv.Annot(str)),
         ('labelCenterB', cv.Annot(str)),
         ('labelBody', cv.Annot(str)),
         ('remove_tmp_files', cv.Annot(bool, default=True)),
         ('bootstrap_iter', cv.Annot(int, default=2500, converter=cv.nonneg_integer)),
         ('n_processes', cv.Annot(int, default=4, converter=cv.nonneg_integer)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #23
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('pval_threshold', cv.Annot(float, default=0.005)),
         ('min_cov', cv.Annot(int, default=2, converter=cv.nonneg_integer)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #24
0
 def __init__(self, pipeline):
     cfg_fmt = [
         ('mut_snp_ratio', cv.Annot(float, default=0.75)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)
Beispiel #25
0
 def __init__(self, pipeline):
     relpath_conv = partial(cv.rel_file_r_validator, cfg_path=pipeline.cfg_path)
     cfg_fmt = [
         ('gff_file', cv.Annot(str, converter=relpath_conv)),
         ('output_prefix', cv.Annot(str)),
         ('downstream_bp', cv.Annot(int, default=500, converter=cv.nonneg_integer)),
         ('upstream_bp', cv.Annot(int, default=1000, converter=cv.nonneg_integer)),
         ('min_trscr_size_bp', cv.Annot(int, default=0, converter=cv.nonneg_integer)),
         ('max_trscr_size_bp', cv.Annot(int, default=5000, converter=cv.nonneg_integer)),
         ('xbins', cv.Annot(int, default=500, converter=cv.nonneg_integer)),
         ('ybins', cv.Annot(int, default=500, converter=cv.nonneg_integer)),
         ('x_pixels', cv.Annot(int, default=500, converter=cv.nonneg_integer)),
         ('y_pixels', cv.Annot(int, default=500, converter=cv.nonneg_integer)),
         ('remove_tmp_files', cv.Annot(bool, default=True)),
     ]
     super().__init__(pipeline, cfg_req=cfg_fmt)