Esempio n. 1
0
def compare_pairwise(run):
    import itertools as it
    all_samples = [s for p in run.projects for s in p.samples]
    pairwise_dict = defaultdict(dict)
    for s1, s2 in it.combinations_with_replacement(all_samples, 2):
        snps_a_by_rsid = s1.snps_from_run(run)
        snps_b_by_rsid = s2.snps_from_run(run)
        matches = 0
        total_locs = 0
        for i, l in enumerate(run.locations):
            snp_a = snps_a_by_rsid[l.rsid]
            snp_b = snps_b_by_rsid[l.rsid]
            seq_a, seq_b = snp_a.get_gt(), snp_b.get_gt()
            if seq_a == 'NN' or seq_b == 'NN':
                pass
            elif seq_a == seq_b:
                matches += 2
            elif seq_a[0] == seq_b[0] or seq_a[1] == seq_b[1]:
                matches += 1
            total_locs += 2
        dist = matches / total_locs
        log.info(f'   {s1.name} VS {s2.name}: {dist:.2f}')
        pairwise_dict[s1.name][s2.name] = dist
        pairwise_dict[s2.name][s1.name] = dist
    return pairwise_dict
Esempio n. 2
0
    def update_batches(samples, silent=False):
        batch_by_name = {bn: Batch(bn) for bn in list(set([b for s in samples for b in s.batch_names]))}
        for sample in samples:
            for bn in sample.batch_names:
                batch_by_name[bn].name = bn
                sample.batch = batch_by_name[bn]
                if sample.phenotype == 'normal':
                    if batch_by_name[bn].normal:
                        critical('Multiple normal samples for batch ' + bn)
                    batch_by_name[bn].normal = sample
                else:
                    batch_by_name[bn].tumor = sample

        for batch in batch_by_name.values():
            if batch.normal and not batch.tumor:
                if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor')
                batch.normal.phenotype = 'tumor'
                batch.normal.batch = batch
                batch.tumor = batch.normal
                batch.normal = None

        # setting up batch properties
        for b in batch_by_name.values():
            b.tumor.normal_match = b.normal

        return batch_by_name
Esempio n. 3
0
def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file,
                p_view):
    if is_us() or is_uk():
        try:
            from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb
        except ImportError:
            log.warn(
                'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`'
            )
        else:
            log.info('Exposing project to NGB...')
            try:
                dataset = project_name + '_Fingerprints'
                add_data_to_ngb(work_dir,
                                p_view,
                                bam_by_sample,
                                dict(),
                                dataset,
                                bed_file=bed_file,
                                genome=genome_build)
                add_file_to_ngb(work_dir,
                                get_dbsnp(genome_build),
                                genome_build,
                                dataset,
                                dataset,
                                skip_if_added=True)
            except Exception:
                traceback.print_exc()
                log.err('Error: cannot export to NGB')
            log.info('*' * 70)
Esempio n. 4
0
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
            else:
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                else:
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                    else:
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
            safe_mkdir(date_dir)
        return date_dir
Esempio n. 5
0
    def find_germline_vcf(self, silent=False, caller=None):
        caller = caller or self.germline_caller
        if not caller:
            if not silent:
                warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF')
            return
        assert caller

        # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019
        vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}.vcf.gz'))
        # in datestamp. bcbio before 1.1.6
        vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz'))

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz)
            self.germline_vcf = vcf_fpath_gz

        elif isfile(vcf_old_fpath_gz):
            verify_file(vcf_old_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz)
            self.germline_vcf = vcf_old_fpath_gz

        elif not silent:
            warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as '
                 f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as '
                 f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
Esempio n. 6
0
def get_ref_fasta(genome):
    if is_az():
        path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa'
        if isfile(path):
            logger.info('Found genome fasta at ' + path)
            return path

    if isdir(join(DATA_DIR, 'genomes', genome)):
        genome_dir = safe_mkdir(join(DATA_DIR, 'genomes'))
    else:
        genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes'))
    if genome not in genomepy.list_installed_genomes(genome_dir):
        genome_rec = [
            rec for rec in genomepy.list_available_genomes()
            if rec[1] == genome
        ]
        if genome_rec:
            genome_rec = genome_rec[0]
        else:
            logger.critical('Error: genome ' + genome + ' is not available')
        logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] +
                    ' and installing into ' + genome_dir)
        genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir)
    genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename
    return genome_fasta_file
Esempio n. 7
0
def _split_reference_by_priority(cnf, features_bed_fpath):
    features = ['CDS', 'Exon', 'Transcript', 'Gene']
    info('Splitting the reference file into ' + ', '.join(features))
    features_and_beds = []
    for f in features:
        features_and_beds.append((f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f)))
    return features_and_beds
Esempio n. 8
0
def remove_run(project_names_line_or_id):
    try:
        id = int(project_names_line_or_id)
    except ValueError:
        project_names = project_names_line_or_id.split('--')
        projects = Project.query.filter(Project.name.in_(project_names))
        if projects.count() < len(project_names):
            raise RuntimeError(
                'Some projects in ' + str(project_names) +
                ' are not found in the database: ' +
                str(set(project_names) - set(p.name for p in projects)))
        run = Run.find_by_projects(projects)
        if not run:
            raise RuntimeError(
                'Cannot find run ' + str(project_names_line_or_id) +
                ' - some projects are not found in the database: ' +
                str(set(project_names) - set(p.name for p in projects)))
    else:
        run = Run.query.filter(id == id).first()
    if run:
        log.info('Deleting run ' + str(run.id))
        run.delete()
        db.session.commit()
    else:
        log.info('Coould not find run')
Esempio n. 9
0
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file
Esempio n. 10
0
def _do_run(cmd, checks, env=None, output_fpath=None, input_fpath=None):
    """Perform running and check results, raising errors for issues.
    """
    cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd)
    s = subprocess.Popen(cmd, shell=shell_arg, executable=executable_arg,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT, close_fds=True, env=env)
    debug_stdout = collections.deque(maxlen=100)
    while 1:
        line = s.stdout.readline()
        if line:
            line = line.decode(errors='replace')
            debug_stdout.append(line)
            info('  ' + line.rstrip())
        exitcode = s.poll()
        if exitcode is not None:
            for line in s.stdout:
                line = line.decode(errors='replace')
                debug_stdout.append(line)
            if exitcode is not None and exitcode != 0:
                error_msg = " ".join(cmd) if not isinstance(cmd, str) else cmd
                error_msg += "\n"
                error_msg += "".join(debug_stdout)
                s.communicate()
                s.stdout.close()
                raise subprocess.CalledProcessError(exitcode, cmd=cmd, output=error_msg)
            else:
                break
    s.communicate()
    s.stdout.close()
    # Check for problems not identified by shell return codes
    if checks:
        for check in checks:
            if not check(output_fpath, input_fpath):
                raise IOError("External command failed")
Esempio n. 11
0
def send_file_for_igv(fpath):
    # handle igv.js Range header which it uses to request a subset of a BAM file:
    range_header = request.headers.get('Range', None)
    if not range_header:
        return send_file(fpath)

    m = re.search('(\d+)-(\d*)', range_header)
    if not m:
        error_msg = "ERROR: unexpected range header syntax: %s" % range_header
        log.err(error_msg)
        return error_msg

    size = os.path.getsize(fpath)
    offset = int(m.group(1))
    length = int(m.group(2) or size) - offset

    with open(fpath, 'rb') as f:
        f.seek(offset)
        data = f.read(length)

    rv = Response(data,
                  206,
                  mimetype="application/octet-stream",
                  direct_passthrough=True)
    rv.headers.add(
        'Content-Range', 'bytes {0}-{1}/{2}'.format(offset,
                                                    offset + length - 1, size))

    log.info("GET range request: %s-%s %s" % (m.group(1), m.group(2), fpath))
    return rv
Esempio n. 12
0
def cnv_to_bed(cnv_path, out_bed_path):
    with open(cnv_path) as fh:
        parse_fn = None
        header = next(fh).strip().split('\t')

        if header[0].startswith('##fileformat=VCF'):
            # Manta
            info(f'Detected {cnv_path} as caller "manta"')
            parse_fn = iter_manta

        else:
            for caller, hdr in header_by_caller.items():
                if header == hdr:
                    print(
                        f'Parsing {cnv_path} as caller "{caller}" with header {hdr}'
                    )
                    parse_fn = get_iter_cnv(header,
                                            parse_row_by_caller[caller])

        if not parse_fn:
            critical(f'Cannot detect CNV file format in {cnv_path}')

    with open(out_bed_path, 'w') as out:
        writer = csv.writer(out, delimiter='\t')
        for i, call in enumerate(parse_fn(cnv_path)):
            if call:
                bed_row = call.get_bed_raw()
                writer.writerow(bed_row)
                if i == 0:
                    print(bed_row)
                    print('')
Esempio n. 13
0
 def adjust_ncpus_per_job(ncpus, max_ncpus_per_job=10, msg=''):
     """ Adjusting the number of cpus to a number below <max_ncpus_per_job>.
         Say, if we have more than 20 cpus on a node and only 1 batch, we should adjust
         to use only half of that for a batch, so that 2 different jobs (say, AMBER and COBALT)
         can be run in parallel, because using 20 cpus per one job is a waste.
     """
     if ncpus > max_ncpus_per_job:
         # new_ncpus = ncpus
         factor = math.ceil(ncpus / max_ncpus_per_job)
         new_ncpus = ncpus // factor
         # while True:
         #     factor += 1
         #     new_ncpus = ncpus // factor
         #     print(f'ncpus: {ncpus}, factor: {factor}, new_ncpus: {new_ncpus}')
         #     if new_ncpus < max_ncpus_per_job:
         #         print(f'breaking')
         #         break
         if not is_silent:
             logger.info(
                 (msg if msg else 'The number of cpus per batch is ') + f'{ncpus} >{max_ncpus_per_job}. '
                 f'This is usually wasteful, so we are adjusting it '
                 f'to the number <={max_ncpus_per_job}: {new_ncpus} = {ncpus} // {factor}, so '
                 f'{factor} different rules can be run in parallel (say, AMBER and COBALT '
                 f'at the same time).')
         ncpus = new_ncpus
     return ncpus
Esempio n. 14
0
 def _load_bcbio_project(self, bcbio_project_path):
     proj = self._parsed_bcbio_projects_by_path.get(bcbio_project_path)
     if not proj:
         info(f'Loading project {bcbio_project_path}')
         proj = BcbioProject(bcbio_project_path, silent=True)
         self._parsed_bcbio_projects_by_path[bcbio_project_path] = proj
     return proj
Esempio n. 15
0
def get_or_create_run(projects, parall_view=None):
    genomes = set([p.genome for p in projects])
    if len(genomes) > 1:
        log.critical('Error: multiple genomes in projects: ' + str(genomes))
    run = Run.find_by_projects(projects)

    if run and run.rerun_on_usercall:
        log.info()
        log.info('Rebuilding tree on usercall')
        build_tree(run)
        run.rerun_on_usercall = False
        db.session.commit()
        return run

    if run and not Run.is_ready(run):
        log.debug('Tree files do not exist, recreating run for projects ' +
                  ', '.join(p.name for p in projects))
        db.session.delete(run)
        db.session.commit()
        run = None

    if run:
        log.debug('Found run for ' + ', '.join([p.name for p in projects]) +
                  ' with ID ' + str(run.id))
    else:
        log.debug('Creating new run for projects ' +
                  ', '.join(p.name for p in projects))
        run = Run.create(projects, parall_view)
        log.debug('Done creating new run with ID ' + str(run.id))
    return run
Esempio n. 16
0
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only,
                     feature_types):
    """ For debug purposes
    """
    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript']
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types)
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    features_bed.saveas(output_file)
    debug(f'Saved features to {output_file}')
Esempio n. 17
0
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
            else:
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                else:
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                    else:
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
            safe_mkdir(date_dir)
        return date_dir
Esempio n. 18
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(**locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Esempio n. 19
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Esempio n. 20
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Esempio n. 21
0
    def find_qc_files(self, dst_dir, exclude_files=None, include_files=None):
        """
        Parses bcbio MultiQC file list and collects all QC files belonging to this batch

        :param dst_dir: destination directory where the QC files will be copied to
        :param exclude_files: not include files matching these patterns
        :param include_files: only include files matching these patterns
        :return: list of file paths copied into `new_mq_data_dir`
        """

        mq_dir = join(self.parent_project.date_dir, 'multiqc')
        mq_filelist = join(mq_dir, 'list_files_final.txt')
        verify_file(mq_filelist, is_critical=True)

        # Cromwell?
        cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz')
        tar_f_by_fp = dict()
        if isfile(cwl_targz):
            info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive')
            if cwl_targz:
                tar = tarfile.open(cwl_targz)
                for member in tar.getmembers():
                    rel_fp = member.name
                    if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp:
                        rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1]
                    tar_f_by_fp[rel_fp] = tar.extractfile(member)

        qc_files_not_found = []
        qc_files_found = []
        with open(mq_filelist) as inp:
            for fp in [l.strip() for l in inp if l.strip()]:
                if fp == 'trimmed' or fp.endswith('/trimmed'):
                    continue  # back-compatibility with bcbio
                if exclude_files:
                    if isinstance(exclude_files, str):
                        exclude_files = [exclude_files]
                    if any(re.search(ptn, fp) for ptn in exclude_files):
                        continue
                if include_files:
                    if isinstance(include_files, str):
                        include_files = [include_files]
                    if not any(re.search(ptn, fp) for ptn in include_files):
                        continue

                new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp)
                if not new_fp:
                    qc_files_not_found.append(fp)
                    continue
                else:
                    qc_files_found.append(new_fp)

        if qc_files_not_found:
            warn('-')
            warn(f'Some QC files from list {mq_filelist} were not found:' +
                ''.join('\n  ' + fpath for fpath in qc_files_not_found))
        return qc_files_found
Esempio n. 22
0
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    if input_fpaths is not None:
        if isinstance(input_fpaths, str):
            input_fpaths = [input_fpaths]
        for fpath in input_fpaths:
            verify_file(fpath, is_critical=True)

    env = _get_env(env_vars)
    # info('env: ' + str(env))

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpaths):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpaths)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpaths)
        else:
            _try_run(cmd, output_fpath, input_fpaths)

    else:
        _try_run(cmd, None, input_fpaths)
Esempio n. 23
0
def ungzip_if_needed(cnf, fpath, silent=False):
    if fpath.endswith('.gz'):
        fpath = fpath[:-3]
    if not file_exists(fpath) and file_exists(fpath + '.gz'):
        gz_fpath = fpath + '.gz'
        cmdline = 'gunzip -c {gz_fpath} > {fpath}'.format(**locals())
        res = run_simple(cmdline)
        if not silent: info()
        if not res:
            return None
    return fpath
Esempio n. 24
0
def genotype(samples, snp_bed, parall_view, work_dir, output_dir,
             genome_build):
    genome_fasta_file = get_ref_fasta(genome_build)
    info('** Running VarDict ** ')
    vcfs = parall_view.run(
        _vardict_pileup_sample,
        [[s, work_dir, output_dir, genome_fasta_file, snp_bed]
         for s in samples])
    vcf_by_sample = OrderedDict(zip([s.name for s in samples], vcfs))
    info('** Finished running VarDict **')
    return vcf_by_sample
Esempio n. 25
0
def ungzip_if_needed(cnf, fpath, silent=False):
    if fpath.endswith('.gz'):
        fpath = fpath[:-3]
    if not file_exists(fpath) and file_exists(fpath + '.gz'):
        gz_fpath = fpath + '.gz'
        cmdline = 'gunzip -c {gz_fpath}'.format(**locals())
        res = run(cmdline, output_fpath=fpath)
        if not silent: info()
        if not res:
            return None
    return fpath
Esempio n. 26
0
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log')

    return output_dir, work_dir, log_fpath
Esempio n. 27
0
def plot_heatmap(pairwise, run_dir, title):
    df = pd.DataFrame(data=pairwise)
    log.info(df)

    # Generate a mask for the upper triangle + main diagonale
    mask = np.zeros_like(df, dtype=np.bool)
    mask[np.triu_indices_from(mask, k=1)] = True

    # Set up the matplotlib figure
    n = len(pairwise)
    figsize = (n / 2, n * 7 / 20)
    log.info(f'Saving figure of size {figsize}')
    f, ax = plt.subplots(figsize=figsize)  # For 20 samples, take 10x7
    if title:
        ax.set_title(title)

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    g = sns.heatmap(df,
                    vmin=0.5,
                    vmax=1,
                    center=0.75,
                    mask=mask,
                    cmap=cmap,
                    annot=True,
                    fmt='.2f',
                    ax=ax,
                    annot_kws={'size': 8})
    g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=7)
    g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=7)
    sns.set(font_scale=2)
    matplotlib.pyplot.subplots_adjust(left=0.2, right=1, top=0.93, bottom=0.29)

    png_file = join(run_dir, str_to_filename(title) + '.png')
    if isfile(png_file):
        os.remove(png_file)
    matplotlib.pyplot.savefig(png_file)
    if isfile(png_file):
        log.info('')
        log.info('Saved heatmap into ' + adjust_path(png_file))
        try:
            from az.webserver.exposing import convert_gpfs_path_to_url
        except ImportError:
            pass
        else:
            url = convert_gpfs_path_to_url(png_file)
            if url:
                log.info('    url: ' + url)
                return url
        return png_file
Esempio n. 28
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Esempio n. 29
0
def main(host, port):
    clearup.HOST_IP = host
    clearup.POST = port
    log.init(True, join(DATA_DIR, 'log_server.txt'), save_previous=True)

    os.environ['FLASK_DEBUG'] = '1'
    # log_path = join(DATA_DIR, 'flask.log')
    # handler = RotatingFileHandler(log_path, maxBytes=10000, backupCount=10)
    # handler.setLevel(logging.INFO)
    # app.logger.addHandler(handler)

    http_server = WSGIServer((host, port), app, handler_class=WebSocketHandler)
    log.info('Starting a webserver at ' + host + ':' + str(port))
    http_server.serve_forever()
Esempio n. 30
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Esempio n. 31
0
    def find_sv_vcf(self, silent=False, caller=False):
        caller = caller or self.sv_caller

        sv_prio   = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz')
        sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz')
        # CWL?
        sv_cwl_prio   = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz')
        sv_cwl_unprio = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}.vcf.gz')

        if isfile(sv_prio):
            verify_file(sv_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio)
            self.sv_vcf = sv_prio

        elif isfile(sv_unprio):
            verify_file(sv_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio)
            self.sv_vcf = sv_unprio

        elif isfile(sv_cwl_prio):
            verify_file(sv_cwl_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio)
            self.sv_cwl_prio = sv_cwl_prio

        elif isfile(sv_cwl_unprio):
            verify_file(sv_cwl_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio)
            self.sv_vcf = sv_cwl_unprio

        elif not silent:
            warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as '
                 f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), '
                 f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
Esempio n. 32
0
def load_bcbio_project(bcbio_dir, name=None, use_callable=False):
    log.info('-' * 70)
    log.info('Loading project into the fingerprints database from ' +
             bcbio_dir)
    log.info('-' * 70)
    log.info()

    bcbio_proj = BcbioProject()
    bcbio_proj.load_from_bcbio_dir(bcbio_dir,
                                   project_name=name,
                                   proc_name='clearup')

    _add_project(
        bam_by_sample={s.name: s.bam
                       for s in bcbio_proj.samples},
        project_name=name or bcbio_proj.project_name,
        bed_file=bcbio_proj.coverage_bed,
        use_callable=use_callable,
        data_dir=bcbio_proj.final_dir,
        genome=bcbio_proj.genome_build,
        min_depth=DEPTH_CUTOFF,
        depth_by_sample={
            s.name: s.get_avg_depth()
            for s in bcbio_proj.samples
        },
    )
Esempio n. 33
0
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(
        adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'),
                          'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')),
                           proc_name + '.log')

    return output_dir, work_dir, log_fpath
Esempio n. 34
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.')

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Esempio n. 35
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
    def _get_approved_genes_by_kind(approved_genes, kind):
        if not approved_genes:
            return 'NOT FOUND'

        if len(approved_genes) > 1:
            approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id]

            if len(approved_genes_same_ucsc) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' +
                    db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_ucsc) == 1:
                if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom):
                    err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id,
                        print_date=False)
                    return approved_genes_same_ucsc[0].name

            # Ok, no genes with same ucsc id, or not the same chromosome for them.

            approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom]

            if len(approved_genes_same_chrom) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' +
                    db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_chrom) == 1:
                g = approved_genes_same_chrom[0]
                info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom '
                    + db_chrom + ', picking it', print_date=False)
                if _check_gene_symbol(g, gene_symbol, db_id, db_chrom):
                    return g.name
                else:
                    return 'NOT FOUND'

            if len(approved_genes_same_chrom) == 0:
                err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom '
                    + db_chrom + '', print_date=False)
                return 'NOT FOUND'

        if len(approved_genes) == 1:
            if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom):
                info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as '
                    + kind + ')', print_date=False)
                return approved_genes[0].name

        return 'NOT FOUND'
def parse_hgnc_chrom(chrom):
    if chrom in ['reserved', 'c10_B']:
        return None

    CHROMS = ['Y', 'X', 'mitochondria']
    for i in range(22, 0, -1):
        CHROMS.append(str(i))

    for c in CHROMS:
        if chrom.startswith(c):
            if c == 'mitochondria':
                return 'chrM'
            return 'chr' + c

    info('  Notice: cannot parse chromosome ' + chrom)
    return None
Esempio n. 38
0
def main(input_bed, output_file, output_features=False, genome=None,
         only_canonical=False, short=False, extended=False, high_confidence=False,
         ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False):
    """ Annotating BED file based on reference features annotations.
    """
    logger.init(is_debug_=is_debug)

    if not genome:
        raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome')

    if short:
        if extended:        raise click.BadParameter('--short and --extended can\'t be set both', param='extended')
        if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features')
    elif output_features or extended:
        extended = True
        short    = False

    if not verify_file(input_bed):
        click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed')
    input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}')

    if work_dir:
        work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0])
        safe_mkdir(work_dir)
        info(f'Created work directory {work_dir}')
    else:
        work_dir = mkdtemp('bed_annotate')
        debug('Created temporary work directory {work_dir}')

    input_bed = clean_bed(input_bed, work_dir)
    input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning')

    output_file = adjust_path(output_file)

    output_file = annotate(
        input_bed, output_file, work_dir, genome=genome,
        only_canonical=only_canonical, short=short, extended=extended,
        high_confidence=high_confidence, collapse_exons=collapse_exons,
        output_features=output_features,
        ambiguities_method=ambiguities_method, coding_only=coding_only,
        is_debug=is_debug)

    if not work_dir:
        debug(f'Removing work directory {work_dir}')
        shutil.rmtree(work_dir)

    info(f'Done, saved to {output_file}')
Esempio n. 39
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = _get_env(env_vars)

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
Esempio n. 40
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Esempio n. 41
0
def _load_datasets(subdirs):
    vcf_by_project_by_genome = defaultdict(dict)
    # vcf_by_label = dict()
    # all_bed_files = []
    # project_names = []
    datasets = []

    for subdir in subdirs:
        dataset = Dataset()

        if ':' in subdir:
            subdir, dataset.genome = subdir.split(':')
        else:
            dataset.genome = 'hg19'

        dir_path = subdir
        if glob(join(dir_path, '*.vcf.gz')):
            log.info(f'Found .vcf.gz files in directory {dir_path}')
            # Simple directory with VCF files and an optional BED file?
            dataset.name = subdir.replace('/', '__')
            if glob(join(dir_path, '*.bed')):
                dataset.bed_file = glob(join(dir_path, '*.bed'))[0]
            for vcf_fpath in glob(join(dir_path, '*.vcf.gz')):
                label = join(subdir,
                             basename(splitext_plus(vcf_fpath)[0])).replace(
                                 '/', '__')
                dataset.vcf_by_label[label] = vcf_fpath
        else:
            log.info(
                f'Not found any .vcf.gz files in directory {dir_path}. Checking if that\'s a bcbio folder.'
            )
            # Bcbio directory?
            bcbio_proj = BcbioProject()
            bcbio_proj.load_from_bcbio_dir(subdir, proc_name='clearup')
            dataset.name = bcbio_proj.project_name
            dataset.genome = bcbio_proj.genome_build
            for s in bcbio_proj.samples:
                vcf_file = s.find_raw_vcf()
                if vcf_file:
                    dataset.vcf_by_label[bcbio_proj.project_name + '__' +
                                         s.name] = vcf_file
            if bcbio_proj.coverage_bed:
                dataset.bed_file = bcbio_proj.coverage_bed

        datasets.append(dataset)
    return datasets
Esempio n. 42
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical(
            'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.'
        )

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Esempio n. 43
0
def detect_run_info_in_config_dir(config_dir):
    run_info_fpaths_in_config = [
        abspath(join(config_dir, fname)) for fname in os.listdir(config_dir)
        if fname.startswith('run_info') and fname.endswith('.yaml')
    ]

    if len(run_info_fpaths_in_config) > 1:
        critical(
            'More than one YAML file containing run_info in name found in the config '
            'directory ' + config_dir + ': ' +
            ' '.join(run_info_fpaths_in_config))

    if len(run_info_fpaths_in_config) == 0:
        return None

    run_cnf = verify_file(run_info_fpaths_in_config[0], is_critical=True)
    info('Using run configuration from the config directory ' + run_cnf)
    return run_cnf
Esempio n. 44
0
def add_user_call(project_names_line, sample_id):
    log.info('Adding user call for ' + str(sample_id))
    edit_sample_id = request.form['editSampleId']
    sample = Sample.query.get(edit_sample_id)
    if not sample:
        log.error('Sample with ID=' + str(edit_sample_id) + ' not found')
        return redirect(
            url_for('closest_comparison_page',
                    project_names_line=project_names_line,
                    sample_id=sample_id))

    # snp = sample.snps.join(Location).filter(Location.rsid==request.form['rsid']).first()
    snp = sample.snps.filter(SNP.rsid == request.form['rsid']).first()
    usercall = request.form['usercall']

    msg = 'ClearUp: usercall for sample ' + sample.name + ' of run ' + project_names_line + ' added:\n'
    msg += 'SNP {}:{} {} {}|{}'.format(str(snp.location.chrom),
                                       str(snp.location.pos),
                                       snp.location.rsid, snp.allele1,
                                       snp.allele2)
    if snp.usercall:
        msg += ', previous usercall ' + snp.usercall
    msg += ', setting usercall ' + usercall

    snp.usercall = usercall
    db.session.commit()

    # Forcing rebuilding the trees of affected runs
    for run in Run.query.all():
        if sample.project in run.projects:
            if any(l for l in run.locations if l.rsid == snp.rsid):
                # log.debug('Removing tree file ' + run.tree_file_path())
                # os.rename(run.fasta_file_path(), run.fasta_file_path() + '.bak')
                # os.rename(run.tree_file_path(), run.tree_file_path() + '.bak')
                run.rerun_on_usercall = True
                db.session.commit()

    log.send_email(msg, subj='ClearUp usercall', only_me=True)

    return render_closest_comparison_page(
        project_names_line,
        sample_id,
        selected_idx=request.form['snpIndex'],
        rerun_if_usercall=False)
def write_all_features(genes, output_fpath, canon_only, cds_only=False, seq2c_cds=False):
    regions = []
    already_added_gene_features = set()
    transcripts = []
    for g in genes:
        _canon_tx = []
        for t in g.transcripts:
            if not canon_only or t.is_canonical:
                _canon_tx.append(t)
        if seq2c_cds and len(_canon_tx) > 1:  # Need to select single one for Seq2C CDS file
            transcripts.append(max(_canon_tx, key=Transcript.get_length_key))
        else:
            transcripts.extend(_canon_tx)

    for t in sorted(transcripts, key=lambda _tr: _tr.get_key()):
        to_add_gene = (not cds_only
                       and all(t2.coding for t2 in t.gene.transcripts if (t2.is_canonical or not canon_only))  # all other transcripts for this gene are coding - we don't report Gene features for ncRNA
                       and t.gene not in already_added_gene_features                                       # and gene is not already added
                       and (len(t.gene.canonical_transcripts) == 1 or len(t.gene.transcripts) == 1))       # and has one canonical or non-canonical transcript to report
        if to_add_gene:
            # skip gene feature for all ncRNA, because there can be multi-domain ncRNA located in different places with the same gene name
            regions.append(t.gene)
            already_added_gene_features.add(t.gene)
        if t.exons:
            if not cds_only:
                regions.append(t)
            for e in t.exons:
                if not cds_only or t.coding:
                    regions.append(e)

    regions = sorted(regions, key=lambda r: r.get_key())
    info('Writing ' + str(len(regions)) + ' regions')
    with open(adjust_path(output_fpath), 'w') as all_out:
        for r in regions:
            if cds_only:
                all_out.write('\t'.join([r.transcript.gene.chrom,
                                         '{}'.format(r.start) if r.start is not None else '.',
                                         '{}'.format(r.end) if r.end is not None else '.',
                                         r.transcript.gene.name or '.']) + '\n')
            else:
                all_out.write(r.__str__())
def read_approved_genes(synonyms_fpath):
    approved_gene_by_name = dict()
    approved_gnames_by_prev_gname = defaultdict(list)
    approved_gnames_by_synonym = defaultdict(list)

    info('Parsing HGNC database ' + synonyms_fpath + '...')
    with open(synonyms_fpath) as f:
        i = 0
        for l in f:
            if l and not l.startswith('#'):
                approved_gn, prev_names, synonyms, hgnc_chrom, ensembl_id, ucsc_id = l.replace('\n', '').split('\t')
                if hgnc_chrom:
                    hgnc_chrom = parse_hgnc_chrom(hgnc_chrom)

                approved_gene = ApprovedGene(approved_gn, prev_names, synonyms, hgnc_chrom, ucsc_id, ensembl_id)
                approved_gene_by_name[approved_gn] = approved_gene

                for gn in prev_names.split(', '):
                    if gn:
                        approved_gnames_by_prev_gname[gn].append(approved_gene)

                for gn in synonyms.split(', '):
                    if gn:
                        approved_gnames_by_synonym[gn].append(approved_gene)
            i += 1
        info('  Processed ' + str(i) + ' lines from ' + synonyms_fpath)
        info()

    return approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
Esempio n. 48
0
def load_bcbio_cnf(config_dir, silent=False):
    all_yamls = [
        abspath(join(config_dir, fname))
        for fname in listdir(config_dir)
        if fname.endswith('.yaml')]
    if len(all_yamls) == 0:
        critical('No YAML file in the config directory.')

    bcbio_yamls = []
    for fpath in all_yamls:
        if not fpath.endswith('-template.yaml'):
            if 'details' in load_yaml_config(fpath):
                bcbio_yamls.append(fpath)
    if len(bcbio_yamls) == 0:
        critical('No bcbio YAMLs found in the config directory: ' + config_dir +
                 ' (only ' + ', '.join(map(basename, all_yamls)) +
                 ' which do not have the "details" section)')
    if len(bcbio_yamls) > 1:
        critical('More than one bcbio YAML file found in the config directory ' +
                 config_dir + ': ' + ' '.join(bcbio_yamls))
    yaml_fpath = bcbio_yamls[0]
    if not silent: info('Using bcbio YAML config: ' + yaml_fpath)
    return load_yaml_config(yaml_fpath), yaml_fpath
Esempio n. 49
0
def detect_bcbio_dir(input_dir, silent=False):
    """
    :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final`
    :return: (config_dir, final_dir, date_dir)
    """
    config_dir, final_dir, date_dir = None, None, None

    input_dir = abspath(input_dir)

    # We are inside `*final*`
    if 'final' in basename(input_dir):  # allow prefixes and postfixes
        final_dir = input_dir
        root_dir = dirname(final_dir)
        config_dir = join(root_dir, 'config')
        if not isdir(config_dir):
            err(f'Are you running on a bcbio output?\n'
                f'The input folder appear to be `final` ({input_dir}), '
                f'however can\'t find `config` directory at the same level ({config_dir})')
            raise NoConfigDirException('No config dir')

    # We are inside `config`
    elif basename(input_dir) == 'config':
        config_dir = input_dir

    # We are in a parent dir to `config` (and possibly `final`, called otherwise)
    elif isdir(join(input_dir, 'config')):
        config_dir = join(input_dir, 'config')

    # We are inside a date dir
    elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))):
        final_dir = abspath(join(input_dir, pardir))
        root_dir = abspath(join(input_dir, pardir, pardir))
        config_dir = abspath(join(root_dir, 'config'))

        # if 'final' not in basename(final_dir):
        #     err(f'Are you running on a bcbio output?\n'
        #         f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} '
        #         f'is a datestamp directory. However, the parent directory is not called `*final*`')
        #     raise NoConfigDirException('No final dir')

    else:
        if not silent:
            err(f'Are you running on a bcbio output?\n'
                f'{input_dir} is not `config` or `*final*`, and '
                f'can\'t find a `config` directory at {join(input_dir, "config")}, or {abspath(join(input_dir, pardir, "config"))}.'
                f'Make sure that you changed to a bcbio root or final directory, or provided it as a first argument.')
        raise NoConfigDirException('No config dir')

    if not silent:
        if not silent:
            info(f'Bcbio config directory: ' + config_dir)
        if final_dir:
            if not silent: info('"final" directory: ' + final_dir)
            if date_dir:
                if not silent: info('"datestamp" directory: ' + date_dir)

    return config_dir, final_dir, date_dir
Esempio n. 50
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Esempio n. 51
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
Esempio n. 52
0
def check_md5(work_dir, fpath, file_type, silent=False):
    md5_fpath = join(work_dir, file_type + '_md5.txt')
    new_md5 = md5(fpath)
    info('md5 of ' + fpath + ' is ' + str(new_md5))
    prev_md5 = None
    if isfile(md5_fpath):
        with open(md5_fpath) as f:
            prev_md5 = f.read()
    else:
        info('Previous md5 file ' + md5_fpath + ' does not exist')
    info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5))

    if prev_md5 == new_md5:
        if not silent:
            debug('Reusing previous ' + file_type.upper() + ' files.')
        return True
    else:
        if not silent:
            info('Pre-processing input ' + file_type.upper() + ' file')
        if prev_md5:
            if not silent:
                info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5))
                info('New ' + file_type.upper() + ' md5: ' + str(new_md5))

        with open(md5_fpath, 'w') as f:
            f.write(str(new_md5))
        return False
def _approve(gene_by_name, synonyms_fpath):
    approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym = \
        read_approved_genes(synonyms_fpath)

    not_approved_gene_names = list()
    gene_after_approving_by_name = OrderedDict()
    total_approved = 0
    total_not_approved = 0
    j = 0
    for g in gene_by_name.values():
        if len(g.exons) == 0:
            continue

        gene_after_approving_by_name[g.name] = g
        if is_approved_symbol(g.name, approved_gene_by_name):
            gene_after_approving_by_name[g.name] = g
            total_approved += 1
        else:
            not_approved_gene_names.append(g.name)
            total_not_approved += 1

        j += 1
        if j % 1000 == 0:
            info('processed ' + str(j / 1000) + 'k genes...')

    info('-----')
    info('Total: ' + str(j))
    if approved_gene_by_name:
        info('Total approved: ' + str(total_approved))
        info('Total not approved: ' + str(total_not_approved))
    info()
    info('Saving genes...')

    gene_features = 0
    features_counter = defaultdict(int)
    biotypes_counter = defaultdict(int)
    no_exon_gene_num = 0

    filtered_gene_after_approving_by_name = OrderedDict()
    for g in gene_after_approving_by_name.values():
        if len(g.exons) == 0:
            no_exon_gene_num += 1
        else:
            filtered_gene_after_approving_by_name[g.name] = g

            gene_features += 1
            features_counter[g.feature] += 1
            biotypes_counter[g.biotype] += 1

            for e in g.exons:
                features_counter[e.feature] += 1

                if e.feature == 'exon': e.feature = 'Exon'
                elif e.feature == 'stop_codon': e.feature = 'CDS'
                else: e.feature = e.feature[0].upper() + e.feature[1:]

    info('Skipped {} genes with no sub-features.'.format(no_exon_gene_num))
    info('Approved {} genes, including:'.format(gene_features))
    info('    Gene: {}'.format(features_counter['Gene']))
    info('    Multi_Gene: {}'.format(features_counter['Multi_Gene']))
    info('')

    info('Out of total: {} protein coding genes, {} ncRNA genes, including:'.format(
        biotypes_counter['protein_coding'], sum(biotypes_counter.values()) - biotypes_counter['protein_coding']))
    for bt, cnt in biotypes_counter.items():
        if bt != 'protein_coding':
            err('    ' + bt + ': ' + str(cnt))

    info()
    if ALL_EXONS:
        info('Found {} exons.'.format(features_counter['exon']))
    else:
        info('Also found {} CDS, {} stop codons, and {} ncRNA exons.'.format(
            features_counter['CDS'], features_counter['stop_codon'], features_counter['exon']))

    return filtered_gene_after_approving_by_name, not_approved_gene_names
def get_approved_gene_symbol(approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym,
                             gene_symbol, db_id='', db_chrom='', indent=''):
    if gene_symbol in approved_gene_by_name:
        if _check_gene_symbol(approved_gene_by_name[gene_symbol], gene_symbol, db_id, db_chrom):
            return approved_gene_by_name[gene_symbol].name, None

    info(indent + 'Gene name ' + gene_symbol + ' is not approved, searching for an approved version... ',
        ending='', print_date=False)

    def _get_approved_genes_by_kind(approved_genes, kind):
        if not approved_genes:
            return 'NOT FOUND'

        if len(approved_genes) > 1:
            approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id]

            if len(approved_genes_same_ucsc) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' +
                    db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_ucsc) == 1:
                if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom):
                    err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id,
                        print_date=False)
                    return approved_genes_same_ucsc[0].name

            # Ok, no genes with same ucsc id, or not the same chromosome for them.

            approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom]

            if len(approved_genes_same_chrom) > 1:
                err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' +
                    db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False)
                return 'AMBIGUOUS'

            if len(approved_genes_same_chrom) == 1:
                g = approved_genes_same_chrom[0]
                info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom '
                    + db_chrom + ', picking it', print_date=False)
                if _check_gene_symbol(g, gene_symbol, db_id, db_chrom):
                    return g.name
                else:
                    return 'NOT FOUND'

            if len(approved_genes_same_chrom) == 0:
                err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom '
                    + db_chrom + '', print_date=False)
                return 'NOT FOUND'

        if len(approved_genes) == 1:
            if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom):
                info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as '
                    + kind + ')', print_date=False)
                return approved_genes[0].name

        return 'NOT FOUND'

    res = _get_approved_genes_by_kind(approved_gnames_by_prev_gname.get(gene_symbol), 'prev')
    if res == 'AMBIGUOUS':
        return None, 'AMBIGUOUS\tAS PREV'
    elif res == 'NOT FOUND':
        res = _get_approved_genes_by_kind(approved_gnames_by_synonym.get(gene_symbol), 'synonym')
        if res == 'AMBIGUOUS':
            return None, res + '\tAS SYNONYM'
        if res == 'NOT FOUND':
            err(' not found.', print_date=False)
            return None, res
        else:
            info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as synonym): ' + res, print_date=False)
            return res, None
    else:
        info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as prev): ' + res, print_date=False)
        return res, None
def main():
    description = '''
The script writes all RefSeq features for requested genome build, and generates 3 files:
    all_features.{genome}.bed:
        Gene (protein_coding)
        Transcript (protein_coding and ncRNA)
        Exon (ncRNA)
        CDS (protein_coding)
    all_features.{genome}.canon.bed:
        The same, but taking canonical (or longest) transcripts only
    CDS.{genome}.bed
        CDS, canonical (or longest) transcripts only

Usage:
    ' + __file__ + ' hg19 [db.gtf]

     And db.gtf is either of the following:

     Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz
     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";
     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";
     ...

     RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz
     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA
     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true
     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2
     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2
     ...

     RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables)
     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol
     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1
     ...

See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'''

    options = [
        # (['--bam'], dict(dest='bam', help='path to the BAM file to analyse',)),
    ]

    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')

    genome_name = args[0]
    chrom_order = ref.get_chrom_order(genome_name)
    canonical_transcripts_ids = ref.get_canonical_transcripts_ids(genome_name)
    if len(args) > 1:
        input_fpath = verify_file(args[1])
    else:
        input_fpath = ba.get_refseq_gene(genome_name)

    output_dirpath = ba.get_refseq_dirpath()
    synonyms_fpath = ba.get_hgnc_gene_synonyms()
    not_approved_fpath = join(output_dirpath, 'not_approved.txt')

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        if input_fpath.endswith('.gtf') or input_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_dirpath, chrom_order)
        elif input_fpath.endswith('.gff3') or input_fpath.endswith('.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_dirpath, chrom_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_dirpath, chrom_order)

    if synonyms_fpath and DO_APPROVE:
        gene_by_name_and_chrom, not_approved_gene_names = _approve(gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_genes = [g for g in genes if any(t.coding for t in g.transcripts)]
    coding_transcripts = [t for g in coding_genes for t in g.transcripts if t.coding]
    rna_genes = [g for g in genes if all(not t.coding for t in g.transcripts)]
    rna_transcripts = [t for g in genes for t in g.transcripts if not t.coding]
    mixed_genes = [g for g in genes if any(not t.coding for t in g.transcripts) and any(t.coding for t in g.transcripts)]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(rna_genes)) + ' RNA genes')
    info('  ' + str(len(rna_transcripts)) + ' RNA transcripts')
    info('  ' + str(len(mixed_genes)) + ' genes with both coding and RNA transcripts')
    for g in coding_genes:
        g.coding = True
        g.biotype = 'protein_coding'
    for g in rna_genes:
        g.coding = False
        g.biotype = 'RNA'

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    all_features_fpath = ba.get_all_features(genome_name)
    write_all_features(genes, all_features_fpath, canon_only=False)
    all_features_fpath = bgzip_and_tabix(all_features_fpath, tabix_parameters='-p bed')

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = ba.get_all_features_canonical(genome_name, gzip=False)
    write_all_features(canon_genes, canon_output_fpath, canon_only=True)
    canon_output_fpath = bgzip_and_tabix(canon_output_fpath, tabix_parameters='-p bed')

    info()
    info('Sorting and printing canonical CDS...')
    cds_output_fpath = ba.get_cds(genome_name)
    write_all_features(canon_genes, cds_output_fpath, canon_only=True, cds_only=True)

    # info()
    # info('Sorting and printing CDS for Seq2C (unique transcript per gene)...')
    # seq2c_output_fpath = ga.get_seq2c_cds(genome_name)
    # write_all_features(canon_genes, seq2c_output_fpath, canon_only=True, cds_only=True, seq2c_cds=True)

    info()
    info('Saved all regions to\n   ' + all_features_fpath + '\n   ' + canon_output_fpath + '\n   ' + cds_output_fpath + '\n   ' + seq2c_output_fpath)
def _proc_ensembl_gtf(inp, out, chr_order, additional_feature_list=None):
    if additional_feature_list is None:
        additional_feature_list = []

    info('additional_feature_list = ' + str(additional_feature_list))

    gene_by_name = OrderedDict()
    gene_by_id = OrderedDict()

    info('Parsing Ensembl input...')
    total_lines = 0
    total_non_coding_genes = 0

    for l in inp:
        if l and not l.startswith('#'):
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')

            # if is_local():
            #     if chrom != '21':
            #         continue

            total_lines += 1
            if total_lines % 1000 == 0:
                info(str(total_lines / 1000) + 'k lines, ' + str(len(gene_by_name)) + ' genes found')
                sys.stdout.flush()

            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            gene_biotype = _rm_quotes(_prop_dict['gene_biotype'])
            gene_source = _rm_quotes(_prop_dict['gene_source'])

            # if gene_symbol == 'PTENP1':
            #     sys.stderr.write('PTENP1\n')

            if not ALL_EXONS and gene_biotype not in [
                'protein_coding',
                'nonsense_mediated_decay',
                'non_stop_decay',
                'processed_transcript',
                'polymorphic_pseudogene',
                'sense_intronic',
                'sense_overlapping',
                'antisense',

            ] and not any(b in gene_biotype for b in ['RNA', 'IG_', 'TR_']):
                total_non_coding_genes += 1
                continue

            full_feature_list = ['gene', 'CDS', 'stop_codon', 'exon'] + additional_feature_list
            if ALL_EXONS:
                full_feature_list = ['gene', 'exon']
            # sys.stderr.write('Full feature list: ' + str(full_feature_list) + '\n')
            if feature not in full_feature_list:
                continue

            start, end = int(start) - 1, int(end)

            if int(end) <= int(start):
                info('Error: start > end: ' + l)
                continue

            chrom = parse_ensembl_chrom(chrom)
            if not chrom:
                continue

            if feature == 'gene':
                # assert gene_biotype == biotype, 'Gene: gene_biotype "' + gene_biotype + '"
                # do not match biotype "' + biotype + '" for ' + gene_symbol

                gene = Gene(chrom, chr_order.get(chrom), start, end, gene_symbol, strand,
                            gene_biotype, gene_id, gene_source)

                if gene.name in gene_by_name:
                    prev_gene = gene_by_name[gene.name]

                    if gene.source != prev_gene.source:
                        err('    Duplicated gene in different databases:')
                        err('        This: ' + gene.__repr__())
                        err('        Prev: ' + prev_gene.__repr__())
                        # answer = raw_input('Which one to pick? This (1), prev (2), longest (Enter): ')
                        #
                        # if answer == '1' or answer == '' and gene.end - gene.start >
                        # prev_gene.end - prev_gene.start:
                        #     del gene_by_name[prev_gene.name]
                        #     del gene_by_id[prev_gene.db_id]
                        #
                        # else:
                        #     continue

                        if gene.source == 'ensembl' or prev_gene.source == 'havana':
                            del gene_by_name[prev_gene.name]
                            del gene_by_id[prev_gene.db_id]
                            err('        Picking up this one.')

                        if prev_gene.source == 'ensembl' or gene.source == 'havana':
                            err('        Picking up previous one.')
                            continue

                    else:
                        err('    Duplicated gene in ' + gene.source + ':')
                        err('        ' + gene.__repr__())
                        prev_gene.start = min(prev_gene.start, gene.start)
                        prev_gene.end = max(prev_gene.end, gene.end)
                        prev_gene.feature = 'Multi_Gene'
                        continue

                    err('')

                gene_by_name[gene_symbol] = gene
                gene_by_id[gene_id] = gene

            elif feature in ['CDS', 'stop_codon'] \
                    or feature == 'exon' and ('RNA' in gene_biotype or ALL_EXONS) \
                    or feature in additional_feature_list:
                assert gene_symbol in gene_by_name, 'Error: ' + feature + ' record before gene record ' + \
                        gene_symbol + ', ' + gene_id + '; gene_by_name: ' + str(gene_by_name.keys())
                gene = gene_by_name[gene_symbol]
                if gene.gene_id == gene_id:
                    assert gene_biotype == gene.biotype, feature + ': gene_biotype "' + gene_biotype + \
                         '" do not match biotype "' + gene.biotype + '" for ' + gene_symbol
                    exon = Exon(gene, start, end, gene_biotype, feature)
                    gene.exons.append(exon)

    info()
    info(
        'Processed ' +
        str(total_lines) + ' lines, ' +
        str(total_non_coding_genes) + ' non-coding genes skipped, ' +
        str(len(gene_by_name)) + ' coding genes found')
    info()
    return gene_by_name
def choose_canonical(genes, canonical_transcripts_ids):
    not_found_in_canon_coding_num = 0
    not_found_in_canon_coding_num_one_transcript = 0
    not_found_in_canon_rna_num = 0
    not_found_in_canon_other_num = 0
    many_canon_coding_num = 0
    many_canon_rna_num = 0
    many_canon_other_num = 0

    canon_genes = []
    for g in genes:
        _canon_tx = []
        for t in g.transcripts:
            if t.transcript_id in canonical_transcripts_ids:
                t.is_canonical = True
                _canon_tx.append(t)

        if len(_canon_tx) > 1:
            if any(t.coding for t in g.transcripts):
                many_canon_coding_num += 1
                # Checking overlapping
                for i, t1 in enumerate(_canon_tx):
                    for j in range(i + 1, len(_canon_tx)):
                        t2 = _canon_tx[j]
                        if t1.start <= t2.start < t1.end or t1.start <= t2.end < t1.end:
                            err('Transcripts ' + t1.transcript_id + ' (' + str(t1.start) + ':' + str(t1.end) + ') and ' +
                                                 t2.transcript_id + ' (' + str(t2.start) + ':' + str(t2.end) + ') ' +
                                ' in gene ' + g.name + ' ' + g.chrom + ' overlap')
            elif any(not t.coding for t in g.transcripts):
                many_canon_rna_num += 1
            else:
                many_canon_other_num += 1

        if len(_canon_tx) == 0:
            if any(t.coding for t in g.transcripts):
                not_found_in_canon_coding_num += 1
                if len(g.transcripts) == 1:
                    not_found_in_canon_coding_num_one_transcript += 1
                # longest_t = max(g.transcripts, key=Transcript.length)
                # longest_t.is_canonical = True
            elif any(not t.coding for t in g.transcripts):
                not_found_in_canon_rna_num += 1
            else:
                not_found_in_canon_other_num += 1

        g.canonical_transcripts = [t for t in g.transcripts if t.is_canonical]
        if len(g.canonical_transcripts) > 0:
            if g.canonical_transcripts:
                canon_genes.append(g)

    info('Coding genes with canonical transcripts: ' +
         str(sum(1 for g in canon_genes if any(t.coding for t in g.canonical_transcripts))))
    info('Coding canonical transcripts: ' +
         str(sum(1 for g in canon_genes for t in g.canonical_transcripts if t.coding)))
    info('RNA genes with canonical transcripts: ' +
         str(sum(1 for g in canon_genes if any(not t.coding for t in g.canonical_transcripts))))
    info('RNA canonical transcripts: ' +
         str(sum(1 for g in canon_genes for t in g.canonical_transcripts if not t.coding)))

    info()
    info('Coding genes with no canonical transcripts (picking longest out of the rest): ' + str(not_found_in_canon_coding_num))
    info('RNA genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_rna_num))
    info('Other genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_other_num))
    info('Coding genes with many canonical transcripts (picking longest): ' + str(many_canon_coding_num))
    info('RNA genes with many canonical transcripts (keeping all): ' + str(many_canon_rna_num))
    info('Other genes with many canonical transcripts (keeping all): ' + str(many_canon_other_num))

    return canon_genes
Esempio n. 58
0
def run_snakemake(snakefile, conf, jobs=None, output_dir=None, forcerun=None,
                  unlock=False, dryrun=False, target_rules=None, cluster=None, cluster_cmd=None,
                  log_dir=None, dag=None, report=None, restart_times=None):

    conf['total_cores'] = jobs

    #########################
    #### Setting cluster ####
    #########################

    cluster_param = ''
    cluster_log_dir = ''
    if cluster or cluster_cmd:
        assert log_dir, 'For cluster run, must also specify log_dir'
        if cluster_cmd:
            cluster_param = f' --cluster "{cluster_cmd}"'
        else:
            cluster_log_dir = safe_mkdir(join(log_dir, 'cluster'))
            cluster_param = make_cluster_cmdl(cluster_log_dir, 'umccrise')

    ##########################
    #### Preparing config ####
    ##########################

    if log_dir:
        safe_mkdir(log_dir)
        conf_f = open(join(log_dir, '.conf.yaml'), 'w')
    else:
        conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False)
    yaml.dump(conf, conf_f)
    conf_f.close()

    ###############################
    #### Building command line ####
    ###############################
    if forcerun:
        forcerun = " ".join(forcerun.split(','))

    cmd = (
        f'snakemake '
        f'{" ".join(flatten([target_rules])) if target_rules else ""} ' +
        f'--snakefile {snakefile} '
        f'--printshellcmds '
        f'{"--dryrun " if dryrun else ""}'
        f'{"--dag " if dag else ""}'
        f'{f"--report {report} " if report else ""}'
        f'{f"--directory {output_dir} " if output_dir else ""}'
        f'{f"-j {jobs} " if jobs else ""}'
        f'--rerun-incomplete '
        f'{f"--restart-times {restart_times} " if restart_times else ""}'
        f'{cluster_param} '
        f'--configfile {conf_f.name} ' +
        f'{"--dag " if dag else ""}'
        f'{f"--forcerun {forcerun}" if forcerun else ""}'
    )

    #################
    #### Running ####
    #################

    if unlock:
        print('* Unlocking previous run... *')
        run_simple(cmd + ' --unlock')
        print('* Now rerunning *')

    try:
        run_simple(cmd)
    except subprocess.CalledProcessError:
        logger.error('--------')
        logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        sys.exit(1)
    except KeyboardInterrupt:
        logger.error('--------')
        logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        sys.exit(1)
    else:
        logger.info('--------')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
        logger.info(f'Finished. Output directory: {output_dir}')