Esempio n. 1
0
def main(cmdline=None):
    parser = ArgumentParser()
    parser.add_argument('-l',
                        '--library',
                        required=True,
                        action='append',
                        help="library table to load")
    parser.add_argument('-o', '--output', help='filename to write report to')
    args = parser.parse_args(cmdline)

    libraries = load_library_tables(args.library)

    metrics = []
    for library_id, library in libraries.iterrows():
        genome_triple = genome_name_from_library(library)
        filename = library.analysis_name + '-' + genome_triple + '_picard_markdup.metrics'
        pathname = Path(library.analysis_dir) / filename
        if pathname.exists():
            picard_metric = parse_picard_metric(pathname,
                                                library_id=library_id)
            metrics.append(picard_metric)
        else:
            print('{} is missing. Skipping'.format(pathname))

    metrics = pandas.DataFrame(metrics)
    metrics.set_index('LIBRARY', inplace=True)

    if args.output:
        metrics.to_csv(args.output, sep='\t')
    else:
        print(metrics)
Esempio n. 2
0
    def test_genome_name_from_library_dict(self):
        d = {
            'genome': 'mm10',
            'annotation': 'M21_minimal',
            'sex': 'male',
        }

        self.assertEqual(models.genome_name_from_library(d), 'mm10-M21_minimal-male')
        self.assertRaises(ValueError, models.genome_name_from_library, 10)
Esempio n. 3
0
def build_hash_tree(library_filename):
    table = load_library_tables([library_filename])

    hashes = {}
    for library_id, row in table.iterrows():
        analysis_dir = row.analysis_dir
        name = row.analysis_name + '-' + genome_name_from_library(
            row) + '_genome.bam'
        alignment = os.path.join(analysis_dir, name)
        hashes[library_id] = hash_alignments(alignment)

    return hashes
def link_genome_bams(libraries, output_dir):
    for library_id, library in libraries.iterrows():
        clean_library_id = sanitize_library_suffix(library_id)
        target_dir = os.path.join(output_dir, clean_library_id)
        if not os.path.exists(target_dir):
            os.mkdir(target_dir)

        name = make_bam_track_name(library, library.analysis_dir)
        source_pathname = os.path.join(library.analysis_dir, name)
        target_name = clean_library_id + genome_name_from_library(
            library) + '_genome.bam'
        cur_dir = os.getcwd()
        os.chdir(target_dir)
        if os.path.exists(source_pathname) and not os.path.exists(target_name):
            print(source_pathname, '->', target_name)
            os.symlink(source_pathname, target_name)
        os.chdir(cur_dir)
Esempio n. 5
0
def make_bigwig_track_name(library, signal_type, analysis_root):
    """Generate the base path where the bigwig track is

    :param Series library: row from a library table DataFrame
    :param str signal_type: either uniq or all to specify bigwig type.
    :param str analysis_root: root directory to be searching for track files
    :returns: list of paths of bigWig files relative to analysis_root
    """
    assert signal_type in ('uniq', 'all')

    genome_triplet = genome_name_from_library(library)
    track_name = library.analysis_name + '-' + genome_triplet + '_' + signal_type + '.bw'

    for pathname in [
            os.path.join(library.analysis_dir, track_name),
            os.path.join(analysis_root, track_name)
    ]:
        if os.path.exists(pathname):
            return return_subpath(pathname, analysis_root)

    logger.warning("Couldn't find track file %s", track_name)
def link_rsem(libraries, output_dir):
    for library_id, library in libraries.iterrows():
        clean_library_id = sanitize_library_suffix(library_id)
        target_dir = os.path.join(output_dir, clean_library_id)
        if not os.path.exists(target_dir):
            os.mkdir(target_dir)

        source_dir = library.analysis_dir
        cur_dir = os.getcwd()
        os.chdir(target_dir)
        for extension in [
                '_anno_rsem.genes.results', '_anno_rsem.isoforms.results'
        ]:
            suffix = '-' + genome_name_from_library(library) + extension
            source_name = library_id + suffix
            target_name = clean_library_id + suffix
            source_pathname = os.path.join(source_dir, source_name)
            if os.path.exists(
                    source_pathname) and not os.path.exists(target_name):
                print(source_pathname, '->', target_name)
                os.symlink(source_pathname, target_name)
        os.chdir(cur_dir)
Esempio n. 7
0
def make_bam_track_name(library, analysis_root=None):
    """Generate the base path where the bam track is.

    :param Series library: row from a library table DataFrame
    :param str analysis_root: root directory to be searching for track files
    :returns: path of bam file relative to analysis_root
    """
    genome_triplet = genome_name_from_library(library)
    track_name = library.analysis_name + '-' + genome_triplet + '_genome.bam'
    old_name = 'Aligned.sortedByCoord.out.bam'
    to_check = [
        os.path.join(library.analysis_dir, track_name),
        os.path.join(analysis_root, track_name),
        os.path.join(library.analysis_dir, old_name),
    ]
    for pathname in to_check:
        if os.path.exists(pathname):
            bai = pathname + '.bai'
            if not os.path.exists(bai):
                logger.warning('Missing index file for {}'.format(pathname))
            return return_subpath(pathname, analysis_root)

    logger.warning("Couldn't find track file %s", track_name)
Esempio n. 8
0
    def test_genome_name_from_library_series(self):
        mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv')
        mm10 = models.load_library_tables([mm10tsv])

        self.assertEqual(models.genome_name_from_library(mm10.loc['12304']), 'mm10-M4-female')
        self.assertEqual(models.genome_name_from_library(mm10.loc['12309']), 'mm10-M4-male')