Esempio n. 1
0
def merge(in_files,
          out_file,
          attach_read_group_from_file_name=False,
          header_file=None,
          compression_level=9,
          num_compression_threads=0):

    cmd = [
        'samtools',
        'merge',
        '-c',
        '-p',
        '-f',
        '-l',
        compression_level,
        '-@',
        num_compression_threads,
    ]

    if attach_read_group_from_file_name:
        cmd.append('-r')

    if header_file is not None:
        cmd.extend(['-h', header_file])

    cmd.append(out_file)

    for file_name in flatten_input(in_files):
        cmd.append(file_name)

    pypeliner.commandline.execute(*cmd)

    pypeliner.commandline.execute('samtools', 'index', out_file,
                                  _get_bam_index_filename(out_file))
Esempio n. 2
0
def concatenate_tables(in_files,
                       out_file,
                       ignore_empty_files=False,
                       use_gzip=True):

    if use_gzip:
        open_func = gzip.GzipFile

    else:
        open_func = open

    write_header = True

    with open_func(out_file, 'w') as out_fh:
        for file_name in flatten_input(in_files):
            try:
                df = pd.read_csv(file_name, sep='\t')

            except EmptyDataError as e:
                if ignore_empty_files:
                    continue

                else:
                    raise e

            if df.empty:
                continue

            df.to_csv(out_fh, header=write_header, index=False, sep='\t')

            write_header = False
Esempio n. 3
0
def write_header_file(in_files, out_file, seq_info):

    bam = pysam.AlignmentFile(flatten_input(in_files)[0],
                              mode='r',
                              check_sq=False)

    header = bam.header.copy()

    bam.close()

    for x in header['PG'][0]['CL'].split('\t'):
        if ':' in x:
            key, value = x.split(':')

            header['PG'][0][key] = value

    header['PG'] = [{
        'ID': 'bwa',
        'VN': header['PG'][0]['VN'],
        'CL': 'bwa aln; bwa sampe'
    }]

    for entry in header['SQ']:
        for key in seq_info:
            entry[key] = seq_info[key]

    pysam.AlignmentFile(out_file, header=header, mode='wh').close()
Esempio n. 4
0
def merge_vcfs(in_files, out_file):
    in_files = flatten_input(in_files)

    with open(out_file, 'w') as out_fh:
        write_header(out_fh)

        writer = csv.DictWriter(
            out_fh,
            ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'],
            delimiter='\t')

        reader = MultiVcfReader(in_files)

        for row in reader:
            writer.writerow({
                'CHROM': row.chrom,
                'POS': row.coord,
                'ID': '.',
                'REF': row.ref,
                'ALT': row.alt,
                'QUAL': '.',
                'FILTER': '.',
                'INFO': '.'
            })

        reader.close()
Esempio n. 5
0
def concatenate(in_files, out_file):
    """ Concatenate FASTQ files.
    """
    with gzip.GzipFile(out_file, 'w') as out_fh:
        for in_file in flatten_input(in_files):
            with gzip.GzipFile(in_file, 'r') as in_fh:
                out_fh.write(in_fh.read())
Esempio n. 6
0
def concatenate_bcf(in_files, out_file):
    """ Fast concatenation of BCF file using `bcftools`.

    :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys.

    :param out_file: path where output file will be written in VCF format.

    """

    cmd = ['bcftools', 'concat', '-a', '-O', 'b', '-o', out_file]
    cmd += flatten_input(in_files)

    pypeliner.commandline.execute(*cmd)

    index_vcf(out_file)
    index_bcf(out_file)
Esempio n. 7
0
def mark_duplicates(in_files,
                    out_file,
                    compression_level=9,
                    hash_table_size=262144,
                    io_buffer_size=128,
                    num_threads=1,
                    overflow_list_size=200000,
                    tmp_dir=None):

    try:
        if tmp_dir is None:
            tmp_dir = tempfile.mkdtemp()

            clean_up = True

        else:
            clean_up = False

        cmd = [
            'sambamba',
            'markdup',
            '-p',
            '-l',
            compression_level,
            '-t',
            num_threads,
            '--io-buffer-size={0}'.format(io_buffer_size),
            '--hash-table-size={0}'.format(hash_table_size),
            '--overflow-list-size={0}'.format(overflow_list_size),
            '--tmpdir',
            tmp_dir,
        ]

        cmd.extend(flatten_input(in_files))

        cmd.append(out_file)

        pypeliner.commandline.execute(*cmd)

    finally:
        if clean_up:
            shutil.rmtree(tmp_dir)

    pypeliner.commandline.execute('samtools', 'index', out_file,
                                  _get_bam_index_filename(out_file))
Esempio n. 8
0
def concatenate_vcf(in_files, out_file, allow_overlap=False, docker_config={}):
    """ Fast concatenation of VCF file using `bcftools`.

    :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys.

    :param out_file: path where output file will be written in VCF format.

    """
    if allow_overlap:
        cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', out_file]
    else:
        cmd = ['bcftools', 'concat', '-O', 'z', '-o', out_file]

    cmd += flatten_input(in_files)

    pypeliner.commandline.execute(*cmd, **docker_config)

    index_vcf(out_file, docker_config=docker_config)
    index_bcf(out_file, docker_config=docker_config)
Esempio n. 9
0
def concatenate_tables(in_files,
                       out_file,
                       drop_duplicates=False,
                       in_memory=True,
                       non_numeric_as_category=True):
    in_files = flatten_input(in_files)

    # Only support drop duplicatess in memory
    if drop_duplicates or in_memory:
        _concatenate_tables_in_memory(
            in_files,
            out_file,
            drop_duplicates=drop_duplicates,
            non_numeric_as_category=non_numeric_as_category)

    else:
        _concatenate_tables_on_disk(
            in_files,
            out_file,
            non_numeric_as_category=non_numeric_as_category)
Esempio n. 10
0
def merge(in_files, out_file, index_file=None):
    os.environ['MALLOC_ARENA_MAX'] = '4'

    cmd = [
        'picard',
        '-XX:ParallelGCThreads=1',
        '-Xmx8g',
        'MergeSamFiles',
        'OUTPUT={0}'.format(out_file),
        'VALIDATION_STRINGENCY=LENIENT',
    ]

    for file_name in flatten_input(in_files):
        cmd.append('INPUT={0}'.format(file_name))

    pypeliner.commandline.execute(*cmd)

    if index_file is not None:
        cmd = ['samtools', 'index', out_file, index_file]

    pypeliner.commandline.execute(*cmd)
Esempio n. 11
0
def merge(in_files,
          reference_genome_fasta_file,
          out_file,
          attach_read_group_from_file_name=False,
          header_file=None,
          compression_level=9,
          num_compression_threads=0):

    cmd = [
        'samtools',
        'merge',
        '-c',
        '-p',
        '-f',
        '-l',
        compression_level,
        '-@',
        num_compression_threads,
        '--output-fmt',
        'cram',
        '--reference',
        reference_genome_fasta_file,
    ]

    if attach_read_group_from_file_name:
        cmd.append('-r')

    if header_file is not None:
        cmd.extend(['-h', header_file])

    cmd.append(out_file)

    for file_name in flatten_input(in_files):
        cmd.append(file_name)

    pypeliner.commandline.execute(*cmd)