def main():
    'The script itself'
    #set parameters
    work_dir, output, reference = set_parameters()

    # make a working tempfir
    temp_dir = NamedTemporaryDir()

    # add readgroup tag to each alignment in bam
    add_header_and_tags_bams(work_dir, temp_dir.name)

    # Prepare files to merge
    sams = get_opened_sams_from_dir(temp_dir.name)
    temp_sam = NamedTemporaryFile()

    # merge all the sam in one
    merge_sam(sams, temp_sam, reference)

    # Convert sam into a bam,(Temporary)
    temp_bam = NamedTemporaryFile(suffix='.bam')
    sam2bam(temp_sam.name, temp_bam.name)

    # finally we need to order the bam
    sort_bam_sam(temp_bam.name, output)

    # and make and index of the bam
    call(['samtools', 'index', output], raise_on_error=True)

    temp_dir.close()
def main():
    "The real script"
    #set parameters
    bamfiles, pileup, reffile = set_parameters()

    # TEMPORARY DIR NAME
    temp_dir = NamedTemporaryDir()
    dir_name = temp_dir.name

    #merge bam files
    if len(bamfiles) == 1:
        merged_bam_file = bamfiles[0]
    else:
        merged_bam_file = os.path.join(dir_name, 'merged.bam')
        cmd = ['samtools', 'merge', merged_bam_file].extend(bamfiles)
        stdout, stderr, retcode = call(cmd)
        if retcode:
            raise RuntimeError('samtools pileup - step error: %s' % stderr)

    #multiple alignment
    cmd = ['samtools', 'pileup', '-f', reffile, merged_bam_file]
    stdout, stderr, retcode = call(cmd)
    if retcode:
        msg = 'Error:\nstep:%s\n error: %s' % (" ".join(cmd), stderr)
        raise RuntimeError(msg)
    else:
        pileup.write(stdout)
    pileup.close()
Example #3
0
def create_sam_reference_index(reference_fpath):
    'It creates a sam index for a reference sequence file'
    index_fpath = reference_fpath + '.fai'
    if os.path.exists(index_fpath):
        return
    cmd = ['samtools', 'faidx', reference_fpath]
    call(cmd, raise_on_error=True)
def map_reads_with_gmap(reference_fpath, reads_fpath, out_bam_fpath,
                        parameters):
    'It maps the reads with gmap'
    threads = parameters['threads']
    tmp_dir = parameters['tmp_dir'] if 'tmp_dir' in parameters else None
    reference_dir, reference_file_name = os.path.split(reference_fpath)
    reference_name = reference_file_name.split('.')[0]
    if not reference_dir:
        reference_dir = '.'

    if not os.path.exists(os.path.join(reference_dir, reference_name,
                                       reference_name + '.chromosome')):
        create_gmap_reference(reference_dir, reference_fpath, reference_name,
                              parameters)

    cmd  = ['gmap', '-d', reference_name, '-D', reference_dir, '-f', 'samse']
    # this gmap options doesn' detect deletions close to introns
    cmd.append('--canonical-mode=0')
    if threads:
        cmd.extend(['-t', str(threads)])
    cmd.append(reads_fpath)
    out_sam_fhand = NamedTemporaryFile(suffix='.sam', dir=tmp_dir)
    call(cmd, stdout=out_sam_fhand, raise_on_error=True)
    if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None:
        out_sam_fhand2 = NamedTemporaryFile(dir=tmp_dir, suffix='.sam')
        get_out_unmapped(out_sam_fhand, parameters['unmapped_fhand'],
                         out_sam_fhand2)
        out_sam_fhand = out_sam_fhand2

    sam2bam(out_sam_fhand.name, out_bam_fpath)
    out_sam_fhand.close()
Example #5
0
def bamsam_converter(input_fhand, output_fhand, java_conf=None):
    'Converts between sam and bam'
    picard_path = guess_jar_dir('SortSam.jar', java_conf)
    picard_jar = os.path.join(picard_path, 'SamFormatConverter.jar')
    cmd = java_cmd(java_conf)
    cmd.extend(['-jar', picard_jar, 'INPUT=' + input_fhand,
                'OUTPUT=' + output_fhand])
    call(cmd, raise_on_error=True, add_ext_dir=False)
Example #6
0
def create_bam_index(bam_fpath):
    'It creates an index of the bam if it does not exist'
    index_fpath = bam_fpath + '.bai'

    if os.path.exists(index_fpath):
        return
    cmd = ['samtools', 'index', bam_fpath]
    call(cmd, raise_on_error=True)
Example #7
0
def bam2sam(bam_path, sam_path, header=False):
    '''It converts between bam and sam.'''
    cmd = ['samtools', 'view']
    if header:
        cmd.append('-h')
    cmd.append(bam_path)
    cmd.extend(['-o', sam_path])
    call(cmd, raise_on_error=True)
def main():
    "The real script"
    seqfile, reffile, seq_type, refdb, bamfile = set_parameters()

    # we need that the index is created
    if not os.path.exists(refdb + '.bwt'):
        raise OSError('Reference database index not created: %s' % refdb)

    # TEMPORARY DIR NAME
    temp_dir = NamedTemporaryDir()
    dir_name = temp_dir.name
    if seq_type == 'short':
        cmd = ['bwa', 'aln', refdb, seqfile]
        stdout, stderr, retcode = call(cmd)
        if retcode:
            print stderr
            raise RuntimeError(" ".join(cmd))
        sai_fhand = open(os.path.join(dir_name, 'output.sai'), 'wb')
        sai_fhand.write(stdout)
        sai_fhand.close()

        cmd = ['bwa', 'samse', refdb, sai_fhand.name, seqfile]
        stdout, stderr, retcode = call(cmd)
        if retcode:
            print stderr
            raise RuntimeError(" ".join(cmd))
        ali_fhand = open(os.path.join(dir_name, 'output.ali'), 'w')
        ali_fhand.write(stdout)
        ali_fhand.close()

    elif seq_type == 'long':
        #align sanger
        cmd = ['bwa', 'dbwtsw', refdb, seqfile]
        stdout, stderr, retcode = call(cmd)
        if retcode:
            print stderr
            raise RuntimeError(" ".join(cmd))

        ali_fhand = open(os.path.join(dir_name, 'output.ali'), 'w')
        ali_fhand.write(stdout)
        ali_fhand.close()
    else:
        raise ValueError('Seq type: short or long')

    #from sam import to bam
    cmd = ['samtools', 'view' , '-bt', reffile, '-o',
           os.path.join(dir_name, 'bam_file.bam'), ali_fhand.name]
    stdout, stderr, retcode = call(cmd)
    if retcode:
        print stderr
        raise RuntimeError(" ".join(cmd))

    #sort the bam
    cmd = ['samtools', 'sort', os.path.join(dir_name, 'bam_file.bam'), bamfile]
    stdout, stderr, retcode = call(cmd)
    if retcode:
        print stderr
        raise RuntimeError(" ".join(cmd))
def blast_runner_plus(seq_fpath, blast_db, blast_type, result_fpath,
                       threads=False):
    'It runs a blast giving a file and a database path'
    cmd = [blast_type, '-db', blast_db, '-num_alignments', '25',
           '-num_descriptions', '25', '-evalue', '0.0001', '-outfmt', '5',
           '-query', seq_fpath, '-out', result_fpath]
    if threads:
        cmd.extend(['-num_threads', str(threads)])
    call(cmd, raise_on_error=True, log=True)
Example #10
0
def create_bowtie_reference(reference_fpath, color=False):
    'It creates the bowtie index used by bowtie and tophat'
    bowtie_index = os.path.splitext(reference_fpath)[0]
    cmd = ['bowtie-build']
    if color:
        cmd.append('-C')

    cmd.extend([reference_fpath, bowtie_index])
    call(cmd, raise_on_error=True)
Example #11
0
def compress_and_index_vcf(vcf_fpath):
    '''It indexes the vcf file using tabix and bgzip. the indexes file will be
    vcf_filename.gz
    '''
    cmd = ['bgzip', '-f', vcf_fpath]
    call(cmd, raise_on_error=True)

    cmd = ['tabix', '-p', 'vcf', '-f', '{0:s}.gz'.format(vcf_fpath)]
    call(cmd, raise_on_error=True)
Example #12
0
def create_picard_dict(reference_fpath, java_conf=None):
    'It creates a picard dict if if it does not exist'
    dict_path = os.path.splitext(reference_fpath)[0] + '.dict'
    if os.path.exists(dict_path):
        return
    picard_path = guess_jar_dir('SortSam.jar', java_conf)
    picard_jar = os.path.join(picard_path, 'CreateSequenceDictionary.jar')
    cmd = ['java', '-jar', picard_jar,
           'R=%s' % reference_fpath,
           'O=%s' % dict_path]
    call(cmd, raise_on_error=True, add_ext_dir=False)
Example #13
0
def create_gmap_reference(reference_dir, reference_path, reference_name,
                          parameters=None):
    'It creates the reference fpath'

    cmd = ['gmap_build',  '-B',  get_external_bin_dir(), '-D',  reference_dir,
           '-d', reference_name]
    if parameters and 'kmer' in parameters:
        cmd.extend(['-k', str(parameters['kmer'])])
    cmd.append(reference_path)
    call(cmd,raise_on_error=True)

    fpath = '%s.coords' % reference_name
    if os.path.exists(fpath):
        os.remove(fpath)
Example #14
0
    def run(self):
        '''It runs the analysis.'''
        self._log({'analysis_started':True})
        inputs = self._get_input_fpaths()
        bam_path = inputs['bam']
        bam_fpath = bam_path.last_version
        reference_fpath = inputs['reference'].last_version
        out_fhand = open(bam_path.next_version, 'w')
        cmd = ['samtools', 'calmd', '-Abr', bam_fpath, reference_fpath]
        call(cmd, raise_on_error=True, stdout=out_fhand)

        create_bam_index(out_fhand.name)

        out_fhand.close()
        self._log({'analysis_finished':True})
def main():
    'the main part'
    # set parameters
    infhand, outfpath = set_parameters()

    # get orfs
    orf_fhand = NamedTemporaryFile()
    for index, orf in enumerate(get_orfs(infhand)):
        orf_fhand.write('>name_%d\n%s\n' % (index, orf))
    orf_fhand.flush()
    print orf_fhand.name
    raw_input()
    # run codon_table_maker
    cmd = ['cusp', '-sequence', orf_fhand.name, '-outfile', outfpath]
    call(cmd)
Example #16
0
def create_bwa_reference(reference_fpath, color=False):
    'It creates the bwa index for the given reference'
    #how many sequences do we have?
    n_seqs = 0
    for line in open(reference_fpath):
        if line[0] == '>':
            n_seqs += 1
    if n_seqs > 10000:
        algorithm = 'bwtsw'
    else:
        algorithm = 'is'

    cmd = ['bwa', 'index', '-a', algorithm, reference_fpath]
    if color:
        cmd.append('-c')

    call(cmd, raise_on_error=True)
Example #17
0
def iprscan_run(in_fasta_fpath, out_fpath):
    """ It runs iprscan and returns the result in a variable. It assumes that
     the iprscan binary is correctly instaled"""
    iprscan_bin = "iprscan"

    cmd = [iprscan_bin, "-cli", "-i", in_fasta_fpath, "-o", out_fpath, "-goterm", "-iprlookup", "-format", "xml"]
    # pylint: disable-msg=W0612
    stdout, stderr, retcode = call(cmd)
    if retcode:
        raise RuntimeError(stderr)
Example #18
0
def realign_bam(bam_fpath, reference_fpath, out_bam_fpath, java_conf=None,
                threads=False, tmp_dir=None):
    'It realigns the bam using GATK Local realignment around indels'
    #reference sam index
    create_sam_reference_index(reference_fpath)

    #reference picard dict
    create_picard_dict(reference_fpath, java_conf=java_conf)

    #bam index
    create_bam_index(bam_fpath)

    #the intervals to realign
    gatk_path = guess_jar_dir('GenomeAnalysisTK.jar', java_conf)
    gatk_jar = os.path.join(gatk_path, 'GenomeAnalysisTK.jar')
    intervals_fhand = tempfile.NamedTemporaryFile(suffix='.intervals')
    cmd = java_cmd(java_conf=java_conf)
    cmd.extend(['-jar', gatk_jar, '-T', 'RealignerTargetCreator',
           '-I', bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name])

    #according to GATK this is experimental, so it might be a good idea to
    #do it in just one thread. In version 1.0.4498. This options is removed
    # so parallel = False
    parallel = False
    if parallel and threads and threads > 1:
        cmd.extend(['-nt', str(get_num_threads(threads))])
    call(cmd, raise_on_error=True, add_ext_dir=False)

    #the realignment itself
    unsorted_bam = NamedTemporaryFile(suffix='.bam')
    cmd = java_cmd(java_conf=java_conf)
    cmd.extend(['-Djava.io.tmpdir=%s' % tempfile.gettempdir(),
           '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath,
           '-T', 'IndelRealigner', '-targetIntervals', intervals_fhand.name,
           '-o', unsorted_bam.name])
    if parallel and threads and threads > 1:
        cmd.extend(['-nt', str(get_num_threads(threads))])
    call(cmd, raise_on_error=True, add_ext_dir=False)
    # now we have to realign the bam
    sort_bam_sam(unsorted_bam.name, out_bam_fpath, java_conf=java_conf,
                 tmp_dir=tmp_dir)
Example #19
0
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None, similar_sequence=None):
    "It infers the intron location in the cdna using est2genome"

    if not similar_sequence:
        # first we want to know where is the most similar seq in the genomic_db
        # this will speed up things
        similar_seqs = look_for_similar_sequences(sequence, database=genomic_db, blast_program="blastn")
        if not similar_seqs:
            return []
        similar_seq = similar_seqs[0]
    else:
        similar_seq = similar_sequence
    start = similar_seq["subject_start"]
    end = similar_seq["subject_end"]
    try:
        similar_seq = genomic_seqs_index[similar_seq["name"]]
    except KeyError:
        msg = "Sequence %s was not found" % similar_seq["name"]
        raise KeyError(msg)

    # now we run est2genome for this cdna
    cdna_file = temp_fasta_file(seqs=[sequence])
    similar_seq_file = temp_fasta_file(seqs=[similar_seq])

    # we run est2genome
    cmd = [
        "est2genome",
        cdna_file.name,
        similar_seq_file.name,
        "-sbegin2",
        str(start),
        "-send2",
        str(end),
        "-stdout",
        "-auto",
    ]
    stdout, stderr, retcode = call(cmd)

    if retcode:
        msg = "There was an error running est2genome: " + stderr
        raise RuntimeError(msg)

    # parse est2genome
    result = est2genome_parser(stdout)
    # get_introns_from parser_result
    return result["cdna"]["introns"]
    def test_call(self):
        'Test call'
        cmd = ['ls', '/@#@#@#@']

        ## When fails
        #without stdout file. Raise False
        stderr = call(cmd, add_ext_dir=False)[1]
        assert '/@#@#@#@' in stderr

        try:
            call(cmd, raise_on_error=True, add_ext_dir=False)
            self.fail()
        except RuntimeError as error:
            assert '/@#@#@#@' in str(error)

        #with stdout file
        stdout = tempfile.NamedTemporaryFile()
        stderr = tempfile.NamedTemporaryFile()
        stdout_str, stderr_str = call(cmd, stdout=stdout, stderr=stderr, add_ext_dir=False)[:2]
        assert not stdout_str
        assert not stderr_str

        assert '/@#@#@#@' in open(stderr.name).read()
        try:
            call(cmd, stdout=stdout, stderr=stderr, raise_on_error=True, add_ext_dir=False)
            self.fail()
        except RuntimeError as error:
            assert '/@#@#@#@' in stderr.read()

        ## when it do right
        cmd = ['ls', '/']
        #without stdout file.
        stdout, stderr = call(cmd, add_ext_dir=False)[:2]
        assert 'root'in stdout

        #without stdout file.
        stdout = tempfile.NamedTemporaryFile()
        stderr = tempfile.NamedTemporaryFile()
        stdout_str, stderr_str = call(cmd, stdout=stdout, stderr=stderr, add_ext_dir=False)[:2]
        assert  stdout_str is None

        assert 'root' in stdout.read()
Example #21
0
def map_reads_with_bwa(reference_fpath, reads_fpath, bam_fpath, parameters):
    'It maps the reads to the reference using bwa and returns a bam file'
    colorspace   = parameters['colorspace']
    reads_length = parameters['reads_length']
    threads      = parameters['threads']
    java_conf    = parameters['java_conf']
    tmp_dir      = parameters['tmp_dir'] if 'tmp_dir' in parameters else None

    threads = get_num_threads(threads)
    #the reference should have an index
    bwt_fpath = reference_fpath + '.bwt'
    if not os.path.exists(bwt_fpath):
        create_bwa_reference(reference_fpath, color=colorspace)

    output_ali = 'output.ali'
    bam_file_bam = 'bam_file.bam'
    output_sai = 'output.sai'
    if reads_length == 'short':
        cmd = ['bwa', 'aln', reference_fpath, reads_fpath,
               '-t', str(threads)]
        if colorspace:
            cmd.append('-c')
        sai_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_sai, mode='wb')
        call(cmd, stdout=sai_fhand, raise_on_error=True)

        cmd = ['bwa', 'samse', reference_fpath, sai_fhand.name, reads_fpath]
        ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali, mode='w')
        call(cmd, stdout=ali_fhand, raise_on_error=True)

    elif reads_length == 'long':
        cmd = ['bwa', 'dbwtsw', reference_fpath, reads_fpath,
               '-t', str(threads)]
        ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali)
        call(cmd, stdout=ali_fhand, raise_on_error=True)
    else:
        raise ValueError('Reads length: short or long')

    if 'unmapped_fhand' in parameters and parameters['unmapped_fhand'] is not None:
        out_ali_fhand = NamedTemporaryFile(dir=tmp_dir, suffix=output_ali)
        get_out_unmapped(ali_fhand, parameters['unmapped_fhand'], out_ali_fhand)
        ali_fhand = out_ali_fhand
    # From sam to Bam
#    unsorted_bam = os.path.join(temp_dir.name, bam_file_bam)
    unsorted_bam = NamedTemporaryFile(dir=tmp_dir, suffix=bam_file_bam)
    sam2bam(ali_fhand.name, unsorted_bam.name)
    # sort bam file
    sort_bam_sam(unsorted_bam.name, bam_fpath, sort_method='coordinate',
                 java_conf=java_conf, strict_validation=False, tmp_dir=tmp_dir)
Example #22
0
def infer_introns_for_cdna(sequence, genomic_db, genomic_seqs_index=None,
                           similar_sequence=None):
    'It infers the intron location in the cdna using est2genome'

    if not similar_sequence:
        #first we want to know where is the most similar seq in the genomic_db
        #this will speed up things
        similar_seqs = look_for_similar_sequences(sequence, database=genomic_db,
                                                  blast_program='blastn')
        if not similar_seqs:
            return []
        similar_seq = similar_seqs[0]
    else:
        similar_seq = similar_sequence
    start = similar_seq['subject_start']
    end = similar_seq['subject_end']
    try:
        similar_seq = genomic_seqs_index[similar_seq['name']]
    except KeyError:
        msg = 'Sequence %s was not found' % similar_seq['name']
        raise KeyError(msg)

    #now we run est2genome for this cdna
    cdna_file = temp_fasta_file(seqs=[sequence])
    similar_seq_file = temp_fasta_file(seqs=[similar_seq])

    #we run est2genome
    cmd = ['est2genome', cdna_file.name, similar_seq_file.name,
           '-sbegin2', str(start), '-send2', str(end), '-stdout', '-auto']
    stdout, stderr, retcode = call(cmd)

    if retcode:
        msg = 'There was an error running est2genome: ' + stderr
        raise RuntimeError(msg)

    #parse est2genome
    result = est2genome_parser(stdout)

    #get_introns_from parser_result
    return result['cdna']['introns']
Example #23
0
def create_gmap_reference_old(reference_fpath):
    'It creates the reference fpath'
    dir_, name = os.path.split(reference_fpath)
    if not dir_:
        dir_ = '.'
    makefile_fpath = os.path.join(dir_, 'Makefile.%s' % name)
    #the gmap_setup command would not accept a file, to avoid thread conflicts
    #we first create a name with a random name and them we move it where it
    #belongs
    temp_makefile = NamedTemporaryFile(delete=False)
    cmd = ['gmap_setup', '-d', name, '-D', dir_, '-o', temp_makefile.name,
           reference_fpath]
    try:
        call(cmd, raise_on_error=True)
    except OSError:
        raise OSError('Gmap mapper is not installed or not in the path')
    shutil.move(temp_makefile.name, makefile_fpath)

    _modify_gmap_makefile(makefile_fpath)

    cmd = ['make', '-f', makefile_fpath , 'coords']
    call(cmd, raise_on_error=True, add_ext_dir=False)

    cmd = ['make', '-f', makefile_fpath, 'gmapdb']
    call(cmd, raise_on_error=True, add_ext_dir=False)

    cmd = ['make', '-f', makefile_fpath, 'install']
    call(cmd, raise_on_error=True, add_ext_dir=False)

    #we remove the makefile and an extra file with some instructions
    os.remove(makefile_fpath)
    coords_fpath = 'coords.%s' % name
    install_fpath = 'INSTALL.%s' % name
    for fpath in (coords_fpath, install_fpath):
        if os.path.exists(fpath):
            os.remove(fpath)
Example #24
0
def sort_bam_sam(in_fpath, out_fpath, sort_method='coordinate',
                 java_conf=None, tmp_dir=None, strict_validation=True):
    'It sorts a bam file using picard'
    picard_path = guess_jar_dir('SortSam.jar', java_conf)
    picard_sort_jar = os.path.join(picard_path, 'SortSam.jar')
    java_cmd_ = java_cmd(java_conf)
    java_cmd_.extend(['-jar', picard_sort_jar, 'INPUT=' + in_fpath,
           'OUTPUT=' + out_fpath, 'SORT_ORDER=' + sort_method])

    if not strict_validation:
        java_cmd_.append('VALIDATION_STRINGENCY=LENIENT')

    if tmp_dir:
        java_cmd_.append('TMP_DIR=%s' % tmp_dir)

    stdout, stderr, retcode = call(java_cmd_, raise_on_error=False, add_ext_dir=False)
    err_msg = 'No space left on device'
    if retcode and (err_msg in stdout or err_msg in stderr):
        raise RuntimeError('Picard sort consumed all space in device.' + stderr)
    elif retcode:
        msg = 'Error running picard: %s\n stderr: %s\n stdout: %s' % \
                                                (' '.join(java_cmd_), stderr,
                                                 stdout)
        raise RuntimeError(msg)
Example #25
0
def sam2bam(sam_path, bam_path):
    'It converts between bam and sam.'
    cmd = ['samtools', 'view', '-bSh', '-o', bam_path, sam_path]
    call(cmd, raise_on_error=True)
def makeblastdb_plus(seq_fpath, dbtype, outputdb=None):
    'It creates the blast db database'
    cmd = ['makeblastdb', '-in', seq_fpath, '-dbtype', dbtype]
    if outputdb is not None:
        cmd.extend(['-out', outputdb])
    call(cmd, raise_on_error=True)