コード例 #1
0
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek):
    'It returns the quality characters and the lengths'
    seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION')
    chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION')

    lengths = array('I')
    seqs_analyzed = 0
    if fhand_is_seekable(fhand) and not force_file_as_non_seek:
        fmt_fhand = fhand
        chunk = fmt_fhand.read(chunk_size)
        fhand.seek(0)
    else:
        chunk = peek_chunk_from_file(fhand, chunk_size)
        fmt_fhand = cStringIO.StringIO(chunk)

    try:
        for seq in FastqGeneralIterator(fmt_fhand):
            qual = [ord(char) for char in seq[2]]
            sanger_chars = [q for q in qual if q < 64]
            if sanger_chars:
                fhand.seek(0)
                return None, True, chunk  # no quals, no lengths, is_sanger
            lengths.append(len(qual))
            seqs_analyzed += 1
            if seqs_analyzed > seqs_to_peek:
                break
    except ValueError:
        msg = 'The file is Fastq, but the version is difficult to guess'
        raise UndecidedFastqVersionError(msg)
    finally:
        fhand.seek(0)
    return lengths, None, chunk  # don't know if it's sanger
コード例 #2
0
ファイル: file_formats.py プロジェクト: djinnome/seq_crumbs
def _get_some_qual_and_lengths(fhand, force_file_as_non_seek):
    'It returns the quality characters and the lengths'
    seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION')
    chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION')

    lengths = array('I')
    seqs_analyzed = 0
    if fhand_is_seekable(fhand) and not force_file_as_non_seek:
        fmt_fhand = fhand
        chunk = fmt_fhand.read(chunk_size)
        fhand.seek(0)
    else:
        chunk = peek_chunk_from_file(fhand, chunk_size)
        fmt_fhand = cStringIO.StringIO(chunk)

    try:
        for seq in FastqGeneralIterator(fmt_fhand):
            qual = [ord(char) for char in seq[2]]
            sanger_chars = [q for q in qual if q < 64]
            if sanger_chars:
                fhand.seek(0)
                return None, True, chunk  # no quals, no lengths, is_sanger
            lengths.append(len(qual))
            seqs_analyzed += 1
            if seqs_analyzed > seqs_to_peek:
                break
    except ValueError:
        msg = 'The file is Fastq, but the version is difficult to guess'
        raise UndecidedFastqVersionError(msg)
    finally:
        fhand.seek(0)
    return lengths, None, chunk  # don't know if it's sanger
コード例 #3
0
    def calculate_bin_edges(self, min_, max_, n_bins=None):
        'It calculates the bin_edges'
        min_bins = get_setting('MIN_BINS')
        max_bins = get_setting('MAX_BINS')
        if n_bins is None:
            num_values = max_ - min_
            if num_values == 0:
                n_bins = 1
            elif num_values < min_bins:
                n_bins = num_values
            else:
                n_bins = int(self.count / get_setting('MEAN_VALUES_IN_BIN'))
                if n_bins < min_bins:
                    n_bins = min_bins
                if n_bins > max_bins:
                    n_bins = max_bins
                if n_bins > num_values:
                    n_bins = num_values

        # now we can calculate the bin edges
        distrib_span = max_ - min_ if max_ != min_ else 1

        if distrib_span % n_bins:
            distrib_span = distrib_span + n_bins - (distrib_span % n_bins)
        bin_span = distrib_span // n_bins
        bin_edges = [min_ + bin_ * bin_span for bin_ in range(n_bins + 1)]
        return bin_edges
コード例 #4
0
ファイル: annotation.py プロジェクト: JoseBlanca/seq_crumbs
    def __init__(self, min_len=get_setting('POLYA_ANNOTATOR_MIN_LEN'),
                max_cont_mismatches=get_setting('POLYA_ANNOTATOR_MISMATCHES')):
        '''It inits the class.

        min_len - minimum number of consecutive As (or Ts) to extend the tail
        max_cont_mismatches - maximum number of consecutive no A (or Ts) to
                              break a tail.
        '''
        self._min_len = min_len
        self._max_cont_mismatches = max_cont_mismatches
コード例 #5
0
ファイル: annotation.py プロジェクト: terrycojones/seq_crumbs
    def __init__(self, min_len=get_setting('POLYA_ANNOTATOR_MIN_LEN'),
                max_cont_mismatches=get_setting('POLYA_ANNOTATOR_MISMATCHES')):
        '''It inits the class.

        min_len - minimum number of consecutive As (or Ts) to extend the tail
        max_cont_mismatches - maximum number of consecutive no A (or Ts) to
                              break a tail.
        '''
        self._min_len = min_len
        self._max_cont_mismatches = max_cont_mismatches
コード例 #6
0
ファイル: mate_chimeras.py プロジェクト: pirega/ngs_crumbs
def classify_chimeras(in_fhand,
                      index_fpath,
                      mate_distance,
                      out_fhand,
                      chimeras_fhand=None,
                      unknown_fhand=None,
                      tempdir=None,
                      threads=None,
                      settings=get_setting('CHIMERAS_SETTINGS')):
    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand,
                                            settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
コード例 #7
0
ファイル: mate_chimeras.py プロジェクト: pirega/ngs_crumbs
def classify_mapped_reads(bam_fhand,
                          mate_distance,
                          settings=get_setting('CHIMERAS_SETTINGS')):
    '''It classifies sequences from bam file in chimeric, unknown and
    non chimeric, according to its distance and orientation in the reference
    sequence'''
    bamfile = AlignmentFile(bam_fhand.name)

    # settings. Include in function properties with default values
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    variation = settings['MATE_DISTANCE_VARIATION']
    mate_length_range = [mate_distance - variation, mate_distance + variation]
    reference_lengths = _get_ref_lengths(bamfile)
    # It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   reference_lengths):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len, reference_lengths):
            kind = CHIMERA
        else:
            kind = UNKNOWN

        pair = [
            alignedread_to_seqitem(_get_primary_alignment(mates))
            for mates in mates_alignments
        ]

        if None not in pair:
            yield pair, kind
コード例 #8
0
ファイル: file_formats.py プロジェクト: djinnome/seq_crumbs
def _guess_fastq_version(fhand, force_file_as_non_seek):
    '''It guesses the format of fastq files.

    It ignores the solexa fastq version.
    '''
    lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand,
                                                        force_file_as_non_seek)
    if is_sanger:
        fmt = 'fastq'
    elif is_sanger is False:
        fmt = 'fastq-illumina'
    else:
        fmt = None
    if fmt:
        return fmt

    longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ')
    n_long_seqs = [l for l in lengths if l > longest_expected_illumina]
    if n_long_seqs:
        msg = 'It was not possible to guess the format of '
        if hasattr(fhand, 'name'):
            msg += 'the file ' + fhand.name
        else:
            msg += 'a file '
        msg = '\n. The quality values could be Illumina, but there are '
        msg += 'sequences longer than %i bp.'
        msg %= longest_expected_illumina
        raise UndecidedFastqVersionError(msg)
    else:
        return 'fastq-illumina'
コード例 #9
0
    def test_many_reads(self):
        'It splits lots of reads to check that blast finds everything'

        linker = TITANIUM_LINKER

        def create_seq(index):
            'It creates a random seq with a linker'
            seq1 = ''.join(choice('ACTG') for i in range(100))
            seq2 = ''.join(choice('ACTG') for i in range(100))
            seq = seq1 + linker + seq2
            seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
            seq = SeqWrapper(SEQRECORD, seq, None)
            return seq

        # We want to test that blast reports all reads
        packet_size = get_setting('PACKET_SIZE')
        default_blast_max_target_size = 500
        assert packet_size > default_blast_max_target_size
        seqs = [create_seq(i) for i in range(1000)]
        splitter = MatePairSplitter()

        for index, seq in enumerate(splitter(seqs)):
            seq_index = index // 2
            pair_index = (index % 2) + 1
            expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index)
            assert  get_name(seq) == expected_id
コード例 #10
0
 def __init__(self, linkers=None):
     'The initiator'
     if linkers is None:
         linkers = get_setting('LINKERS')
         linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
         linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
     self.linkers = list(linkers)
コード例 #11
0
ファイル: filters.py プロジェクト: milw/seq_crumbs
 def __init__(self, threshold=get_setting('DEFATULT_DUST_THRESHOLD'),
              reverse=False, failed_drags_pair=True):
     '''The initiator
     '''
     self._threshold = threshold
     super(FilterDustComplexity, self).__init__(reverse=reverse,
                                       failed_drags_pair=failed_drags_pair)
コード例 #12
0
ファイル: split_mates.py プロジェクト: radaniba/seq_crumbs
 def __init__(self, linkers=None):
     'The initiator'
     if linkers is None:
         linkers = get_setting('LINKERS')
         linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
         linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
     self.linkers = list(linkers)
コード例 #13
0
    def test_many_reads(self):
        'It splits lots of reads to check that blast finds everything'

        linker = TITANIUM_LINKER

        def create_seq(index):
            'It creates a random seq with a linker'
            seq1 = ''.join(choice('ACTG') for i in range(100))
            seq2 = ''.join(choice('ACTG') for i in range(100))
            seq = seq1 + linker + seq2
            seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq))
            seq = SeqWrapper(SEQRECORD, seq, None)
            return seq

        # We want to test that blast reports all reads
        packet_size = get_setting('PACKET_SIZE')
        default_blast_max_target_size = 500
        assert packet_size > default_blast_max_target_size
        seqs = [create_seq(i) for i in range(1000)]
        splitter = MatePairSplitter()

        for index, seq in enumerate(splitter(seqs)):
            seq_index = index // 2
            pair_index = (index % 2) + 1
            expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index)
            assert get_name(seq) == expected_id
コード例 #14
0
 def __init__(self, threshold=get_setting('DEFATULT_DUST_THRESHOLD'),
              reverse=False, failed_drags_pair=True):
     '''The initiator
     '''
     self._threshold = threshold
     super(FilterDustComplexity, self).__init__(reverse=reverse,
                                       failed_drags_pair=failed_drags_pair)
コード例 #15
0
ファイル: file_formats.py プロジェクト: fw1121/ngs_crumbs
def _guess_fastq_version(fhand, force_file_as_non_seek):
    """It guesses the format of fastq files.

    It ignores the solexa fastq version.
    """
    lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand, force_file_as_non_seek)
    if is_sanger:
        fmt = "fastq"
    elif is_sanger is False:
        fmt = "fastq-illumina"
    else:
        fmt = None
    if fmt:
        return fmt

    longest_expected_illumina = get_setting("LONGEST_EXPECTED_ILLUMINA_READ")
    n_long_seqs = [l for l in lengths if l > longest_expected_illumina]
    if n_long_seqs:
        msg = "It was not possible to guess the format of "
        if hasattr(fhand, "name"):
            msg += "the file " + fhand.name
        else:
            msg += "a file "
        msg = "\n. The quality values could be Illumina, but there are "
        msg += "sequences longer than %i bp."
        msg %= longest_expected_illumina
        raise UndecidedFastqVersionError(msg)
    else:
        return "fastq-illumina"
コード例 #16
0
ファイル: bam_tools.py プロジェクト: JoseBlanca/seq_crumbs
def _realign_bam(bam_fpath, reference_fpath, out_bam_fpath, threads=False):
    'It realigns the bam using GATK Local realignment around indels'
    # reference sam index
    _create_sam_reference_index(reference_fpath)

    # reference picard dict
    _create_picard_dict(reference_fpath)

    # bam index
    index_bam(bam_fpath)

    # the intervals to realign
#     gatk_dir = get_setting("GATK_DIR")
#     gatk_jar = os.path.join(gatk_dir, 'GenomeAnalysisTK.jar')
    gatk_jar = get_setting('GATK_JAR')
    intervals_fhand = NamedTemporaryFile(suffix='.intervals')
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    stdout = NamedTemporaryFile(suffix='picard.stdout')
    cmd = ['java', '-jar', gatk_jar, '-T', 'RealignerTargetCreator',
           '-I', bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name]
    check_call(cmd, stderr=stderr, stdout=stdout)

    # the realignment itself
    cmd = ['java', '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath,
           '-T', 'IndelRealigner', '-targetIntervals', intervals_fhand.name,
           '-o', out_bam_fpath]

    if threads and threads > 1:
        cmd.extend(['-nt', str(get_num_threads(threads))])
    check_call(cmd, stderr=stderr, stdout=stdout)
    intervals_fhand.close()
コード例 #17
0
def _realign_bam(bam_fpath, reference_fpath, out_bam_fpath, threads=False):
    'It realigns the bam using GATK Local realignment around indels'
    # reference sam index
    _create_sam_reference_index(reference_fpath)

    # reference picard dict
    _create_picard_dict(reference_fpath)

    # bam index
    index_bam(bam_fpath)

    # the intervals to realign
    #     gatk_dir = get_setting("GATK_DIR")
    #     gatk_jar = os.path.join(gatk_dir, 'GenomeAnalysisTK.jar')
    gatk_jar = get_setting('GATK_JAR')
    intervals_fhand = NamedTemporaryFile(suffix='.intervals')
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    stdout = NamedTemporaryFile(suffix='picard.stdout')
    cmd = [
        'java', '-jar', gatk_jar, '-T', 'RealignerTargetCreator', '-I',
        bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name
    ]
    check_call(cmd, stderr=stderr, stdout=stdout)

    # the realignment itself
    cmd = [
        'java', '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath, '-T',
        'IndelRealigner', '-targetIntervals', intervals_fhand.name, '-o',
        out_bam_fpath
    ]

    if threads and threads > 1:
        cmd.extend(['-nt', str(get_num_threads(threads))])
    check_call(cmd, stderr=stderr, stdout=stdout)
    intervals_fhand.close()
コード例 #18
0
ファイル: mate_chimeras.py プロジェクト: djinnome/seq_crumbs
def classify_mapped_reads(bam_fhand, mate_distance,
                          settings=get_setting('CHIMERAS_SETTINGS')):
    '''It classifies sequences from bam file in chimeric, unknown and
    non chimeric, according to its distance and orientation in the reference
    sequence'''
    bamfile = Samfile(bam_fhand.name)

    # settings. Include in function properties with default values
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    variation = settings['MATE_DISTANCE_VARIATION']
    mate_length_range = [mate_distance - variation, mate_distance + variation]
    reference_lengths = _get_ref_lengths(bamfile)
    # It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_reads_by_qname(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   reference_lengths):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len, reference_lengths):
            kind = CHIMERA
        else:
            kind = UNKNOWN

        pair = [alignedread_to_seqitem(_get_primary_alignment(mates))
                for mates in mates_alignments]

        if None not in pair:
            yield pair, kind
コード例 #19
0
ファイル: filters.py プロジェクト: radaniba/seq_crumbs
def classify_mapped_reads_new(bamfile,
                              settings=get_setting('CHIMERAS_SETTINGS'),
                              file_format='fastq',
                              mate_length_range=[2000, 4000],
                              out_format=SEQITEM):
    #settings. Include in function properties with default values
    max_coincidences = settings['MAX_COINCIDENCES']
    max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE']
    limit = settings['MAX_DISTANCE_TO_END']
    max_clipping = settings['MAX_CLIPPING']
    max_pe_len = settings['MAX_PE_LEN']
    min_mp_len = settings['MIN_MP_LEN']

    #It tries to find out the kind of each pair of sequences
    for grouped_mates in _group_alignments_by_reads(bamfile):
        mates_alignments = _split_mates(grouped_mates)
        if _mates_are_not_chimeric(mates_alignments, max_clipping,
                                   mate_length_range, bamfile,
                                   max_coincidences, max_mapq_difference,
                                   limit):
            kind = NON_CHIMERIC
        elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping,
                                 max_pe_len):
            kind = CHIMERA
        else:
            kind = UNKNOWN
        if out_format == SEQITEM:
            pair = [alignedread_to_seqitem(read[0], file_format) for read in mates_alignments]
        elif out_format == 'aligned_read':
            pair = mates_alignments
        yield [pair, kind]
コード例 #20
0
def _guess_fastq_version(fhand, force_file_as_non_seek):
    '''It guesses the format of fastq files.

    It ignores the solexa fastq version.
    '''
    lengths, is_sanger, chunk = _get_some_qual_and_lengths(
        fhand, force_file_as_non_seek)
    if is_sanger:
        fmt = 'fastq'
    elif is_sanger is False:
        fmt = 'fastq-illumina'
    else:
        fmt = None
    if fmt:
        return fmt

    longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ')
    n_long_seqs = [l for l in lengths if l > longest_expected_illumina]
    if n_long_seqs:
        msg = 'It was not possible to guess the format of '
        if hasattr(fhand, 'name'):
            msg += 'the file ' + fhand.name
        else:
            msg += 'a file '
        msg = '\n. The quality values could be Illumina, but there are '
        msg += 'sequences longer than %i bp.'
        msg %= longest_expected_illumina
        raise UndecidedFastqVersionError(msg)
    else:
        return 'fastq-illumina'
コード例 #21
0
ファイル: bin_utils.py プロジェクト: pziarsolo/seq_crumbs
    def _get_binary_path(binary_name):
        '''It return the path to the proper binary. It looks on platform and
        architecture to decide it.

        Fails if there is not binary for that architecture
        '''
        if get_setting('USE_EXTERNAL_BIN_PREFIX'):
            ext_binary_name = get_setting('EXTERNAL_BIN_PREFIX') + binary_name
            if os.path.exists(ext_binary_name):
                return ext_binary_name

        if not get_setting('ADD_PATH_TO_EXT_BIN'):
            # I have to check if the binary is on my current directory.
            # If it is there use it, else assumes that it is on the path
            if os.path.exists(os.path.join(os.getcwd(), ext_binary_name)):
                return os.path.join(os.getcwd(), ext_binary_name)
            #return binary_name

        system = platform.system().lower()
        if system == 'windows':
            binary_name += '.exe'
        arch = platform.architecture()[0]

        join = os.path.join

        third_party_path = join(module_path, '..', 'third_party', 'bin')
        third_party_path = os.path.abspath(third_party_path)

        binary_path = os.path.abspath(join(third_party_path, system, arch,
                                           binary_name))

        if os.path.exists(binary_path):
            return binary_path
        elif arch == '64bit':
            arch = '32bit'
            binary_path = os.path.abspath(join(third_party_path, system, arch,
                                               binary_name))
            if os.path.exists(binary_path):
                return binary_path

        # At this point there is not available binary for the working platform
        # Is the binary really in the path?
        if which(binary_name):
            return binary_name

        msg = '{} not found in the path. Please install it to use seq_crumbs'
        raise MissingBinaryError(msg.format(binary_name))
コード例 #22
0
    def _get_binary_path(binary_name):
        '''It return the path to the proper binary. It looks on platform and
        architecture to decide it.

        Fails if there is not binary for that architecture
        '''
        if get_setting('USE_EXTERNAL_BIN_PREFIX'):
            ext_binary_name = get_setting('EXTERNAL_BIN_PREFIX') + binary_name
            if os.path.exists(ext_binary_name):
                return ext_binary_name

        if not get_setting('ADD_PATH_TO_EXT_BIN'):
            # I have to check if the binary is on my current directory.
            # If it is there use it, else assumes that it is on the path
            if os.path.exists(os.path.join(os.getcwd(), ext_binary_name)):
                return os.path.join(os.getcwd(), ext_binary_name)
            #return binary_name

        system = platform.system().lower()
        if system == 'windows':
            binary_name += '.exe'
        arch = platform.architecture()[0]

        join = os.path.join

        third_party_path = join(module_path, '..', 'third_party', 'bin')
        third_party_path = os.path.abspath(third_party_path)

        binary_path = os.path.abspath(
            join(third_party_path, system, arch, binary_name))

        if os.path.exists(binary_path):
            return binary_path
        elif arch == '64bit':
            arch = '32bit'
            binary_path = os.path.abspath(
                join(third_party_path, system, arch, binary_name))
            if os.path.exists(binary_path):
                return binary_path

        # At this point there is not available binary for the working platform
        # Is the binary really in the path?
        if which(binary_name):
            return binary_name

        msg = '{} not found in the path. Please install it to use seq_crumbs'
        raise MissingBinaryError(msg.format(binary_name))
コード例 #23
0
def draw_histogram_ascii(bin_limits, counts):
    'It draws an ASCII histogram'

    fill_char = '*'

    assert len(bin_limits) == len(counts) + 1
    # pylint: disable=W0108
    number_to_str = lambda n: '{:d}'.format(n) if isinstance(n, int) else \
                                                            '{:.2f}'.format(n)

    # we gather all bin limits and we calculate the longest number
    bin_start = None
    bin_end = bin_limits[0]
    max_ndigits = len(number_to_str(bin_end))
    max_count_ndigits = 0
    bins = []
    for bin_limit, cnt in zip(bin_limits[1:], counts):
        bin_start, bin_end = bin_end, bin_limit
        n_digits = len(number_to_str(bin_end))
        if max_ndigits < n_digits:
            max_ndigits = n_digits
        n_digits = len(number_to_str(cnt))
        if max_count_ndigits < n_digits:
            max_count_ndigits = n_digits
        bins.append((bin_start, bin_end))

    limit_fmt_int = '{:>' + str(max_ndigits) + 'd}'
    limit_fmt_float = '{:>' + str(max_ndigits) + '.5f}'
    limit_to_padded_str = lambda n: limit_fmt_int.format(n) \
                           if isinstance(n, int) else limit_fmt_float.format(n)

    count_fmt = '{:>' + str(max_count_ndigits) + 'd}'
    count_to_padded_str = lambda n: count_fmt.format(n)

    result = []
    for bin_, cnt in zip(bins, counts):
        line = ''
        line += '['
        line += limit_to_padded_str(bin_[0])
        line += ' , '
        line += limit_to_padded_str(bin_[1])
        line += '[ ('
        line += count_to_padded_str(cnt)
        line += '): '
        result.append(line)

    # pylint: disable=W0141
    max_count = max(counts)
    max_header_len = max(map(len, result))
    max_hist_width = get_setting('MAX_WIDTH_ASCII_PLOT') - max_header_len
    counts_ratio = max_hist_width / max_count

    result2 = []
    for line, cnt in zip(result, counts):
        line += fill_char * int(cnt * counts_ratio)
        line += '\n'
        result2.append(line)

    return ''.join(result2)
コード例 #24
0
ファイル: statistics.py プロジェクト: JoseBlanca/seq_crumbs
def draw_histogram_ascii(bin_limits, counts):
    'It draws an ASCII histogram'

    fill_char = '*'

    assert len(bin_limits) == len(counts) + 1
    # pylint: disable=W0108
    number_to_str = lambda n: '{:d}'.format(n) if isinstance(n, int) else \
                                                            '{:.2f}'.format(n)

    # we gather all bin limits and we calculate the longest number
    bin_start = None
    bin_end = bin_limits[0]
    max_ndigits = len(number_to_str(bin_end))
    max_count_ndigits = 0
    bins = []
    for bin_limit, cnt in zip(bin_limits[1:], counts):
        bin_start, bin_end = bin_end, bin_limit
        n_digits = len(number_to_str(bin_end))
        if max_ndigits < n_digits:
            max_ndigits = n_digits
        n_digits = len(number_to_str(cnt))
        if max_count_ndigits < n_digits:
            max_count_ndigits = n_digits
        bins.append((bin_start, bin_end))

    limit_fmt_int = '{:>' + str(max_ndigits) + 'd}'
    limit_fmt_float = '{:>' + str(max_ndigits) + '.5f}'
    limit_to_padded_str = lambda n: limit_fmt_int.format(n) \
                           if isinstance(n, int) else limit_fmt_float.format(n)

    count_fmt = '{:>' + str(max_count_ndigits) + 'd}'
    count_to_padded_str = lambda n: count_fmt.format(n)

    result = []
    for bin_, cnt in zip(bins, counts):
        line = ''
        line += '['
        line += limit_to_padded_str(bin_[0])
        line += ' , '
        line += limit_to_padded_str(bin_[1])
        line += '[ ('
        line += count_to_padded_str(cnt)
        line += '): '
        result.append(line)

    # pylint: disable=W0141
    max_count = max(counts)
    max_header_len = max(map(len, result))
    max_hist_width = get_setting('MAX_WIDTH_ASCII_PLOT') - max_header_len
    counts_ratio = max_hist_width / max_count

    result2 = []
    for line, cnt in zip(result, counts):
        line += fill_char * int(cnt * counts_ratio)
        line += '\n'
        result2.append(line)

    return ''.join(result2)
コード例 #25
0
 def __init__(self, index_fpath, max_clipping=None, tempdir=None):
     'The initiator'
     self._tempdir = tempdir
     self._index_fpath = index_fpath
     if max_clipping is not None:
         self.max_clipping = max_clipping
     else:
         self.max_clipping = get_setting('CHIMERAS_SETTINGS')['MAX_CLIPPING']
コード例 #26
0
ファイル: trim.py プロジェクト: fw1121/ngs_crumbs
 def __init__(self, index_fpath, max_clipping=None, tempdir=None):
     'The initiator'
     self._tempdir = tempdir
     self._index_fpath = index_fpath
     if max_clipping is not None:
         self.max_clipping = max_clipping
     else:
         self.max_clipping = get_setting('CHIMERAS_SETTINGS')['MAX_CLIPPING']
コード例 #27
0
def copy_and_rename_ext_bin(app_dir, bin_dist_dir):
    ''' It copies and  changes the name of the platform specific external
     binaries.
     It copies them to the binary dist directory'''

    external_bin_dir = get_platform_bin_dir(app_dir)
    for ext_bin in os.listdir(external_bin_dir):
        shutil.copy(join(external_bin_dir, ext_bin),
                join(bin_dist_dir,
                     settings.get_setting('EXTERNAL_BIN_PREFIX') + ext_bin))
コード例 #28
0
ファイル: pairs.py プロジェクト: fastq/seq_crumbs
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_fwd, buf_rev   # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        if direction == FWD:
            buf1 = buf_rev
            buf2 = buf_fwd
        else:
            buf1 = buf_fwd
            buf2 = buf_rev

        try:
            matching_seq_index = buf1['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf2['items'].append(seq)
            buf2['index'][seq_name] = len(buf2['items']) - 1
            # check mem limit
            sum_items = len(buf1['items'] + buf2['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf1['items'][:matching_seq_index]
            matching_seq = buf1['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffers 1
            buf1['items'] = buf1['items'][matching_seq_index + 1:]
            buf1['index'] = {s: i for i, s in enumerate(buf1['items'])}

            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf2['items'], orphan_out_fhand, out_format)
            buf2['items'] = []
            buf2['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)
コード例 #29
0
def copy_and_rename_ext_bin(app_dir, bin_dist_dir):
    ''' It copies and  changes the name of the platform specific external
     binaries.
     It copies them to the binary dist directory'''

    external_bin_dir = get_platform_bin_dir(app_dir)
    for ext_bin in os.listdir(external_bin_dir):
        shutil.copy(
            join(external_bin_dir, ext_bin),
            join(bin_dist_dir,
                 settings.get_setting('EXTERNAL_BIN_PREFIX') + ext_bin))
コード例 #30
0
ファイル: seqio.py プロジェクト: terrycojones/seq_crumbs
def read_seq_packets(fhands,
                     size=get_setting('PACKET_SIZE'),
                     out_format=None,
                     file_format=GUESS_FORMAT,
                     prefered_seq_classes=None):
    '''It yields SeqItems in packets of the given size.'''
    seqs = read_seqs(fhands,
                     file_format,
                     out_format=out_format,
                     prefered_seq_classes=prefered_seq_classes)
    return group_in_packets(seqs, size)
コード例 #31
0
ファイル: bam_tools.py プロジェクト: JoseBlanca/seq_crumbs
def _create_picard_dict(fpath):
    'It creates a picard dict if if it does not exist'
    dict_path = os.path.splitext(fpath)[0] + '.dict'
    if os.path.exists(dict_path):
        return
    picard_jar = get_setting("PICARD_JAR")
    cmd = ['java', '-jar', picard_jar, 'CreateSequenceDictionary',
           'R=%s' % fpath,
           'O=%s' % dict_path]
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    check_call(cmd, stderr=stderr)
コード例 #32
0
ファイル: mapping.py プロジェクト: fangly/seq_crumbs
def map_with_bwasw(index_fpath, bam_fpath, unpaired_fpath=None,
                    paired_fpaths=None, readgroup=None, threads=None,
                    log_fpath=None, extra_params=None):
    'It maps with bwa ws algorithm'
    if paired_fpaths is None and unpaired_fpath is None:
        raise RuntimeError('At least one file to map is required')
    elif paired_fpaths is not None and unpaired_fpath is not None:
        msg = 'Bwa can not map unpaired and unpaired reads together'
        raise RuntimeError(msg)

    if readgroup is None:
        readgroup = {}

    if extra_params is None:
        extra_params = []

    binary = get_binary_path('bwa')
    cmd = [binary, 'bwasw', '-t', str(get_num_threads(threads)), index_fpath]
    cmd.extend(extra_params)

    if paired_fpaths is not None:
        cmd.extend(paired_fpaths)
    if unpaired_fpath is not None:
        cmd.append(unpaired_fpath)

    if log_fpath is None:
        stderr = NamedTemporaryFile(suffix='.stderr')
    else:
        stderr = open(log_fpath, 'w')
    #raw_input(' '.join(cmd))
    bwa = popen(cmd, stderr=stderr, stdout=PIPE)

    # add readgroup using picard
    picard_tools = get_setting("PICARD_TOOLS_DIR")
    if readgroup:
        cmd = ['java', '-jar',
           os.path.join(picard_tools, 'AddOrReplaceReadGroups.jar'),
           'INPUT=/dev/stdin', 'OUTPUT={0}'.format(bam_fpath),
           'RGID={0}'.format(readgroup['ID']),
           'RGLB={0}'.format(readgroup['LB']),
           'RGPL={0}'.format(readgroup['PL']),
           'RGSM={0}'.format(readgroup['SM']),
           'RGPU={0}'.format(readgroup['PU']),
           'VALIDATION_STRINGENCY=LENIENT']
    else:
        cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-',
               '-o', bam_fpath]

    samtools = popen(cmd, stdin=bwa.stdout, stderr=stderr)
    bwa.stdout.close()  # Allow p1 to receive a SIGPIPE if samtools exits.
    samtools.communicate()
    if bwa.returncode or samtools.returncode:
        raise RuntimeError(open(stderr.name).read())
コード例 #33
0
    def ascii_plot(self):
        'It plots columns with the nucleotide frequencies'
        nucls = ('A', 'C', 'G', 'T', 'N')
        plot_nucls = ('a', 'C', 'g', 'T', 'n')
        locs = self.counts.keys()
        if not locs:
            return ''
        loc_max = max(locs)
        loc_min = min(locs)
        loc_width = len(str(loc_max))
        loc_fmt = '{:>' + str(loc_width) + 'd}'
        counts = self.counts

        def _header_for_nucl(loc):
            'It returns the header for the given position'
            header = loc_fmt.format(loc) + ' ('
            count = counts.get(loc, {})
            freqs = [count.get(n, 0) for n in nucls]
            tot_bases = sum(freqs)
            freqs = [f / tot_bases for f in freqs]
            freq_strs = [
                '{}: {:.2f}'.format(n, f) for n, f in zip(nucls, freqs)
            ]
            header += ', '.join(freq_strs) + ') | '
            return header, freqs

        header_len = len(_header_for_nucl(0)[0])
        plot_width = get_setting('MAX_WIDTH_ASCII_PLOT') - header_len
        val_per_pixel = 1 / plot_width
        plot = ''
        for loc in range(loc_min, loc_max + 1):
            header, freqs = _header_for_nucl(loc)
            assert approx_equal(sum(freqs), 1)
            line = header

            remainder_freqs = [
                float(re.sub('\d\.', '0.', str(f))) for f in freqs
            ]
            round_freqs = [int(round(f / val_per_pixel)) for f in freqs]

            pixels_remaining = plot_width - sum(round_freqs)

            if pixels_remaining > 0:
                add_to_freq = remainder_freqs.index(max(remainder_freqs))
                round_freqs[add_to_freq] += (plot_width - sum(round_freqs))
            elif pixels_remaining < 0:
                add_to_freq = remainder_freqs.index(min(remainder_freqs))
                round_freqs[add_to_freq] -= (plot_width - sum(round_freqs))
            assert approx_equal(sum(round_freqs), plot_width)
            line += ''.join([n * f for f, n in zip(round_freqs, plot_nucls)])
            line += '\n'
            plot += line
        return plot
コード例 #34
0
ファイル: blast.py プロジェクト: bharatpatel/seq_crumbs
 def __init__(self, seqs_fpath, seqs, program, params=None, filters=None, elongate_for_global=False, seqs_type=None):
     """It inits the class."""
     self.program = program
     if params is None:
         params = {}
     params["max_target_seqs"] = str(get_setting("PACKET_SIZE"))
     self.params = params
     if filters is None:
         filters = []
     self.filters = filters
     self.elongate_for_global = elongate_for_global
     self._match_parts = self._look_for_blast_matches(seqs_fpath, seqs, seqs_type)
コード例 #35
0
def calculate_dust_score(seq):
    '''It returns the dust score.

    From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA
    Sequences"
    doi:10.1089/cmb.2006.13.1028

    and re-implemented from PRINSEQ
    '''
    seq = get_str_seq(seq)
    length = len(seq)
    if length == 3:
        return 0
    if length <= 5:
        return None

    windowsize = get_setting('DUST_WINDOWSIZE')
    windowstep = get_setting('DUST_WINDOWSTEP')

    dustscores = []
    if length > windowsize:
        windows = 0
        for seq_in_win in rolling_window(seq, windowsize, windowstep):
            score = _calculate_rawscore(seq_in_win)
            dustscores.append(score / (windowsize - 2))
            windows += 1
        remaining_seq = seq[windows * windowstep:]
    else:
        remaining_seq = seq

    if remaining_seq > 5:
        length = len(remaining_seq)
        score = _calculate_rawscore(remaining_seq)
        dustscore = score / (length - 3) * (windowsize - 2) / (length - 2)
        dustscores.append(dustscore)

    # max score should be 100 not 31
    dustscore = sum(dustscores) / len(dustscores) * 100 / 31
    return dustscore
コード例 #36
0
def _create_picard_dict(fpath):
    'It creates a picard dict if if it does not exist'
    dict_path = os.path.splitext(fpath)[0] + '.dict'
    if os.path.exists(dict_path):
        return
    picard_jar = get_setting("PICARD_JAR")
    cmd = [
        'java', '-jar', picard_jar, 'CreateSequenceDictionary',
        'R=%s' % fpath,
        'O=%s' % dict_path
    ]
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    check_call(cmd, stderr=stderr)
コード例 #37
0
ファイル: statistics.py プロジェクト: bharatpatel/seq_crumbs
    def ascii_plot(self):
        'It plots columns with the nucleotide frequencies'
        nucls = ('A', 'C', 'G', 'T', 'N')
        plot_nucls = ('a', 'C', 'g', 'T', 'n')
        locs = self.counts.keys()
        if not locs:
            return ''
        loc_max = max(locs)
        loc_min = min(locs)
        loc_width = len(str(loc_max))
        loc_fmt = '{:>' + str(loc_width) + 'd}'
        counts = self.counts

        def _header_for_nucl(loc):
            'It returns the header for the given position'
            header = loc_fmt.format(loc) + ' ('
            count = counts.get(loc, {})
            freqs = [count.get(n, 0) for n in nucls]
            tot_bases = sum(freqs)
            freqs = [f / tot_bases for f in freqs]
            freq_strs = ['{}: {:.2f}'.format(n, f)
                                                 for n, f in zip(nucls, freqs)]
            header += ', '.join(freq_strs) + ') | '
            return header, freqs

        header_len = len(_header_for_nucl(0)[0])
        plot_width = get_setting('MAX_WIDTH_ASCII_PLOT') - header_len
        val_per_pixel = 1 / plot_width
        plot = ''
        for loc in range(loc_min, loc_max + 1):
            header, freqs = _header_for_nucl(loc)
            assert approx_equal(sum(freqs), 1)
            line = header

            remainder_freqs = [float(re.sub('\d\.', '0.', str(f)))
                                                                for f in freqs]
            round_freqs = [int(round(f / val_per_pixel)) for f in freqs]

            pixels_remaining = plot_width - sum(round_freqs)

            if pixels_remaining > 0:
                add_to_freq = remainder_freqs.index(max(remainder_freqs))
                round_freqs[add_to_freq] += (plot_width - sum(round_freqs))
            elif pixels_remaining < 0:
                add_to_freq = remainder_freqs.index(min(remainder_freqs))
                round_freqs[add_to_freq] -= (plot_width - sum(round_freqs))
            assert approx_equal(sum(round_freqs), plot_width)
            line += ''.join([n * f for f, n in zip(round_freqs, plot_nucls)])
            line += '\n'
            plot += line
        return plot
コード例 #38
0
ファイル: bam_tools.py プロジェクト: JoseBlanca/seq_crumbs
def merge_sams(in_fpaths, out_fpath):
    picard_jar = get_setting("PICARD_JAR")

    cmd = ['java', '-jar', picard_jar, 'MergeSamFiles',
           'O={}'.format(out_fpath)]
    for in_fpath in in_fpaths:
        cmd.append('I={}'.format(in_fpath))
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    stdout = NamedTemporaryFile(suffix='picard.stdout')
    try:
        check_call(cmd, stderr=stderr, stdout=stdout)
    except CalledProcessError:
        sys.stderr.write(open(stderr.name).read())
        sys.stdout.write(open(stdout.name).read())
コード例 #39
0
def merge_sams(in_fpaths, out_fpath):
    picard_jar = get_setting("PICARD_JAR")

    cmd = [
        'java', '-jar', picard_jar, 'MergeSamFiles', 'O={}'.format(out_fpath)
    ]
    for in_fpath in in_fpaths:
        cmd.append('I={}'.format(in_fpath))
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    stdout = NamedTemporaryFile(suffix='picard.stdout')
    try:
        check_call(cmd, stderr=stderr, stdout=stdout)
    except CalledProcessError:
        sys.stderr.write(open(stderr.name).read())
        sys.stdout.write(open(stdout.name).read())
コード例 #40
0
ファイル: mapping.py プロジェクト: radaniba/seq_crumbs
def sort_mapped_reads(map_process, out_fpath, key='coordinate',
                      tempdir='/tmp'):
    picard_tools = get_setting("PICARD_TOOLS_DIR")
    fpath = os.path.join(picard_tools, 'SortSam.jar')
    cmd = ['java', '-jar', fpath, 'I=/dev/stdin',
           'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir,
           'VALIDATION_STRINGENCY=LENIENT']
    sort = subprocess.Popen(cmd, stdin=map_process.stdout, stderr=PIPE)
    stderr = sort.stderr
    sort.wait()
    if sort.returncode:
        msg = 'Something happened running picard sort:\n'
        msg += stderr.read()
        raise RuntimeError(msg)
    assert sort.returncode == 0
コード例 #41
0
ファイル: mapping.py プロジェクト: JoseBlanca/seq_crumbs
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate',
                             log_fpath=None, tempdir=None):
    if log_fpath is None:
        stderr = NamedTemporaryFile(suffix='.stderr')
    else:
        stderr = open(log_fpath, 'w')

    if tempdir is None:
        tempdir = tempfile.gettempdir()
    picard_jar = get_setting("PICARD_JAR")
    cmd = ['java', '-jar', picard_jar, 'SortSam', 'I=/dev/stdin',
           'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir,
           'VALIDATION_STRINGENCY=LENIENT']
    sort = popen(cmd, stdin=map_process.stdout, stderr=stderr)
    map_process.stdout.close()
    sort.communicate()
コード例 #42
0
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate',
                             log_fpath=None, tempdir=None):
    if log_fpath is None:
        stderr = NamedTemporaryFile(suffix='.stderr')
    else:
        stderr = open(log_fpath, 'w')

    if tempdir is None:
        tempdir = tempfile.gettempdir()
    picard_tools = get_setting("PICARD_TOOLS_DIR")
    fpath = os.path.join(picard_tools, 'SortSam.jar')
    cmd = ['java', '-jar', fpath, 'I=/dev/stdin',
           'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir,
           'VALIDATION_STRINGENCY=LENIENT']
    sort = popen(cmd, stdin=map_process.stdout, stderr=stderr)
    map_process.stdout.close()
    sort.communicate()
コード例 #43
0
ファイル: blast.py プロジェクト: bharatpatel/seq_crumbs
    def get_matched_segments_for_read(self, read_name):
        "It returns the matched segments for any oligo"
        setting_key = "DEFAULT_IGNORE_ELONGATION_SHORTER"
        ignore_elongation_shorter = get_setting(setting_key)

        try:
            match_parts = self._match_parts[read_name]
        except KeyError:
            # There was no match in the blast
            return None

        # Any of the match_parts has been elongated?
        elongated_match = False
        for m_p in match_parts:
            if ELONGATED in m_p and m_p[ELONGATED] > ignore_elongation_shorter:
                elongated_match = True
        segments = covered_segments_from_match_parts(match_parts, in_query=False)
        return segments, elongated_match
コード例 #44
0
def _guess_fastq_version(fhand, force_file_as_non_seek):
    '''It guesses the format of fastq files.

    It ignores the solexa fastq version.
    '''
    lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand,
                                                        force_file_as_non_seek)
    if is_sanger:
        fmt = 'fastq'
    elif is_sanger is False:
        fmt = 'fastq-illumina'
    else:
        fmt = None

    # onle line fastq? All seq in just one line?
    lines = [l for l in itertools.islice(chunk.splitlines(), 5)]
    if len(lines) != 5:
        one_line = ''
    else:
        if (not lines[0].startswith('@') or
            not lines[2].startswith('+') or
            not lines[4].startswith('@') or
            len(lines[1]) != len(lines[3])):
            one_line = '-multiline'
        else:
            one_line = ''

    if fmt:
        return fmt + one_line

    longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ')
    n_long_seqs = [l for l in lengths if l > longest_expected_illumina]
    if n_long_seqs:
        msg = 'It was not possible to guess the format of '
        if hasattr(fhand, 'name'):
            msg += 'the file ' + fhand.name
        else:
            msg += 'a file '
        msg = '\n. The quality values could be Illumina, but there are '
        msg += 'sequences longer than %i bp.'
        msg %= longest_expected_illumina
        raise UndecidedFastqVersionError(msg)
    else:
        return 'fastq-illumina' + one_line
コード例 #45
0
    def get_matched_segments_for_read(self, read_name):
        'It returns the matched segments for any oligo'
        setting_key = 'DEFAULT_IGNORE_ELONGATION_SHORTER'
        ignore_elongation_shorter = get_setting(setting_key)

        try:
            match_parts = self._match_parts[read_name]
        except KeyError:
            # There was no match in the blast
            return None

        # Any of the match_parts has been elongated?
        elongated_match = False
        for m_p in match_parts:
            if ELONGATED in m_p and m_p[ELONGATED] > ignore_elongation_shorter:
                elongated_match = True
        segments = covered_segments_from_match_parts(match_parts,
                                                     in_query=False)
        return segments, elongated_match
コード例 #46
0
def _guess_fastq_version(fhand, force_file_as_non_seek):
    '''It guesses the format of fastq files.

    It ignores the solexa fastq version.
    '''
    lengths, is_sanger, chunk = _get_some_qual_and_lengths(
        fhand, force_file_as_non_seek)
    if is_sanger:
        fmt = 'fastq'
    elif is_sanger is False:
        fmt = 'fastq-illumina'
    else:
        fmt = None

    # onle line fastq? All seq in just one line?
    lines = [l for l in itertools.islice(chunk.splitlines(), 5)]
    if len(lines) != 5:
        one_line = ''
    else:
        if (not lines[0].startswith('@') or not lines[2].startswith('+')
                or not lines[4].startswith('@')
                or len(lines[1]) != len(lines[3])):
            one_line = '-multiline'
        else:
            one_line = ''

    if fmt:
        return fmt + one_line

    longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ')
    n_long_seqs = [l for l in lengths if l > longest_expected_illumina]
    if n_long_seqs:
        msg = 'It was not possible to guess the format of '
        if hasattr(fhand, 'name'):
            msg += 'the file ' + fhand.name
        else:
            msg += 'a file '
        msg = '\n. The quality values could be Illumina, but there are '
        msg += 'sequences longer than %i bp.'
        msg %= longest_expected_illumina
        raise UndecidedFastqVersionError(msg)
    else:
        return 'fastq-illumina' + one_line
コード例 #47
0
 def __init__(self,
              seqs_fpath,
              seqs,
              program,
              params=None,
              filters=None,
              elongate_for_global=False,
              seqs_type=None):
     '''It inits the class.'''
     self.program = program
     if params is None:
         params = {}
     params['max_target_seqs'] = str(get_setting('PACKET_SIZE'))
     self.params = params
     if filters is None:
         filters = []
     self.filters = filters
     self.elongate_for_global = elongate_for_global
     self._match_parts = self._look_for_blast_matches(
         seqs_fpath, seqs, seqs_type)
コード例 #48
0
ファイル: bam_tools.py プロジェクト: JoseBlanca/seq_crumbs
def sort_bam(in_bam_fpath, out_bam_fpath=None):

    if out_bam_fpath is None:
        out_bam_fpath = in_bam_fpath

    if out_bam_fpath == in_bam_fpath:
        sorted_fhand = NamedTemporaryFile(suffix='.sorted.bam', delete=False)
        temp_out_fpath = sorted_fhand.name
    else:
        temp_out_fpath = out_bam_fpath

    picard_jar = get_setting("PICARD_JAR")
    cmd = ['java', '-jar', picard_jar, 'SortSam',
           'INPUT={0}'.format(in_bam_fpath),
           'OUTPUT={0}'.format(temp_out_fpath),
           'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT']
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    check_call(cmd, stderr=stderr)

    if temp_out_fpath != out_bam_fpath:
        shutil.move(temp_out_fpath, out_bam_fpath)
コード例 #49
0
ファイル: mate_chimeras.py プロジェクト: djinnome/seq_crumbs
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand,
                      chimeras_fhand=None, unknown_fhand=None, tempdir=None,
                      threads=None, settings=get_setting('CHIMERAS_SETTINGS')):

    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand, settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
コード例 #50
0
def sort_bam(in_bam_fpath, out_bam_fpath=None):

    if out_bam_fpath is None:
        out_bam_fpath = in_bam_fpath

    if out_bam_fpath == in_bam_fpath:
        sorted_fhand = NamedTemporaryFile(suffix='.sorted.bam', delete=False)
        temp_out_fpath = sorted_fhand.name
    else:
        temp_out_fpath = out_bam_fpath

    picard_jar = get_setting("PICARD_JAR")
    cmd = [
        'java', '-jar', picard_jar, 'SortSam',
        'INPUT={0}'.format(in_bam_fpath), 'OUTPUT={0}'.format(temp_out_fpath),
        'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT'
    ]
    stderr = NamedTemporaryFile(suffix='picard.stderr')
    check_call(cmd, stderr=stderr)

    if temp_out_fpath != out_bam_fpath:
        shutil.move(temp_out_fpath, out_bam_fpath)
コード例 #51
0
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate',
                             stderr_fhand=None, tempdir=None):
    if stderr_fhand is None:
        stderr = NamedTemporaryFile(suffix='.stderr')
    else:
        stderr = stderr_fhand

    if tempdir is None:
        tempdir = tempfile.gettempdir()
    picard_jar = get_setting("PICARD_JAR")
    cmd = ['java', '-jar', picard_jar, 'SortSam', 'I=/dev/stdin',
           'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir,
           'VALIDATION_STRINGENCY=LENIENT']
    sort = popen(cmd, stdin=map_process.stdout, stderr=stderr)
    map_process.stdout.close()
    sort.communicate()

    if map_process.returncode:
        raise RuntimeError('Error in mapping process')

    if sort.returncode:
        raise RuntimeError('Error in Sort process')
コード例 #52
0
ファイル: filters.py プロジェクト: radaniba/seq_crumbs
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands,
                    unknown_fhand, unpaired=False, paired_result=True,
                    settings=get_setting('CHIMERAS_SETTINGS'),
                    min_seed_len=None, directory=None):
    file_format = get_format(in_fhands[0])
    if unpaired:
        unpaired_fpaths = [fhand.name for fhand in in_fhands]
        paired_fpaths = None
    else:
        f_fhand = NamedTemporaryFile()
        r_fhand = NamedTemporaryFile()
        seqs = read_seqs(in_fhands)
        deinterleave_pairs(seqs, f_fhand, r_fhand, file_format)
        paired_fpaths = [f_fhand.name, r_fhand.name]
        unpaired_fpaths = None
    bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths,
                                   directory, file_format, min_seed_len)

    total = 0
    chimeric = 0
    unknown = 0
    for pair, kind in classify_mapped_reads(bamfile, settings=settings,
                                           paired_result=paired_result,
                                           file_format=file_format):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
            chimeric += 1
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
            unknown += 1
        total += 1
    mapped = total - chimeric - unknown
    print 'Total pairs analyzed: ', total
    print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total)
    print 'Unknown pairs found: ', unknown, '\t', unknown / float(total)
    print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
コード例 #53
0
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'
    if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'):
        raise IncompatibleFormatError("This output format is not supported")

    in_formats = [get_format(fhand) for fhand in in_fhands]

    if len(in_fhands) == 1 and in_formats[0] == out_format:
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhand)
        else:
            rel_symlink(in_fhands[0].name, out_fhand.name)
    else:
        seqs = _read_seqrecords(in_fhands)
        try:
            write_seqrecs(seqs, out_fhand, out_format)
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            if 'No suitable quality scores' in str(error):
                msg = 'No qualities available to write output file'
                raise IncompatibleFormatError(msg)
            raise
コード例 #54
0
from subprocess import check_output, CalledProcessError
from tempfile import NamedTemporaryFile

from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

from crumbs.transcript_orientations import TranscriptOrientator
from crumbs.settings import get_setting
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.tags import SEQRECORD
from crumbs.seq import get_str_seq
from crumbs.seqio import read_seqs
from crumbs.seq import SeqWrapper

POLYA_ANNOTATOR_MISMATCHES = get_setting('POLYA_ANNOTATOR_MISMATCHES')

# pylint: disable=R0201
# pylint: disable=R0904


_wrap_seq = lambda seq: SeqWrapper(SEQRECORD, seq, None)


class TestTranscriptomeOrientator(unittest.TestCase):
    'the class'

    def test_transcriptome_orientator(self):
        '''tests the orientator class'''
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
コード例 #55
0
 def test_get_settings(self):
     'We get the settings'
     kmer_size = get_setting('DEFAULT_KMER_SIZE')
     assert kmer_size
コード例 #56
0
    def test_split_mates(self):
        'It tests the detection of oligos in sequence files'

        mate_fhand = NamedTemporaryFile(suffix='.fasta')
        linker = TITANIUM_LINKER

        # a complete linker
        seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT'

        mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n')
        # no linker
        mate_fhand.write('>seq2\n' + seq5 + '\n')
        # a partial linker
        mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n')
        # the linker is 5 prima
        mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n')
        # two linkers
        mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n')
        # reverse linker
        rev_linker = get_setting('TITANIUM_LINKER_REV')
        mate_fhand.write('>seq6\n' + seq5 + rev_linker + seq3 + '\n')
        mate_fhand.flush()

        splitter = MatePairSplitter()
        new_seqs = []
        for packet in read_seq_packets([mate_fhand], 2):
            new_seqs.append(splitter(packet))

        out_fhand = StringIO()
        write_seq_packets(out_fhand, new_seqs, file_format='fasta')

        result = out_fhand.getvalue()
        xpect = r'>seq1\1'
        xpect += '\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += r'>seq1\2'
        xpect += '\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq2\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += '>seq3_pl.part1\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n'
        xpect += '>seq3_pl.part2\n'
        xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq4\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part1\n'
        xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT'
        xpect += 'TG'
        xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part2\n'
        xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        xpect += '\n'
        xpect += r'>seq6\1'
        xpect += '\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += r'>seq6\2'
        xpect += '\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        assert xpect == result

        # with short linker in 3 prima
        mate_fhand = NamedTemporaryFile(suffix='.fasta')
        seq = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAA"
        seq += "CTCACATACACTGCTGTACCGTAC"
        mate_fhand.write(seq)
        mate_fhand.flush()
        splitter = MatePairSplitter()
        new_seqs = []
        for packet in read_seq_packets([mate_fhand], 1):
            new_seqs.append(splitter(packet))
        out_fhand = StringIO()
        write_seq_packets(out_fhand, new_seqs, file_format='fasta')
        result = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAACTCACATACA\n"
        assert result == out_fhand.getvalue()
コード例 #57
0
import os.path
from random import choice

from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

from crumbs.split_mates import MatePairSplitter
from crumbs.settings import get_setting
from crumbs.seqio import read_seq_packets, write_seq_packets, read_seqs
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.seq_utils import process_seq_packets
from crumbs.utils.tags import SEQRECORD
from crumbs.seq import get_name, SeqWrapper, get_str_seq

TITANIUM_LINKER = get_setting('TITANIUM_LINKER')
FLX_LINKER = get_setting('FLX_LINKER')

# pylint: disable=R0201
# pylint: disable=R0904


def create_a_matepair_file():
    'It creates a matepair fasta file'

    seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
    seq_3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT'
    mate_seq = seq_5 + TITANIUM_LINKER + seq_3
    mate_fhand = NamedTemporaryFile(suffix='.fasta')
    mate_fhand.write('>seq1\n' + mate_seq + '\n')
    mate_fhand.flush()