def _get_some_qual_and_lengths(fhand, force_file_as_non_seek): 'It returns the quality characters and the lengths' seqs_to_peek = get_setting('SEQS_TO_GUESS_FASTQ_VERSION') chunk_size = get_setting('CHUNK_TO_GUESS_FASTQ_VERSION') lengths = array('I') seqs_analyzed = 0 if fhand_is_seekable(fhand) and not force_file_as_non_seek: fmt_fhand = fhand chunk = fmt_fhand.read(chunk_size) fhand.seek(0) else: chunk = peek_chunk_from_file(fhand, chunk_size) fmt_fhand = cStringIO.StringIO(chunk) try: for seq in FastqGeneralIterator(fmt_fhand): qual = [ord(char) for char in seq[2]] sanger_chars = [q for q in qual if q < 64] if sanger_chars: fhand.seek(0) return None, True, chunk # no quals, no lengths, is_sanger lengths.append(len(qual)) seqs_analyzed += 1 if seqs_analyzed > seqs_to_peek: break except ValueError: msg = 'The file is Fastq, but the version is difficult to guess' raise UndecidedFastqVersionError(msg) finally: fhand.seek(0) return lengths, None, chunk # don't know if it's sanger
def calculate_bin_edges(self, min_, max_, n_bins=None): 'It calculates the bin_edges' min_bins = get_setting('MIN_BINS') max_bins = get_setting('MAX_BINS') if n_bins is None: num_values = max_ - min_ if num_values == 0: n_bins = 1 elif num_values < min_bins: n_bins = num_values else: n_bins = int(self.count / get_setting('MEAN_VALUES_IN_BIN')) if n_bins < min_bins: n_bins = min_bins if n_bins > max_bins: n_bins = max_bins if n_bins > num_values: n_bins = num_values # now we can calculate the bin edges distrib_span = max_ - min_ if max_ != min_ else 1 if distrib_span % n_bins: distrib_span = distrib_span + n_bins - (distrib_span % n_bins) bin_span = distrib_span // n_bins bin_edges = [min_ + bin_ * bin_span for bin_ in range(n_bins + 1)] return bin_edges
def __init__(self, min_len=get_setting('POLYA_ANNOTATOR_MIN_LEN'), max_cont_mismatches=get_setting('POLYA_ANNOTATOR_MISMATCHES')): '''It inits the class. min_len - minimum number of consecutive As (or Ts) to extend the tail max_cont_mismatches - maximum number of consecutive no A (or Ts) to break a tail. ''' self._min_len = min_len self._max_cont_mismatches = max_cont_mismatches
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = AlignmentFile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [ alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments ] if None not in pair: yield pair, kind
def _guess_fastq_version(fhand, force_file_as_non_seek): '''It guesses the format of fastq files. It ignores the solexa fastq version. ''' lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand, force_file_as_non_seek) if is_sanger: fmt = 'fastq' elif is_sanger is False: fmt = 'fastq-illumina' else: fmt = None if fmt: return fmt longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ') n_long_seqs = [l for l in lengths if l > longest_expected_illumina] if n_long_seqs: msg = 'It was not possible to guess the format of ' if hasattr(fhand, 'name'): msg += 'the file ' + fhand.name else: msg += 'a file ' msg = '\n. The quality values could be Illumina, but there are ' msg += 'sequences longer than %i bp.' msg %= longest_expected_illumina raise UndecidedFastqVersionError(msg) else: return 'fastq-illumina'
def test_many_reads(self): 'It splits lots of reads to check that blast finds everything' linker = TITANIUM_LINKER def create_seq(index): 'It creates a random seq with a linker' seq1 = ''.join(choice('ACTG') for i in range(100)) seq2 = ''.join(choice('ACTG') for i in range(100)) seq = seq1 + linker + seq2 seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq)) seq = SeqWrapper(SEQRECORD, seq, None) return seq # We want to test that blast reports all reads packet_size = get_setting('PACKET_SIZE') default_blast_max_target_size = 500 assert packet_size > default_blast_max_target_size seqs = [create_seq(i) for i in range(1000)] splitter = MatePairSplitter() for index, seq in enumerate(splitter(seqs)): seq_index = index // 2 pair_index = (index % 2) + 1 expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index) assert get_name(seq) == expected_id
def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers)
def __init__(self, threshold=get_setting('DEFATULT_DUST_THRESHOLD'), reverse=False, failed_drags_pair=True): '''The initiator ''' self._threshold = threshold super(FilterDustComplexity, self).__init__(reverse=reverse, failed_drags_pair=failed_drags_pair)
def _guess_fastq_version(fhand, force_file_as_non_seek): """It guesses the format of fastq files. It ignores the solexa fastq version. """ lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand, force_file_as_non_seek) if is_sanger: fmt = "fastq" elif is_sanger is False: fmt = "fastq-illumina" else: fmt = None if fmt: return fmt longest_expected_illumina = get_setting("LONGEST_EXPECTED_ILLUMINA_READ") n_long_seqs = [l for l in lengths if l > longest_expected_illumina] if n_long_seqs: msg = "It was not possible to guess the format of " if hasattr(fhand, "name"): msg += "the file " + fhand.name else: msg += "a file " msg = "\n. The quality values could be Illumina, but there are " msg += "sequences longer than %i bp." msg %= longest_expected_illumina raise UndecidedFastqVersionError(msg) else: return "fastq-illumina"
def _realign_bam(bam_fpath, reference_fpath, out_bam_fpath, threads=False): 'It realigns the bam using GATK Local realignment around indels' # reference sam index _create_sam_reference_index(reference_fpath) # reference picard dict _create_picard_dict(reference_fpath) # bam index index_bam(bam_fpath) # the intervals to realign # gatk_dir = get_setting("GATK_DIR") # gatk_jar = os.path.join(gatk_dir, 'GenomeAnalysisTK.jar') gatk_jar = get_setting('GATK_JAR') intervals_fhand = NamedTemporaryFile(suffix='.intervals') stderr = NamedTemporaryFile(suffix='picard.stderr') stdout = NamedTemporaryFile(suffix='picard.stdout') cmd = ['java', '-jar', gatk_jar, '-T', 'RealignerTargetCreator', '-I', bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name] check_call(cmd, stderr=stderr, stdout=stdout) # the realignment itself cmd = ['java', '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath, '-T', 'IndelRealigner', '-targetIntervals', intervals_fhand.name, '-o', out_bam_fpath] if threads and threads > 1: cmd.extend(['-nt', str(get_num_threads(threads))]) check_call(cmd, stderr=stderr, stdout=stdout) intervals_fhand.close()
def _realign_bam(bam_fpath, reference_fpath, out_bam_fpath, threads=False): 'It realigns the bam using GATK Local realignment around indels' # reference sam index _create_sam_reference_index(reference_fpath) # reference picard dict _create_picard_dict(reference_fpath) # bam index index_bam(bam_fpath) # the intervals to realign # gatk_dir = get_setting("GATK_DIR") # gatk_jar = os.path.join(gatk_dir, 'GenomeAnalysisTK.jar') gatk_jar = get_setting('GATK_JAR') intervals_fhand = NamedTemporaryFile(suffix='.intervals') stderr = NamedTemporaryFile(suffix='picard.stderr') stdout = NamedTemporaryFile(suffix='picard.stdout') cmd = [ 'java', '-jar', gatk_jar, '-T', 'RealignerTargetCreator', '-I', bam_fpath, '-R', reference_fpath, '-o', intervals_fhand.name ] check_call(cmd, stderr=stderr, stdout=stdout) # the realignment itself cmd = [ 'java', '-jar', gatk_jar, '-I', bam_fpath, '-R', reference_fpath, '-T', 'IndelRealigner', '-targetIntervals', intervals_fhand.name, '-o', out_bam_fpath ] if threads and threads > 1: cmd.extend(['-nt', str(get_num_threads(threads))]) check_call(cmd, stderr=stderr, stdout=stdout) intervals_fhand.close()
def classify_mapped_reads(bam_fhand, mate_distance, settings=get_setting('CHIMERAS_SETTINGS')): '''It classifies sequences from bam file in chimeric, unknown and non chimeric, according to its distance and orientation in the reference sequence''' bamfile = Samfile(bam_fhand.name) # settings. Include in function properties with default values max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] variation = settings['MATE_DISTANCE_VARIATION'] mate_length_range = [mate_distance - variation, mate_distance + variation] reference_lengths = _get_ref_lengths(bamfile) # It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, reference_lengths): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len, reference_lengths): kind = CHIMERA else: kind = UNKNOWN pair = [alignedread_to_seqitem(_get_primary_alignment(mates)) for mates in mates_alignments] if None not in pair: yield pair, kind
def classify_mapped_reads_new(bamfile, settings=get_setting('CHIMERAS_SETTINGS'), file_format='fastq', mate_length_range=[2000, 4000], out_format=SEQITEM): #settings. Include in function properties with default values max_coincidences = settings['MAX_COINCIDENCES'] max_mapq_difference = settings['MAX_MAPQ_DIFFERENCE'] limit = settings['MAX_DISTANCE_TO_END'] max_clipping = settings['MAX_CLIPPING'] max_pe_len = settings['MAX_PE_LEN'] min_mp_len = settings['MIN_MP_LEN'] #It tries to find out the kind of each pair of sequences for grouped_mates in _group_alignments_by_reads(bamfile): mates_alignments = _split_mates(grouped_mates) if _mates_are_not_chimeric(mates_alignments, max_clipping, mate_length_range, bamfile, max_coincidences, max_mapq_difference, limit): kind = NON_CHIMERIC elif _mates_are_chimeric(mates_alignments, bamfile, max_clipping, max_pe_len): kind = CHIMERA else: kind = UNKNOWN if out_format == SEQITEM: pair = [alignedread_to_seqitem(read[0], file_format) for read in mates_alignments] elif out_format == 'aligned_read': pair = mates_alignments yield [pair, kind]
def _guess_fastq_version(fhand, force_file_as_non_seek): '''It guesses the format of fastq files. It ignores the solexa fastq version. ''' lengths, is_sanger, chunk = _get_some_qual_and_lengths( fhand, force_file_as_non_seek) if is_sanger: fmt = 'fastq' elif is_sanger is False: fmt = 'fastq-illumina' else: fmt = None if fmt: return fmt longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ') n_long_seqs = [l for l in lengths if l > longest_expected_illumina] if n_long_seqs: msg = 'It was not possible to guess the format of ' if hasattr(fhand, 'name'): msg += 'the file ' + fhand.name else: msg += 'a file ' msg = '\n. The quality values could be Illumina, but there are ' msg += 'sequences longer than %i bp.' msg %= longest_expected_illumina raise UndecidedFastqVersionError(msg) else: return 'fastq-illumina'
def _get_binary_path(binary_name): '''It return the path to the proper binary. It looks on platform and architecture to decide it. Fails if there is not binary for that architecture ''' if get_setting('USE_EXTERNAL_BIN_PREFIX'): ext_binary_name = get_setting('EXTERNAL_BIN_PREFIX') + binary_name if os.path.exists(ext_binary_name): return ext_binary_name if not get_setting('ADD_PATH_TO_EXT_BIN'): # I have to check if the binary is on my current directory. # If it is there use it, else assumes that it is on the path if os.path.exists(os.path.join(os.getcwd(), ext_binary_name)): return os.path.join(os.getcwd(), ext_binary_name) #return binary_name system = platform.system().lower() if system == 'windows': binary_name += '.exe' arch = platform.architecture()[0] join = os.path.join third_party_path = join(module_path, '..', 'third_party', 'bin') third_party_path = os.path.abspath(third_party_path) binary_path = os.path.abspath(join(third_party_path, system, arch, binary_name)) if os.path.exists(binary_path): return binary_path elif arch == '64bit': arch = '32bit' binary_path = os.path.abspath(join(third_party_path, system, arch, binary_name)) if os.path.exists(binary_path): return binary_path # At this point there is not available binary for the working platform # Is the binary really in the path? if which(binary_name): return binary_name msg = '{} not found in the path. Please install it to use seq_crumbs' raise MissingBinaryError(msg.format(binary_name))
def _get_binary_path(binary_name): '''It return the path to the proper binary. It looks on platform and architecture to decide it. Fails if there is not binary for that architecture ''' if get_setting('USE_EXTERNAL_BIN_PREFIX'): ext_binary_name = get_setting('EXTERNAL_BIN_PREFIX') + binary_name if os.path.exists(ext_binary_name): return ext_binary_name if not get_setting('ADD_PATH_TO_EXT_BIN'): # I have to check if the binary is on my current directory. # If it is there use it, else assumes that it is on the path if os.path.exists(os.path.join(os.getcwd(), ext_binary_name)): return os.path.join(os.getcwd(), ext_binary_name) #return binary_name system = platform.system().lower() if system == 'windows': binary_name += '.exe' arch = platform.architecture()[0] join = os.path.join third_party_path = join(module_path, '..', 'third_party', 'bin') third_party_path = os.path.abspath(third_party_path) binary_path = os.path.abspath( join(third_party_path, system, arch, binary_name)) if os.path.exists(binary_path): return binary_path elif arch == '64bit': arch = '32bit' binary_path = os.path.abspath( join(third_party_path, system, arch, binary_name)) if os.path.exists(binary_path): return binary_path # At this point there is not available binary for the working platform # Is the binary really in the path? if which(binary_name): return binary_name msg = '{} not found in the path. Please install it to use seq_crumbs' raise MissingBinaryError(msg.format(binary_name))
def draw_histogram_ascii(bin_limits, counts): 'It draws an ASCII histogram' fill_char = '*' assert len(bin_limits) == len(counts) + 1 # pylint: disable=W0108 number_to_str = lambda n: '{:d}'.format(n) if isinstance(n, int) else \ '{:.2f}'.format(n) # we gather all bin limits and we calculate the longest number bin_start = None bin_end = bin_limits[0] max_ndigits = len(number_to_str(bin_end)) max_count_ndigits = 0 bins = [] for bin_limit, cnt in zip(bin_limits[1:], counts): bin_start, bin_end = bin_end, bin_limit n_digits = len(number_to_str(bin_end)) if max_ndigits < n_digits: max_ndigits = n_digits n_digits = len(number_to_str(cnt)) if max_count_ndigits < n_digits: max_count_ndigits = n_digits bins.append((bin_start, bin_end)) limit_fmt_int = '{:>' + str(max_ndigits) + 'd}' limit_fmt_float = '{:>' + str(max_ndigits) + '.5f}' limit_to_padded_str = lambda n: limit_fmt_int.format(n) \ if isinstance(n, int) else limit_fmt_float.format(n) count_fmt = '{:>' + str(max_count_ndigits) + 'd}' count_to_padded_str = lambda n: count_fmt.format(n) result = [] for bin_, cnt in zip(bins, counts): line = '' line += '[' line += limit_to_padded_str(bin_[0]) line += ' , ' line += limit_to_padded_str(bin_[1]) line += '[ (' line += count_to_padded_str(cnt) line += '): ' result.append(line) # pylint: disable=W0141 max_count = max(counts) max_header_len = max(map(len, result)) max_hist_width = get_setting('MAX_WIDTH_ASCII_PLOT') - max_header_len counts_ratio = max_hist_width / max_count result2 = [] for line, cnt in zip(result, counts): line += fill_char * int(cnt * counts_ratio) line += '\n' result2.append(line) return ''.join(result2)
def __init__(self, index_fpath, max_clipping=None, tempdir=None): 'The initiator' self._tempdir = tempdir self._index_fpath = index_fpath if max_clipping is not None: self.max_clipping = max_clipping else: self.max_clipping = get_setting('CHIMERAS_SETTINGS')['MAX_CLIPPING']
def copy_and_rename_ext_bin(app_dir, bin_dist_dir): ''' It copies and changes the name of the platform specific external binaries. It copies them to the binary dist directory''' external_bin_dir = get_platform_bin_dir(app_dir) for ext_bin in os.listdir(external_bin_dir): shutil.copy(join(external_bin_dir, ext_bin), join(bin_dist_dir, settings.get_setting('EXTERNAL_BIN_PREFIX') + ext_bin))
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_fwd, buf_rev # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue if direction == FWD: buf1 = buf_rev buf2 = buf_fwd else: buf1 = buf_fwd buf2 = buf_rev try: matching_seq_index = buf1['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf2['items'].append(seq) buf2['index'][seq_name] = len(buf2['items']) - 1 # check mem limit sum_items = len(buf1['items'] + buf2['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf1['items'][:matching_seq_index] matching_seq = buf1['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffers 1 buf1['items'] = buf1['items'][matching_seq_index + 1:] buf1['index'] = {s: i for i, s in enumerate(buf1['items'])} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf2['items'], orphan_out_fhand, out_format) buf2['items'] = [] buf2['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)
def copy_and_rename_ext_bin(app_dir, bin_dist_dir): ''' It copies and changes the name of the platform specific external binaries. It copies them to the binary dist directory''' external_bin_dir = get_platform_bin_dir(app_dir) for ext_bin in os.listdir(external_bin_dir): shutil.copy( join(external_bin_dir, ext_bin), join(bin_dist_dir, settings.get_setting('EXTERNAL_BIN_PREFIX') + ext_bin))
def read_seq_packets(fhands, size=get_setting('PACKET_SIZE'), out_format=None, file_format=GUESS_FORMAT, prefered_seq_classes=None): '''It yields SeqItems in packets of the given size.''' seqs = read_seqs(fhands, file_format, out_format=out_format, prefered_seq_classes=prefered_seq_classes) return group_in_packets(seqs, size)
def _create_picard_dict(fpath): 'It creates a picard dict if if it does not exist' dict_path = os.path.splitext(fpath)[0] + '.dict' if os.path.exists(dict_path): return picard_jar = get_setting("PICARD_JAR") cmd = ['java', '-jar', picard_jar, 'CreateSequenceDictionary', 'R=%s' % fpath, 'O=%s' % dict_path] stderr = NamedTemporaryFile(suffix='picard.stderr') check_call(cmd, stderr=stderr)
def map_with_bwasw(index_fpath, bam_fpath, unpaired_fpath=None, paired_fpaths=None, readgroup=None, threads=None, log_fpath=None, extra_params=None): 'It maps with bwa ws algorithm' if paired_fpaths is None and unpaired_fpath is None: raise RuntimeError('At least one file to map is required') elif paired_fpaths is not None and unpaired_fpath is not None: msg = 'Bwa can not map unpaired and unpaired reads together' raise RuntimeError(msg) if readgroup is None: readgroup = {} if extra_params is None: extra_params = [] binary = get_binary_path('bwa') cmd = [binary, 'bwasw', '-t', str(get_num_threads(threads)), index_fpath] cmd.extend(extra_params) if paired_fpaths is not None: cmd.extend(paired_fpaths) if unpaired_fpath is not None: cmd.append(unpaired_fpath) if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') #raw_input(' '.join(cmd)) bwa = popen(cmd, stderr=stderr, stdout=PIPE) # add readgroup using picard picard_tools = get_setting("PICARD_TOOLS_DIR") if readgroup: cmd = ['java', '-jar', os.path.join(picard_tools, 'AddOrReplaceReadGroups.jar'), 'INPUT=/dev/stdin', 'OUTPUT={0}'.format(bam_fpath), 'RGID={0}'.format(readgroup['ID']), 'RGLB={0}'.format(readgroup['LB']), 'RGPL={0}'.format(readgroup['PL']), 'RGSM={0}'.format(readgroup['SM']), 'RGPU={0}'.format(readgroup['PU']), 'VALIDATION_STRINGENCY=LENIENT'] else: cmd = [get_binary_path('samtools'), 'view', '-h', '-b', '-S', '-', '-o', bam_fpath] samtools = popen(cmd, stdin=bwa.stdout, stderr=stderr) bwa.stdout.close() # Allow p1 to receive a SIGPIPE if samtools exits. samtools.communicate() if bwa.returncode or samtools.returncode: raise RuntimeError(open(stderr.name).read())
def ascii_plot(self): 'It plots columns with the nucleotide frequencies' nucls = ('A', 'C', 'G', 'T', 'N') plot_nucls = ('a', 'C', 'g', 'T', 'n') locs = self.counts.keys() if not locs: return '' loc_max = max(locs) loc_min = min(locs) loc_width = len(str(loc_max)) loc_fmt = '{:>' + str(loc_width) + 'd}' counts = self.counts def _header_for_nucl(loc): 'It returns the header for the given position' header = loc_fmt.format(loc) + ' (' count = counts.get(loc, {}) freqs = [count.get(n, 0) for n in nucls] tot_bases = sum(freqs) freqs = [f / tot_bases for f in freqs] freq_strs = [ '{}: {:.2f}'.format(n, f) for n, f in zip(nucls, freqs) ] header += ', '.join(freq_strs) + ') | ' return header, freqs header_len = len(_header_for_nucl(0)[0]) plot_width = get_setting('MAX_WIDTH_ASCII_PLOT') - header_len val_per_pixel = 1 / plot_width plot = '' for loc in range(loc_min, loc_max + 1): header, freqs = _header_for_nucl(loc) assert approx_equal(sum(freqs), 1) line = header remainder_freqs = [ float(re.sub('\d\.', '0.', str(f))) for f in freqs ] round_freqs = [int(round(f / val_per_pixel)) for f in freqs] pixels_remaining = plot_width - sum(round_freqs) if pixels_remaining > 0: add_to_freq = remainder_freqs.index(max(remainder_freqs)) round_freqs[add_to_freq] += (plot_width - sum(round_freqs)) elif pixels_remaining < 0: add_to_freq = remainder_freqs.index(min(remainder_freqs)) round_freqs[add_to_freq] -= (plot_width - sum(round_freqs)) assert approx_equal(sum(round_freqs), plot_width) line += ''.join([n * f for f, n in zip(round_freqs, plot_nucls)]) line += '\n' plot += line return plot
def __init__(self, seqs_fpath, seqs, program, params=None, filters=None, elongate_for_global=False, seqs_type=None): """It inits the class.""" self.program = program if params is None: params = {} params["max_target_seqs"] = str(get_setting("PACKET_SIZE")) self.params = params if filters is None: filters = [] self.filters = filters self.elongate_for_global = elongate_for_global self._match_parts = self._look_for_blast_matches(seqs_fpath, seqs, seqs_type)
def calculate_dust_score(seq): '''It returns the dust score. From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA Sequences" doi:10.1089/cmb.2006.13.1028 and re-implemented from PRINSEQ ''' seq = get_str_seq(seq) length = len(seq) if length == 3: return 0 if length <= 5: return None windowsize = get_setting('DUST_WINDOWSIZE') windowstep = get_setting('DUST_WINDOWSTEP') dustscores = [] if length > windowsize: windows = 0 for seq_in_win in rolling_window(seq, windowsize, windowstep): score = _calculate_rawscore(seq_in_win) dustscores.append(score / (windowsize - 2)) windows += 1 remaining_seq = seq[windows * windowstep:] else: remaining_seq = seq if remaining_seq > 5: length = len(remaining_seq) score = _calculate_rawscore(remaining_seq) dustscore = score / (length - 3) * (windowsize - 2) / (length - 2) dustscores.append(dustscore) # max score should be 100 not 31 dustscore = sum(dustscores) / len(dustscores) * 100 / 31 return dustscore
def _create_picard_dict(fpath): 'It creates a picard dict if if it does not exist' dict_path = os.path.splitext(fpath)[0] + '.dict' if os.path.exists(dict_path): return picard_jar = get_setting("PICARD_JAR") cmd = [ 'java', '-jar', picard_jar, 'CreateSequenceDictionary', 'R=%s' % fpath, 'O=%s' % dict_path ] stderr = NamedTemporaryFile(suffix='picard.stderr') check_call(cmd, stderr=stderr)
def ascii_plot(self): 'It plots columns with the nucleotide frequencies' nucls = ('A', 'C', 'G', 'T', 'N') plot_nucls = ('a', 'C', 'g', 'T', 'n') locs = self.counts.keys() if not locs: return '' loc_max = max(locs) loc_min = min(locs) loc_width = len(str(loc_max)) loc_fmt = '{:>' + str(loc_width) + 'd}' counts = self.counts def _header_for_nucl(loc): 'It returns the header for the given position' header = loc_fmt.format(loc) + ' (' count = counts.get(loc, {}) freqs = [count.get(n, 0) for n in nucls] tot_bases = sum(freqs) freqs = [f / tot_bases for f in freqs] freq_strs = ['{}: {:.2f}'.format(n, f) for n, f in zip(nucls, freqs)] header += ', '.join(freq_strs) + ') | ' return header, freqs header_len = len(_header_for_nucl(0)[0]) plot_width = get_setting('MAX_WIDTH_ASCII_PLOT') - header_len val_per_pixel = 1 / plot_width plot = '' for loc in range(loc_min, loc_max + 1): header, freqs = _header_for_nucl(loc) assert approx_equal(sum(freqs), 1) line = header remainder_freqs = [float(re.sub('\d\.', '0.', str(f))) for f in freqs] round_freqs = [int(round(f / val_per_pixel)) for f in freqs] pixels_remaining = plot_width - sum(round_freqs) if pixels_remaining > 0: add_to_freq = remainder_freqs.index(max(remainder_freqs)) round_freqs[add_to_freq] += (plot_width - sum(round_freqs)) elif pixels_remaining < 0: add_to_freq = remainder_freqs.index(min(remainder_freqs)) round_freqs[add_to_freq] -= (plot_width - sum(round_freqs)) assert approx_equal(sum(round_freqs), plot_width) line += ''.join([n * f for f, n in zip(round_freqs, plot_nucls)]) line += '\n' plot += line return plot
def merge_sams(in_fpaths, out_fpath): picard_jar = get_setting("PICARD_JAR") cmd = ['java', '-jar', picard_jar, 'MergeSamFiles', 'O={}'.format(out_fpath)] for in_fpath in in_fpaths: cmd.append('I={}'.format(in_fpath)) stderr = NamedTemporaryFile(suffix='picard.stderr') stdout = NamedTemporaryFile(suffix='picard.stdout') try: check_call(cmd, stderr=stderr, stdout=stdout) except CalledProcessError: sys.stderr.write(open(stderr.name).read()) sys.stdout.write(open(stdout.name).read())
def merge_sams(in_fpaths, out_fpath): picard_jar = get_setting("PICARD_JAR") cmd = [ 'java', '-jar', picard_jar, 'MergeSamFiles', 'O={}'.format(out_fpath) ] for in_fpath in in_fpaths: cmd.append('I={}'.format(in_fpath)) stderr = NamedTemporaryFile(suffix='picard.stderr') stdout = NamedTemporaryFile(suffix='picard.stdout') try: check_call(cmd, stderr=stderr, stdout=stdout) except CalledProcessError: sys.stderr.write(open(stderr.name).read()) sys.stdout.write(open(stdout.name).read())
def sort_mapped_reads(map_process, out_fpath, key='coordinate', tempdir='/tmp'): picard_tools = get_setting("PICARD_TOOLS_DIR") fpath = os.path.join(picard_tools, 'SortSam.jar') cmd = ['java', '-jar', fpath, 'I=/dev/stdin', 'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir, 'VALIDATION_STRINGENCY=LENIENT'] sort = subprocess.Popen(cmd, stdin=map_process.stdout, stderr=PIPE) stderr = sort.stderr sort.wait() if sort.returncode: msg = 'Something happened running picard sort:\n' msg += stderr.read() raise RuntimeError(msg) assert sort.returncode == 0
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate', log_fpath=None, tempdir=None): if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') if tempdir is None: tempdir = tempfile.gettempdir() picard_jar = get_setting("PICARD_JAR") cmd = ['java', '-jar', picard_jar, 'SortSam', 'I=/dev/stdin', 'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir, 'VALIDATION_STRINGENCY=LENIENT'] sort = popen(cmd, stdin=map_process.stdout, stderr=stderr) map_process.stdout.close() sort.communicate()
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate', log_fpath=None, tempdir=None): if log_fpath is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = open(log_fpath, 'w') if tempdir is None: tempdir = tempfile.gettempdir() picard_tools = get_setting("PICARD_TOOLS_DIR") fpath = os.path.join(picard_tools, 'SortSam.jar') cmd = ['java', '-jar', fpath, 'I=/dev/stdin', 'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir, 'VALIDATION_STRINGENCY=LENIENT'] sort = popen(cmd, stdin=map_process.stdout, stderr=stderr) map_process.stdout.close() sort.communicate()
def get_matched_segments_for_read(self, read_name): "It returns the matched segments for any oligo" setting_key = "DEFAULT_IGNORE_ELONGATION_SHORTER" ignore_elongation_shorter = get_setting(setting_key) try: match_parts = self._match_parts[read_name] except KeyError: # There was no match in the blast return None # Any of the match_parts has been elongated? elongated_match = False for m_p in match_parts: if ELONGATED in m_p and m_p[ELONGATED] > ignore_elongation_shorter: elongated_match = True segments = covered_segments_from_match_parts(match_parts, in_query=False) return segments, elongated_match
def _guess_fastq_version(fhand, force_file_as_non_seek): '''It guesses the format of fastq files. It ignores the solexa fastq version. ''' lengths, is_sanger, chunk = _get_some_qual_and_lengths(fhand, force_file_as_non_seek) if is_sanger: fmt = 'fastq' elif is_sanger is False: fmt = 'fastq-illumina' else: fmt = None # onle line fastq? All seq in just one line? lines = [l for l in itertools.islice(chunk.splitlines(), 5)] if len(lines) != 5: one_line = '' else: if (not lines[0].startswith('@') or not lines[2].startswith('+') or not lines[4].startswith('@') or len(lines[1]) != len(lines[3])): one_line = '-multiline' else: one_line = '' if fmt: return fmt + one_line longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ') n_long_seqs = [l for l in lengths if l > longest_expected_illumina] if n_long_seqs: msg = 'It was not possible to guess the format of ' if hasattr(fhand, 'name'): msg += 'the file ' + fhand.name else: msg += 'a file ' msg = '\n. The quality values could be Illumina, but there are ' msg += 'sequences longer than %i bp.' msg %= longest_expected_illumina raise UndecidedFastqVersionError(msg) else: return 'fastq-illumina' + one_line
def get_matched_segments_for_read(self, read_name): 'It returns the matched segments for any oligo' setting_key = 'DEFAULT_IGNORE_ELONGATION_SHORTER' ignore_elongation_shorter = get_setting(setting_key) try: match_parts = self._match_parts[read_name] except KeyError: # There was no match in the blast return None # Any of the match_parts has been elongated? elongated_match = False for m_p in match_parts: if ELONGATED in m_p and m_p[ELONGATED] > ignore_elongation_shorter: elongated_match = True segments = covered_segments_from_match_parts(match_parts, in_query=False) return segments, elongated_match
def _guess_fastq_version(fhand, force_file_as_non_seek): '''It guesses the format of fastq files. It ignores the solexa fastq version. ''' lengths, is_sanger, chunk = _get_some_qual_and_lengths( fhand, force_file_as_non_seek) if is_sanger: fmt = 'fastq' elif is_sanger is False: fmt = 'fastq-illumina' else: fmt = None # onle line fastq? All seq in just one line? lines = [l for l in itertools.islice(chunk.splitlines(), 5)] if len(lines) != 5: one_line = '' else: if (not lines[0].startswith('@') or not lines[2].startswith('+') or not lines[4].startswith('@') or len(lines[1]) != len(lines[3])): one_line = '-multiline' else: one_line = '' if fmt: return fmt + one_line longest_expected_illumina = get_setting('LONGEST_EXPECTED_ILLUMINA_READ') n_long_seqs = [l for l in lengths if l > longest_expected_illumina] if n_long_seqs: msg = 'It was not possible to guess the format of ' if hasattr(fhand, 'name'): msg += 'the file ' + fhand.name else: msg += 'a file ' msg = '\n. The quality values could be Illumina, but there are ' msg += 'sequences longer than %i bp.' msg %= longest_expected_illumina raise UndecidedFastqVersionError(msg) else: return 'fastq-illumina' + one_line
def __init__(self, seqs_fpath, seqs, program, params=None, filters=None, elongate_for_global=False, seqs_type=None): '''It inits the class.''' self.program = program if params is None: params = {} params['max_target_seqs'] = str(get_setting('PACKET_SIZE')) self.params = params if filters is None: filters = [] self.filters = filters self.elongate_for_global = elongate_for_global self._match_parts = self._look_for_blast_matches( seqs_fpath, seqs, seqs_type)
def sort_bam(in_bam_fpath, out_bam_fpath=None): if out_bam_fpath is None: out_bam_fpath = in_bam_fpath if out_bam_fpath == in_bam_fpath: sorted_fhand = NamedTemporaryFile(suffix='.sorted.bam', delete=False) temp_out_fpath = sorted_fhand.name else: temp_out_fpath = out_bam_fpath picard_jar = get_setting("PICARD_JAR") cmd = ['java', '-jar', picard_jar, 'SortSam', 'INPUT={0}'.format(in_bam_fpath), 'OUTPUT={0}'.format(temp_out_fpath), 'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT'] stderr = NamedTemporaryFile(suffix='picard.stderr') check_call(cmd, stderr=stderr) if temp_out_fpath != out_bam_fpath: shutil.move(temp_out_fpath, out_bam_fpath)
def sort_bam(in_bam_fpath, out_bam_fpath=None): if out_bam_fpath is None: out_bam_fpath = in_bam_fpath if out_bam_fpath == in_bam_fpath: sorted_fhand = NamedTemporaryFile(suffix='.sorted.bam', delete=False) temp_out_fpath = sorted_fhand.name else: temp_out_fpath = out_bam_fpath picard_jar = get_setting("PICARD_JAR") cmd = [ 'java', '-jar', picard_jar, 'SortSam', 'INPUT={0}'.format(in_bam_fpath), 'OUTPUT={0}'.format(temp_out_fpath), 'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT' ] stderr = NamedTemporaryFile(suffix='picard.stderr') check_call(cmd, stderr=stderr) if temp_out_fpath != out_bam_fpath: shutil.move(temp_out_fpath, out_bam_fpath)
def map_process_to_sortedbam(map_process, out_fpath, key='coordinate', stderr_fhand=None, tempdir=None): if stderr_fhand is None: stderr = NamedTemporaryFile(suffix='.stderr') else: stderr = stderr_fhand if tempdir is None: tempdir = tempfile.gettempdir() picard_jar = get_setting("PICARD_JAR") cmd = ['java', '-jar', picard_jar, 'SortSam', 'I=/dev/stdin', 'O=' + out_fpath, 'SO=' + key, 'TMP_DIR=' + tempdir, 'VALIDATION_STRINGENCY=LENIENT'] sort = popen(cmd, stdin=map_process.stdout, stderr=stderr) map_process.stdout.close() sort.communicate() if map_process.returncode: raise RuntimeError('Error in mapping process') if sort.returncode: raise RuntimeError('Error in Sort process')
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands, unknown_fhand, unpaired=False, paired_result=True, settings=get_setting('CHIMERAS_SETTINGS'), min_seed_len=None, directory=None): file_format = get_format(in_fhands[0]) if unpaired: unpaired_fpaths = [fhand.name for fhand in in_fhands] paired_fpaths = None else: f_fhand = NamedTemporaryFile() r_fhand = NamedTemporaryFile() seqs = read_seqs(in_fhands) deinterleave_pairs(seqs, f_fhand, r_fhand, file_format) paired_fpaths = [f_fhand.name, r_fhand.name] unpaired_fpaths = None bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths, directory, file_format, min_seed_len) total = 0 chimeric = 0 unknown = 0 for pair, kind in classify_mapped_reads(bamfile, settings=settings, paired_result=paired_result, file_format=file_format): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) chimeric += 1 elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand) unknown += 1 total += 1 mapped = total - chimeric - unknown print 'Total pairs analyzed: ', total print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total) print 'Unknown pairs found: ', unknown, '\t', unknown / float(total) print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True): 'It converts sequence files between formats' if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'): raise IncompatibleFormatError("This output format is not supported") in_formats = [get_format(fhand) for fhand in in_fhands] if len(in_fhands) == 1 and in_formats[0] == out_format: if copy_if_same_format: copyfileobj(in_fhands[0], out_fhand) else: rel_symlink(in_fhands[0].name, out_fhand.name) else: seqs = _read_seqrecords(in_fhands) try: write_seqrecs(seqs, out_fhand, out_format) except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) if 'No suitable quality scores' in str(error): msg = 'No qualities available to write output file' raise IncompatibleFormatError(msg) raise
from subprocess import check_output, CalledProcessError from tempfile import NamedTemporaryFile from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from crumbs.transcript_orientations import TranscriptOrientator from crumbs.settings import get_setting from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.tags import SEQRECORD from crumbs.seq import get_str_seq from crumbs.seqio import read_seqs from crumbs.seq import SeqWrapper POLYA_ANNOTATOR_MISMATCHES = get_setting('POLYA_ANNOTATOR_MISMATCHES') # pylint: disable=R0201 # pylint: disable=R0904 _wrap_seq = lambda seq: SeqWrapper(SEQRECORD, seq, None) class TestTranscriptomeOrientator(unittest.TestCase): 'the class' def test_transcriptome_orientator(self): '''tests the orientator class''' estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat')
def test_get_settings(self): 'We get the settings' kmer_size = get_setting('DEFAULT_KMER_SIZE') assert kmer_size
def test_split_mates(self): 'It tests the detection of oligos in sequence files' mate_fhand = NamedTemporaryFile(suffix='.fasta') linker = TITANIUM_LINKER # a complete linker seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT' mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n') # no linker mate_fhand.write('>seq2\n' + seq5 + '\n') # a partial linker mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n') # the linker is 5 prima mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n') # two linkers mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n') # reverse linker rev_linker = get_setting('TITANIUM_LINKER_REV') mate_fhand.write('>seq6\n' + seq5 + rev_linker + seq3 + '\n') mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 2): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = out_fhand.getvalue() xpect = r'>seq1\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq1\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq2\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += '>seq3_pl.part1\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n' xpect += '>seq3_pl.part2\n' xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq4\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part1\n' xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT' xpect += 'TG' xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part2\n' xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' xpect += '\n' xpect += r'>seq6\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq6\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' assert xpect == result # with short linker in 3 prima mate_fhand = NamedTemporaryFile(suffix='.fasta') seq = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAA" seq += "CTCACATACACTGCTGTACCGTAC" mate_fhand.write(seq) mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 1): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAACTCACATACA\n" assert result == out_fhand.getvalue()
import os.path from random import choice from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from crumbs.split_mates import MatePairSplitter from crumbs.settings import get_setting from crumbs.seqio import read_seq_packets, write_seq_packets, read_seqs from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.seq_utils import process_seq_packets from crumbs.utils.tags import SEQRECORD from crumbs.seq import get_name, SeqWrapper, get_str_seq TITANIUM_LINKER = get_setting('TITANIUM_LINKER') FLX_LINKER = get_setting('FLX_LINKER') # pylint: disable=R0201 # pylint: disable=R0904 def create_a_matepair_file(): 'It creates a matepair fasta file' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' seq_3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT' mate_seq = seq_5 + TITANIUM_LINKER + seq_3 mate_fhand = NamedTemporaryFile(suffix='.fasta') mate_fhand.write('>seq1\n' + mate_seq + '\n') mate_fhand.flush()