def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers)
def _read_seqitems(fhands): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: file_format = get_format(fhand) seq_iter = _itemize_fastx(fhand) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def test_case_change(self): 'It changes the case of the sequences' seqs = [SeqRecord(Seq('aCCg'), letter_annotations={'dummy': 'dddd'})] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=UPPERCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['ACCG'] seqs = [SeqRecord(Seq('aCCg'))] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=LOWERCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['accg'] seqs = [SeqRecord(Seq('aCCg'))] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=SWAPCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['AccG']
def test_matching_segments(self): 'It tests the detection of oligos in sequence files' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' mate_fhand = create_a_matepair_file() linkers = assing_kind_to_seqs(SEQRECORD, LINKERS, None) expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1) matcher = BlasterForFewSubjects(mate_fhand.name, linkers, program='blastn', elongate_for_global=True) linker_region = matcher.get_matched_segments_for_read('seq1')[0] assert [expected_region] == linker_region
def test_matching_segments(self): 'It tests the detection of oligos in sequence files' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' mate_fhand = create_a_matepair_file() linkers = [SeqItem('titan', ['>titan\n', TITANIUM_LINKER + '\n']), SeqItem('flx', ['>flx\n', FLX_LINKER + '\n'])] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1) matcher = BlasterForFewSubjects(mate_fhand.name, linkers, program='blastn', elongate_for_global=True) linker_region = matcher.get_matched_segments_for_read('seq1')[0] assert [expected_region] == linker_region
def test_matching_segments(self): 'It tests the detection of oligos in sequence files' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' mate_fhand = create_a_matepair_file() linkers = [ SeqItem('titan', ['>titan\n', TITANIUM_LINKER + '\n']), SeqItem('flx', ['>flx\n', FLX_LINKER + '\n']) ] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1) matcher = BlasterForFewSubjects(mate_fhand.name, linkers, program='blastn', elongate_for_global=True) linker_region = matcher.get_matched_segments_for_read('seq1')[0] assert [expected_region] == linker_region
def _read_seqitems(fhands, file_format): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: file_format = guess_format(fhand) else: file_format = file_format if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: seq_iter = _itemize_fastq(fhand) else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def read_seqs(fhands, file_format=GUESS_FORMAT, out_format=None, prefered_seq_classes=None): 'It returns a stream of seqs in different codings: seqrecords, seqitems...' if not prefered_seq_classes: prefered_seq_classes = [SEQITEM, SEQRECORD] if file_format == GUESS_FORMAT: in_format = guess_format(fhands[0]) else: in_format = file_format if out_format not in (None, GUESS_FORMAT): if in_format != out_format: if SEQITEM in prefered_seq_classes: # seqitems is incompatible with different input and output # formats prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM)) if not prefered_seq_classes: msg = 'No valid seq class left or prefered' raise ValueError(msg) for seq_class in prefered_seq_classes: if seq_class == SEQITEM: try: return _read_seqitems(fhands, in_format) except NotImplementedError: continue elif seq_class == SEQRECORD: try: seqs = _read_seqrecords(fhands, in_format) return assing_kind_to_seqs(SEQRECORD, seqs, None) except NotImplementedError: continue else: raise ValueError('Unknown class for seq: ' + seq_class) raise RuntimeError('We should not be here, fixme')
def _read_seqitems(fhands): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: file_format = get_format(fhand) if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: try: seq_iter = _itemize_fastq(fhand) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def read_seqs(fhands, out_format=None, prefered_seq_classes=None): 'It returns a stream of seqs in different codings: seqrecords, seqitems...' if not prefered_seq_classes: prefered_seq_classes = [SEQITEM, SEQRECORD] try: in_format = get_format(fhands[0]) except FileIsEmptyError: return [] # seqitems is incompatible with different input and output formats # or when in_format != a fasta or fastq if ((out_format not in (None, GUESS_FORMAT) and in_format != out_format and SEQITEM in prefered_seq_classes) or (in_format not in ('fasta',) + SANGER_FASTQ_FORMATS + ILLUMINA_FASTQ_FORMATS)): prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM)) if not prefered_seq_classes: msg = 'No valid seq class left or prefered' raise ValueError(msg) for seq_class in prefered_seq_classes: if seq_class == SEQITEM: try: return _read_seqitems(fhands) except NotImplementedError: continue elif seq_class == SEQRECORD: try: seqs = _read_seqrecords(fhands) return assing_kind_to_seqs(SEQRECORD, seqs, None) except NotImplementedError: continue else: raise ValueError('Unknown class for seq: ' + seq_class) raise RuntimeError('We should not be here, fixme')
def _read_seqitems(fhands, file_format): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: file_format = guess_format(fhand) else: file_format = file_format if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: try: seq_iter = _itemize_fastq(fhand) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)