def _write_snv(self, sequence, snv): 'Given an snv feature it writes a line in the vcf' items = [] #items to write items.append(get_seq_name(sequence)) items.append(str(int(snv.location.start.position) + 1)) id_ = snv.id if id_ == "<unknown id>": id_ = '.' items.append(id_) qualifiers = snv.qualifiers ref_seq = qualifiers['reference_allele'].replace('-', '') items.append(ref_seq) toprint_af, alternative_alleles = self._create_alternative_alleles( qualifiers['alleles']) items.append(toprint_af) items.append(self._create_quality(qualifiers['alleles'], alternative_alleles)) filters = self._create_filters(qualifiers) items.append(filters) try: items.append(self._create_info(qualifiers, alternative_alleles)) except KeyError: print 'sequence', get_seq_name(sequence) print 'position', str(int(snv.location.start.position)) raise items.append(self._create_genotypes(qualifiers, alternative_alleles)) self._temp_fhand.write('%s\n' % '\t'.join(items)) self._temp_fhand.flush()
def test_get_seq_name(): 'It tests that we can get a sequence name' #with no name attribute -> uuid assert len(get_seq_name('AA')) > 10 seq = SeqWithQuality(id='seqid', name='seqname', seq=Seq('ATGAT')) assert get_seq_name(seq) == 'seqname' seq = SeqWithQuality(id='seqid', seq=Seq('ATGAT')) assert get_seq_name(seq) == 'seqid' seq = SeqWithQuality(seq=Seq('ATGAT')) assert len(get_seq_name(seq)) > 10
def write(self, sequence): 'It does the real write of the features' seq_name = get_seq_name(sequence) for snv in sequence.get_features(kind='snv'): self.num_features += 1 location = snv.location.start.position reference_allele = snv.qualifiers['reference_allele'] snv_name = "%s_%d" % (seq_name, location + 1) left_limit = location - self._length rigth_limit = location + self._length + 1 if self._write_short and left_limit < 0: left_limit = 0 if self._write_short and rigth_limit > len(sequence): rigth_limit = len(sequence) seq_left = sequence[left_limit: location] seq_rigth = sequence[location + 1: rigth_limit] alleles = [allele[0] for allele in snv.qualifiers['alleles'].keys()] alleles = set(alleles) alleles.add(reference_allele) alleles_str = "[" + "/".join(alleles) + "]" illum_str = '%s,SNP,%s\n' % (snv_name, seq_left.seq + alleles_str + seq_rigth.seq) self.fhand.write(illum_str) self.fhand.flush()
def go_annotator(sequence): 'The annotator' if sequence is None: return seq_name = get_seq_name(sequence) if seq_name in go_annotations: sequence.annotations['GOs'] = go_annotations[seq_name] return sequence
def ortholog_annotator(sequence): 'The real annotator' if sequence is None: return name = get_seq_name(sequence) try: sequence.annotations['%s-orthologs' % species] = orthologs[name] except KeyError: pass return sequence
def _change_names_in_files_by_seq(fhand_in, fhand_out, naming, file_format): 'It replaces the seq name using the per_seq method' seqs = seqs_in_file(fhand_in, format=file_format) for seq in seqs: old_name = get_seq_name(seq) new_name = naming.get_uniquename(old_name) seq.name = new_name seq.id = new_name write_seqs_in_file([seq], fhand_out, format=file_format)
def descrition_annotator(sequence): 'The description annotator' if sequence is None: return name = get_seq_name(sequence) if name in descriptions: sequence.description = 'Similar to %s (%s:%s)' % \ (descriptions[name]['description'], descriptions[name]['db_name'], descriptions[name]['subj_name']) return sequence
def write(self, sequence): 'It does the real write of the features' name = get_seq_name(sequence) for annot_keys, annot_values in sequence.annotations.items(): if 'ortholog' in annot_keys: self.num_features += 1 spp = annot_keys.split('-')[0] orthologs = ','.join(annot_values) line_content = '%s\t%s\t%s\n' % (spp, name, orthologs) self.fhand.write(line_content) self.fhand.flush()
def temp_qual_file(seqs): 'Given a qual seq it return a temporary qual fasta file' fhand_qual = tempfile.NamedTemporaryFile(suffix='.qual') for seq in seqs: if seq is None: continue name = get_seq_name(seq) quality = seq.letter_annotations["phred_quality"] quality = [str(qual) for qual in quality] fhand_qual.write('>%s\n%s\n' % (name , ' '.join(quality))) fhand_qual.flush() fhand_qual.seek(0) return fhand_qual
def write(self, sequence): 'It does the real write of the features' for orf in sequence.get_features(kind='orf'): self.num_features += 1 name = get_seq_name(sequence) start = int(str(orf.location.start)) + 1 end = int(str(orf.location.end)) + 1 strand = orf.qualifiers['strand'] seq_content = '>%s_orf_seq start=%d end=%d strand=%s\n%s\n' % \ (name, start, end, strand, str(orf.qualifiers['dna'])) pep_content = '>%s_orf_pep start=%d end=%d strand=%s\n%s\n' % \ (name, start, end, strand, str(orf.qualifiers['pep'])) self.fhand.write(seq_content) self.pep_fhand.write(pep_content) self.fhand.flush() self.pep_fhand.flush()
def reference_in_list_filter(sequence): "The filter" if sequence is None: return None name = get_seq_name(sequence) if name in seq_list: result = True else: result = False for snv in sequence.get_features(kind='snv'): previous_result = _get_filter_result(snv, 'ref_not_in_list') if previous_result is not None: continue _add_filter_result(snv, 'ref_not_in_list', result) return sequence
def write(self, sequence): 'It does the real write of the features' seq_name = get_seq_name(sequence) for ssr in sequence.get_features(kind='microsatellite'): self.num_features += 1 start = int(str(ssr.location.start)) + 1 end = int(str(ssr.location.end)) + 1 score = int(ssr.qualifiers['score']) kind = ssr.qualifiers['type'] unit = ssr.qualifiers['unit'] length = end - start + 1 num_repeats = length / len(unit) self.fhand.write('%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\n' % (seq_name, start, end, length, score, kind, unit, num_repeats)) self.fhand.flush()
def _write_fasta_file(seqs, fhand_seq, default_quality=None, fhand_qual=None): '''Given a Seq and its default name it returns a fasta file in a temporary file. If the seq is a SeqWithQuality you can ask a qual fasta file''' try: # Is seqs an seq or an iter?? #pylint:disable-msg=W0104 seqs.name seqs = [seqs] #pylint:disable-msg=W0704 except AttributeError: pass for seq in seqs: if seq is None: continue name = get_seq_name(seq) if fhand_qual is not None: try: quality = seq.letter_annotations["phred_quality"] except (AttributeError, KeyError): if default_quality: quality = [default_quality] * len(seq) else: msg = 'Sequence must have a phred_quality letter annotation' raise AttributeError(msg) if quality is not None: quality = [str(qual) for qual in quality] fhand_qual.write(fasta_str(' '.join(quality), name)) else: raise AttributeError('Quality can not be empty') try: desc = seq.description except AttributeError: desc = None if desc == "<unknown description>": desc = None fasta_seq = fasta_str(seq, name, desc) fhand_seq.write(fasta_seq) fhand_seq.flush() if fhand_qual is not None: fhand_qual.flush()
def _snvs_in_bam(bam, reference, min_quality, default_sanger_quality, min_mapq, min_num_alleles, max_maf, min_num_reads_for_allele, read_edge_conf=None, default_bam_platform=None): 'It yields the snv information for every snv in the given reference' min_num_alleles = int(min_num_alleles) read_groups_info = get_read_group_info(bam) if not read_groups_info: if default_bam_platform is None: msg = 'Platform is not present either in header or in ' msg += 'configuration' raise ValueError(msg) read_groups_info = {UNKNOWN_RG:{'PL':default_bam_platform}} reference_id = get_seq_name(reference) reference_seq = reference.seq reference_len = len(reference_seq) #we can clean the cache of segments because we're in a new molecule global SEGMENTS_CACHE SEGMENTS_CACHE = {} for column in bam.pileup(reference=reference_id): alleles = {} ref_pos = column.pos if ref_pos >= reference_len: continue ref_id = bam.getrname(column.tid) ref_allele = reference_seq[ref_pos].upper() for pileup_read in column.pileups: #for each read in the column we add its allele to the alleles dict aligned_read = pileup_read.alignment read_mapping_qual = aligned_read.mapq #We ignore the reads that are likely to be missaligned if read_mapping_qual < min_mapq: continue try: read_group = aligned_read.opt('RG') except KeyError: read_group = UNKNOWN_RG read_name = aligned_read.qname if read_groups_info and read_group in read_groups_info: platform = read_groups_info[read_group]['PL'] else: platform = default_bam_platform read_pos = pileup_read.qpos alleles_here, read_limits = _get_alleles_from_read(ref_allele, ref_pos, pileup_read) if read_edge_conf and platform in read_edge_conf: edge_left, edge_right = read_edge_conf[platform] #if we're in the edge region to be ignored we continue to #the next read, because there's no allele to add for this one. if (edge_left is not None and read_limits[0] + edge_left > read_pos): continue if (edge_right is not None and read_pos > read_limits[1] - edge_right): continue for allele in alleles_here: allele, kind, qual, is_reverse = allele _add_allele(alleles, allele, kind, read_name, read_group, is_reverse, qual, read_mapping_qual, read_groups_info) #remove N _remove_alleles_n(alleles) #add default sanger qualities to the sanger reads with no quality _add_default_sanger_quality(alleles, default_sanger_quality, read_groups_info) #remove bad quality alleles _remove_bad_quality_alleles(alleles, min_quality) #check maf if not check_maf_ok(alleles, max_maf): continue # min_num_reads_for_allele _remove_alleles_by_read_number(alleles, min_num_reads_for_allele) #if there are a min_num number of alleles requested and there are more #alleles than that #OR #there is some allele different than invariant #a variation is yield if not alleles: continue if (len(alleles) > min_num_alleles or (min_num_alleles == 1 and alleles.keys()[0][1] != INVARIANT) or (min_num_alleles > 1 and len(alleles) >= min_num_alleles)): yield {'ref_name':ref_id, 'ref_position':ref_pos, 'reference_allele':ref_allele, 'alleles':alleles, 'read_groups':read_groups_info}