Ejemplo n.º 1
0
    def _write_snv(self, sequence, snv):
        'Given an snv feature it writes a line in the vcf'
        items = [] #items to write
        items.append(get_seq_name(sequence))
        items.append(str(int(snv.location.start.position) + 1))
        id_ = snv.id
        if id_ == "<unknown id>":
            id_ = '.'
        items.append(id_)
        qualifiers = snv.qualifiers
        ref_seq = qualifiers['reference_allele'].replace('-', '')
        items.append(ref_seq)
        toprint_af, alternative_alleles = self._create_alternative_alleles(
                                                          qualifiers['alleles'])
        items.append(toprint_af)
        items.append(self._create_quality(qualifiers['alleles'],
                                          alternative_alleles))
        filters = self._create_filters(qualifiers)
        items.append(filters)
        try:
            items.append(self._create_info(qualifiers, alternative_alleles))
        except KeyError:
            print 'sequence', get_seq_name(sequence)
            print 'position', str(int(snv.location.start.position))
            raise

        items.append(self._create_genotypes(qualifiers, alternative_alleles))

        self._temp_fhand.write('%s\n' % '\t'.join(items))
        self._temp_fhand.flush()
Ejemplo n.º 2
0
 def test_get_seq_name():
     'It tests that we can get a sequence name'
     #with no name attribute -> uuid
     assert len(get_seq_name('AA')) > 10
     seq = SeqWithQuality(id='seqid', name='seqname', seq=Seq('ATGAT'))
     assert get_seq_name(seq) == 'seqname'
     seq = SeqWithQuality(id='seqid', seq=Seq('ATGAT'))
     assert get_seq_name(seq) == 'seqid'
     seq = SeqWithQuality(seq=Seq('ATGAT'))
     assert  len(get_seq_name(seq)) > 10
Ejemplo n.º 3
0
    def write(self, sequence):
        'It does the real write of the features'
        seq_name = get_seq_name(sequence)

        for snv in sequence.get_features(kind='snv'):
            self.num_features += 1
            location  =  snv.location.start.position
            reference_allele = snv.qualifiers['reference_allele']
            snv_name  =  "%s_%d" % (seq_name, location + 1)
            left_limit  = location - self._length
            rigth_limit = location + self._length + 1
            if self._write_short and left_limit < 0:
                left_limit = 0
            if self._write_short and rigth_limit > len(sequence):
                rigth_limit = len(sequence)


            seq_left   = sequence[left_limit: location]
            seq_rigth  = sequence[location + 1: rigth_limit]
            alleles = [allele[0] for allele in snv.qualifiers['alleles'].keys()]
            alleles = set(alleles)
            alleles.add(reference_allele)
            alleles_str = "[" + "/".join(alleles) + "]"
            illum_str = '%s,SNP,%s\n' % (snv_name,
                                     seq_left.seq + alleles_str + seq_rigth.seq)
            self.fhand.write(illum_str)
        self.fhand.flush()
 def go_annotator(sequence):
     'The annotator'
     if sequence is None:
         return
     seq_name = get_seq_name(sequence)
     if  seq_name in go_annotations:
         sequence.annotations['GOs'] = go_annotations[seq_name]
     return sequence
 def ortholog_annotator(sequence):
     'The real annotator'
     if sequence is None:
         return
     name = get_seq_name(sequence)
     try:
         sequence.annotations['%s-orthologs' % species] = orthologs[name]
     except KeyError:
         pass
     return sequence
Ejemplo n.º 6
0
def _change_names_in_files_by_seq(fhand_in, fhand_out, naming, file_format):
    'It replaces the seq name using the  per_seq method'
    seqs = seqs_in_file(fhand_in, format=file_format)

    for seq in seqs:
        old_name = get_seq_name(seq)
        new_name = naming.get_uniquename(old_name)
        seq.name = new_name
        seq.id = new_name
        write_seqs_in_file([seq], fhand_out, format=file_format)
 def descrition_annotator(sequence):
     'The description annotator'
     if sequence is None:
         return
     name = get_seq_name(sequence)
     if name in descriptions:
         sequence.description = 'Similar to %s (%s:%s)' % \
                                         (descriptions[name]['description'],
                                          descriptions[name]['db_name'],
                                          descriptions[name]['subj_name'])
     return sequence
Ejemplo n.º 8
0
 def write(self, sequence):
     'It does the real write of the features'
     name = get_seq_name(sequence)
     for annot_keys, annot_values in sequence.annotations.items():
         if 'ortholog' in annot_keys:
             self.num_features += 1
             spp = annot_keys.split('-')[0]
             orthologs = ','.join(annot_values)
             line_content = '%s\t%s\t%s\n' % (spp, name, orthologs)
             self.fhand.write(line_content)
     self.fhand.flush()
Ejemplo n.º 9
0
def temp_qual_file(seqs):
    'Given a qual seq it return a temporary qual fasta file'
    fhand_qual = tempfile.NamedTemporaryFile(suffix='.qual')
    for seq in seqs:
        if seq is None:
            continue
        name = get_seq_name(seq)
        quality = seq.letter_annotations["phred_quality"]

        quality = [str(qual) for qual in quality]
        fhand_qual.write('>%s\n%s\n' % (name , ' '.join(quality)))

    fhand_qual.flush()
    fhand_qual.seek(0)
    return fhand_qual
Ejemplo n.º 10
0
 def write(self, sequence):
     'It does the real write of the features'
     for orf in sequence.get_features(kind='orf'):
         self.num_features += 1
         name = get_seq_name(sequence)
         start = int(str(orf.location.start)) + 1
         end = int(str(orf.location.end)) + 1
         strand = orf.qualifiers['strand']
         seq_content = '>%s_orf_seq start=%d end=%d strand=%s\n%s\n' % \
                       (name, start, end, strand, str(orf.qualifiers['dna']))
         pep_content = '>%s_orf_pep start=%d end=%d strand=%s\n%s\n' % \
                       (name, start, end, strand, str(orf.qualifiers['pep']))
         self.fhand.write(seq_content)
         self.pep_fhand.write(pep_content)
         self.fhand.flush()
         self.pep_fhand.flush()
Ejemplo n.º 11
0
    def reference_in_list_filter(sequence):
        "The filter"
        if sequence is None:
            return None

        name = get_seq_name(sequence)
        if name in seq_list:
            result = True
        else:
            result = False

        for snv in sequence.get_features(kind='snv'):
            previous_result = _get_filter_result(snv, 'ref_not_in_list')
            if previous_result is not None:
                continue
            _add_filter_result(snv, 'ref_not_in_list', result)
        return sequence
Ejemplo n.º 12
0
 def write(self, sequence):
     'It does the real write of the features'
     seq_name = get_seq_name(sequence)
     for ssr in sequence.get_features(kind='microsatellite'):
         self.num_features += 1
         start = int(str(ssr.location.start)) + 1
         end = int(str(ssr.location.end)) + 1
         score = int(ssr.qualifiers['score'])
         kind = ssr.qualifiers['type']
         unit = ssr.qualifiers['unit']
         length = end - start + 1
         num_repeats = length / len(unit)
         self.fhand.write('%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\n' % (seq_name,
                                                     start, end, length,
                                                     score, kind, unit,
                                                     num_repeats))
         self.fhand.flush()
Ejemplo n.º 13
0
def _write_fasta_file(seqs, fhand_seq, default_quality=None, fhand_qual=None):
    '''Given a Seq and its default name it returns a fasta file in a
    temporary file. If the seq is a SeqWithQuality you can ask a qual fasta
    file'''
    try:
        # Is seqs an seq or an iter??
        #pylint:disable-msg=W0104
        seqs.name
        seqs = [seqs]
        #pylint:disable-msg=W0704
    except AttributeError:
        pass
    for seq in seqs:
        if seq is None:
            continue
        name = get_seq_name(seq)
        if fhand_qual is not None:
            try:
                quality = seq.letter_annotations["phred_quality"]
            except (AttributeError, KeyError):
                if default_quality:
                    quality = [default_quality] * len(seq)
                else:
                    msg = 'Sequence must have a phred_quality letter annotation'
                    raise AttributeError(msg)
            if quality is not None:
                quality = [str(qual) for qual in quality]
                fhand_qual.write(fasta_str(' '.join(quality), name))
            else:
                raise AttributeError('Quality can not be empty')
        try:
            desc = seq.description
        except AttributeError:
            desc = None
        if desc == "<unknown description>":
            desc = None
        fasta_seq = fasta_str(seq, name, desc)
        fhand_seq.write(fasta_seq)

    fhand_seq.flush()
    if fhand_qual is not None:
        fhand_qual.flush()
Ejemplo n.º 14
0
def _snvs_in_bam(bam, reference, min_quality, default_sanger_quality,
                 min_mapq, min_num_alleles, max_maf, min_num_reads_for_allele,
                 read_edge_conf=None, default_bam_platform=None):
    'It yields the snv information for every snv in the given reference'

    min_num_alleles = int(min_num_alleles)

    read_groups_info = get_read_group_info(bam)
    if not read_groups_info:
        if default_bam_platform is None:
            msg = 'Platform is not present either in header or in '
            msg += 'configuration'
            raise ValueError(msg)
        read_groups_info = {UNKNOWN_RG:{'PL':default_bam_platform}}

    reference_id = get_seq_name(reference)
    reference_seq = reference.seq
    reference_len = len(reference_seq)
    #we can clean the cache of segments because we're in a new molecule
    global SEGMENTS_CACHE
    SEGMENTS_CACHE = {}
    for column in bam.pileup(reference=reference_id):
        alleles = {}
        ref_pos = column.pos
        if ref_pos >= reference_len:
            continue
        ref_id = bam.getrname(column.tid)
        ref_allele = reference_seq[ref_pos].upper()
        for pileup_read in column.pileups:
            #for each read in the column we add its allele to the alleles dict
            aligned_read = pileup_read.alignment

            read_mapping_qual = aligned_read.mapq
            #We ignore the reads that are likely to be missaligned
            if read_mapping_qual < min_mapq:
                continue

            try:
                read_group = aligned_read.opt('RG')
            except KeyError:
                read_group = UNKNOWN_RG

            read_name = aligned_read.qname
            if read_groups_info and read_group in read_groups_info:
                platform = read_groups_info[read_group]['PL']
            else:
                platform = default_bam_platform

            read_pos = pileup_read.qpos

            alleles_here, read_limits = _get_alleles_from_read(ref_allele,
                                                               ref_pos,
                                                               pileup_read)

            if read_edge_conf and platform in read_edge_conf:
                edge_left, edge_right = read_edge_conf[platform]

                #if we're in the edge region to be ignored we continue to
                #the next read, because there's no allele to add for this one.

                if (edge_left is not None and
                    read_limits[0] + edge_left > read_pos):
                    continue
                if (edge_right is not None and
                    read_pos > read_limits[1] - edge_right):
                    continue

            for allele in alleles_here:
                allele, kind, qual, is_reverse = allele
                _add_allele(alleles, allele, kind, read_name, read_group,
                    is_reverse, qual, read_mapping_qual,
                    read_groups_info)

        #remove N
        _remove_alleles_n(alleles)

        #add default sanger qualities to the sanger reads with no quality
        _add_default_sanger_quality(alleles, default_sanger_quality,
                                    read_groups_info)

        #remove bad quality alleles
        _remove_bad_quality_alleles(alleles, min_quality)

        #check maf
        if not check_maf_ok(alleles, max_maf):
            continue

        # min_num_reads_for_allele
        _remove_alleles_by_read_number(alleles, min_num_reads_for_allele)

        #if there are a min_num number of alleles requested and there are more
        #alleles than that
        #OR
        #there is some allele different than invariant
        #a variation is yield
        if not alleles:
            continue
        if (len(alleles) > min_num_alleles or
            (min_num_alleles == 1 and alleles.keys()[0][1] != INVARIANT) or
            (min_num_alleles > 1 and len(alleles) >= min_num_alleles)):
            yield {'ref_name':ref_id,
                   'ref_position':ref_pos,
                   'reference_allele':ref_allele,
                   'alleles':alleles,
                   'read_groups':read_groups_info}