Example #1
0
def get_orthologs(blast1_fhand, blast2_fhand, sub1_def_as_acc=None, sub2_def_as_acc=None):
    """It return orthologs from two pools. It needs the xml output blast of the
    pools"""
    # First we have to get hist from the first blast. We will put the in a set
    blast1_hits = set()
    for hits in get_hit_pairs_from_blast(get_fhand(blast1_fhand), sub_def_as_acc=sub1_def_as_acc):
        blast1_hits.add(hits)

    # Know we will see if the hits in the second blast in the first too
    for hits in get_hit_pairs_from_blast(get_fhand(blast2_fhand), sub_def_as_acc=sub2_def_as_acc):
        hits = (hits[1], hits[0])
        if hits in blast1_hits:
            yield hits
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand):
    'It creates a function that annotates introns in cdna matching with genomic'
    genomic_seqs_fhand = get_fhand(genomic_seqs_fhand)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))
    def annotate_intron(sequence):
        'It adds the orf to the SeqFeatures'
        if sequence is None:
            return
        try:
            introns = infer_introns_for_cdna(sequence=sequence,
                                          genomic_db=genomic_db,
                                          genomic_seqs_index=genomic_seqs_index)
        except KeyError as error:
            error = str(error).lstrip('u').strip("'")
            if 'not found' in error:
                error += ' in seq file %s, but present in blast db %s' % \
                                           (genomic_seqs_fhand.name, genomic_db)
            raise RuntimeError(error)
        for intron_pos in introns:
            feature = SeqFeature(location=FeatureLocation(intron_pos,
                                                          intron_pos),
                                 type='intron',
                                 qualifiers={'genomic_db':genomic_db})
            sequence.features.append(feature)
        return sequence
    return annotate_intron
Example #3
0
def create_vector_striper(vectors, vectors_are_blastdb=False):
    '''It returns a function capable of detecting vector sequences.

    The vectors could be an fhand to a fasta file or a blast database.
    The vectors will be detected by using blastn
    '''
    if not vectors_are_blastdb:
        check_sequences_length(get_fhand(vectors), MAX_ADAPTOR_LENGTH)
    return _create_vector_striper(vectors, aligner='blastn',
                                  vectors_are_blastdb=vectors_are_blastdb,
                                  seqs_are_short=False,
                                  elongate_match_to_complete_adaptor=False)
Example #4
0
def create_adaptor_striper(adaptors, elongate_match_to_complete_adaptor=True):
    '''It creates a function capable of detecting adaptor sequences.

    The adaptors should be a fhand to a fasta file with the adaptors in it.
    The adaptors will be detected by using blastn-short.
    '''
    fhand = get_fhand(adaptors)
    check_sequences_length(fhand, MIN_ADAPTOR_LENGTH, MAX_ADAPTOR_LENGTH)
    return _create_vector_striper(vectors=adaptors,
                                  aligner='blast_short',
                                  vectors_are_blastdb=False,
                                  seqs_are_short=True,
          elongate_match_to_complete_adaptor=elongate_match_to_complete_adaptor)
Example #5
0
def create_snv_annotator(bam_fhand, min_quality=45, default_sanger_quality=25,
                         min_mapq=15, min_num_alleles=1, max_maf=None,
                         read_edge_conf=None, default_bam_platform=None,
                         min_num_reads_for_allele=None, ploidy=2):
    'It creates an annotator capable of annotating the snvs in a SeqRecord'

    #the bam should have an index, does the index exists?
    bam_fhand = get_fhand(bam_fhand)
    create_bam_index(bam_fpath=bam_fhand.name)
    read_edge_conf = _normalize_read_edge_conf(read_edge_conf)

    bam = pysam.Samfile(bam_fhand.name, 'rb')

    # default min num_reads per allele and ploidy
    if min_num_reads_for_allele is None:
        min_num_reads_for_allele = DEFAUL_MIN_NUM_READS_PER_ALLELE
    if ploidy is None:
        ploidy = DEFAULT_PLOIDY

    def annotate_snps(sequence):
        'It annotates the snvs found in the sequence'
        for snv in _snvs_in_bam(bam, reference=sequence,
                                min_quality=min_quality,
                                default_sanger_quality=default_sanger_quality,
                                min_mapq=min_mapq,
                                min_num_alleles=min_num_alleles,
                                max_maf=max_maf,
                                read_edge_conf=read_edge_conf,
                                default_bam_platform=default_bam_platform,
                             min_num_reads_for_allele=min_num_reads_for_allele):
            snv = _summarize_snv(snv)
            location = snv['ref_position']
            type_ = 'snv'

            qualifiers = {'alleles':snv['alleles'],
                          'reference_allele':snv['reference_allele'],
                          'read_groups':snv['read_groups'],
                          'mapping_quality': snv['mapping_quality'],
                          'quality': snv['quality']}
            snv_feat = SeqFeature(location=FeatureLocation(location, location),
                              type=type_,
                              qualifiers=qualifiers)

            annotate_pic(snv_feat)
            annotate_heterozygosity(snv_feat, ploidy=ploidy)

            sequence.features.append(snv_feat)
        return sequence
    return annotate_snps
Example #6
0
def num_seqs_in_file(seq_fhand, format=None):
    'It counts seqs in file. '
    seq_fhand = get_fhand(seq_fhand)
    if format is None:
        format = guess_seq_file_format(seq_fhand)

    if format == 'fasta':
        return count_str_in_file(seq_fhand, '^>')
    elif format == 'repr':
        class_name = SeqWithQuality.__class__.__name__.split('.')[-1]
        return count_str_in_file(seq_fhand, "^%s" % class_name)
    elif 'fastq' in format:
        return _num_seqs_in_fastq(seq_fhand)
    else:
        raise NotImplementedError('I can not count this format: %s' % format)
def _get_descriptions_from_blasts(blasts):
    '''It gets a description from a list of blast outputs.
    Blast description in the xml may be modified to remove trash. This depends
    on blast xml, so the item of the list can be a blast or a dict with the
    blast and the function to modify the description field.

    It tries to find the name in the first file, after in the second, etc'''

    seq_annot = {}
    filters = [{'kind'           : 'best_scores',
                'score_key'      : 'expect',
                'max_score'      : 1e-20,
                'score_tolerance': 10}]
    for blast in blasts:
        blast_fhand = blast['blast']
        if 'modifier' in blast:
            modifier = blast['modifier']
        else:
            modifier = None
        blast_fhand = get_fhand(blast_fhand)
        blast = BlastParser(fhand=blast_fhand)
        filtered_results = filter_alignments(blast, config=filters)
        db_name = blast.db_name
        try:
            for match in filtered_results:
                try:
                    query = match['query'].id
                except AttributeError:
                    query = match['query'].name
                if query not in seq_annot:
                    match_hit = match['matches'][0]
                    description = match_hit['subject'].description
                    subject_name = match_hit['subject'].name
                    if modifier is not None:
                        description = modifier(description)
                    if description != "<unknown description>":
                        seq_annot[query] = {'description':description.strip(),
                                            'db_name':db_name,
                                            'subj_name': subject_name}
        except ExpatError as error:
            msg = str(error) + ':%s' % blast_fhand.name
            raise ExpatError(msg)
    return seq_annot
Example #8
0
def _create_vector_striper(
    vectors, aligner, vectors_are_blastdb=False, seqs_are_short=False, elongate_match_to_complete_adaptor=False
):
    """It creates a function which will remove vectors from the given sequence.

    It looks for the vectors comparing the sequence with a vector database. To
    do these alignments two programs can be used, exonerate and blast. Exonerate
    requires a fasta file with the vectors and blast and indexed blast database.
    """
    # exonerate fails with sequences below 20 bp
    # blast_short starts to fail bellow 15 bases with 2% errors (although not as
    # badly as exonerate with 19

    # depending on the aligner program we need different parameters and filters
    # blast parameter value is taken from vecscreen parameters:
    # http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html
    vectors = get_fhand(vectors)
    parameters = {
        "blast_long": {
            "gapextend": "3",
            "gapopen": "3",
            "penalty": "-5",
            "expect": "700",
            "dust": "20 1 64",
            "searchsp": "1750000000000",
        },
        "blast_short": {"task": "blastn-short", "expect": "0.0001", "subject": vectors, "alig_format": 6},
    }

    # They filter matches not match parts
    filters = {
        "blast_long": [
            {"kind": "score_threshold", "score_key": "identity", "min_score": 96},
            {"kind": "min_length", "min_num_residues": MIN_ADAPTOR_LENGTH, "length_in_query": False},
        ],
        "blast_short": [
            {"kind": "score_threshold", "score_key": "identity", "min_score": 89},
            {"kind": "min_length", "min_num_residues": 13, "length_in_query": False},
        ],
    }
    if vectors is None:
        aligner = None
    elif aligner == "blast_short" or aligner == "blastn":
        seq_type = "blast_short" if seqs_are_short else "blast_long"
        if vectors_are_blastdb:
            aligner = BlastAligner(database=vectors, parameters=parameters[seq_type], filters=filters[seq_type])
        else:
            aligner = BlastAligner(subject=vectors, parameters=parameters[seq_type], filters=filters[seq_type])

    def strip_vector_by_alignment(sequence):
        """It strips the vector from a sequence.

        It returns a striped sequence with the longest segment without vector.
        """
        if sequence is None:
            return None
        if vectors is None:
            return sequence

        alignments = list(aligner.do_alignment(sequence))

        if elongate_match_to_complete_adaptor:
            _elongate_matches_to_complete_subject(alignments)

        alignment_matches = _get_non_matched_locations(alignments)

        _add_trim_segments(alignment_matches, sequence)
        return sequence

    return strip_vector_by_alignment
Example #9
0
def _create_vector_striper(vectors, aligner, vectors_are_blastdb=False,
                           seqs_are_short=False,
                           elongate_match_to_complete_adaptor=False):
    '''It creates a function which will remove vectors from the given sequence.

    It looks for the vectors comparing the sequence with a vector database. To
    do these alignments two programs can be used, exonerate and blast. Exonerate
    requires a fasta file with the vectors and blast and indexed blast database.
    '''
    #exonerate fails with sequences below 20 bp
    #blast_short starts to fail bellow 15 bases with 2% errors (although not as
    #badly as exonerate with 19

    # depending on the aligner program we need different parameters and filters
    # blast parameter value is taken from vecscreen parameters:
    # http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html
    vectors = get_fhand(vectors)
    parameters = {'blast_long'    : {'gapextend': '3', 'gapopen':'3',
                                     'penalty':'-5', 'expect':'700',
                                     'dust':'20 1 64'},
                  'blast_short': {'task': 'blastn-short', 'expect': '0.0001',
                                  'subject': vectors, 'alig_format':6},
                 }

    #They filter matches not match parts
    filters = {'blast_long':      [{'kind'     : 'score_threshold',
                                    'score_key': 'identity',
                                    'min_score': 96},
                                   {'kind'            : 'min_length',
                                    'min_num_residues': MIN_ADAPTOR_LENGTH,
                                    'length_in_query' : False}],
               'blast_short': [{'kind'    : 'score_threshold',
                                'score_key': 'identity',
                                'min_score': 89},
                               {'kind'            : 'min_length',
                                'min_num_residues': 13,
                                'length_in_query' : False}]
              }
    if vectors is None:
        aligner = None
    elif aligner == 'blast_short' or aligner == 'blastn':
        seq_type = 'blast_short' if seqs_are_short else 'blast_long'
        if vectors_are_blastdb:
            aligner = BlastAligner(database=vectors,
                                   parameters=parameters[seq_type],
                                   filters=filters[seq_type])
        else:
            aligner = BlastAligner(subject=vectors,
                                   parameters=parameters[seq_type],
                                   filters=filters[seq_type])

    def strip_vector_by_alignment(sequence):
        '''It strips the vector from a sequence.

        It returns a striped sequence with the longest segment without vector.
        '''
        if sequence is None:
            return None
        if vectors is None:
            return sequence

        alignments = list(aligner.do_alignment(sequence))

        if elongate_match_to_complete_adaptor:
            _elongate_matches_to_complete_subject(alignments)

        alignment_matches = _get_non_matched_locations(alignments)

        segments  = _get_longest_non_matched_seq_region_limits(sequence,
                                                              alignment_matches)

        if segments is None:
            return None

        segments  = _get_non_matched_from_matched_locations([segments],
                                                            len(sequence))
        _add_trim_segments(segments, sequence)
        return sequence

    return strip_vector_by_alignment