Python SeqRecord._set_seq Examples

Programming Language: Python
Namespace/Package Name: Bio.SeqRecord
Class/Type: SeqRecord
Method/Function: _set_seq
Examples at hotexamples.com: 1
Python SeqRecord._set_seq - 1 examples found. These are the top rated real world Python examples of Bio.SeqRecord.SeqRecord._set_seq extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
SeqRecord(30)
annotations(30)
__init__(8)
_al_start(4)
_al_stop(4)
__add__(2)
COMMENT(1)
replace(1)
populate_attribs(1)
direction(1)
attributes(1)
_set_seq(1)
add_conservation_features(1)
accession(1)
__repr__(1)
__getitem__(1)
split(1)
Example #1
Show file
File: conservation_profiler_general.py Project: YairGatt/ConservationProfiler
def directed_local_alignment(proteome_file, ref_seq_file, proteome_name,
                             workdir, mode, source, strand,
                             per_proteome_sequences):
    """
    This function does the main lifting, running Supermatcher or BLAST on your proteomes/genomes.
    """
    #open result file
    alignment_file = os.path.join(workdir, "curr_alignment.aln")
    #If Supermatcher was chosen as the alignment algorithm
    if mode.lower() == "supermatcher":
        #Unlike BLAST, Supermatcher only searches one strand, so we create a temp file to use as the supermatcher
        #bsequence, and we can write into it either the positive or negative strand depending on the "strand" parameter
        records = open_proteome(proteome_file)
        if not records: return []
        if not strand:
            records = [
                i.reverse_complement(id=True,
                                     name=True,
                                     description=True,
                                     features=True,
                                     annotations=True,
                                     letter_annotations=True,
                                     dbxrefs=True) for i in records
            ]
        #write file
        proteome_fasta_file = os.path.join(workdir, "curr_proteome.fasta")
        SeqIO.write(records, proteome_fasta_file, "fasta")
        #Build the matching command
        if source.lower() == "protein" or source.lower() == "proteome":
            matrix = "EBLOSUM62"  #AA matrix
        elif source.lower() == "nucleotide" or source.lower() == "genome":
            matrix = "EDNAFULL"
        #run supermatcher
        cmd = SuperMatcherCommandline(asequence=ref_seq_file,
                                      bsequence=proteome_fasta_file,
                                      gapopen=10,
                                      gapextend=0.5,
                                      datafile=matrix,
                                      outfile=alignment_file)
        #Excecute the command
        stdout, stderr = cmd()
        #Parse the resulting alignments
        alignments = []
        try:
            #Create list of MultipleSeqAlignment objects representing the Supermatcher results (list may be empty)
            align_seq_list = list(AlignIO.parse(
                alignment_file,
                "amir_emboss"))  #List of MultipleSeqAlignment objects
            #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments
            #will be used if per_proteome_sequences==None
            for number, alignment in enumerate(
                    align_seq_list[0:per_proteome_sequences]):
                #get the alignent
                align_seq = alignment[
                    1]  #SeqRecord objects, [0] is query and [1] is sbjct
                #remove gaps
                align_seq._set_seq(align_seq.seq.ungap("-"))
                #get name
                if (not per_proteome_sequences or per_proteome_sequences > 1
                    ) and len(align_seq_list) > 1:
                    usable_name = proteome_name + "_" + str(number)
                else:
                    usable_name = proteome_name
                align_seq.name = usable_name
                align_seq.id = usable_name
                #finalize parsing
                score, identity_percentage = parse_supermatcher_result(
                    alignment_file, number)
                #add to list
                alignments.append(
                    (score, align_seq, identity_percentage, 0,
                     1))  #1 is given arbitrarily as gene_percentage
                #alignments is a list of tuples with each element being (score, align_seq, identity_percentage)
            #detlete temp files
            os.remove(alignment_file)
            os.remove(proteome_fasta_file)
            #return
            return alignments
        except ValueError or IndexError:
            raise NoMatchForSeqException(proteome_fasta_file, ref_seq_file)
    #If BLAST was chosen as the alignment algorithm
    elif mode.upper() == "BLAST":
        #if file is called XXXX.file_type.gz, db name should be XXXX
        if proteome_file.endswith(".gz"):
            db_name_temp = ".".join(proteome_file.split(".")[:-2])
            #if file is called XXXX.file_type, db name should be XXXX
        else:
            db_name_temp = ".".join(proteome_file.split(".")[:-1])
        #define dir
        directory = os.path.dirname(proteome_file)
        directory_files = os.listdir(directory)
        #iterate through files and find database file
        for dir_file in directory_files:
            #determine type of die
            if os.path.basename(
                    db_name_temp) in dir_file and ".nhr" in dir_file:
                db_name = directory + "/" + dir_file.split(".nhr")[0]
            elif os.path.basename(
                    db_name_temp) in dir_file and ".phr" in dir_file:
                db_name = directory + "/" + dir_file.split(".phr")[0]
        #open file
        records = open_proteome(proteome_file)
        if not records: return []
        #Build matching command
        if source.lower() == "protein" or source.lower() == "proteome":
            cmd = NcbiblastpCommandline(query=ref_seq_file,
                                        db=db_name,
                                        out=alignment_file,
                                        outfmt=5)
        elif source.lower() == "nucleotide" or source.lower() == "genome":
            cmd = NcbiblastnCommandline(query=ref_seq_file,
                                        db=db_name,
                                        out=alignment_file,
                                        outfmt=5,
                                        task="blastn")
            #cmd =  NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5)
        #Execute command
        stdout, stderr = cmd()
        #Open result
        try:
            result_handle = open(alignment_file)
            blast_record = list(
                NCBIXML.parse(result_handle))[0]  #BLAST record object
            result_handle.close()
        except ValueError:
            raise NoMatchForSeqException(proteome_file, ref_seq_file)
        #Parse resulting alignments
        alignments = []
        try:
            #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments
            #will be used if per_proteome_sequences==None
            for number, alignment in enumerate(
                    blast_record.alignments[0:per_proteome_sequences]):
                hsp = alignment.hsps[
                    0]  #HSP contains all the details about the alignment
                sequence = hsp.sbjct
                score = hsp.score
                evalue = hsp.expect
                identities = hsp.identities
                query_length = blast_record.query_letters
                align_length = hsp.align_length
                #calculate percentages
                identity_percentage = float(identities) / align_length
                if (not per_proteome_sequences or per_proteome_sequences > 1
                    ) and len(blast_record.alignments) > 1:
                    name = proteome_name + "_" + str(number)
                else:
                    name = proteome_name
                #length percentage
                percentage = float(align_length) / query_length
                #convert to SeqRecord0
                align_seq = SeqRecord(Seq(sequence, IUPAC.protein),
                                      id=name,
                                      name=name,
                                      description=name)
                align_seq._set_seq(align_seq.seq.ungap("-"))  #Remove the gaps
                #score, identity_percentage = parse_blast_result(alignment_file)
                alignments.append((score, align_seq, identity_percentage,
                                   evalue, percentage))
                #alignments is a list of tuples with each element being (score, align_seq, identity_percentage, evalue, length_percentage)
        except IndexError:  #If alignments are empty, doesn't actually do anything
            sequence = ""
            score = 0
            identity_percentage = 0
        #remove temp file
        os.remove(alignment_file)
        #return
        return alignments
    #not BLAST or Supermatcher!
    else:
        raise BaseException(
            "Only Supermatcher and BLAST modes are currently supported.")