def directed_local_alignment(proteome_file, ref_seq_file, proteome_name, workdir, mode, source, strand, per_proteome_sequences): """ This function does the main lifting, running Supermatcher or BLAST on your proteomes/genomes. """ #open result file alignment_file = os.path.join(workdir, "curr_alignment.aln") #If Supermatcher was chosen as the alignment algorithm if mode.lower() == "supermatcher": #Unlike BLAST, Supermatcher only searches one strand, so we create a temp file to use as the supermatcher #bsequence, and we can write into it either the positive or negative strand depending on the "strand" parameter records = open_proteome(proteome_file) if not records: return [] if not strand: records = [ i.reverse_complement(id=True, name=True, description=True, features=True, annotations=True, letter_annotations=True, dbxrefs=True) for i in records ] #write file proteome_fasta_file = os.path.join(workdir, "curr_proteome.fasta") SeqIO.write(records, proteome_fasta_file, "fasta") #Build the matching command if source.lower() == "protein" or source.lower() == "proteome": matrix = "EBLOSUM62" #AA matrix elif source.lower() == "nucleotide" or source.lower() == "genome": matrix = "EDNAFULL" #run supermatcher cmd = SuperMatcherCommandline(asequence=ref_seq_file, bsequence=proteome_fasta_file, gapopen=10, gapextend=0.5, datafile=matrix, outfile=alignment_file) #Excecute the command stdout, stderr = cmd() #Parse the resulting alignments alignments = [] try: #Create list of MultipleSeqAlignment objects representing the Supermatcher results (list may be empty) align_seq_list = list(AlignIO.parse( alignment_file, "amir_emboss")) #List of MultipleSeqAlignment objects #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments #will be used if per_proteome_sequences==None for number, alignment in enumerate( align_seq_list[0:per_proteome_sequences]): #get the alignent align_seq = alignment[ 1] #SeqRecord objects, [0] is query and [1] is sbjct #remove gaps align_seq._set_seq(align_seq.seq.ungap("-")) #get name if (not per_proteome_sequences or per_proteome_sequences > 1 ) and len(align_seq_list) > 1: usable_name = proteome_name + "_" + str(number) else: usable_name = proteome_name align_seq.name = usable_name align_seq.id = usable_name #finalize parsing score, identity_percentage = parse_supermatcher_result( alignment_file, number) #add to list alignments.append( (score, align_seq, identity_percentage, 0, 1)) #1 is given arbitrarily as gene_percentage #alignments is a list of tuples with each element being (score, align_seq, identity_percentage) #detlete temp files os.remove(alignment_file) os.remove(proteome_fasta_file) #return return alignments except ValueError or IndexError: raise NoMatchForSeqException(proteome_fasta_file, ref_seq_file) #If BLAST was chosen as the alignment algorithm elif mode.upper() == "BLAST": #if file is called XXXX.file_type.gz, db name should be XXXX if proteome_file.endswith(".gz"): db_name_temp = ".".join(proteome_file.split(".")[:-2]) #if file is called XXXX.file_type, db name should be XXXX else: db_name_temp = ".".join(proteome_file.split(".")[:-1]) #define dir directory = os.path.dirname(proteome_file) directory_files = os.listdir(directory) #iterate through files and find database file for dir_file in directory_files: #determine type of die if os.path.basename( db_name_temp) in dir_file and ".nhr" in dir_file: db_name = directory + "/" + dir_file.split(".nhr")[0] elif os.path.basename( db_name_temp) in dir_file and ".phr" in dir_file: db_name = directory + "/" + dir_file.split(".phr")[0] #open file records = open_proteome(proteome_file) if not records: return [] #Build matching command if source.lower() == "protein" or source.lower() == "proteome": cmd = NcbiblastpCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5) elif source.lower() == "nucleotide" or source.lower() == "genome": cmd = NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5, task="blastn") #cmd = NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5) #Execute command stdout, stderr = cmd() #Open result try: result_handle = open(alignment_file) blast_record = list( NCBIXML.parse(result_handle))[0] #BLAST record object result_handle.close() except ValueError: raise NoMatchForSeqException(proteome_file, ref_seq_file) #Parse resulting alignments alignments = [] try: #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments #will be used if per_proteome_sequences==None for number, alignment in enumerate( blast_record.alignments[0:per_proteome_sequences]): hsp = alignment.hsps[ 0] #HSP contains all the details about the alignment sequence = hsp.sbjct score = hsp.score evalue = hsp.expect identities = hsp.identities query_length = blast_record.query_letters align_length = hsp.align_length #calculate percentages identity_percentage = float(identities) / align_length if (not per_proteome_sequences or per_proteome_sequences > 1 ) and len(blast_record.alignments) > 1: name = proteome_name + "_" + str(number) else: name = proteome_name #length percentage percentage = float(align_length) / query_length #convert to SeqRecord0 align_seq = SeqRecord(Seq(sequence, IUPAC.protein), id=name, name=name, description=name) align_seq._set_seq(align_seq.seq.ungap("-")) #Remove the gaps #score, identity_percentage = parse_blast_result(alignment_file) alignments.append((score, align_seq, identity_percentage, evalue, percentage)) #alignments is a list of tuples with each element being (score, align_seq, identity_percentage, evalue, length_percentage) except IndexError: #If alignments are empty, doesn't actually do anything sequence = "" score = 0 identity_percentage = 0 #remove temp file os.remove(alignment_file) #return return alignments #not BLAST or Supermatcher! else: raise BaseException( "Only Supermatcher and BLAST modes are currently supported.")