# Expected to fail pass # Show the alignment for i, alignment in enumerate(alignments): if i < 3 or i + 1 == t_count: print(" Alignment %i, with %i sequences of length %i" \ % (i, len(alignment), alignment.get_alignment_length())) print(alignment_summary(alignment)) elif i == 3: print(" ...") # Check AlignInfo.SummaryInfo likes the alignment summary = AlignInfo.SummaryInfo(alignment) dumb_consensus = summary.dumb_consensus() #gap_consensus = summary.gap_consensus() if t_format != "nexus": # Hack for bug 2535 pssm = summary.pos_specific_score_matrix() rep_dict = summary.replacement_dictionary() try: info_content = summary.information_content() except ValueError as e: if str( e ) != "Error in alphabet: not Nucleotide or Protein, supply expected frequencies": raise e pass
def align_by_phylotype(input_fasta,input_cluster,input_metadata,output_folder): metadata_dic = {} phylotype_dic = {} seq_dic = {} consensus_dic = {} with open(input_metadata,"r") as f: reader = csv.DictReader(f) metadata = [r for r in reader] for items in metadata: metadata_dic[items["header"]] = items["lineage"] for record in SeqIO.parse(input_fasta, 'fasta'): seq_dic[record.id]= record.seq with open(input_cluster,"r") as f: for line in f: phylotype_dic[line.rstrip()] = [] for cluster in phylotype_dic.keys(): for seq_id,phylotype in metadata_dic.items(): cluster_type = cluster.split(".") cluster_length = len(cluster_type) phylo_type = phylotype.split(".") if len(phylo_type) < cluster_length: continue if phylo_type[:cluster_length] == cluster_type: if seq_id in seq_dic.keys(): phylotype_dic[cluster].append([seq_id,seq_dic[seq_id],phylotype]) del seq_dic[seq_id] print("Clade","Number of Sequences") for key,value in phylotype_dic.items(): print(key,len(value)) log_file = open(output_folder+"align_phylo.log","w") for seq in seq_dic.keys(): log_file.write("Sequence " + seq + " with lineage " + metadata_dic[seq] + " did not fall into any of the phylotypes stated in the cluster file.\n") for key in phylotype_dic.keys(): if len(phylotype_dic[key]) > 2: outfile_name = output_folder + "lineage_" + key + ".fasta" outfile = open(outfile_name,"w") for sequences in phylotype_dic[key]: record = SeqRecord(sequences[1],id=sequences[0],description="") SeqIO.write(record, outfile, "fasta") outfile.close() alignment_name = outfile_name[:-6] + "_alignment.fasta" align_command = "mafft " + outfile_name + " > " + alignment_name os.system(align_command) os.remove(outfile_name) alignment = AlignIO.read(alignment_name, 'fasta') consensus_name = key + "_consensus" summary_align = AlignInfo.SummaryInfo(alignment) consensus_seq = summary_align.dumb_consensus(threshold=0.0,ambiguous='N') consensus_dic[consensus_name] = consensus_seq else: log_file.write("Phylotype " + key + "does not have 2 or more sequences for an alignment to work.") log_file.close() consensus_file = open(output_folder+"lineage_consensus.fasta","w") for key, value in consensus_dic.items(): record = SeqRecord(value,id=key,description="") SeqIO.write(record, consensus_file, "fasta") consensus_file.close()
def buildGSSP( vgene ): results = [] if len(masterList[vgene]) < arguments["--numSequences"]: print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) ) return [] if vgene not in germList: print( "Skipping %s, it's not in the germline database..." %vgene ) return [] # Take random overlapping subsets to generate multiple profiles # need to add back a sanity check for capping the number of subsets if there's not enough raw data. numProfiles = arguments['--profiles'] if arguments["--profiles"] == 0: numProfiles = 1 success = 0 for i in range(numProfiles): seqs = [] + germList[vgene] #force a copy rather than an alias if arguments["--profiles"] == 0: seqs += list(masterList[vgene]) else: #get our sequence subset, add the germlines, and write them # to a temporary file for alignment seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False)) tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene) with open("%s.fa"%tempFile, "w") as temp: SeqIO.write(seqs,temp,"fasta") muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile) #try to speed up the process a little bit for large datasets #still going to max out at ~50k seqs per profile (probably) muscle_cline.maxiters = 2 muscle_cline.diags = True try: stdout, stderr = muscle_cline() except: print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) ) for f in glob.glob("%s.*"%tempFile): os.remove(f) continue alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal") success += 1 #Input order is not maintained, so we need a little # kludge to find a germline sequences. Use the # first one to remove any insertions from the alignment germRow = 0 for n, rec in enumerate(alignment): if rec.id in [g.id for g in germList[vgene]]: germRow = n break #look for gaps one at a time so we don't get tripped up by shifting indices gap = re.search( "-+", str(alignment[germRow].seq) ) while (gap): alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():] gap = re.search( "-+", str(alignment[germRow].seq) ) #Now we get BioPython to make a PSSM for us. To convert that into # a mutability profile, we will delete the germline residue[s] # at each position (but save what they were) germRes = defaultdict(Counter) summary_align = AlignInfo.SummaryInfo(alignment) pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X']) #get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data # do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues. denominator = [] for p,pos in enumerate(pssm): denominator.append( sum(pos.values()) - len(germList[vgene]) ) for germ in germList[vgene]: for pos, residue in enumerate(germ): if residue == "X": continue germRes[pos][residue] += 1 pssm[pos][residue] = 0 #normalize and save for p, pos in enumerate(pssm): germAA = ",".join([ x[0] for x in germRes[p].most_common() ]) results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] ) #clean up for f in glob.glob("%s.*"%tempFile): os.remove(f) print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) ) return results
def generateIntron2Consensus(alleleFullList, outputDirectory): # TODO: This method does not seem to work anymore. I am not assigning the in2Sequence anywhere. # Do I need this code anymore? Why would I want to simulate an Intron 2 consensus sequence? for featureName in ['Intron 2']: shortFeatureName = featureName.replace(' ', '') #Im deciding to quit here. Late enough. I want to fix this method tomorrow. print ('Creating a ' + featureName + ' Reference:' + join( join(outputDirectory,shortFeatureName + 'References'), 'HLA_Intron2.fasta') ) intron2Alleles = [] for allele in alleleFullList: #TODO fix featuresInFullSequence. Might work this way. if('Intron 2' in allele.featuresInFullSequence): currentIntron2Allele = allele.copy() #TODO I don't know if i'm still gonna use in2Sequence. currentIntron2Allele.sequence = allele.in2Sequence intron2Alleles.append(currentIntron2Allele) # Intron 2 output file, for analyizing *just* the intron 2 outputIn2FileName = join(join(outputDirectory,'Intron2References'), 'HLA_Intron2.fasta') printFasta(intron2Alleles, outputIn2FileName, False, False, False) # Print outputfiles and info for each allele group. print ('Generating output files for each HLA Allele Group') alleleGroups = getAlleleGroups(intron2Alleles) alleleGenes = getAlleleGenes(intron2Alleles) combinedAlleleGroups = alleleGroups + alleleGenes for index, alleleGroup in enumerate(combinedAlleleGroups): print('(' + str(index + 1) + '/' + str(len(combinedAlleleGroups)) + '): ' + alleleGroup.FileName) outputGroupFileName = join(outputDirectory, join('Intron2References',alleleGroup.FileName)) clustalwAlignmentOutputFileName = outputGroupFileName.replace('.fasta','.aln') clustalwConsensusOutputFileName = outputGroupFileName.replace('.fasta','.consensus.fasta') # if there is more than one allele in the group if (len(alleleGroup.Alleles) > 1): print (str(len(alleleGroup.Alleles)) + ' Alleles Found. Writing to file: ' + outputGroupFileName) # Print allele group to a fasta file # So this should actually be a false, I don't want to use the APD sequence here. printFasta(alleleGroup.Alleles, outputGroupFileName, False, False, False) if (not os.path.isfile(clustalwAlignmentOutputFileName)): clustalwCommandline = ClustalwCommandline("clustalw", infile=outputGroupFileName, outfile=clustalwAlignmentOutputFileName) print ('Performing ClustalW Alignment : \n' + str(clustalwCommandline)) #Perform the alignment clustalwCommandline() # sanity check to make sure it exists. if (os.path.isfile(clustalwAlignmentOutputFileName)): # If consensus does not exist yet if not (os.path.isfile(clustalwConsensusOutputFileName)): #Find consensus alignmentType = 'clustal' align = AlignIO.read(clustalwAlignmentOutputFileName, alignmentType) print ('Consensus FileName = ' + clustalwConsensusOutputFileName) #print('Alignment:' + str(align)) summary_align = AlignInfo.SummaryInfo(align) dumb_consensus = summary_align.dumb_consensus() #print('LengthDumbConsensus:' + str(len(dumb_consensus))) gap_consensus = summary_align.gap_consensus() #print('LengthGapConsensus:' + str(len(gap_consensus))) # Print Consensus to fasta. # I can cheat and just create an HLA_Allele object, and print that. currentAllele = HLA_Allele() currentAllele.APDSequence = str(dumb_consensus) currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','') currentAllele.outputDirectory = outputDirectory #print ('Consensus2=' + currentAllele.APDSequence) printFasta([currentAllele], clustalwConsensusOutputFileName, True, False, False) pass else: print ('Consensus file ' + clustalwConsensusOutputFileName + ' already exists. Moving on...') else: print ('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) #raise Exception('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) pass else: print('This alignment file ' + clustalwAlignmentOutputFileName + ' already exists. Moving on...') #else: # print ('Not running Alignments because you told me not to.') # There is only one allele in this group. else: print ('Only one allele found. Writing to file: ' + outputGroupFileName) #writing it out twice, that's kind of silly but whatever. printFasta([alleleGroup.Alleles[0]], outputGroupFileName, True, False, False) printFasta([alleleGroup.Alleles[0]], clustalwConsensusOutputFileName, True, False, False)
def closestToConsensus(linIt): results = [] print("Starting vsearch on a new chunk...") for lineage in linIt: #save time on singletons (if they weren't excluded by minSeq) if len(lineage['desc']) == 1: with open("%s/%s.fa" % (prj_tree.lineage, lineage['name']), "r") as handle: results.append(SeqIO.read(handle, 'fasta')) else: FNULL = open( os.devnull, 'w') #don't clutter up output with tons of vsearch messages #cluster and rapid align with vsearch subprocess.call([ vsearch, "-cluster_size", "%s/%s.fa" % (prj_tree.lineage, lineage['name']), "-id", "0.97", "-sizein", "-sizeout", "-msaout", "%s/%s_msa.fa" % (prj_tree.lineage, lineage['name']), "-clusterout_sort" ], stdout=FNULL, stderr=subprocess.STDOUT) #extract biggest cluster with open("%s/%s_msa.fa" % (prj_tree.lineage, lineage['name']), "r") as allClusters: with open( "%s/%s_msaBiggest.fa" % (prj_tree.lineage, lineage['name']), "w") as biggestOnly: blank = next(allClusters) for line in allClusters: if "consensus" in line: break biggestOnly.write(line) #open the msa with open( "%s/%s_msaBiggest.fa" % (prj_tree.lineage, lineage['name']), "r") as handle: aln = AlignIO.read(handle, "fasta") #add derep size to alignment as weighting for rec in aln: rec.annotations['weight'] = int( rec.id.split(";")[1].split("=")[1]) summary_align = AlignInfo.SummaryInfo(aln) pssm = summary_align.pos_specific_score_matrix() #score each sequence and save the best one scores = dict() for record in aln: myScore = 0 for i, l in enumerate(record): myScore += pssm[i][l] scores[record.id] = myScore d = sorted(aln, key=lambda rec: scores[rec.id], reverse=True) #reverse -> get max d[0].seq = d[0].seq.ungap("-") #remove gaps d[0].id = d[0].id.split(";")[0] #remove vsearch size annotation d[0].id = re.sub( "^\*", "", d[0].id) #get rid of possible annotation from vsearch d[0].description = lineage['desc'][d[0].id] #restore original info results.append(d[0]) return results
def get_consensus(self): '''Outpus consensus sequence of the cluster''' summary_align = AlignInfo.SummaryInfo(self.msa) return summary_align.dumb_consensus()
def germline_from_imgt(sequence_file, germline_lib, species_name, output_file, option, report, error, fixed_mut): """ Test cases mainly for Mutation Analysis: Identical sequences with varying lengths at the 5' end >>> germline_from_imgt("Testfiles/alleles/varyinglengths.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/varyinglengths_out.fasta", "v", doctest_report, doctest_report, 3) Processing successfully completed. Mutation Analysis: IGHV4-34*01 (5 sequences): Common mutations: c67g, g88c, t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t germline: caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg consensus: .........ctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg Identical sequences with one common deletion and an adjacent deletion that is not present in all sequences >>> germline_from_imgt("Testfiles/alleles/somedeletions.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/somedeletions_out.fasta", "v", doctest_report, doctest_report, 3) Processing successfully completed. Mutation Analysis: IGHV4-34*01 (5 sequences): Common deletions: 156, 157, 158 Common mutations: c67g, g88c, t96g, a97c, g103c, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t germline: caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcXXX---agtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg Identical sequences with a common deletion >>> germline_from_imgt("Testfiles/alleles/deletions.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/deletions_out.fasta", "v", doctest_report, doctest_report, 3) Processing successfully completed. Mutation Analysis: IGHV4-34*01 (5 sequences): Common deletions: 153, 154, 155, 156, 157, 158 Common mutations: c67g, g88c, t96g, a97c, g103c, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t germline: caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatc------agtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg Test with 5 identical sequences >>> germline_from_imgt("Testfiles/alleles/identical.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/identical_out.fasta", "v", doctest_report, doctest_report, 3) Processing successfully completed. Mutation Analysis: IGHV4-34*01 (5 sequences): Common mutations: c67g, g88c, t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t germline: caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgGtgtctatggtgggtccttcaCtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg Test with two mutations removed from the first sequence >>> germline_from_imgt("Testfiles/alleles/two_dropped.txt", "Testfiles/alleles/imgt_germlines.fasta", "H**o sapiens", "Testfiles/alleles/two_dropped_out.fasta", "v", doctest_report, doctest_report, 3) Processing successfully completed. Mutation Analysis: IGHV4-34*01 (5 sequences): Common mutations: t96g, a97c, g103c, t158a, a165g, g166c, c179t, c180g, c181t, g182t, a189g, t207c, c208a, a209c, g210a, a212t, c221t, a229g, g230a, a241g, c248t, t249g, g273a, t274a, g275a, t278c, a280t germline: caggtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgctgtctatggtgggtccttcagtggttactactggagctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagg consensus: ...gtgcagctacagcagtggggcgcaggactgttgaagccttcggagaccctgtccctcacctgcgXtgtctatggtgggtccttcaXtggttacGCctggaCctggatccgccagcccccagggaaggggctggagtggattggggaaatcaatcaAagtggaGCcaccaactacaaTGTTtccctcGagagtcgagtcaccataCACAtTgacacgtcTaagaaccGAttctccctgaGgctgagTGctgtgaccgccgcggacacggctAAAtaCtTctgtgcgagagg """ for char in option: if char not in 'ciofvjx': error('unrecognised option: %s.' % char) return try: gl = Germlib(species_name, germline_file=germline_lib) except: report("Error parsing germline library file: " + str(sys.exc_info()[1])) return consensus_f = [] consensus_v = [] consensus_j = [] mutated_germs = {} imgt_nt = {} try: with open(sequence_file, "r") as sequence_handle: ln = sequence_handle.readline() sep = ("\t" if "\t" in ln else ",") sequence_handle.seek(0) reader = csv.DictReader(sequence_handle, delimiter=sep) for row in reader: imgt_nt[row["Sequence ID"]] = row outrecs = [] for id, nt_rec in imgt_nt.iteritems(): try: if "JUNCTION" in nt_rec and nt_rec["JUNCTION"] != None and len(nt_rec["JUNCTION"]) > 0: heavychain = len(nt_rec["V-D-J-REGION"]) > 0 if heavychain: mAb = (nt_rec["V-REGION"], nt_rec.get("P3'V", ""), nt_rec.get("N-REGION", ""), nt_rec.get("N1-REGION", ""), nt_rec.get("P5'D", ""), nt_rec.get("D-REGION", ""), nt_rec.get("P3'D", ""), nt_rec.get("N2-REGION", ""), nt_rec.get("P5'J", ""), nt_rec["J-REGION"]) else: mAb = (nt_rec["V-REGION"], nt_rec.get("P3'V", ""), nt_rec.get("N-REGION", ""), nt_rec.get("P5'J", ""), nt_rec["J-REGION"]) if 'x' in option: report("%s:" % id) report(" | ".join(mAb)) # Revert the part of the V-gene that extends to the second Cysteine vregion = nt_rec["V-REGION"] vregion_3prime = nt_rec["3'V-REGION"] vgene_name = Germlib.translate_imgt_name(nt_rec["V-GENE and allele"]) if len(vregion_3prime) > 0 and vregion[0 - len(vregion_3prime):] != vregion_3prime: report("Error: 3'V-REGION sequence not found at 3' end of V-REGION in sequence %s" % id) continue # Remove stray nucleotides from the 5' end of the V-region to give us whole codons (we know the 3' end is aligned) vregion_5prime = vregion[:0 - len(vregion_3prime)] if len(vregion_3prime) > 0 else vregion vregion_5prime = (vregion_5prime if len(vregion_5prime) % 3 == 0 else vregion_5prime[(len(vregion_5prime) % 3):]) try: vgene_frag1, matchstr_frag1 = gl.match_from_aa(vgene_name, vregion_5prime) # For the remaining (3') part, use a global alignment. We use the entire V-region so that the 3prime # region, which might be quite small, aligns against the right part of the sequence vgene_frag2, matchstr_frag2 = gl.match(vgene_name, vregion)[0 - len(vregion_3prime):] if len(vregion_3prime) > 0 else ("", "") if fixed_mut > 0: # Merge the two matchstrings. Starting at the 3' end, we pull matchstring off frag2 until we get beyond vgene_frag2 and are # about to pull the first nt of vgene_frag1. Then we pull the rest off vgene_frag1. mlen = 0 matchstr = "" for m in matchstr_frag2[::-1]: if m != 'd': mlen += 1 if mlen > len(vregion_3prime): break matchstr += m skip = True for m in matchstr_frag1[::-1]: if skip and m != 'd': skip = False if not skip: matchstr += m matchstr = matchstr[::-1] # Sanity check 1 - number of nucleotides in match string should match length of v-region mlen = sum((n != 'd') for n in matchstr) if len(vregion_5prime) + len(vregion_3prime) != mlen: report("Error in match string length for sequence %s" % id) # Sanity check 2 - check matchstring is consistent vgene = str(gl.seq(vgene_name).seq) mismatch = False gt = iter(vgene) vt = iter(vregion) for m in matchstr: if m == 'd': next(gt) elif m == 'i': next(vt) elif m == 'm': if next(gt) != next(vt): mismatch = True else: if next(gt) == next(vt): mismatch = True if mismatch: report("Error in matchstring for sequence %s:\nvgene: %s\nseq : %s\nmatch: %s\n" % (id, vgene, vregion, matchstr)) else: en = mutated_germs.get(vgene_name, []) en.append((vregion, matchstr)) mutated_germs[vgene_name] = en if nt_rec["J-GENE and allele"] != '': jgene_name = Germlib.translate_imgt_name(nt_rec["J-GENE and allele"]) jgene_frag, _ = gl.match(jgene_name, nt_rec["J-REGION"]) else: jgene_frag = '' if heavychain and nt_rec["D-GENE and allele"] != '': dgene_name = Germlib.translate_imgt_name(nt_rec["D-GENE and allele"]) dgene_frag, _ = gl.match(dgene_name, nt_rec["D-REGION"]) else: dgene_frag = '' except: report("Error processing sequence " + id + ":") exc_type, exc_value, exc_traceback = sys.exc_info() report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2)) continue if heavychain: germline = [ vgene_frag1 + vgene_frag2, nt_rec.get("P3'V", ""), nt_rec.get("N-REGION", ""), nt_rec.get("N1-REGION", ""), nt_rec.get("P5'D", ""), dgene_frag, nt_rec.get("P3'D", ""), nt_rec.get("N2-REGION", ""), nt_rec.get("P5'J", ""), jgene_frag] else: germline = [ vgene_frag1 + vgene_frag2, nt_rec.get("P3'V", ""), nt_rec.get("N-REGION", ""), nt_rec.get("P5'J", ""), jgene_frag] jgene_frag = (jgene_frag if len("".join(germline)) % 3 == 0 else jgene_frag[:0-(len("".join(germline)) % 3)]) germline[-1] = jgene_frag if 'i' in option: trunc5 = len(vregion) - len(vregion_5prime + vregion_3prime) if heavychain: trunc3 = (len(nt_rec["V-D-J-REGION"]) - trunc5) % 3 if trunc3 != 0: outrecs.append(SeqRecord(Seq(nt_rec["V-D-J-REGION"][trunc5:0-trunc3]), id=id, name=id, description="")) else: outrecs.append(SeqRecord(Seq(nt_rec["V-D-J-REGION"][trunc5:]), id=id, name=id, description="")) else: trunc3 = (len(nt_rec["V-J-REGION"]) - trunc5) % 3 if trunc3 != 0: outrecs.append(SeqRecord(Seq(nt_rec["V-J-REGION"][trunc5:0-trunc3]), id=id, name=id, description="")) else: outrecs.append(SeqRecord(Seq(nt_rec["V-J-REGION"][trunc5:]), id=id, name=id, description="")) if 'f' in option: if 'x' in option: report("Inferred 'full' germline:") report(" | ".join(germline)) sr = SeqRecord(Seq("".join(germline)), id=id + "_germ", name=id + "_germ", description="") consensus_f.append(sr) if 'o' in option: outrecs.append(sr) def chunks(l, n): """ Yield successive n-sized chunks from l.""" for i in xrange(0, len(l), n): yield l[i:i + n] germline = "".join(germline) v_ext = vgene_frag1 + vgene_frag2 if 'v' in option: g = (v_ext) + '-' * (len(germline) - len(v_ext)) germline_v = "" for c in chunks(g, 3): germline_v += c if '-' not in c else '-'*len(c) if 'x' in option: report("Inferred germline (v):") report(germline_v) sr = SeqRecord(Seq(germline_v), id=id + "_germ_v", name=id + "_germ_v", description="") consensus_v.append(sr) if 'o' in option: outrecs.append(sr) if 'j' in option: if heavychain: g = v_ext + '-' * ( len(nt_rec.get("P3'V", "")) + len(nt_rec.get("N-REGION", "")) + len(nt_rec.get("N1-REGION", "")) + len(nt_rec.get("P5'D", ""))) + \ dgene_frag + \ '-' * ( len(nt_rec.get("P3'D", "")) + len(nt_rec.get("N2-REGION", "")) + len(nt_rec.get("P5'J", ""))) + \ jgene_frag else: g = v_ext + '-' * (len(germline) - len(v_ext) - len(jgene_frag)) + jgene_frag germline_vj = "" for c in chunks(g, 3): germline_vj += c if '-' not in c else '-'*len(c) if 'x' in option: report("Inferred germline_vdj:") report(germline_vj) sr = SeqRecord(Seq(germline_vj), id=id + "_germ_vdj", name=id + "_germ_vdj", description="") consensus_j.append(sr) if 'o' in option: outrecs.append(sr) else: report("%s: no junction." % id) except: report("Error processing input record " + id + ":") exc_type, exc_value, exc_traceback = sys.exc_info() report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2)) report("Processing successfully completed.") except: report("Error parsing input file: " + str(sys.exc_info()[1])) return if 'c' in option: try: def checklengths(srs): length = -1 for sr in srs: if length < 0: length = len(sr.seq) elif len(sr.seq) != length: report("Length error in sequence %s" % sr.id) if 'f' in option: checklengths(consensus_f) summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_f)) cd = summary.dumb_consensus(ambiguous="-") consensus = "" for c in chunks(cd, 3): consensus += c if '-' not in c else '-'*len(c) report("'Full' germline consensus:") report(str(consensus)) outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_full", name="consensus_germ_full", description="")) if 'v' in option: checklengths(consensus_v) summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_v)) cd = summary.dumb_consensus(ambiguous="-") consensus = "" for c in chunks(cd, 3): consensus += c if '-' not in c else '-'*len(c) report("Germline (v) consensus:") report(str(consensus)) outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_v", name="consensus_germ_v", description="")) if 'j' in option: checklengths(consensus_j) summary = AlignInfo.SummaryInfo(MultipleSeqAlignment(consensus_j)) cd = summary.dumb_consensus(ambiguous="-") consensus = "" for c in chunks(cd, 3): consensus += c if '-' not in c else '-'*len(c) report("Germline vdj consensus:") report(str(consensus)) outrecs.insert(0, SeqRecord(consensus, id="consensus_germ_vdj", name="consensus_germ_vdj", description="")) except: report("Error generating consensus: %s - %s" % (sys.exc_info()[0], sys.exc_info()[1])) if fixed_mut > 0: try: report("Mutation Analysis, showing mutations, insertions and deletions that are common to all sequences from a given germline.") report("This will be reported for all germlines for which there are at least %d sequences in the analysis:" % fixed_mut) def m_limits(m): # Find the upper and lower limits of the matchstr, ignoring leading and trailing deletions # limits are expressed as locations relative to the germline (insertions in the matchstr are ignored) for i in range(len(m)): if m[i] != 'd': mstart = i break for i in range(len(m)-1, -1, -1): if m[i] != 'd' and m[i] != 'i': mend = i break loc = 0 for i in range(len(m)): if i == mstart: start = loc elif i == mend: end = loc if m[i] != 'i': loc += 1 return (start, end) for germline, mg in mutated_germs.iteritems(): if len(mg) >= fixed_mut: # given that the sequences may have different start and end points, compute # the range over which we have coverage from a sufficient number of sequences germseq = gl.seq(germline).seq coverage = [0] * len(germseq) for seq, matchstr in mg: start, end = m_limits(matchstr) for i in range(start, end+1): coverage[i] += 1 range_start = 999 range_end = -1 for i, val in enumerate(coverage): if val >= fixed_mut: if range_start > i: range_start = i if range_end < i: range_end = i # matches[loc] holds: # 'u' if this location has not as yet been observed in sequences processed # 'm' if it has been observed to match the germline in sequences processed so far # 'c,g,a,t' if it has been observed to be mutated to that value in sequences processed so far # 'm' if it has been observed to be deleted in sequences processed so far # 'x' if if the results at this location are not consistent between sequences matches = ['u'] * len(germseq) insertions = [] range_encountered_start = 999 range_encountered_end = -1 for seq, matchstr in mg: ins = 0 loc = 0 inserts = [] (start, end) = m_limits(matchstr) start = max(start, range_start) end = min(end, range_end) s = iter(seq) for m in matchstr: if m != 'i': ins = 0 if m == 'n': sub = next(s) if loc >= start and loc <= end: if matches[loc] == 'u': matches[loc] = sub elif matches[loc] != sub: matches[loc] = 'x' loc += 1 elif m == 'd': if loc >= start and loc <= end: if matches[loc] == 'u': matches[loc] = 'd' elif matches[loc] != 'd': matches[loc] = 'x' loc += 1 elif m == 'i': if loc >= start and loc <= end: inserts.append((loc, ins)) ins += 1 next(s) else: if loc >= start and loc <= end: if matches[loc] == 'u': matches[loc] = 'm' elif matches[loc] != 'm': matches[loc] = 'x' loc += 1 next(s) # Add a new insertion to the consensus list if we see it in this sequence, and it is outside # the range we've encountered so far. for loc, ins in inserts: if loc < range_encountered_start or loc > range_encountered_end: insertions.append((loc, ins)) # Remove insertions from the consensus list if they are in range of this sequence and were not # observed in it for loc, ins in insertions: if loc >= start and loc <= end: if not (loc, ins) in inserts: insertions.remove((loc, ins)) range_encountered_start = min(range_encountered_start, start) range_encountered_end = max(range_encountered_end, end) report("%s (%d sequences):" % (germline, len(mg))) deletions = [] for loc, m in enumerate(matches): if m == 'd': deletions.append(loc) if len(deletions) > 0: report(" Common deletions: %s" % ', '.join([str(n) for n in sorted(deletions)])) if len(insertions) > 0: report(" Common insertions: %s") % ', '.join(["%d.%d" % (loc, ins) for (loc, ins) in sorted(insertions)]) mutations = [] for loc, m in enumerate(matches): if m in ('c', 'a', 'g', 't'): mutations.append("%s%d%s" % (germseq[loc], loc, m)) if len(mutations) > 0: report(" Common mutations: %s" % ', '.join([str(n) for n in mutations])) if len(insertions) + len(deletions) + len(mutations) > 0: r_g = "" gi = iter(germseq) for m in matches: r_g += next(gi) if m != 'i' else '-' report( "germline: %s" % r_g) r_c = "" gi = iter(germseq) for m in matches: if m == 'm': r_c += next(gi) elif m == 'd': r_c += '-' next(gi) elif m == 'i': r_c += 'i' elif m == 'u': r_c += '.' next(gi) else: r_c += m.upper() next(gi) report( "consensus: %s" % r_c) else: report(" No common insertions, deletions or mutations compared to gertmline") else: report("%s (%d sequences) - number of sequences is below analysis threshold." % (germline, len(mg))) except: report("Error creating mutation report:") exc_type, exc_value, exc_traceback = sys.exc_info() report(traceback.format_exception(exc_type, exc_value, exc_traceback, 2)) SeqIO.write(outrecs, output_file, "fasta")
def generate_consensus(alignment): align_summary = AlignInfo.SummaryInfo(alignment) consensus = align_summary.dumb_consensus(threshold=0.51, ambiguous='N') return consensus
#go through seqs, find probe taxa, record start and end for rec in alignIN: ID = rec.id Seq = rec.seq SEQ = Seq.upper() if PROBEname in ID: #print(ID) start = min(SEQ.find("A"), SEQ.find("C"), SEQ.find("G"), SEQ.find("T")) end = max(SEQ.rfind("A"), SEQ.rfind("C"), SEQ.rfind("G"), SEQ.rfind("T")) break #print("Probe region- ID:", ID, "start:",start,"end:", end,"\n") #make a 50% consesus seq info = AlignInfo.SummaryInfo(alignIN) consensus = info.dumb_consensus(threshold=.50, consensus_alpha=ambiguous_dna) #print("Consensus:",consensus,"\n") #consensus for the head and tail region used to get number of gaps needed to insert below headcon = consensus[:start + 1] tailcon = consensus[end:] colsH = len(headcon) colsT = len(tailcon) #loop through each column by record, record scores for head and tail regions and add to dictionary for rec in alignIN: Seq = rec.seq ID = rec.id for colIDX in range(colsL): col = alignIN[count:count + 1, colIDX] if colIDX < start:
from Bio.SeqRecord import SeqRecord os.chdir(sys.argv[1]) listing = os.listdir(".") consensus = {} genConsensus = '' pssmGen = '' consensusThres = 0.7 #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) generalAlignment = AlignIO.parse(sys.argv[2], "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) for item in listing: if item.endswith(".fas"): #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) alignments = AlignIO.parse(item, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) for alignment in alignments: summ = AlignInfo.SummaryInfo(alignment) consensus[item] = summ.gap_consensus(consensusThres)
def Generate_Consensus_conserved_region(self): # Read the minimum length of the amolicons from the para.txt file: with open('para.txt') as para: self.alllines = para.readlines() for self.para_index, self.para_line in enumerate(self.alllines): self.seperate = self.para_line.split(':') if 'Minimum length of conserved regions' in self.para_line: self.conserved_region_minimum_size = self.seperate[-1] if self.conserved_region_minimum_size[-1] == '\n': self.conserved_region_minimum_size = self.conserved_region_minimum_size[ 0:-1] else: pass # Read in the alignment file generated from last step and create consensus sequence from it: self.alignment = AlignIO.read(self.alignment_file, "clustal") self.summary_align = AlignInfo.SummaryInfo(self.alignment) self.consensus = self.summary_align.dumb_consensus( float(self.get_from_keyboard(self.entry1))) if self.consensus_seq_file: f = open(self.consensus_seq_file, "w+") f.write('>' + self.get_from_keyboard(self.entry) + '\n' + str(self.consensus)) f.close() self.star_list = [] # Try to find out all the positions of the gaps, use 'try' because there may be no gaps, which may lead to errors in the next step: try: for self.index in range(0, len(self.consensus)): if str(self.consensus[self.index]) == 'X': self.star_list.append(self.index) try: # Always append the first and the last bases of the sequence as gaps into the gap list: if self.star_list[0] == 0: pass else: self.star_list = [0] + self.star_list if self.star_list[-1] == len(self.consensus): pass else: self.star_list.append(len(self.consensus)) for self.region_start in range(0, len(self.star_list) - 1): self.region_end = self.region_start + 1 self.region = self.consensus[ self.star_list[self.region_start] + 1:self.star_list[self.region_end]] if len(self.region) >= int( self.conserved_region_minimum_size): f = open(self.conserved_region_file, 'a+') f.write('Length:' + str(len(self.region)) + '\n' + str(self.region) + '\n' 'Startposition is: ' + str(self.star_list[self.region_start]) + '\n' + 'Endposition is: ' + str(self.star_list[self.region_end]) + '\n') f.close() self.__class__.position_pair.append([ self.star_list[self.region_start], self.star_list[self.region_end] ]) else: pass # If no gaps are found then the whole sequence is conserved: except IndexError: f = open(self.conserved_region_file, 'a+') f.write('Length:' + str(len(self.consensus)) + '\n' + str(self.consensus) + '\n' 'Startposition is: 1' + '\n' + 'Endposition is ' + str(len(self.consensus)) + '\n') f.close() self.__class__.position_pair.append([1, len(self.consensus)]) except TypeError: pass # Check if the conserved regions are generated or not, if not, then it means the threshold of consensus sequences are too high. Otherwise, inform users the positions the path of the file generated: try: if os.path.isfile(self.conserved_region_file) == False: self.textbox.insert( INSERT, 'No conserved regions found under this threshold for' + self.get_from_keyboard(self.entry) + ', please lower the threshold!!!' + '\n') if os.path.isfile(self.conserved_region_file) == True: self.textbox.insert( INSERT, 'You can get your detected conserved regions at' + self.conserved_region_file + '\n') except FileNotFoundError: pass
def create_alignment(filename, verbose=1, outfile='test.aligned.fasta'): """This function performs a MAFFT alignment""" jfilename = filename if not exists(jfilename): sys.exit("No such file: %s" % jfilename) # let's see if the given file isn't already aligned try: alignment = AlignIO.read(filename, 'fasta') jfilename = filename.rsplit('.', 1)[0] if verbose: print("File '%s' is already aligned. Skipping alignment step." % filename) except: # deprecated: #cline = MultipleAlignCL( jfilename, cf.ClustalPath ) # set output filename: #cline.set_output( jfilename+'.aln' ) #cline.set_dna_matrix( cf.DNAMatrixPath ) # ang. nedenstaaende: nuvaerende matrix er god i alle testede tilfaelde. ## current matrix is good in all tested cases. #cline = ClustalwCommandline( "clustalw", infile=jfilename, outfile=jfilename+'.aln', dnamatrix=cf.DNAMatrixPath ) cline = [ 'mafft', '--localpair', '--maxiterate', '16', '--inputorder', '--preservecase', '--quiet', jfilename ] #cline.gap_open_pen = 0.001 # dette sammen med multalinDNAmatrix.clustal giver bedre alignment # hvis sekvenserne er meget ens (se Version2/test.fasta) ## this along with multalinDNAmatrix.clustal provides better alignment ## if the sequences are very similar (see Version2/test.fasta) # hvis linien udkommenteres og mymatrix_identity5.clustal bruges i # stedet, giver det bedre alignment hvis sekvenserne ikke ligner hinanden # helt saa meget (?) (f.eks. Paper/NAR/revieweralignment.fasta) ## if the line is commented out and mymatrix_identity5.clustal used ## instead, it provides better alignment if the sequences do not ## resemble each other quite so much (?) ## (eg. Paper / NAR / revieweralignment.fasta) #cline.gap_ext_pen = 0.00 # 0.01 seems to give other regions, better? # normalt var gappen ikke sat. ## normally gaps were not set. # cline.max_div = 100 if verbose: print('-- running this command:', cline) # deprecated: #alignment = Clustalw.do_alignment(cline) stdout = subprocess.Popen(cline, stdout=subprocess.PIPE, stderr=None).communicate()[0] with open(jfilename + '.mafft', 'wb') as handle: handle.write(stdout) alignment = AlignIO.read(jfilename + '.mafft', 'fasta') # make sure sequences are upper case for rec in alignment: rec.seq = rec.seq.upper() # # indices from 0 to len-1 # # things you can do with the returned alignment object: # # deprecated: #allseq = alignment.get_all_seqs() allseq = list(alignment) summary = AlignInfo.SummaryInfo(alignment) l = alignment.get_alignment_length() return (allseq, summary, l)
myConsensus = open('consensus', "r") print_consensus = myConsensus.readlines() print(print_consensus) shutil.copy2('consensus', 'consensus.fasta') myCons = "" for myLine in print_consensus: if myLine.startswith('>'): myCons = "" else: myLine = myLine[:-1] myCons = "".join((myCons, myLine)) myConsensus.close() #5. Calculation of the abundance matrices - biopython module myMatrixinput = AlignIO.read(myAlignmentOutput, "clustal") mySummary_align = AlignInfo.SummaryInfo(myMatrixinput) myFrequency_matrix = mySummary_align.pos_specific_score_matrix(myCons) myFrequency_matrix_str = str(myFrequency_matrix) #Abundance matrix in absolute values print(myFrequency_matrix_str) myAbunout = open('abundance_matrix.txt', 'w') myAbunout.write(myFrequency_matrix_str) myAbunout.close() #Abundance matrix in percentage myAbunout2 = open('abundance_matrix.txt', 'r') myLines = myAbunout2.readlines() myAbunout2.close() myAbunperout = open('abundance_matrix_percentage.txt', 'w')
def createFirstGuessReferenceFromReads(self): #TODO: I should make this a commandline parameter. More = MSA takes longer. Less = worse reference msaReadCount = 4 print ('I choose ' + str(msaReadCount) + ' random reads.' + '\nThese are aligned to form a rough initial consensus sequence. Here:' + '\n' + join(self.outputRootDirectory,'Initial_Reference') + '\nPerforming ClustalO Multiple Sequence Alignment Now...') try: # Load Reads from File parsedReads = list(parse(self.readInput, self.readInputFormat)) referenceSequence = None # Reference Directory referenceDirectory = join(self.outputRootDirectory,'Initial_Reference') if not isdir(referenceDirectory): makedirs(referenceDirectory) if (len(parsedReads) > msaReadCount): # Select a subset of reads for Multiple SequneceAlignment. Randomly, i guess. randomIndexes = list(range(0, len(parsedReads))) shuffle(randomIndexes) rawClustalReads = [] for i in range(0,msaReadCount): rawClustalReads.append(parsedReads[randomIndexes[i]]) rawClustalReadsFilename = join(referenceDirectory, 'MSARaw.fasta') rawClustalReadsFileWriter = createOutputFile(rawClustalReadsFilename) write(rawClustalReads, rawClustalReadsFileWriter, 'fasta') rawClustalReadsFileWriter.close() #Perform Clustal MSA clustalOAlignmentOutputFileName = join(referenceDirectory, 'clustalOAlignment.fasta') clustalOCommandLine = ClustalOmegaCommandline(infile=rawClustalReadsFilename, outfile=clustalOAlignmentOutputFileName, verbose=True, auto=True, force=True, threads=int(self.numberThreads)) clustalOCommandLine() # Calculate consensus # A dumb consensus has lots of ambiguous nucleotides. We'll polish those out later. alignmentType = 'fasta' alignmentObject = read(clustalOAlignmentOutputFileName, alignmentType) alignmentSummaryInfo = AlignInfo.SummaryInfo(alignmentObject) dumbConsensus = alignmentSummaryInfo.dumb_consensus(threshold=.5) referenceSequence = SeqRecord(Seq(str(dumbConsensus) , IUPAC.IUPACUnambiguousDNA), id='Initial_Consensus', description='Initial_Consensus') # Else else: # Select the first read, use it as the reference. It's something. #referenceSequence = parsedReads[0] # You know what? we should just give up. There aren't enough reads to assemble. #raise Exception('Not enough reads to continue.') referenceSequence = SeqRecord(Seq('' , IUPAC.IUPACUnambiguousDNA), id='Initial_Consensus', description='Initial_Consensus') #Write reference to file self.referenceSequenceFileName = join(referenceDirectory, 'FirstGuessReference.fasta') firstGuessRefFileWriter = createOutputFile(self.referenceSequenceFileName) write([referenceSequence], firstGuessRefFileWriter, 'fasta') firstGuessRefFileWriter.close() return self.referenceSequenceFileName print ('Done making initial consensus sequence.') except Exception: print ('Exception encountered in createFirstGuessReferenceFromReads()') print (exc_info()[0]) print (exc_info()[1]) print (exc_info()[2]) raise
def frequency(): global consensus global frequency_matrix global aadict #read fasta file and detmine number of sequences. fasta_file = AlignIO.read(file_ali_out, "clustal") summary_align = AlignInfo.SummaryInfo(fasta_file) amount_seq = len(fasta_file) for record in fasta_file: print record.seq, record.id # make simple consensus sequence with treshold value. gap is marked with "-" consensus = summary_align.dumb_consensus(ambiguous="-", threshold=cons_tresh) print "\n", consensus info_file = open(project_dir + spacer + "info.txt", "a") info_file.write("\n consensus sequence: \n" + str(consensus)) info_file.close() # establish the abundance of a certain residue at each position in consensus sequence. frequency_matrix = summary_align.pos_specific_score_matrix(consensus) frequency_matrix_str = str(frequency_matrix) print frequency_matrix_str #print tis matrix to a file print("writing file of frequencies") frequencies_file = open( project_dir + spacer + "matrices" + spacer + "frequencies.txt", "w") frequencies_file.writelines(frequency_matrix_str) frequencies_file.close() #read first line of the file (determines the AA present in the alignment frequency_file = open( project_dir + spacer + "matrices" + spacer + "frequencies.txt", "r") aa = frequency_file.readline() # making list of the AA, deleting blank space. aadict = [] for i in range(len(aa)): if aa[i] != " ": aadict.append(aa[i]) frequency_file.close() #delete last item in list because it is not an AA (\n) del aadict[-1] info_file = open(project_dir + spacer + "info.txt", "a") info_file.write("\n Amino Acids found in sequences: \n \t" + str(aadict)) info_file.close() print aadict #make quotient of all input by amount of entries lines = range(0, len(consensus), 1) number_aa = int(len(aadict)) #make quotient of all input by amount of entries n = 0 #make first line of csv file procent_line = [] global procent_complete global entropy_position_list procent_file = open( project_dir + spacer + "matrices" + spacer + "procent.csv", "a") procent_file.write(str(aadict) + "\n") procent_file.close() entropy_file = open( project_dir + spacer + "matrices" + spacer + "entropy.csv", "a") entropy_file.write(str(aadict) + "\n") entropy_file.close() for number in lines: entropy_list = [] entropy_num = 0 entropy_position = 0 for AA in aadict: entropy_num += frequency_matrix[number][AA] for AA in aadict: #abundance procent = (1 - (frequency_matrix[number][AA] / amount_seq)) * 100 procent_line.append(procent) #entropy # calculate p. quotient_entropy = frequency_matrix[number][AA] / entropy_num # due to error with log2(0) if is needed. if quotient_entropy != 0: log_entropy = math.log(quotient_entropy, 2) entropy = quotient_entropy * log_entropy entropy = math.fabs(entropy) entropy = entropy entropy_list.append(entropy) else: entropy_list.append(0.000) # set numbers to 3digits behind the comma. for entropy_item in entropy_list: entropy_position = float(entropy_position) entropy_position += float(entropy_item) # make list of all entropy values. log_number_aa = math.log(number_aa, 2) entropy_position = entropy_position / log_number_aa entropy_position = float(entropy_position) entropy_position = entropy_position * 100 entropy_position = float(entropy_position) entropy_position = "%.2f" % entropy_position entropy_position_list.append(entropy_position) #write line to csv file print(procent_line) procent_line_str = str(procent_line) procent_file = open( project_dir + spacer + "matrices" + spacer + "procent.csv", "a") procent_file.write(procent_line_str) procent_file.write("\n") procent_file.close() procent_complete.append(procent_line) procent_line = [] #starting new line n = 0 entropy_file = open( project_dir + spacer + "matrices" + spacer + "entropy.csv", "a") entropy_file.write(str(entropy_position_list)) entropy_file.close() print entropy_position_list print procent_complete
new_align = alignment[:, 1:] print "trimming first character" if bs > (len(back_str)/3): new_align = new_align[:, :-1] print "trimming last character" flag = 1 if bs > (len(back_str)/3) and flag < 1: new_align = alignment[:, :-1] print "trimming last character but not first" return new_align alignment = AlignIO.read(handle, "fasta") clean_align = trim_consensus(alignment) print clean_align summary_align = AlignInfo.SummaryInfo(clean_align) gap_consensus = summary_align.gap_consensus(threshold = 0, ambiguous = 'N', consensus_alpha=alphabet, require_multiple=1) dumb_consensus = summary_align.dumb_consensus(threshold = 0, ambiguous = 'N', consensus_alpha=alphabet, require_multiple = 1) outfile_gap = open(sys.argv[1]+"_gap_consensus.fasta", "w") outfile_dumb = open(sys.argv[1]+"_dumb_consensus.fasta", "w") outfile_ungap = open(sys.argv[1]+"_ungap_consensus.fasta", "w") ungap_consensus = gap_consensus.ungap("-") SeqIO.write(SeqRecord(gap_consensus, id="%s"%(sys.argv[1]+"_gap_consensus"), description=""), outfile_gap, "fasta") SeqIO.write(SeqRecord(dumb_consensus, id="%s"%(sys.argv[1]+"_dumb_consensus"), description=""), outfile_dumb, "fasta") SeqIO.write(SeqRecord(ungap_consensus, id="%s"%(sys.argv[1]+"_ungap_consensus"), description=""), outfile_ungap, "fasta")
def get_sumatyInfo(alignment): #objeto para informacion sumarizada summary_align = AlignInfo.SummaryInfo(alignment) return summary_align
def clustalw2MSA(folderName, segList, startingIndex): filename = "seq" + str(startingIndex) lenMax = min(11, len(segList)) ''' indexNum = 10259 if False and startingIndex + 80 > indexNum > startingIndex + 20: GTDic = IORobot.loadContigsFromFile(folderName, "GTDic.fasta") str5 = GTDic["Segkk0"] print startingIndex , indexNum - startingIndex print startingIndex, str5[startingIndex:startingIndex+100] print indexNum, str5[indexNum-10:indexNum+10] print len(segList) segList.append(str5[startingIndex:startingIndex+100]) ''' IORobot.writeSegOut(segList[0:lenMax], folderName, filename + ".fasta") endThres = 2 #cline = ClustalwCommandline("clustalw2", infile=folderName + filename + ".fasta", pwdnamatrix="matrix.txt",TRANSWEIGHT=0, GAPOPEN=0, GAPEXT=0) cline = ClustalwCommandline("clustalw2", infile=folderName + filename + ".fasta", PWDNAMATRIX="matrix.txt", pwgapopen=0, pwgapext=0, TRANSWEIGHT=0, GAPOPEN=0, GAPEXT=0) stdout, stderr = cline() align = AlignIO.read(folderName + filename + ".aln", "clustal") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.gap_consensus(threshold=0) for eachalign in align: if eachalign.id == 'Segkk0': myseq = eachalign.seq ctTemplate = startingIndex modiList = [] for i in range(len(consensus)): #if consensus[i] == 'X': # print ctTemplate, consensus[i], myseq[i], i #if startingIndex + 80 > indexNum > startingIndex + 20 and i == 83: # print consensus[i] != myseq[i] and consensus[i] != 'X' # print consensus[i], myseq[i], ctTemplate, i # assert(False) # print ctTemplate, consensus[i], myseq[i], i if consensus[i] != myseq[i] and consensus[i] != 'X': if consensus[i] == '-': modiList.append([ctTemplate - startingIndex, ctTemplate, 'd']) elif myseq[i] == '-': if len(modiList) > 1 and modiList[-1][0] == ctTemplate: prevIndex = int(modiList[-1][1].split('_')[-2]) suffix = str(prevIndex + 1) else: suffix = "0" modiList.append([ ctTemplate - startingIndex, ctTemplate, 'i_' + suffix + "_" + str(consensus[i]) ]) else: modiList.append([ ctTemplate - startingIndex, ctTemplate, 's_' + str(consensus[i]) ]) #print consensus, i , consensus[i], myseq[i] if myseq[i] != '-': ctTemplate += 1 modiList.sort() newModiList = [] for k in range(len(modiList)): eachitem = modiList[k] if 20 < eachitem[0] < 80: newModiList.append([eachitem[1], eachitem[2]]) elif k < len(modiList) - 1 and modiList[k][0] <= 20 and modiList[ k + 1][0] > 20 and abs(modiList[k + 1][0] - modiList[k][0]) < endThres: newModiList.append([eachitem[1], eachitem[2]]) elif k > 0 and modiList[k][0] >= 80 and modiList[ k - 1][0] < 80 and abs(modiList[k - 1][0] - modiList[k][0]) < endThres: newModiList.append([eachitem[1], eachitem[2]]) ''' indexNum = 10259 if startingIndex + 80 > indexNum > startingIndex + 20: command = "cat " + folderName + "seq.aln >> ./happy " os.system(command) print startingIndex , indexNum - startingIndex print consensus print newModiList #assert(False) ''' return newModiList
def create_consensus(in_fasta, in_metadata, index_field, index_column, lineage, out_fasta, log_file): """ Collapses sequences into consensus sequences based on grouping by index column or index field :param in_fasta: Fasta file with sequences that needs to be splitted according to criteria to create a consensus set by user according to metadata file. (Required) :param in_metadata: Matching metadata file with same naming convention as fasta file. Contains all sequence metadata that the user wants to split the fasta file by for consensus to be created. Metadata file must be in .csv format (Required) :param index_field: The matching criteria the fasta file needs to be splitted by. (Required) :param index_column: The column with matching sequence IDs with fasta file (Default: header). (Optional) :param lineage: Specific lineages the user wants to split by. All sub-lineages will be collapsed to the closest lineage (e.g. 1.1.2 to 1.1). (Optional) :param out_fasta: Output fasta file with consensus sequences of all groups based on trait (Default: consensus.fasta). (Optional) :param log_file: Output log file (Default: stdout). (Optional) :return: """ metadata_dic = {} phylotype_dic = {} seq_dic = {} consensus_dic = {} output_folder = os.path.dirname(out_fasta) log_handle = get_log_handle(log_file, out_fasta) log_handle.write("Output folder: %s" % output_folder) with open(in_metadata, "r") as f: reader = csv.DictReader(f) reader.fieldnames = [name.lower() for name in reader.fieldnames] metadata = [r for r in reader] if index_field.lower() not in reader.fieldnames or index_column.lower( ) not in reader.fieldnames: sys.exit( "Column name not in metadata file, please re-check metadata file and reinsert a column name." ) for items in metadata: if items[index_column] in metadata_dic.keys(): print("Duplicate sequences with name: " + items[index_column] + " in metadata file.", file=log_handle) else: metadata_dic[items[index_column]] = items[index_field.lower()] for record in SeqIO.parse(in_fasta, 'fasta'): seq_dic[record.id] = record.seq if len(set(metadata_dic.keys()) & set(seq_dic.keys())) == 0: sys.exit("No matching sequence name with metadata name. Program Exit") if lineage != "": for clades in lineage: phylotype_dic[clades] = [] trait_order = list(phylotype_dic.keys()) trait_order.sort(key=lambda x: re.sub("[^A-Z0-9]", "", x), reverse=True) for cluster in trait_order: for seq_id, phylotype in metadata_dic.items(): cluster_type = cluster.split(".") cluster_length = len(cluster_type) phylo_type = phylotype.split(".") if len(phylo_type) < cluster_length: continue if phylo_type[:cluster_length] == cluster_type: if seq_id in seq_dic.keys(): phylotype_dic[cluster].append( [seq_id, seq_dic[seq_id], phylotype]) del seq_dic[seq_id] else: for seq, trait in metadata_dic.items(): if trait == "": print("Sequence " + seq + " have an empty " + trait + " value.", file=log_handle) if seq not in seq_dic.keys(): print("Sequence " + seq + " does not match metadata sequence name.", file=log_handle) continue if trait not in phylotype_dic.keys(): phylotype_dic[trait] = [] phylotype_dic[trait].append([seq, seq_dic[seq]]) else: phylotype_dic[trait].append([seq, seq_dic[seq]]) for seq in seq_dic.keys(): log_handle.write("Sequence " + seq + " did not find any matches to metadata file.\n") for key in phylotype_dic.keys(): if len(phylotype_dic[key]) > 2: print("Trait:" + key + "\t\tTotal Number:" + str(len(phylotype_dic[key])), file=log_handle) outfile_name = output_folder + key + ".fasta" outfile = open(outfile_name, "w") for sequences in phylotype_dic[key]: record = SeqRecord(sequences[1], id=sequences[0], description="") SeqIO.write(record, outfile, "fasta-2line") outfile.close() alignment_name = outfile_name[:-6] + "_alignment.fasta" align_command = "mafft " + outfile_name + " > " + alignment_name os.system(align_command) os.remove(outfile_name) alignment = AlignIO.read(alignment_name, 'fasta') consensus_name = key + "_consensus" summary_align = AlignInfo.SummaryInfo(alignment) consensus_seq = summary_align.dumb_consensus(threshold=0.0, ambiguous='N') consensus_dic[consensus_name] = consensus_seq os.remove(alignment_name) else: log_handle.write( "Phylotype " + key + " does not have 2 or more sequences for an alignment to work.\n" ) close_handle(log_handle) out_handle = get_out_handle(out_fasta) for key, value in consensus_dic.items(): record = SeqRecord(value, id=key, description="") SeqIO.write(record, out_handle, "fasta-2line") close_handle(out_handle)
# print the alignment back out print(alignment.format("clustal")) alignment = AlignIO.read(os.path.join(test_dir, test_names[0]), "clustal", alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) # test the base alignment stuff print('all_seqs...') for seq_record in alignment: print('description: %s' % seq_record.description) print('seq: %r' % seq_record.seq) print('length: %i' % alignment.get_alignment_length()) print('Calculating summary information...') align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus() assert isinstance(consensus, Seq) print('consensus: %r' % consensus) print('Replacement dictionary') ks = sorted(align_info.replacement_dictionary(['N']).keys()) for key in ks: print("%s : %s" % (key, align_info.replacement_dictionary(['N'])[key])) print('position specific score matrix.') print('with a supplied consensus sequence...') print(align_info.pos_specific_score_matrix(consensus, ['N'])) print('defaulting to a consensus sequence...') print(align_info.pos_specific_score_matrix(chars_to_ignore=['N']))
def cluster(blastdb, taxdb): Entrez.email = "*****@*****.**" conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") muscle_cline = MuscleCommandline(clwstrict=True) input_dic = {} multiple_dic = {} two_dic = {} problem_dic = {} finalseqs = set() multfinalseqs = [] unresolved = [] with open("multiple_gene_choices.txt") as o: line = o.readline() while line: input_dic[line.split("\t")[0]] = line.strip().split( "\t")[1].replace("[", "").replace("]", "").replace("'", "") line = o.readline() for i in input_dic: GIs = input_dic[i] GIs_list = GIs.split(", ") if len(GIs_list) > 2: multiple_dic[i] = GIs_list if len(GIs_list) == 2: two_dic[i] = GIs_list for i in multiple_dic: identities = [] joined_GIs = ",".join(multiple_dic[i]) handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=joined_GIs) seqs = SeqIO.parse(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N') consensus_record = SeqRecord(consensus, id="Consensus_all") for m in multiple_dic[i]: error = True while error == True: try: handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=m) error = False except: print('Error, trying again') time.sleep(10) seqs = SeqIO.read(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") SeqIO.write(consensus_record, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") count = 0 gaps = 0 for col in range(0, len(align[0])): column = align[:, col] if "-" not in column: if column[1:] == column[:-1]: count = count + 1 else: gaps = gaps + 1 iden = 100 * (count / float((len(align[0]) - gaps))) identities.append(iden) if identities.count(max(identities)) == 1: finalseqs.add(multiple_dic[i][identities.index(max(identities))]) else: problem_dic[i] = multiple_dic[i] GI_to_pick = [ multiple_dic[i][m] for m, x in enumerate(identities) if x == max(identities) ] multfinalseqs.append(GI_to_pick) for i in two_dic: #align the two seqs list_of_GIs = two_dic[i] alignment = alignment_reg(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: #get taxonomy for query(main species) # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Complement Aligned Identity: " + str(iden)) #add tiling thing gene_name = '_'.join(i.split('_')[1:]) idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(m > 70 for m in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): # print("Seqs align to different regions of probe, choosing all") multfinalseqs.append(list_of_GIs) else: # print('Seqs overlap: Printing to file for hand checking') with open('these_seqs_overlap_cluster.txt', 'a') as a: unresolved.append(list_of_GIs) a.write(str(list_of_GIs) + '\n') else: #get taxonomy for query(main species) print("Parsing taxonomy for error sequences") hits = blast(i, list_of_GIs, c) #if theres only one lowest taxonomy hit, change if hits.count(min(hits)) == 1: finalseqs.add( str(two_dic[i][hit_levels.index(min(hits))])) # print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit") else: #there are multiple lowest taxonomy hits multfinalseqs.append(two_dic[i]) problem_dic[i] = two_dic[i] # print('Taxonomies had the multiple closest hits') else: multfinalseqs.append(two_dic[i]) # print("Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("High Aligned Identity: " + str(iden) + " so pair is fine") print("length of resolved = " + str(len(finalseqs))) print("length of choose multiple = " + str(len(multfinalseqs))) print("length of unresolved = " + str(len(unresolved))) with open("final_GIs.txt", "a") as o: for m in finalseqs: o.write(str(m) + "\n") with open("choose_mult.txt", "a") as o: for m in [num for pair in multfinalseqs for num in pair]: o.write(str(m) + "\n")
def ClusterFams(dirClust, dCLustID, strOutputFile, dThresh, strMUSCLE): #Clusters all of the family files made by MakeFamilyFastaFiles. dirFams = dirClust + os.sep + "fams" dirCentroids = dirFams + os.sep + "centroids" dirUC = dirFams + os.sep + "uc" if not os.path.exists(dirClust): os.makedirs(dirClust) if not os.path.exists(dirFams): os.makedirs(dirFams) if not os.path.exists(dirCentroids): os.makedirs(dirCentroids) if not os.path.exists(dirUC): os.makedirs(dirUC) #sys.stderr.write( dirCentroids + "\n") #sys.stderr.write( str(glob.glob(dirFams+os.sep+'*.faa')) + "\n") for fileFasta in glob.glob(dirFams + os.sep + '*.faa'): #sys.stderr.write("The file is " + fileFasta + " \n") fileClust = dirCentroids + os.sep + os.path.basename(fileFasta) fileAlign = dirFams + os.sep + os.path.basename(fileFasta) + ".aln" strSeqID = os.path.basename(fileFasta) strSeqID = strSeqID.replace(".faa", "") iSeqCount = 0 #Count seqs, if more than one, then align them for seq in SeqIO.parse(fileFasta, "fasta"): iSeqCount += 1 if iSeqCount > 1: #Call muscle to produce an alignment subprocess.check_call( [strMUSCLE, "-in", str(fileFasta), "-out", str(fileAlign)]) # Use BioPython's "dumb consensus" feature to get consensus sequence algnFasta = AlignIO.read(str(fileAlign), "fasta") seqConsensus = str( AlignInfo.SummaryInfo(algnFasta).dumb_consensus( threshold=dThresh, ambiguous='X')) seqConsensus = SeqRecord(Seq(seqConsensus), id=strSeqID) SeqIO.write(seqConsensus, str(fileClust), "fasta") """ # We previously used EMBOSS-CONS to produce consensus sequences # Call cons or em_cons from the EMBOSS package to produce a consensus sequence subprocess.check_call(["cons", "-seq", str(fileAlign), "-outseq", str(fileClust)]) """ else: shutil.copyfile(fileFasta, fileClust) ageneAllGenes = [] for fileFasta in glob.glob(dirCentroids + os.sep + '*.faa'): for gene in SeqIO.parse(fileFasta, "fasta"): gene.id = os.path.basename(os.path.splitext(fileFasta)[0]) ageneAllGenes.append(gene) """ for gene in ageneAllGenes: mtch = re.search(r'centroid=(.*)',gene.id) if mtch: gene.id = mtch.group(1) else: gene.id = os.path.splitext() """ SeqIO.write(ageneAllGenes, strOutputFile, "fasta")
def consensus(self): aln = Align.MultipleSeqAlignment(self.members) info = AlignInfo.SummaryInfo(aln) seq = info.dumb_consensus(threshold=self.consensus_threshold, ambiguous='N') return SeqRecord.SeqRecord(seq, name='consensus', id='consensus')
def performClustalWAlignmentsForGroupwiseReference(outputDirectory, alleleFullList, runAlignments): print ('Performing clustalW Alignments and finding consensus sequences') # Create the output directories for clustalw. clustalwOutputDirectory = join(outputDirectory, 'ClustalwAlignmentsAPD') if not os.path.isdir(clustalwOutputDirectory): os.mkdir(clustalwOutputDirectory) clustalwConsensusOutputDirectory = clustalwOutputDirectory.replace('Alignments','Consensus') if not os.path.isdir(clustalwConsensusOutputDirectory): os.mkdir(clustalwConsensusOutputDirectory) #clustalwAlignmentScriptFile = createOutputFile(clustalwAlignmentScriptFileName) alleleGroups = getAlleleGroups(alleleFullList) for index, alleleGroup in enumerate(alleleGroups): print('(' + str(index + 1) + '/' + str(len(alleleGroups)) + '): HLA-' + alleleGroup.Gene + '*' + alleleGroup.AlleleGroup) if (True): #if (alleleGroup.Gene in genesForAnalysis): outputGroupFileName = join(outputDirectory, join('AlleleGroupsAPD',alleleGroup.FileName)) clustalwAlignmentOutputFileName = outputGroupFileName.replace( '.fasta','.aln').replace('/AlleleGroupsAPD/','/ClustalwAlignmentsAPD/') clustalwConsensusOutputFileName = outputGroupFileName.replace('/AlleleGroupsAPD/','/ClustalwConsensusAPD/') # If the alignment does not already exist if not (os.path.isfile(clustalwAlignmentOutputFileName)): # if there is more than one allele in the group if (len(alleleGroup.Alleles) > 1): print (str(len(alleleGroup.Alleles)) + ' Alleles Found.') clustalwCommandline = ClustalwCommandline("clustalw", infile=outputGroupFileName, outfile=clustalwAlignmentOutputFileName) print ('ClustalW Alignment Commandline:\n' + str(clustalwCommandline)) if (runAlignments): # print ('Performing Clustalw Alignment...') #clustalwAlignmentScriptFile.write(str(clustalwCommandline) + '\n') #Perform the alignment clustalwCommandline() if (os.path.isfile(clustalwAlignmentOutputFileName)): # If consensus does not exist yet if not (os.path.isfile(clustalwConsensusOutputFileName)): #Perform the consensus alignmentType = 'clustal' align = AlignIO.read(clustalwAlignmentOutputFileName, alignmentType) #print ('Consensus FileName = ' + clustalwConsensusOutputFileName) summary_align = AlignInfo.SummaryInfo(align) dumb_consensus = summary_align.dumb_consensus() #print('LengthDumbConsensus:' + str(len(dumb_consensus))) gap_consensus = summary_align.gap_consensus() #print('LengthGapConsensus:' + str(len(gap_consensus))) #print ('Consensus=' + str(gap_consensus)) # Print Consensus to fasta. # I can cheat and just create an HLA_Allele object, and print that. currentAllele = HLA_Allele() # I think I'll use the dumb consensus. The only difference is that a gap consensus allows gaps. currentAllele.APDSequence = str(dumb_consensus) currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','') currentAllele.outputDirectory = outputDirectory #print ('Consensus2=' + currentAllele.APDSequence) printFasta([currentAllele], clustalwConsensusOutputFileName, True, False, False) pass else: print ('Consensus file ' + clustalwConsensusOutputFileName + ' already exists. Moving on...') else: print ('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) #raise Exception('Cannot find alignment file after completing alignment:' + clustalwAlignmentOutputFileName) pass else: print ('Not running Alignments because you told me not to.') # There is only one allele in this group. else: print ('Only one allele found') currentGene = alleleGroup.Alleles[0].geneName # Only class 1. #if (currentGene in genesForAnalysis): currentAllele = HLA_Allele() currentAllele.sequence = alleleGroup.Alleles[0].sequence currentAllele.alleleName = os.path.basename(clustalwConsensusOutputFileName).replace('.fasta','') printFasta([currentAllele], clustalwConsensusOutputFileName, False, False, False) else: print ('Alignment file ' + clustalwAlignmentOutputFileName + ' already exists. Moving on...') else: print ('Skipping alignment, because this gene isnt included in genesForAnalysis')
def processing(raw_fasta_path, out_dir_path): if not os.path.exists(out_dir_path): logging.info("Making directory {0}".format(out_dir_path)) os.makedirs(out_dir_path) deduplicated_fasta = remove_duplicates(SeqIO.parse(raw_fasta_path, "fasta")) base = os.path.basename(raw_fasta_path) fasta_path = os.path.join(out_dir_path, base) logging.info("Writing FASTA in {0}".format(fasta_path)) SeqIO.write(deduplicated_fasta, fasta_path, "fasta") # Multiple sequence alignment cline = ClustalwCommandline("clustalw2", infile=fasta_path) stdout, stderr = cline() logging.info(cline) clustalw_result_path = fasta_path.replace(".fasta", ".aln") alignment_dict = SeqIO.to_dict( AlignIO.read(clustalw_result_path, "clustal")) # writing alignment table in .txt with open(os.path.join(out_dir_path, "alignment.txt"), "w") as fout: fout.write("\n".join( str(record.seq) for record in alignment_dict.itervalues())) # alignment tree drawing tree_path = fasta_path.replace(".fasta", ".dnd") tree = Phylo.read(tree_path, "newick") tree.ladderize() # with labels Phylo.draw_graphviz(tree, label_func=lambda x: x.name.replace("ID=", "")) plt.savefig(os.path.join( out_dir_path, "figure_with_labels.pdf")) # need pygraphviz, pylab # Clustering ids = dict(enumerate(alignment_dict.keys())) distance_matrix = np.zeros([len(ids)] * 2) for i, j in itertools.combinations(xrange(len(ids)), r=2): distance_matrix[i][j] = distance_matrix[j][i] = \ distance(alignment_dict[ids[i]], alignment_dict[ids[j]]) # Compute and plot dendrogram fig = plt.figure() axdendro = fig.add_axes([0.09, 0.1, 0.2, 0.8]) Y = linkage(distance_matrix, method="centroid") cutoff = 0.5 * max(Y[:, 2]) clusters = fcluster(Y, cutoff, "distance") Z = dendrogram(Y, orientation="right", color_threshold=cutoff) axdendro.set_yticks([]) # Plot distance matrix axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.8]) index = Z["leaves"] distance_matrix = distance_matrix[index, :] distance_matrix = distance_matrix[:, index] im = axmatrix.matshow(distance_matrix, aspect="auto", origin="lower") axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.8]) plt.colorbar(im, cax=axcolor) # Display and save figure dendogram_path = os.path.join(out_dir_path, "dendogram.png") fig.savefig(dendogram_path) fasta_clusters = defaultdict(list) for i, cluster in enumerate(clusters): fasta_id = ids[i] fasta_clusters[cluster].append(alignment_dict[fasta_id]) # Saving information about clusters clusters_dir_path = os.path.join(out_dir_path, "clusters") if not os.path.exists(clusters_dir_path): os.makedirs(clusters_dir_path) clusters_meta_path = os.path.join(clusters_dir_path, "clusters_meta.txt") meta_file = open(clusters_meta_path, "w") for cluster_id, cluster in fasta_clusters.iteritems(): cluster_path = os.path.join(clusters_dir_path, "cluster_{0}.fasta".format(cluster_id)) SeqIO.write(cluster, cluster_path, "fasta") summary_align = AlignInfo.SummaryInfo(MultipleSeqAlignment(cluster)) consensus = summary_align.dumb_consensus() pssm = summary_align.pos_specific_score_matrix(consensus, chars_to_ignore=['-']) frequencies = dict.fromkeys(IUPAC.protein.letters, 0) frequencies.update( (key, len(list(group))) for key, group in itertools.groupby(sorted(consensus))) frequencies.pop("X") meta_file.write("""Cluster ID: {0} Cluster size: {1} Consensus: {2} PSSM: {3} Frequencies in consensus: {4} """.format(cluster_id, len(cluster), textwrap.fill(str(consensus)), pssm, pprint.pformat(frequencies))) fig = plt.figure() pos = np.arange(len(IUPAC.protein.letters)) width = .5 # gives histogram aspect to the bar diagram ax = plt.axes() ax.set_xticks(pos + (width / 2)) ax.set_xticklabels(IUPAC.protein.letters) plt.bar(pos, [frequencies[letter] for letter in IUPAC.protein.letters], width, color='r') frequencies_path = os.path.join( clusters_dir_path, "frequencies_{0}.png".format(cluster_id)) fig.savefig(frequencies_path)
def test_read_write_clustal(self): """Test the base alignment stuff.""" path = os.path.join(os.getcwd(), "Clustalw", "opuntia.aln") alignment = AlignIO.read(path, "clustal", alphabet=Alphabet.generic_dna) self.assertEqual(len(alignment), 7) seq_record = alignment[0] self.assertEqual(seq_record.description, "gi|6273285|gb|AF191659.1|AF191") self.assertEqual( seq_record.seq, Seq("TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCCATTGATTTAGTGTACCAGA" )) seq_record = alignment[1] self.assertEqual(seq_record.description, "gi|6273284|gb|AF191658.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATA--------ATATATTTCAAATTTCCTTATATACCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[2] self.assertEqual(seq_record.description, "gi|6273287|gb|AF191661.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTCAAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[3] self.assertEqual(seq_record.description, "gi|6273286|gb|AF191660.1|AF191") self.assertEqual( seq_record.seq, "TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATA----------ATATATTTATAATTTCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[4] self.assertEqual(seq_record.description, "gi|6273290|gb|AF191664.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) seq_record = alignment[5] self.assertEqual(seq_record.description, "gi|6273289|gb|AF191663.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATA------ATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTATACCAGA" ) seq_record = alignment[6] self.assertEqual(seq_record.description, "gi|6273291|gb|AF191665.1|AF191") self.assertEqual( seq_record.seq, "TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTCCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) self.assertEqual(alignment.get_alignment_length(), 156) align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus() self.assertIsInstance(consensus, Seq) self.assertEqual( consensus, "TATACATTAAAGXAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAATATATATATATATATAATATATTTCAAATTXCCTTATATATCCAAATATAAAAATATCTAATAAATTAGATGAATATCAAAGAATCTATTGATTTAGTGTACCAGA" ) dictionary = align_info.replacement_dictionary(["N", "-"]) self.assertEqual(len(dictionary), 16) self.assertAlmostEqual(dictionary[("A", "A")], 1395.0, places=1) self.assertAlmostEqual(dictionary[("A", "C")], 3.0, places=1) self.assertAlmostEqual(dictionary[("A", "G")], 13.0, places=1) self.assertAlmostEqual(dictionary[("A", "T")], 6.0, places=1) self.assertAlmostEqual(dictionary[("C", "A")], 3.0, places=1) self.assertAlmostEqual(dictionary[("C", "C")], 271.0, places=1) self.assertAlmostEqual(dictionary[("C", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("C", "T")], 16.0, places=1) self.assertAlmostEqual(dictionary[("G", "A")], 5.0, places=1) self.assertAlmostEqual(dictionary[("G", "C")], 0, places=1) self.assertAlmostEqual(dictionary[("G", "G")], 480.0, places=1) self.assertAlmostEqual(dictionary[("G", "T")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "A")], 6.0, places=1) self.assertAlmostEqual(dictionary[("T", "C")], 12.0, places=1) self.assertAlmostEqual(dictionary[("T", "G")], 0, places=1) self.assertAlmostEqual(dictionary[("T", "T")], 874.0, places=1) matrix = align_info.pos_specific_score_matrix(consensus, ["N", "-"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) matrix = align_info.pos_specific_score_matrix( chars_to_ignore=["N", "-"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 X 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 T 0.0 0.0 0.0 3.0 A 3.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 T 0.0 0.0 0.0 1.0 A 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 X 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) second_seq = alignment[1].seq matrix = align_info.pos_specific_score_matrix(second_seq, ["N", "-"]) self.assertEqual( str(matrix), """\ A C G T T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 1.0 0.0 0.0 6.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 4.0 0.0 3.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 G 0.0 0.0 7.0 0.0 C 0.0 7.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 4.0 A 4.0 0.0 0.0 0.0 - 0.0 0.0 0.0 3.0 - 3.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 - 0.0 0.0 0.0 1.0 - 1.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 C 1.0 6.0 0.0 0.0 A 6.0 0.0 0.0 1.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 3.0 0.0 4.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 2.0 0.0 5.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 C 0.0 7.0 0.0 0.0 T 0.0 1.0 0.0 6.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 T 0.0 0.0 0.0 7.0 G 1.0 0.0 6.0 0.0 T 0.0 0.0 0.0 7.0 A 7.0 0.0 0.0 0.0 C 0.0 7.0 0.0 0.0 C 0.0 7.0 0.0 0.0 A 7.0 0.0 0.0 0.0 G 0.0 0.0 7.0 0.0 A 7.0 0.0 0.0 0.0 """) value = align_info.information_content(5, 50, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 88.42, places=2) value = align_info.information_content(chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) e_freq_table = {"G": 0.25, "C": 0.25, "A": 0.25, "T": 0.25} value = align_info.information_content(e_freq_table=e_freq_table, chars_to_ignore=["N"]) self.assertAlmostEqual(value, 287.55, places=2) self.assertEqual(align_info.get_column(1), "AAAAAAA") self.assertAlmostEqual(align_info.ic_vector[1], 2.00, places=2) self.assertEqual(align_info.get_column(7), "TTTATTT") self.assertAlmostEqual(align_info.ic_vector[7], 1.41, places=2) handle = StringIO() AlignInfo.print_info_content(align_info, fout=handle) self.assertEqual( handle.getvalue(), """\ 0 T 2.000 1 A 2.000 2 T 2.000 3 A 2.000 4 C 2.000 5 A 2.000 6 T 2.000 7 T 1.408 8 A 2.000 9 A 2.000 10 A 2.000 11 G 2.000 12 A 1.015 13 A 2.000 14 G 2.000 15 G 2.000 16 G 2.000 17 G 2.000 18 G 2.000 19 A 2.000 20 T 2.000 21 G 2.000 22 C 2.000 23 G 2.000 24 G 2.000 25 A 2.000 26 T 2.000 27 A 2.000 28 A 2.000 29 A 2.000 30 T 2.000 31 G 2.000 32 G 2.000 33 A 2.000 34 A 2.000 35 A 2.000 36 G 2.000 37 G 2.000 38 C 2.000 39 G 2.000 40 A 2.000 41 A 2.000 42 A 2.000 43 G 2.000 44 A 2.000 45 A 2.000 46 A 2.000 47 G 2.000 48 A 2.000 49 A 2.000 50 T 2.000 51 A 2.000 52 T 2.000 53 A 2.000 54 T 2.000 55 A 2.000 56 - 0.682 57 - 0.682 58 - 0.333 59 - 0.333 60 - -0.115 61 - -0.115 62 - -0.115 63 - -0.115 64 - -0.115 65 - -0.115 66 A 2.000 67 T 2.000 68 A 2.000 69 T 2.000 70 A 2.000 71 T 2.000 72 T 2.000 73 T 2.000 74 C 1.408 75 A 1.408 76 A 2.000 77 A 2.000 78 T 2.000 79 T 2.000 80 T 1.015 81 C 2.000 82 C 2.000 83 T 2.000 84 T 2.000 85 A 2.000 86 T 2.000 87 A 2.000 88 T 2.000 89 A 2.000 90 C 1.137 91 C 2.000 92 C 2.000 93 A 2.000 94 A 2.000 95 A 2.000 96 T 2.000 97 A 2.000 98 T 2.000 99 A 2.000 100 A 2.000 101 A 2.000 102 A 2.000 103 A 2.000 104 T 2.000 105 A 2.000 106 T 2.000 107 C 2.000 108 T 2.000 109 A 2.000 110 A 2.000 111 T 2.000 112 A 2.000 113 A 2.000 114 A 2.000 115 T 2.000 116 T 2.000 117 A 2.000 118 G 2.000 119 A 2.000 120 T 2.000 121 G 2.000 122 A 2.000 123 A 2.000 124 T 2.000 125 A 2.000 126 T 2.000 127 C 2.000 128 A 2.000 129 A 2.000 130 A 2.000 131 G 2.000 132 A 2.000 133 A 2.000 134 T 2.000 135 C 2.000 136 C 1.408 137 A 2.000 138 T 2.000 139 T 2.000 140 G 2.000 141 A 2.000 142 T 2.000 143 T 2.000 144 T 2.000 145 A 2.000 146 G 2.000 147 T 2.000 148 G 1.408 149 T 2.000 150 A 2.000 151 C 2.000 152 C 2.000 153 A 2.000 154 G 2.000 155 A 2.000 """)
def divergence(fastain, patient_id, cutoff): # fasta = open('%s' % filename, 'r') split_fasta = split(fastain, 1) seqs_by_timepoint = split_fasta[0] total_seq = split_fasta[1] # conseq = consensus.seq[(sites_pos[0]-1):(sites_pos[1]-1)] # conseq = Seq(str(consensus).replace('-','N')) # consensus = Seq(conseq.seq.tostring().replace('-','N')) # seq_length = len(consensus) mean_divergence = [] median_divergence = [] lower_divergence_25 = [] upper_divergence_75 = [] lower_divergence_5 = [] upper_divergence_95 = [] divergence_std = [] mean_N_divergence = [] median_N_divergence = [] lower_N_divergence_25 = [] upper_N_divergence_75 = [] lower_N_divergence_5 = [] upper_N_divergence_95 = [] N_divergence_std = [] mean_S_divergence = [] median_S_divergence = [] lower_S_divergence_25 = [] upper_S_divergence_75 = [] lower_S_divergence_5 = [] upper_S_divergence_95 = [] S_divergence_std = [] dN = [] dN_med = [] dN_lower_25 = [] dN_upper_75 = [] dN_lower_5 = [] dN_upper_95 = [] dN_std = [] dS = [] dS_med = [] dS_lower_25 = [] dS_upper_75 = [] dS_lower_5 = [] dS_upper_95 = [] dS_std = [] patient = [] # parts = str.split(fastain, "/") # parts2 = str.split(parts[len(parts)-1], "_") patient.append(patient_id) nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None) sorted_timepoints = seqs_by_timepoint.keys() sorted_timepoints.sort(key=natural_keys) print sorted_timepoints first_timepoint = AlignIO.MultipleSeqAlignment( seqs_by_timepoint[sorted_timepoints[0]]) consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus( threshold=0.01).upper() conseq = Seq(str(consensus).replace('X', 'N')) prot = "" if "gag" in fastain: prot = "gag" else: prot = "gp41" sampleTimes = [] for t in sorted_timepoints: sampleTimes.append(float(t)) # for f in filelist: for t in range(0, len(sorted_timepoints)): divergence = [] divergence_N = [] divergence_S = [] divergence_dN = [] divergence_dS = [] # diff = 0 seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]] seq_length = len(seqs_at_t[0].seq) seq_freq = get_seq_freq(seqs_at_t) seqs_at_t_array = np.asarray(seqs_at_t) # i want to calculate derived freq wrt to consequence not minor freq per site #for c in xrange(0,len(consensus_seqs)): full_der_freq = [] total_site_freq = [] for i in range(seq_length): site_a = seqs_at_t_array[:, i] anc_freq = 0 der_freq = 0 #gap_count = "".join(site_a).count('-') for j in range(0, len(seq_freq)): if site_a[j] != '-': if conseq[i].lower() == site_a[j]: anc_freq += seq_freq[j] else: der_freq += seq_freq[j] # if (site_a[j] == 'a'): # A += seq_freq[j] # elif (site_a[j] == 'c'): # C += seq_freq[j] # elif (site_a[j] == 't'): # T += seq_freq[j] # elif (site_a[j] == 'g'): # G += seq_freq[j] total_seq = sum([der_freq, anc_freq]) full_der_freq.append(der_freq) total_site_freq.append(total_seq) #print [der_freq, anc_freq], total_seq #total_site_freq_per_consensus.append(total_site_freq) #full_der_freq_per_consensus.append(full_der_freq) #for c in xrange(0, len(consensus_seqs)): for i in range(seq_length): # print i, full_der_freq[i], patient_id, sorted_timepoints[t], total_seq, float( # full_der_freq[i]) / float(total_seq) diff = 0 diff_N = 0 diff_S = 0 count = total_site_freq[i] count1 = 0 if full_der_freq[i] > cutoff * total_seq: for each in seqs_at_t: parts = str.split(each.name, "_") freq = int(parts[2].strip()) seq = Seq(str(each.seq).upper().replace('-', 'N')) if (str(conseq[i]) != "N"): if (str(seq[i]) != "N"): count1 += freq if (conseq[i] != seq[i]): codon = [] if (i % 3 == 0): cp = i cp_a = i + 1 cp_b = i + 2 codon = [cp, cp_a, cp_b] elif (i % 3 == 1): cp_a = i - 1 cp = i cp_b = i + 1 codon = [cp_a, cp, cp_b] else: cp_a = i - 2 cp_b = i - 1 cp = i codon = [cp_a, cp_b, cp] consensus_aa = conseq[codon[0]:( codon[2] + 1)].translate() current_aa = seq[codon[0]:(codon[2] + 1)].translate() # print(str(consensus_aa), str(current_aa)) if 'X' in conseq[codon[0]:(codon[2] + 1)]: break if (str(consensus_aa) != str(current_aa)): diff_N += freq else: diff_S += freq #print i, current_aa, consensus_aa, diff_N, diff_S, each.name, freq diff += freq #print each.name, sorted_timepoints[t], "d", float(diff), i, seq_length, count print(count, count1, i, diff, diff_N, diff_S) # # if((count-count1) != 0): # print(count, count1, i, diff, diff_N, diff_S) if count > 0: #print i, patient_id, diff, count divergence.extend([float(diff) / float(count)]) divergence_N.extend([float(diff_N) / float(count)]) divergence_S.extend([float(diff_S) / float(count)]) divergence_dN.extend( [float(diff_N) / float(nonsyn_sites) / float(count)]) divergence_dS.extend( [float(diff_S) / float(syn_sites) / float(count)]) if len(divergence) > 1: mean_divergence.append(np.mean(divergence)) median_divergence.append(np.percentile(divergence, 50)) lower_divergence_25.append(np.percentile(divergence, 25)) upper_divergence_75.append(np.percentile(divergence, 75)) lower_divergence_5.append(np.percentile(divergence, 5)) upper_divergence_95.append(np.percentile(divergence, 95)) divergence_std.append(np.std(divergence)) mean_N_divergence.append(np.mean(divergence_N)) median_N_divergence.append(np.percentile(divergence_N, 50)) lower_N_divergence_25.append(np.percentile(divergence_N, 25)) upper_N_divergence_75.append(np.percentile(divergence_N, 75)) lower_N_divergence_5.append(np.percentile(divergence_N, 5)) upper_N_divergence_95.append(np.percentile(divergence_N, 95)) N_divergence_std.append(np.std(divergence_N)) mean_S_divergence.append(np.mean(divergence_S)) median_S_divergence.append(np.percentile(divergence_S, 50)) lower_S_divergence_25.append(np.percentile(divergence_S, 25)) upper_S_divergence_75.append(np.percentile(divergence_S, 75)) lower_S_divergence_5.append(np.percentile(divergence_S, 5)) upper_S_divergence_95.append(np.percentile(divergence_S, 95)) S_divergence_std.append(np.std(divergence_S)) dN.append(np.mean(divergence_dN)) dN_med.append(np.percentile(divergence_dN, 50)) dN_lower_25.append(np.percentile(divergence_dN, 25)) dN_upper_75.append(np.percentile(divergence_dN, 75)) dN_lower_5.append(np.percentile(divergence_dN, 5)) dN_upper_95.append(np.percentile(divergence_dN, 95)) dN_std.append(np.std(divergence_dN)) dS.append(np.mean(divergence_dS)) dS_med.append(np.percentile(divergence_dS, 50)) dS_lower_25.append(np.percentile(divergence_dS, 25)) dS_upper_75.append(np.percentile(divergence_dS, 75)) dS_lower_5.append(np.percentile(divergence_dS, 5)) dS_upper_95.append(np.percentile(divergence_dS, 95)) dS_std.append(np.std(divergence_dS)) if ("gag" in fastain): csvfile_gag_b.write(patient_id + "," + str(sorted_timepoints[t]) + "," + str(np.mean(divergence)) + "," + str(np.percentile(divergence, 50)) + "," + str(np.percentile(divergence, 5)) + "," + str(np.percentile(divergence, 95)) + "," + str(np.mean(divergence_N)) + "," + str(np.percentile(divergence_N, 50)) + "," + str(np.percentile(divergence_N, 5)) + "," + str(np.percentile(divergence_N, 95)) + "," + str(np.mean(divergence_S)) + "," + str(np.percentile(divergence_S, 50)) + "," + str(np.percentile(divergence_S, 5)) + "," + str(np.percentile(divergence_S, 95)) + "\n") csvfile_gag_b.flush() elif ("gp41" in fastain): csvfile_gp41_b.write( patient_id + "," + str(sorted_timepoints[t]) + "," + str(np.mean(divergence)) + "," + str(np.percentile(divergence, 50)) + "," + str(np.percentile(divergence, 5)) + "," + str(np.percentile(divergence, 95)) + "," + str(np.mean(divergence_N)) + "," + str(np.percentile(divergence_N, 50)) + "," + str(np.percentile(divergence_N, 5)) + "," + str(np.percentile(divergence_N, 95)) + "," + str(np.mean(divergence_S)) + "," + str(np.percentile(divergence_S, 50)) + "," + str(np.percentile(divergence_S, 5)) + "," + str(np.percentile(divergence_S, 95)) + "\n") else: print "xxx", patient_id, sorted_timepoints[t] print patient_id, sorted_timepoints[t], len(divergence)
import glob directory = sys.argv[1] + '/grouped_reads_passing_cutoff_' + sys.argv[ 2] + '_' + sys.argv[3] + '.fasta.split/' files_in_direct = glob.glob(directory + "*.fasta") print('Sequences grouped...') print("Script has detected " + str(len(files_in_direct)) + " files in the directory " + directory) print('Generating consensus sequences for each UMI group...') for file in files_in_direct: align = AlignIO.read(file, "fasta") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.dumb_consensus(threshold=0.5, ambiguous='N') str_con = str(consensus) filesplit = file.split("/") filesplitfurther = filesplit[-1].split("_") filesplitevenfurther = filesplitfurther[-1].split(".") ID = filesplitevenfurther[0] output_con = open( sys.argv[1] + '/consensus_' + sys.argv[2] + '_ZIKV_UMI.fasta', "a") output_con.write(">" + ID + "\n") output_con.write(str_con + "\n") output_con.close()
def number_of_N_and_S_sites(fastain, sites): fastaseq = AlignIO.read('%s' % fastain, 'fasta') sequence = AlignInfo.SummaryInfo(fastaseq).dumb_consensus().upper() print sequence site1 = 0 site2 = 0 bases = ['A', 'T', 'G', 'C'] if sites == None: site1 = 0 site2 = len(sequence) else: site1 = sites[0] - 1 site2 = sites[1] - 1 total_nonsyn = 0 total_syn = 0 for i in xrange(site1, site2): non_syn = 0 syn = 0 codon = sequence[0:3] codon_pos = 0 if i % 3 == 0: codon = sequence[i:i + 3] codon_pos = 0 elif i % 3 == 1: codon = sequence[i - 1:(i - 1) + 3] codon_pos = 1 elif i % 3 == 2: codon = sequence[(i - 2):(i - 2) + 3] codon_pos == 3 # print i, codon for b in bases: if ("X" in str(codon)): break codon_string = list(codon) codon_string[codon_pos] = b new_codon_seq = Seq("".join(codon_string)) new_aa = new_codon_seq.translate() aa = codon.translate() if str(new_aa) == str(aa): syn += 1 else: non_syn += 1 total_nonsyn += float(non_syn / 4.0) total_syn += float(syn / 4.0) # print float(non_syn/4.0), float(syn/4.0) return total_nonsyn, total_syn
'A' : 0.25, 'T' : 0.25} e_freq_table = FreqTable.FreqTable(e_freq, FreqTable.FREQ, IUPAC.unambiguous_dna) print 'relative information:', align_info.information_content( e_freq_table = e_freq_table, chars_to_ignore = ['N']) print 'Column 1:', align_info.get_column(1) print 'IC for column 1:', align_info.ic_vector[1] print 'Column 7:', align_info.get_column(7) print 'IC for column 7:', align_info.ic_vector[7] print 'test print_info_content' AlignInfo.print_info_content(align_info) print "testing reading and writing fasta format..." to_parse = os.path.join(os.curdir, 'Quality', 'example.fasta') alignment = AlignIO.read(open(to_parse), "fasta", alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)) # test the base alignment stuff print 'all_seqs...' for seq_record in alignment: print 'description:', seq_record.description print 'seq:', repr(seq_record.seq) print 'length:', alignment.get_alignment_length() align_info = AlignInfo.SummaryInfo(alignment)
os.chdir(sys.argv[1]) listing = os.listdir(".") consensus = {} genConsensus = '' pssmGen = '' # this value should be read from the arguments or else use a default consensusThres = 0.7 # sys.argv[2] holds the path to the general alignment generalAlignment = AlignIO.parse(sys.argv[2], "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 positionsToMask = [] for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) for index, residue in enumerate(genConsensus): if genConsensus[index] == '-': continue if genConsensus[index] == 'X': continue positionsToMask.append(index) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) print positionsToMask for item in listing: if item.endswith(".fas"):