genome = Fasta(genome_file) if sys.argv[3] == 'gene': with open(annotation_file, 'r') as f: for line in f: if "##" in line: continue split_line = line.strip().split('\t') if split_line[2] == 'gene': chrom = split_line[0] start = int(split_line[3]) - 1 end = int(split_line[4]) gene_id = split_line[8].split(';')[0].replace('gene_id ', '').replace( '"', '') gcc = GC(genome[chrom][start:end].seq) sys.stdout.write("%s\t%s\n" % (gene_id, gcc)) elif sys.argv[3] == 'exon': with open(annotation_file, 'r') as f: for line in f: if "##" in line: continue split_line = line.strip().split('\t') if split_line[2] == 'gene': gene_id = split_line[8].split(';')[0].replace('gene_id ', '').replace( '"', '') try: gcc = GC(seq) sys.stdout.write("%s\t%s\n" % (gene_id_bak, gcc))
plotdata = {} else: plotcreate = False threek_windows = [ sequence[start:start + 3000] for start in range(0, len(sequence), 3000) ] windowcount = 0 for threekindex, i in enumerate(threek_windows): lowgc_windows_num = 0 hundred_windows = [ i[start:start + 100] for start in range(0, len(i), 100) ] for hunindex, window in enumerate(hundred_windows): window = str(window).replace("N", "") GCperc = GC(window) if plotcreate: windowID = (threekindex) * 30 + hunindex + 1 plotdata[windowID] = GCperc if len( window ) > 20 and GCperc < 32: #can be 32% as in the Diner et al 2017 paper lowgc_windows_num += 1 #print("{} {}".format(windowID,window)) if lowgc_windows_num > 10: result.write(">{}_candidate centromere {}-{}\n{}\n".format( rec.name, (threekindex) * 3000, (threekindex + 1) * 3000, rec.seq)) # if plotcreate: #only runs in jupyter - graphical output # plotvalues = [int(plotdata[i]) for i in range(1, len(plotdata.keys()))] # plt.plot(range(1, len(plotdata.keys())), plotvalues)
def automated_intergenic_gc_fixer( genome_record, interval_list, gc_content_constraint_obj=GCContentConstraints(), changes_ok_in_feature_types=[], increase_GC=True): """Fixes GC content in the given interval. Avoids making changes within any feature annotations and 20 bp upstream of CDS, unless feature type is specified in changes_ok_in_feature_types. Args: genome_record: Mutable SeqRecord. interval_list: List of tuples in which we want to fix GC. Note that bases just outside this interval may be changed, as they contribute to the local GC measure. changes_ok_in_feature_types: List of feature types that we want to allow changes inside of. gc_content_constraint_obj: Provide limits on GC content. increase_GC: If True, increase GC. If False, decrease GC. Limitations: * Only changes bases that are not inside of any feature annotation or before CDS gene. * Only increases or decreases GC in all given intervals. """ # Strategy: Sample positions in the interval and change them to # corresponding purine or pyrimidine. Recalculate interval GC until # above threshold. assert increase_GC, "Implementation only supports increasing GC right now." # Record the original length for a final assertion. len_genome_before_fix = len(genome_record) # First calculate black-listed positions that cannot be changed. These are # positions that are inside of feature annotations, or just upstream of CDS. pos_blacklist_set = _calculate_feature_annotation_shadow( genome_record, changes_ok_in_feature_types=changes_ok_in_feature_types) # We extend each target interval by half of the GC measurement window size # in each direction, since the we measure the GC centered at each position # in the interval. half_window = gc_content_constraint_obj.local_window_size / 2 # R -> R, Y -> Y GC_TRANSITION_TABLE = { 'A': 'G', 'T': 'C', 'G': 'A', 'C': 'T', } GC_bases = 'GC' for interval in interval_list: # Extend the interval to include the epsilon on each side. We change # positions in this extended_interval, although we still only target # raising the GC of windows centered about positions in the unextended # interval. extended_interval = (interval[0] - half_window, interval[1] + half_window) interval_size = interval[1] - interval[0] extended_interval_size = extended_interval[1] - extended_interval[0] assert interval_size + 100 == extended_interval_size # Extract the sequence in the interval. We'll modify this extracted # sequence only, and then put all the parts back together again # once we're done. before_interval_seq = genome_record.seq[:extended_interval[0]] orig_interval_seq = genome_record.seq[ extended_interval[0]:extended_interval[1]] after_interval_seq = genome_record.seq[extended_interval[1]:] # Make a copy to manipulate so that we can make a comparison afterward. interval_seq = orig_interval_seq[:] assert extended_interval_size == len(interval_seq), ( "Evaluating interval %s: \n %d != %d" % (str(interval), extended_interval_size, len(interval_seq))) # Perform the fix. March through each position, updating the interval # seq as necessary until that position is above the threshold. for interval_pos in range(extended_interval_size): # We only care about increasing GC of windows centered at positions # inside the original interval, not in the extended interval. if (interval_pos < half_window or interval_pos >= extended_interval_size - half_window): continue # Calculate window coordinates in frame of interval_seq. window_start = interval_pos - half_window window_end = interval_pos + half_window window_seq = interval_seq[window_start:window_end] # Figure out which positions we can modify. pos_to_modify_list = range(window_start, window_end) pos_to_modify_list = [ p for p in pos_to_modify_list if not _interval_pos_to_global_pos( p, extended_interval) in pos_blacklist_set ] # Repeat while we have positions to modify, or until we achieve # the desired GC content. gc_content = GC(window_seq) / 100 while (len(pos_to_modify_list) and gc_content < gc_content_constraint_obj.local_window_lower_bound): # Choose the next position randomly to avoid bias. pos_to_modify_idx = random.randint(0, len(pos_to_modify_list) - 1) pos_to_modify = pos_to_modify_list.pop(pos_to_modify_idx) current = interval_seq[pos_to_modify] if current.upper() in GC_bases: continue new = GC_TRANSITION_TABLE[current] interval_seq = (interval_seq[:pos_to_modify] + new + interval_seq[pos_to_modify + 1:]) window_seq = interval_seq[window_start:window_end] gc_content = GC(window_seq) / 100 # Put it back together. assert len(orig_interval_seq) == len(interval_seq), ( "len before: %d | len after: %d" % (len(orig_interval_seq), len(interval_seq))) genome_record.seq = (before_interval_seq + interval_seq + after_interval_seq) # Post-completion checks. assert len_genome_before_fix == len(genome_record), ( "len before: %d | len after: %d" % (len_genome_before_fix, len(genome_record)))
def gc(self, seq): """Calculate GC content in percent (0-100).""" return GC(seq)
max_len = len(seq_records[i].seq) longest_seq = seq_records[i].id elif len(seq_records[i].seq) < min_len: # update min_len and shortest_seq min_len = len(seq_records[i].seq) shortest_seq = seq_records[i].id print('Longest sequence is', longest_seq, 'with length', max_len, 'bp') print('Shortest sequence is', shortest_seq, 'with length', min_len, 'bp') # Creating a new sequence list containing sequences longer than 500bp # Calculate the average length of these sequences # calculate and print the percentage of GC contents long_seq_records = list() # empty list for sequences total_seq_length = 0 for sequence in seq_records: if len(sequence) > 500: long_seq_records.append(sequence) total_seq_length += len(sequence) gc = GC(sequence.seq) print('%GC in', sequence.id, 'is {:.2f}'.format(gc)) avg_seq_length = total_seq_length / len(long_seq_records) print('Average length for sequences longer than 500bp is', avg_seq_length) # Write sequences in the long_seq_records in a file with 'GenBank' format SeqIO.write(long_seq_records, 'long_sequences.fa', 'fasta')
def divergence(): ######################## ## Arguments d'entrée ## ######################## fic1dna = sys.argv[1] #fichier des séquences adn de l'espèce 1 fic2dna = sys.argv[2] #fichier des séquences adn de l'espèce 2 fic1prot = sys.argv[3] #fichier des séquences protéiques de l'espèce 1 fic2prot = sys.argv[4] #fichier des séquences protéiques de l'espèce 2 #outfile_unaligned="outfile_unaligned.fa" #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8') outfile_dn_ds = sys.argv[5] #fichier de sortie format tableau, sep = ";" outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8') method = sys.argv[6] #Methode utilisée muscle_exe = sys.argv[7] #Chemin vers le fichier executable de MUSCLE #Transformation des séquences en format SeqIO seq1dna = list( SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq2dna = list( SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein)) seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein)) #Première ligne du tableau "titres" """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2", ";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)""" print("Nombre de paires de sequences a analyser: ", len(seq1dna)) print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";", "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";", "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";", "Mean_length") """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2", "GC_content_seq1","GC_content_seq2","GC","Mean_length"))""" #Boucle sur chaque paire de séquence u = 0 while u < (len(seq1dna)): try: ########################################################### #. Alignement entre chaque paire de séquence # ########################################################### nuc1 = str(seq1dna[u].seq ) #Récupère la séquence u et la transforme en string nuc2 = str(seq2dna[u].seq) prot1 = str(seq1prot[u].seq) prot2 = str(seq2prot[u].seq) protein2 = SeqRecord( Seq(prot2, alphabet=IUPAC.protein), id='protein2' ) #Transformation de la séquence protéique en format SeqRecord protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein), id='protein1') with open( "outfile_unaligned.fa", "w", encoding='utf-8' ) as output_handle: #Permet de créer un fichier de deux séquences non-alignées (format fasta) SeqIO.write(protein1, output_handle, "fasta") SeqIO.write(protein2, output_handle, "fasta") muscle_cline = MuscleCommandline( muscle_exe, input="outfile_unaligned.fa", out="outfile_aligned.aln" ) #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées stdout, stderr = muscle_cline() alns = AlignIO.read( "outfile_aligned.aln", "fasta") #Lecture du fichier de séquences alignées prot1 = str(alns[0].seq) #Récupère la séquence protéique 1 alignée prot2 = str(alns[1].seq) #Récup§re la séquence protéique 2 alignée nuc2 = SeqRecord( Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2' ) #Transformation de la séquence nucléique en format SeqRecord nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc1') prot1 = SeqRecord( Seq(prot1, alphabet=IUPAC.protein), id='pro1' ) #Transformation de la séquence protéique en format SeqRecord prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2') aln = MultipleSeqAlignment( [prot1, prot2] ) #Créer format alignement des 2 séquences protéiques préalablement alignées codon_aln = codonalign.build( aln, [nuc1, nuc2]) #Créer un alignement de codon #Fichier d'alignement #AlignIO.write(codon_aln,"outfile_aligned", 'fasta') lengthseq1 = len(nuc1.seq) lengthseq2 = len(nuc2.seq) GCcontentseq1 = GC(nuc1.seq) GCcontentseq2 = GC(nuc2.seq) GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2) if lengthseq1 >= lengthseq2: Min_length = lengthseq2 if lengthseq1 < lengthseq2: Min_length = lengthseq1 ########################################################## # CALCULS DES INDICES DE DIVERGENCE # ########################################################## #Calcul de divergence synonyme et non-synonyme #Supression des gaps seq1 = "" seq2 = "" for x, z in zip(codon_aln[0], codon_aln[1]): if z == "-": continue if x == "-": continue else: seq1 += x seq2 += z ################################################################# #. Comptage du nombre de site polymorhe brute # ################################################################# #Compteur de différences par site compteur0 = 0 for i, e in zip(seq1, seq2): if i != e: compteur0 += 1 distance_brute = round(float((compteur0) / len(seq1)), 3) seq1_third_pos = "" seq2_third_pos = "" compteur1 = 0 for i in seq1[2::3]: if i.isalpha(): seq1_third_pos += i compteur1 += 1 compteur2 = 0 for i in seq2[2::3]: if i.isalpha(): seq2_third_pos += i compteur2 += 1 #################################################################### # Comptage du nombre de site polymorphe en troisième position # #################################################################### #Compteur de différences par site (3ieme position) compteur3 = 0 for i, e in zip(seq1_third_pos, seq2_third_pos): if i != e: compteur3 += 1 distance_third_pos = round(float((compteur3) / compteur2), 3) #################################################################### # Calcul dN et dS selon la méthode utilisée # #################################################################### try: dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method) """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ValueError: result = 9.999 #Saturation trop importante pour calculer les indices. """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ZeroDivisionError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except KeyError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" u += 1 except: traceback.print_exc() print("Une erreur est survenue pour la sequence: ", seq1dna[u].id, "vs", seq2dna[u].id) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA", "Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)""" u += 1 #df2.to_csv(outfile_dn_ds, sep='\t') outfile_dn_ds.close() #Fermeture du fichier ouvert
print "Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each). Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.\n" from Bio import SeqIO from Bio.SeqUtils import GC id_gc = {} for f in SeqIO.parse('input.fasta', 'fasta'): id_gc[f.id] = GC(f.seq) print(max(id_gc, key=id_gc.get)) print(id_gc[max(id_gc, key=id_gc.get)])
def findBiasTrend(pairedEndMode, countFileLines, segmentFile): segmentGCDict = dict() segmentHexamerDict = dict() segmentStrandDict = dict() segmentPositionDict = dict() for record in SeqIO.parse(segmentFile, "fasta"): segmentGCDict[record.id] = GC(record.seq) segmentHexamerDict[record.id] = dict() segmentStrandDict[record.id] = record.description.split(" ")[1].split( ":")[5] for i in range(len(record.seq) - 6): hexamer = record.seq[i:i + 6] if hexamer in segmentHexamerDict[record.id]: segmentHexamerDict[record.id][hexamer] += 1 else: segmentHexamerDict[record.id][hexamer] = 1 segmentIDs = [] segmentCountsDict = dict() segmentCounts = [] segmentLengths = [] segmentIsoforms = dict() geneIsoforms = [] segmentPositionsDict = dict() countFileLineIndex = 0 countFileLine = countFileLines[countFileLineIndex] splitLine = countFileLine.strip().split("\t") if pairedEndMode: currGene = splitLine[4] #Get Current Gene while countFileLineIndex < countFileLineEnd and splitLine[ 4] == currGene: segmentID1 = splitLine[0] segmentID2 = splitLine[1] if segmentID1 == segmentID2: segmentIDs.append(segmentID1) segmentLengths.append(int(splitLine[5])) segmentIsoforms[segmentID1] = splitLine[9].split(",") for isoform in segmentIsoforms[segmentID1]: if isoform not in geneIsoforms: geneIsoforms.append(isoform) if segmentID1 not in segmentCountsDict: segmentCountsDict[segmentID1] = int(splitLine[2]) else: segmentCountsDict[segmentID1] += int(splitLine[2]) if segmentID2 not in segmentCountsDict: segmentCountsDict[segmentID2] = int(splitLine[2]) else: segmentCountsDict[segmentID2] += int(splitLine[2]) countFileLineIndex += 1 if countFileLineIndex < countFileLineEnd: countFileLine = countFileLines[countFileLineIndex] splitLine = countFileLine.strip().split("\t") else: currGene = splitLine[3] #Get Current Gene while countFileLineIndex < countFileLineEnd and splitLine[ 3] == currGene: segmentID = splitLine[0] segmentIDs.append(segmentID) segmentCountsDict[segmentID] = int(splitLine[1]) segmentLengths.append(int(splitLine[4])) segmentIsoforms[segmentID] = splitLine[6].split(",") for isoform in segmentIsoforms[segmentID]: if isoform not in geneIsoforms: geneIsoforms.append(isoform) countFileLineIndex += 1 if countFileLineIndex < countFileLineEnd: countFileLine = countFileLines[countFileLineIndex] splitLine = countFileLine.strip().split("\t") for segmentID in segmentIDs: #Only get segment counts of certain segments segmentCounts.append(segmentCountsDict[segmentID]) segmentGC = [0.0 for x in range(len(segmentIDs))] segmentCount = [0.0 for x in range(len(segmentIDs))] for x in range(len(segmentIDs)): segmentID = segmentIDs[x] segmentGC[x] = segmentGCDict[segmentID] segmentCount[x] = math.log(segmentCountsDict[segmentID]) plt.scatter(segmentGC, segmentCount) plt.show()
def test_GC(self): seq = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC" self.assertEqual(GC(seq), 56.25)
# 46.875 # 위의 예제에서는 서열의 GC contents를 구할 수 있다. # 이는 전통적인 형태로써 각각 G, C의 개수를 구하여 전체 서열에서 비율을 구하면 된다. # ================================================================================ # 이는 다양한 함수 및 수식을 정리하여 사용해야 하지만 아래와 같이 from Bio.SeqUtils import GC를 사용하면 복잡한 수식을 사용하지 않아도 간편하게 값을 확인할 수 있다. from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.SeqUtils import GC my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna) GC(my_seq) # 46.875 # ================================================================================ # @@ 아래의 예제는 문자열의 슬라이싱 기능을 이용하여 서열을 자르거나 추출하는 것을 보여준다. # @@ 서열[x:y:z]처럼 서열을 잘라낼 수 있다. # @@ x는 시작위치, y는 마지막 위치, z는 범위를 나타낸다. # 아래의 예는 4번 bp 위치에서부터 12번 bp까지 길이 8bp의 서열을 추출하는 것이다. from Bio.Seq import Seq from Bio.Alphabet import IUPAC my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
from Bio.SeqUtils import GC from Bio.SeqUtils import GC123 seq = input(str("DNA manual:")) print("Total GC content:") print(GC(seq)) print("GC by parts:") print(GC123(seq)) input("enter")
index = [m.start() for m in re.finditer(r'AA', seq)] index_list = [] for i in index: if i + 20 > seq_len: continue else: index_list.append(i) pre_sirna = [] for index in index_list: pre_sirna.append(seq[index:index + 23]) pre_gc_sirna = [] index_gc_sirna = [] for i, seq1 in enumerate(pre_sirna): if (30 < round(GC(seq1), 2) < 58) and ('AAAA' not in seq1[2:21]) and ('TTTT' not in seq1[2:21]): pre_gc_sirna.append(seq1) index_gc_sirna.append(index_list[i]) num = len(pre_gc_sirna) num_mat = [100] * num SS = [] for seq2 in pre_gc_sirna: SS.append(seq2[2:]) for i, seq2 in enumerate(SS): if seq2.endswith('TT'): num_mat[i] += 40
def result_tools3(request): input_seq = request.POST.get('tool1', 'default') rec1 = Seq(input_seq) ans = GC(rec1) params = {'res': ans} return render(request, 'mysite/result_tools.html', params)
from Bio.Seq import Seq from Bio.SeqUtils import GC exon_seq = Seq("ATGCAGTAG") gc_contents = GC(exon_seq) print(gc_contents)
def return_genbank_dict(gb_file, key='annotation', seq_type='amino_acid'): """Overview: This function will return a dictionary generated from a genbank file with key value supplied by caller. Returns: A dictionary created by the supplied genbank file (gb_file) indexed off the key value supplied. Default: The deafult key is locus, and this is generally the most useful key type since it is garanteed to be unique within the genbank file. This condition is not necessarily true for any other attribute. """ result = {} seq_record = SeqIO.parse(open(gb_file), "genbank").next() accession = seq_record.annotations['accessions'][0].split('.')[0] common_name = seq_record.annotations['organism'].replace(' ', '_') result.update({'accession': accession}) result.update({'common_name': common_name}) cnt = 0 # loop over the genbank file unk_cnt = 1 for fnum, feature in enumerate(seq_record.features): # here i simply check the gene coding type, and identify them in a way that can be used later. if feature.type == 'CDS' or feature.type == 'ncRNA' or feature.type == 'tRNA' or feature.type == 'mRNA' or feature.type == 'rRNA': start = feature.location.start stop = feature.location.end #print start, stop strand = feature.strand synonyms = 'NONE' ''' try: gene = feature.qualifiers['gene'][0] except: gene = 'unknown' ''' # this line might be wrong, just trying to get rid of an unneccessary try/except clause if 'gene' in feature.qualifiers: gene = feature.qualifiers['gene'][0] else: gene = 'unknown' if 'gene_synonym' in feature.qualifiers: synonym_list = feature.qualifiers['gene_synonym'][0].replace( ' ', '').split(';') synonyms = ':'.join(synonym_list) try: locus = feature.qualifiers['locus_tag'][0] except: try: locus = feature.qualifiers['gene'][0] except: locus = '' print 'No locus associated. This should never be invoked meaning you are proper fracked. (The gbk file has an error).' try: seq = feature.qualifiers['translation'] seq_type = 'Protein' except: cnt = cnt + 1 seq = seq_record.seq[start:stop] seq_type = feature.type if feature.type == 'CDS': seq_type = 'Pseudo_Gene' # attempt to fix an error if strand == 1: seq = seq.translate() else: seq = seq.reverse_complement().translate() gc = "%2.1f" % GC(seq_record.seq[start:stop]) # Debugging something odd #print feature.qualifiers['gene_synonym'] #method = "exact" if key == 'locus': result.update({locus: (locus, gene, seq, seq_type, synonyms)}) elif key == 'annotation': if gene == 'unknown': new_gene = 'unknown_' + str(unk_cnt) header = '|'.join([ accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc ]) result.update({new_gene: [header, ''.join(seq)]}) unk_cnt += 1 else: header = '|'.join([ accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc ]) result.update({gene: [header, ''.join(seq)]}) try: for syn in synonym_list: result.update({syn: [header, ''.join(seq)]}) except: pass #print 'The number of non-protein regions in %s is: %i.' % (common_name, cnt) return result
def main(): for seq_record in SeqIO.parse(sys.argv[1], "fasta"): print(seq_record.id) print(len(seq_record.seq)) print(GC(seq_record.seq))
import Bio from Bio import SeqIO from Bio.SeqUtils import GC records = list(SeqIO.parse("sample_data/ls_orchid.fasta", "fasta")) for dna_rec in records[:10]: print(dna_rec.description) dna = dna_rec.seq[:100] print("GC Content: " + str(GC(dna)) + "%") print("")
def circos_gc_var(record, windows=1000, shift=0): ''' :param record: :return: circos string with difference as compared to the average GC ex: average = 32 GC(seq[3000:4000]) = 44 diff = 44 - 32 = 12% ''' circos_string = '' from Bio.SeqFeature import FeatureLocation average_gc = GC(record.seq) gap_locations = [] for feature in record.features: if feature.type == "assembly_gap": gap_locations.append(feature.location) if len(gap_locations) == 0: gap_locations.append(FeatureLocation(0, len(record.seq))) else: gap_locations.append( FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) if len(gap_locations) > 1: #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) for i in range(0, len(gap_locations)): if i == 0: seq = record.seq[0:gap_locations[i].start] chr_start = 0 else: seq = record.seq[gap_locations[i - 1].end:gap_locations[i].start] chr_start = gap_locations[i - 1].end contig_name = record.name + "_%s" % (i + 1) if len(seq) < windows: windows_range = len(seq) #print seq else: windows_range = windows if len(seq) == 0: continue #print record for i in range(0, len(seq), windows_range): start = i stop = i + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 if 'n' in record.seq[start:stop]: continue if 'N' in record.seq[start:stop]: continue gc = GC(record.seq[start:stop]) - average_gc if stop > len(seq): stop = len(seq) #if stop - start < 200: # break section_start = chr_start + start section_end = chr_start + stop circos_string += "%s %s %s %s\n" % (contig_name, section_start, section_end, gc) else: seq = record.seq contig_name = record.id.split('.')[0] for i in range(0, len(seq), windows): start = i stop = i + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 gc = GC(record.seq[start:stop]) - average_gc if stop > len(seq): stop = len(seq) if stop - start < 500: break circos_string += "%s %s %s %s\n" % (contig_name, start + shift, stop + shift, gc) return circos_string
' or it is unreadable') quit() #input_fasta_path = sys.argv[1] #output_table_path = sys.argv[2] output = open(output_table_path, 'w') if no_coverage_mode: output.write('contig\tlength\tgc\n') else: output.write('contig\tlength\tgc\tcov\n') for seq_record in SeqIO.parse(fasta_assembly_path, 'fasta'): length = str(len(seq_record)) gc = str(GC(str(seq_record.seq))) contig = str(seq_record.id) if use_coverage_table: if contig in coverages: cov = coverages[contig] else: print('Error, ' + contig + ' not found in the coverage table') quit() output.write(contig + '\t' + length + '\t' + gc + '\t' + str(cov) + '\n') elif no_coverage_mode: output.write(contig + '\t' + length + '\t' + gc + '\n') else: # Do a format check to make sure the contig name is right contigList = contig.split('_') if contigList[0] == 'NODE' and contigList[
def gc_values(handle): parsed_handle = [record for record in SeqIO.parse(handle, "fasta")] gc_values = [GC(rec.seq) for rec in parsed_handle] seq_ids = [rec.id for rec in parsed_handle] for i in range(0, len(seq_ids)): print(seq_ids[i] + "\t" + str(gc_values[i]))
def dna_features(dna_sequences): """ This function calculates a variety of properties from a DNA sequence. Input: a list of DNA sequence (can also be length of 1) Output: a dataframe of features """ import numpy as np import pandas as pd from Bio.SeqUtils import GC, CodonUsage A_freq = [] T_freq = [] C_freq = [] G_freq = [] GC_content = [] codontable = { 'ATA': [], 'ATC': [], 'ATT': [], 'ATG': [], 'ACA': [], 'ACC': [], 'ACG': [], 'ACT': [], 'AAC': [], 'AAT': [], 'AAA': [], 'AAG': [], 'AGC': [], 'AGT': [], 'AGA': [], 'AGG': [], 'CTA': [], 'CTC': [], 'CTG': [], 'CTT': [], 'CCA': [], 'CCC': [], 'CCG': [], 'CCT': [], 'CAC': [], 'CAT': [], 'CAA': [], 'CAG': [], 'CGA': [], 'CGC': [], 'CGG': [], 'CGT': [], 'GTA': [], 'GTC': [], 'GTG': [], 'GTT': [], 'GCA': [], 'GCC': [], 'GCG': [], 'GCT': [], 'GAC': [], 'GAT': [], 'GAA': [], 'GAG': [], 'GGA': [], 'GGC': [], 'GGG': [], 'GGT': [], 'TCA': [], 'TCC': [], 'TCG': [], 'TCT': [], 'TTC': [], 'TTT': [], 'TTA': [], 'TTG': [], 'TAC': [], 'TAT': [], 'TAA': [], 'TAG': [], 'TGC': [], 'TGT': [], 'TGA': [], 'TGG': [] } for item in dna_sequences: # nucleotide frequencies A_freq.append(item.count('A') / len(item)) T_freq.append(item.count('T') / len(item)) C_freq.append(item.count('C') / len(item)) G_freq.append(item.count('G') / len(item)) # GC content GC_content.append(GC(item)) # codon frequency: count codons, normalize counts, add to dict codons = [item[i:i + 3] for i in range(0, len(item), 3)] l = [] for key in codontable.keys(): l.append(codons.count(key)) l_norm = [float(i) / sum(l) for i in l] for j, key in enumerate(codontable.keys()): codontable[key].append(l_norm[j]) # codon usage bias (_b) synonym_codons = CodonUsage.SynonymousCodons codontable2 = { 'ATA_b': [], 'ATC_b': [], 'ATT_b': [], 'ATG_b': [], 'ACA_b': [], 'ACC_b': [], 'ACG_b': [], 'ACT_b': [], 'AAC_b': [], 'AAT_b': [], 'AAA_b': [], 'AAG_b': [], 'AGC_b': [], 'AGT_b': [], 'AGA_b': [], 'AGG_b': [], 'CTA_b': [], 'CTC_b': [], 'CTG_b': [], 'CTT_b': [], 'CCA_b': [], 'CCC_b': [], 'CCG_b': [], 'CCT_b': [], 'CAC_b': [], 'CAT_b': [], 'CAA_b': [], 'CAG_b': [], 'CGA_b': [], 'CGC_b': [], 'CGG_b': [], 'CGT_b': [], 'GTA_b': [], 'GTC_b': [], 'GTG_b': [], 'GTT_b': [], 'GCA_b': [], 'GCC_b': [], 'GCG_b': [], 'GCT_b': [], 'GAC_b': [], 'GAT_b': [], 'GAA_b': [], 'GAG_b': [], 'GGA_b': [], 'GGC_b': [], 'GGG_b': [], 'GGT_b': [], 'TCA_b': [], 'TCC_b': [], 'TCG_b': [], 'TCT_b': [], 'TTC_b': [], 'TTT_b': [], 'TTA_b': [], 'TTG_b': [], 'TAC_b': [], 'TAT_b': [], 'TAA_b': [], 'TAG_b': [], 'TGC_b': [], 'TGT_b': [], 'TGA_b': [], 'TGG_b': [] } for item1 in dna_sequences: codons = [item1[l:l + 3] for l in range(0, len(item1), 3)] codon_counts = [] # count codons corresponding to codontable (not codontable2 because keynames changed!) for key in codontable.keys(): codon_counts.append(codons.count(key)) # count total for synonymous codons, divide each synonym codon count by total for key_syn in synonym_codons.keys(): total = 0 for item2 in synonym_codons[key_syn]: total += codons.count(item2) for j, key_table in enumerate(codontable.keys()): if (key_table in synonym_codons[key_syn]) & (total != 0): codon_counts[j] /= total # add corrected counts to codontable2 (also corresponds to codontable which was used to count codons) for k, key_table in enumerate(codontable2.keys()): codontable2[key_table].append(codon_counts[k]) # make new dataframes & standardize features_codonbias = pd.DataFrame.from_dict(codontable2) features_dna = pd.DataFrame.from_dict(codontable) features_dna['A_freq'] = np.asarray(A_freq) features_dna['T_freq'] = np.asarray(T_freq) features_dna['C_freq'] = np.asarray(C_freq) features_dna['G_freq'] = np.asarray(G_freq) features_dna['GC'] = np.asarray(GC_content) # concatenate dataframes & return features = pd.concat([features_dna, features_codonbias], axis=1) return features
def whole_gc(records): seq = "" for record in records: seq += record.seq return GC(seq)
#Biopython from Bio import SeqIO from Bio.SeqUtils import GC GCcont = 0 ID = "" file = open("rosalind_GC.txt", "r") for record in SeqIO.parse(file, "fasta"): if GCcont < GC(record.seq): GCcont = GC(record.seq) ID = record.id print(ID) print(str(round(GCcont,2))+"%") #dictionary file = open('rosalind_gc.txt', 'r').read() d = {} for seqblock in raw.split(">")[1:]: parts = seqblock.split("\n") id = parts[0] seq = ''.join(parts[1:]) gc = 100 * ( seq.count("G") + seq.count("C") ) / float(len(seq)) dic[gc] = id
def circos_gc_content(record, windows=1000, shift=0): ''' :param record: :return: circos string with difference as compared to the average GC ex: average = 32 GC(seq[3000:4000]) = 44 diff = 44 - 32 = 12% UPDATE 12.06.2017: calculs based on complete (concatenated) sequence, then converted to draft contigs coords ''' circos_string = '' from Bio.SeqFeature import FeatureLocation average_gc = GC(record.seq) gap_locations = [] for feature in record.features: if feature.type == "assembly_gap": gap_locations.append(feature.location) if len(gap_locations) == 0: gap_locations.append(FeatureLocation(0, len(record.seq))) else: gap_locations.append( FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) if len(gap_locations) > 1: #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) for i in range(0, len(gap_locations)): if i == 0: seq = record.seq[0:gap_locations[i].start] chr_start = 0 else: seq = record.seq[gap_locations[i - 1].end:gap_locations[i].start] chr_start = gap_locations[i - 1].end contig_name = record.name + "_%s" % (i + 1) if len(seq) <= windows: window_range = len(seq) #print 'small contig!!!!!!!!!!', len(seq) else: #print 'len contig', len(seq) window_range = windows for i in range(0, len(seq), window_range): start = i stop = i + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 gc = GC(record.seq[start:stop]) if stop > len(seq): stop = len(seq) #print "small contig!!", start, stop, gc #if stop - start < 200: # #break section_start = chr_start + start section_end = chr_start + stop circos_string += "%s %s %s %s\n" % (contig_name, section_start, section_end, gc) else: seq = record.seq contig_name = record.id #.split('.')[0] for i in range(0, len(seq), windows): start = i stop = i + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 gc = GC(record.seq[start:stop]) if stop > len(seq): stop = len(seq) if stop - start < 500: break circos_string += "%s %s %s %s\n" % (contig_name, start + shift, stop + shift, gc) return circos_string
from Bio import SeqIO from Bio.SeqUtils import GC import pylab for i in SeqIO.parse("NC_017108.gbk", "genbank"): gc = GC(i.seq) at = 100 - gc pylab.pie([gc, at]) pylab.title("Conteudo GC:") pylab.xlabel("GC: %0.1f porcento\nAT: %0.1f porcento" % (gc, at)) pylab.show()
def adapter_find(reference_database, reads, threads, max_intron_length, working_dir, verbose): subset_fasta = reads + "subset.10000.fasta" count_reads = 0 with open(subset_fasta, "w") as fh: for rec in SeqIO.parse(reads, "fasta"): if int(rec.id) < 10000: SeqIO.write(rec, fh, "fasta") bam = mapping.minimap(reference_database, subset_fasta, threads, max_intron_length, working_dir, verbose) #soft_clip_regions = soft_clip(bam) fasta_gz = bam + ".fasta.gz" cmd = "extractSoftclipped %s > %s" % (bam, fasta_gz) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True) extract_clip.communicate() list_short = [] list_long = [] dict_uniq = {} with gzip.open(fasta_gz, "rt") as handle: for rec in SeqIO.parse(handle, "fasta"): name_seq = str(rec.id) name = name_seq.split("_")[0] if name in dict_uniq: if len(dict_uniq[name].seq) > len(rec.seq): list_long.append(dict_uniq[name]) list_short.append(rec) else: list_short.append(dict_uniq[name]) list_long.append(rec) else: dict_uniq[name] = rec long_file = fasta_gz + ".long.fasta" with open(long_file, "w") as fh: SeqIO.write(list_long, fh, "fasta") short_file = fasta_gz + ".short.fasta" with open(short_file, "w") as fh: SeqIO.write(list_short, fh, "fasta") list_file_clip = [(long_file, "long"), (short_file, "short")] list_file_adapter = [] for clip_file in list_file_clip: kmer_start = 21 list_kmer = [] while kmer_start < 120: cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % ( kmer_start, kmer_start, clip_file[1], clip_file[0]) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True) jelly_count.communicate() cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % ( kmer_start, clip_file[1]) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) jelly_dump = subprocess.Popen(cmd, cwd=working_dir, stdout=subprocess.PIPE, shell=True) out_dump = jelly_dump.communicate()[0].decode('utf-8') mer = out_dump.split("\t")[0] a_count = mer.count("A") t_count = mer.count("T") if a_count > t_count: bias_count = a_count else: bias_count = t_count data_kmer = (kmer_start, mer, GC(mer), (bias_count / kmer_start) * 100, (bias_count / kmer_start) * 100 - GC(mer)) list_kmer.append(data_kmer) kmer_start += 5 value_adapter = 0 for i in list_kmer: if i[4] > int(value_adapter): value_adapter = i[4] kmer_done = i[1] adapter_file = os.path.join(working_dir, "adapter.fasta") with open(adapter_file, "w") as fh: record = SeqRecord(Seq(str(kmer_done)), id="adapter") SeqIO.write(record, fh, "fasta") return adapter_file
def fix_gc_content(refactor_context, gc_content_constraint_obj, start_bound=None, end_bound=None, debug=False, report_file=None): """Fixes the GC content according to desired constraints. Strategy: Slide a window across the genome and bump any regions that fall outside of the constraint. Now, there is some subtlety here in that we have a good idea of how to fix coding regions (i.e. synonymous codon swaps), but want to avoid messing with stuff outside of coding regions. And so when we identify a bad window, for now, we limit fixes to any coding portions of that window only. TODOs: * We are only dealing with local window for now. Figure out how we want to deal with global window. Args: refactor_context: The RefactorContext. gc_content_constraint_obj: A GCContentConstraints object that allows the client to configure the fixes. start_bound: Optionally bound fixes to start at this position. end_bound: Optionally bound fixes to end at this position. debug: Debug flag. Prints helpful output. For now, runs analysis only, and doesn't actually make changes. Returns: A copy of the genome_record contained within refactor_context with the GC content made to satisfy constraints. """ print 'Fixing GC content...' updated_genome_record = copy.deepcopy(refactor_context.get_genome_record()) # Figure out effective bounds. effective_start_bound = start_bound if start_bound else 0 effective_end_bound = end_bound if end_bound else len( updated_genome_record) # Features that we can do synonymous swaps in swappable_features = [ feature for feature in updated_genome_record.features if feature.type == 'CDS' ] # Slide the window looking for violations of GC content restrictions. window_center_range = range( effective_start_bound + gc_content_constraint_obj.local_window_size / 2, effective_end_bound - gc_content_constraint_obj.local_window_size / 2) report_intervals = [] if debug: running_interval = None running_gc_total = 0 for window_center_pos in window_center_range: window_start_pos = (window_center_pos - gc_content_constraint_obj.local_window_size / 2) window_end_pos = (window_start_pos + gc_content_constraint_obj.local_window_size) window_seq = updated_genome_record.seq[window_start_pos:window_end_pos] gc_content = GC(window_seq) / 100 if (gc_content_constraint_obj.local_window_lower_bound <= gc_content <= gc_content_constraint_obj.local_window_upper_bound): # GC is all good. if debug: # Close the running interval and print it out. if running_interval: interval_size = running_interval[1] - running_interval[ 0] + 1 avg_gc = running_gc_total / interval_size report_intervals.append({ 'interval': str(running_interval), 'interval_size': interval_size, 'avg_gc': avg_gc, }) print('%s, size: %d, average_gc: %f' % (str(running_interval), interval_size, avg_gc)) running_interval = None running_gc_total = 0 continue if debug: if not running_interval: running_interval = (window_center_pos, window_center_pos) else: running_interval = (running_interval[0], window_center_pos) running_gc_total += gc_content continue # As a first stab, only attempt fixes in the simplest of cases. # That is, only do synonymous codon swaps within parts of features # that are not overlapping. # First identify all features overlaped by the interval. interval = (window_start_pos, window_end_pos) overlapped_features = calc_interval_list_to_features_overlapped( [interval], swappable_features)[0] if len(overlapped_features) != 1: # TODO: Eventually handle more complex cases. continue # Otherwise attempt to fix. feature = overlapped_features[0] feature_seq = str(feature.extract(updated_genome_record.seq)) # Figure out the specific codons that need to be changed. affected_codon_indeces = get_region_codon_indeces_in_feature( feature, interval) avoid_codons_in_positions = {} for codon_index in affected_codon_indeces: codon = feature_seq[codon_index * 3:codon_index * 3 + 3] if GC(codon) < 1.0: avoid_codons_in_positions[codon_index] = codon # Perform replace. first_codon_to_modify = affected_codon_indeces[0] last_codon_to_modify = affected_codon_indeces[-1] assert first_codon_to_modify <= last_codon_to_modify result = replace_codons_in_single_feature( refactor_context, feature.id, explicit_genome_record=updated_genome_record, start_codon_index=first_codon_to_modify, last_codon_index=last_codon_to_modify, avoid_codons_in_positions=avoid_codons_in_positions) if not result['is_success']: # TODO: Do something better for debugging here, although # we don't necessarily need each replace to succeed. continue update_seq_record_feature(updated_genome_record, feature.id, result) print '...Done.' if report_file: print 'Writing report.' REPORT_FIELDNAMES = [ 'interval', 'interval_size', 'avg_gc', ] with open(report_file, 'w') as report_fh: writer = csv.DictWriter(report_fh, REPORT_FIELDNAMES) writer.writeheader() for interval in report_intervals: writer.writerow(interval) return updated_genome_record
else: printlog("\t\tUnable to determine a likely cluster.") printlog("\tBase One Guess") if base_ones: for base_one in base_ones: out = "\t\tIn the blast hit to %s, query position %s matches subject position %s." % (base_one[0], str(base_one[1]), str(base_one[2])) out2 = "\t\tLikely Base 1 position: %s in %s" % (base_one[1], base_one[4]) if base_one[3]: out += " (After contig was reverse-complemented.)" printlog(out) printlog(out2) else: printlog("\t\tUnable to find Base 1.") printlog("\tGC Info") i=0 all_contig_objects=SeqIO.parse(open('%s/454AllContigs.fna' % project_dir,'r'),'fasta') for contig in all_contig_objects: if i==10: break printlog("\t\t%s\t%s %%" % (contig.id, round(GC(contig.seq),1))) i += 1 printlog("\tCoverage Info") for contig in contig_list: printlog("\t\t%s\t%s (assembled)\t%s (estimated for entire fastq)" % (contig[0],contig[4],contig[5])) log_file.close()
def find_gc_content_extremes(genome_record, gc_content_constraint_obj=GCContentConstraints(), start_bound=None, end_bound=None, debug=False): """Finds runs of extreme GC content. Args: genome_record: The SeqRecord object with the sequence. gc_content_constraint_obj: A GCContentConstraints object that allows the client to configure the fixes. start_bound: Optionally bound fixes to start at this position. end_bound: Optionally bound fixes to end at this position. Returns: List of objects with keys: * interval: Pythonic (start, end) of the interval. * avg_gc: Average GC content over this interval. """ extreme_gc_intervals = [] effective_start_bound = start_bound if start_bound else 0 effective_end_bound = end_bound if end_bound else len(genome_record.seq) running_interval = None running_gc_total = 0 # Slide the window looking for violations of GC content restrictions. window_center_range = xrange( effective_start_bound + gc_content_constraint_obj.local_window_size / 2, effective_end_bound - gc_content_constraint_obj.local_window_size / 2) # Necessary initialization for our get_GC_optimized() method. gc_content = None for window_center_pos in window_center_range: window_start_pos = (window_center_pos - gc_content_constraint_obj.local_window_size / 2) window_end_pos = (window_start_pos + gc_content_constraint_obj.local_window_size) gc_content = GC(genome_record.seq, window_start_pos, window_end_pos, gc_content) if (gc_content_constraint_obj.local_window_lower_bound <= gc_content <= gc_content_constraint_obj.local_window_upper_bound): # End of extreme interval. Record the current interval and reset. if running_interval: interval_size = running_interval[1] - running_interval[0] + 1 avg_gc = running_gc_total / interval_size extreme_gc_intervals.append({ 'interval': running_interval, 'avg_gc': avg_gc }) # Reset. running_interval = None running_gc_total = 0 else: # Create or update the running interval. if not running_interval: running_interval = (window_center_pos, window_center_pos) else: running_interval = (running_interval[0], window_center_pos) running_gc_total += gc_content return extreme_gc_intervals
def main(argv): #default parameters mg_lst = [] ref_lst = [] e_val = 1e-5 alen = 50.0 alen_percent = True alen_bp = False iden = 95.0 name= "output" fmt_lst = ["fasta"] supported_formats =["fasta", "csv"] iterations = 1 alen_increment = 5.0 iden_increment = 0.0 blast_db_Dir = "" results_Dir = "" input_files_Dir = "" ref_out_0 = "" blasted_lst = [] continue_from_previous = False #poorly supported, just keeping the directories skip_blasting = False debugging = False sheared = False shear_val = None logfile = "" try: opts, args = getopt.getopt(argv, "r:m:n:e:a:i:s:f:h", ["reference=", "metagenome=", "name=", "e_value=", "alignment_length=", "identity=","shear=","format=", "iterations=", "alen_increment=", "iden_increment=","continue_from_previous","skip_blasting","debugging", "help"]) except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() # elif opt in ("--recover_after_failure"): # recover_after_failure = True # print "Recover after failure:", recover_after_failure elif opt in ("--continue_from_previous"): continue_from_previous = True if debugging: print "Continue after failure:", continue_from_previous elif opt in ("--debugging"): debugging = True if debugging: print "Debugging messages:", debugging elif opt in ("-r", "--reference"): if arg: ref_lst=arg.split(',') #infiles = arg if debugging: print "Reference file(s)", ref_lst elif opt in ("-m", "--metagenome"): if arg: mg_lst=arg.split(',') #infiles = arg if debugging: print "Metagenome file(s)", mg_lst elif opt in ("-f", "--format"): if arg: fmt_lst=arg.split(',') #infiles = arg if debugging: print "Output format(s)", fmt_lst elif opt in ("-n", "--name"): if arg.strip(): name = arg if debugging: print "Project name", name elif opt in ("-e", "--e_value"): try: e_val = float(arg) except: print "\nERROR: Please enter numerical value as -e parameter (default: 1e-5)" usage() sys.exit(1) if debugging: print "E value", e_val elif opt in ("-a", "--alignment_length"): if arg.strip()[-1]=="%": alen_bp = False alen_percent = True else: alen_bp = True alen_percent = False try: alen = float(arg.split("%")[0]) except: print "\nERROR: Please enter a numerical value as -a parameter (default: 50.0)" usage() sys.exit(1) if debugging: print "Alignment length", alen elif opt in ("-i", "--identity"): try: iden = float(arg) except: print "\nERROR: Please enter a numerical value as -i parameter (default: 95.0)" usage() sys.exit(1) if debugging: print "Alignment length", iden elif opt in ("-s", "--shear"): sheared = True try: shear_val = int(arg) except: print "\nERROR: Please enter an integer value as -s parameter" usage() sys.exit(1) if debugging: print "Alignment length", iden elif opt in ("--iterations"): try: iterations = int(arg) except: print "\nWARNING: Please enter integer value as --iterations parameter (using default: 1)" if debugging: print "Iterations: ", iterations elif opt in ("--alen_increment"): try: alen_increment = float(arg) except: print "\nWARNING: Please enter numerical value as --alen_increment parameter (using default: )", alen_increment if debugging: print "Alignment length increment: ", alen_increment elif opt in ("--iden_increment"): try: iden_increment = float(arg) except: print "\nWARNING: Please enter numerical value as --iden_increment parameter (using default: )", iden_increment if debugging: print "Alignment length increment: ", iden_increment elif opt in ("--skip_blasting"): skip_blasting = True if debugging: print "Blasting step omitted; Using previous blast output." for ref_file in [x for x in ref_lst if x]: try: # with open(ref_file, "rU") as hand_ref: pass except: print "\nERROR: Reference File(s) ["+ref_file+"] doesn't exist" usage() sys.exit(1) for mg_file in [x for x in mg_lst if x]: try: # with open(mg_file, "rU") as hand_mg: pass except: print "\nERROR: Metagenome File(s) ["+mg_file+"] doesn't exist" usage() sys.exit(1) for fmt in [x for x in fmt_lst if x]: if fmt not in supported_formats: print "\nWARNING: Output format [",fmt,"] is not supported" print "\tUse -h(--help) option for the list of supported formats" fmt_lst=["fasta"] print "\tUsing default output format: ", fmt_lst[0] project_dir = name if not continue_from_previous: if os.path.exists(project_dir): shutil.rmtree(project_dir) try: os.mkdir(project_dir) except OSError: print "ERROR: Cannot create project directory: " + name raise print "\n\t Initial Parameters:" print "\nProject Name: ", name,'\n' print "Project Directory: ", os.path.abspath(name),'\n' print "Reference File(s): ", ref_lst,'\n' if sheared: print "Shear Reference File(s):", str(shear_val)+"bp",'\n' print "Metagenome File(s): ", mg_lst,'\n' print "E Value: ", e_val, "\n" if alen_percent: print "Alignment Length: "+str(alen)+'%\n' if alen_bp: print "Alignment Length: "+str(alen)+'bp\n' print "Sequence Identity: "+str(iden)+'%\n' print "Output Format(s):", fmt_lst,'\n' if iterations > 1: print "Iterations: ", iterations, '\n' print "Alignment Length Increment: ", alen_increment, '\n' print "Sequence identity Increment: ", iden_increment, '\n' #Initializing directories blast_db_Dir = name+"/blast_db" if not continue_from_previous: if os.path.exists(blast_db_Dir): shutil.rmtree(blast_db_Dir) try: os.mkdir(blast_db_Dir) except OSError: print "ERROR: Cannot create project directory: " + blast_db_Dir raise results_Dir = name+"/results" if not continue_from_previous: if os.path.exists(results_Dir): shutil.rmtree(results_Dir) try: os.mkdir(results_Dir) except OSError: print "ERROR: Cannot create project directory: " + results_Dir raise input_files_Dir = name+"/input_files" if not continue_from_previous: if os.path.exists(input_files_Dir): shutil.rmtree(input_files_Dir) try: os.mkdir(input_files_Dir) except OSError: print "ERROR: Cannot create project directory: " + input_files_Dir raise # Writing raw reference files into a specific input filename input_ref_records = {} for reference in ref_lst: ref_records_ind = parse_contigs_ind(reference) #ref_records = dict(ref_records_ind) input_ref_records.update(ref_records_ind) ref_records_ind.close() #input_ref_records.update(ref_records) ref_out_0 = input_files_Dir+"/reference0.fna" if (sheared & bool(shear_val)): with open(ref_out_0, "w") as handle: SeqIO.write(genome_shredder(input_ref_records, shear_val).values(), handle, "fasta") #NO NEED TO CLOSE with statement will automatically close the file else: with open(ref_out_0, "w") as handle: SeqIO.write(input_ref_records.values(), handle, "fasta") # Making BLAST databases #output fname from before used as input for blast database creation input_ref_0 = ref_out_0 title_db = name+"_db"#add iteration functionality #diamond os.mkdir(blast_db_Dir+"/iteration"+str(iterations)) outfile_db = blast_db_Dir+"/iteration"+str(iterations)+"/db" #change into for loop #os.system("makeblastdb -in "+input_ref_0+" -dbtype prot -title "+title_db+" -out "+outfile_db+" -parse_seqids") os.system("diamond makedb --in "+input_ref_0+" --db "+outfile_db) # BLASTing query contigs if not skip_blasting: print "\nBLASTing query file(s):" for i in range(len(mg_lst)): database = outfile_db # adjust for iterations blasted_lst.append(results_Dir+"/recruited_mg_"+str(i)+".tab") start = time.time() #os_string = 'blastp -db '+database+' -query \"'+mg_lst[i]+'\" -out '+blasted_lst[i]+" -evalue "+str(e_val)+" -outfmt 6 -num_threads 8" os_string = 'diamond blastp -p 8 -f 6 -d '+database+'.dmnd -q \"'+mg_lst[i]+'\" --out '+blasted_lst[i]+" --evalue "+str(e_val) #print os_string os.system(os_string) print "\t"+mg_lst[i]+"; Time elapsed: "+str(time.time()-start)+" seconds." else: for i in range(len(mg_lst)): blasted_lst.append(results_Dir+"/recruited_mg_"+str(i)+".tab") # Parsing BLAST outputs blast_cols = ['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits'] recruited_mg=[] for i in range(len(mg_lst)): try: df = pandas.read_csv(blasted_lst[i] ,sep="\t", header=None) except: df = pandas.DataFrame(columns=blast_cols) df.columns=blast_cols recruited_mg.append(df) # print len(recruited_mg[0]) # print len(recruited_mg[1]) #creating all_records entry #! Remember to close index objects after they are no longer needed #! Use helper function close_ind_lst() all_records = [] all_input_recs = parse_contigs_ind(ref_out_0) ##calculating GC of the reference # if (len(all_input_recs)>1): #TODO: make a better adaptation if False: # I'm adapting the script for blastn pass # ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()]) # ref_cnt = ref_gc_lst.size # ref_gc_avg = np.mean(ref_gc_lst) # ref_gc_avg_std = np.std(ref_gc_lst) # if(len(ref_gc_lst) > 0): # ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0) # else: # ref_gc_avg_sem=0 else: if (debugging): print "Only one reference" ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()]) ref_cnt = ref_gc_lst.size ref_gc_avg = np.mean(ref_gc_lst) ref_gc_avg_std=0 ref_gc_avg_sem=0 #ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0) # _ = 0 # for key, value in all_input_recs.items(): # _ +=1 # if _ < 20: # print key, len(value) print "\nIndexing metagenome file(s):" for i in range(len(mg_lst)): start = time.time() all_records.append(parse_contigs_ind(mg_lst[i])) print "\t"+mg_lst[i]+" Indexed in : "+str(time.time()-start)+" seconds." # Transforming data print "\nParsing recruited contigs:" for i in range(len(mg_lst)): start = time.time() #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe]) recruited_mg[i]=unique_scaffold_topBits(recruited_mg[i]) contig_list = recruited_mg[i]['quid'].tolist() #this should solve string/int fastaID problem, until now fixed with renaming contig_list = list(map(str, contig_list)) recruited_mg[i]['Contig_nt']=retrive_sequence(contig_list, all_records[i]) recruited_mg[i]['Contig_size']=recruited_mg[i]['Contig_nt'].apply(lambda x: len(x)) #recruited_mg[i]['Ref_nt']=recruited_mg[i]['suid'].apply(lambda x: all_input_recs[str(x)].seq) recruited_mg[i]['Ref_size']=recruited_mg[i]['suid'].apply(lambda x: len(all_input_recs[str(x)])) #TODO: make a better adaptation recruited_mg[i]['Ref_GC']=0.0 #recruited_mg[i]['Ref_GC']=recruited_mg[i]['suid'].apply(lambda x: GC(all_input_recs[str(x)].seq)) #recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/min(recruited_mg[i]['Contig_size'].apply(lambda y: y),recruited_mg[i]['Ref_size'].apply(lambda z: z)) #df.loc[:, ['B0', 'B1', 'B2']].min(axis=1) recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/recruited_mg[i].loc[:,["Contig_size", "Ref_size"]].min(axis=1) recruited_mg[i]['Metric']=recruited_mg[i]['Coverage']*recruited_mg[i]['iden']/100.0 try: recruited_mg[i]['Contig_GC']=recruited_mg[i]['Contig_nt'].apply(lambda x: GC(x)) except: recruited_mg[i]['Contig_GC']=recruited_mg[i]['Contig_nt'].apply(lambda x: None) try: recruited_mg[i]['Read_RPKM']=1.0/((recruited_mg[i]['Ref_size']/1000.0)*(len(all_records[i])/1000000.0)) except: recruited_mg[i]['Read_RPKM']=np.nan #recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Ref_nt','Contig_size','Contig_GC','Contig_nt']] recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Contig_size','Contig_GC','Read_RPKM','Contig_nt']] print "\tContigs from "+mg_lst[i]+" parsed in : "+str(time.time()-start)+" seconds." # Here would go statistics functions and producing plots # # # # # # Quality filtering before outputting if alen_percent: for i in range(len(recruited_mg)): recruited_mg[i]=recruited_mg[i][(recruited_mg[i]['iden']>=iden)&(recruited_mg[i]['Coverage']>=alen)&(recruited_mg[i]['eval']<=e_val)] if alen_bp: for i in range(len(recruited_mg)): recruited_mg[i]=recruited_mg[i][(recruited_mg[i]['iden']>=iden)&(recruited_mg[i]['alen']>=alen)&(recruited_mg[i]['eval']<=e_val)] # print len(recruited_mg[0]) # print len(recruited_mg[1]) # Batch export to outfmt (csv and/or multiple FASTA) alen_str = "" iden_str = "_iden_"+str(iden)+"%" if alen_percent: alen_str = "_alen_"+str(alen)+"%" if alen_bp: alen_str = "_alen_"+str(alen)+"bp" if iterations > 1: prefix=name+"/results/"+name.split("/")[0]+"_iter_e_"+str(e_val)+iden_str+alen_str else: prefix=name+"/results/"+name.split("/")[0]+"_e_"+str(e_val)+iden_str+alen_str if sheared: prefix = prefix+'_sheared_'+str(shear_val)+"bp" prefix = prefix + "_recruited_mg_" #initializing log file data logfile=name.split("/")[0]+"/results_log.csv" try: run = int(name.split("/")[-1].split("_")[-1])# using "_" less depends on the wrapper script except: if name.split("/")[-1].split("_")[-1]==name: run = 0 else: print "Warning: Run identifier could not be written in: "+logfile #sys.exit(1) run = None alen_header = "Min alen" if alen_bp: alen_header = alen_header+" (bp)" if alen_percent: alen_header = alen_header+" (%)" shear_header = "Reference Shear (bp)" shear_log_value = 0 if sheared: shear_log_value = str(shear_val) print "\nWriting files:" for i in range(len(mg_lst)): records= [] if "csv" in fmt_lst: outfile1 = prefix+str(i)+".csv" recruited_mg[i].to_csv(outfile1, sep='\t') print str(len(recruited_mg[i]))+" sequences written to "+outfile1 if "fasta" in fmt_lst: ids = recruited_mg[i]['quid'].tolist() # fixing the renaming error, converting to list of string ids = list(map(str, ids)) #if len(ids)==len(sequences): for j in range(len(ids)): records.append(all_records[i][ids[j]]) outfile2 = prefix+str(i)+".fasta" with open(outfile2, "w") as output_handle: #SeqIO.write(records, output_handle, "fasta") #this should not have line wrappings SeqIO.write(records, output_handle, "fasta-2line") print str(len(ids))+" sequences written to "+outfile2 #Writing logfile try: time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) except: print "Warning: Time identifier could not be written in: "+logfile metagenome = mg_lst[i] #contig info rpkm_lst = np.array(recruited_mg[i]['Read_RPKM'].tolist()) if(len(rpkm_lst) > 0): rpkm = np.sum(rpkm_lst) rpkm_std= np.std(rpkm_lst) rpkm_sem = np.std(rpkm_lst)*np.sqrt(len(rpkm_lst)) else: rpkm = 0 rpkm_std= 0 rpkm_sem=0 sizes_lst = np.array(recruited_mg[i]['Contig_size'].tolist()) if(len(sizes_lst) > 0): sizes_avg = np.mean(sizes_lst) sizes_avg_std= np.std(sizes_lst) if(len(sizes_lst) > 1): sizes_avg_sem = stats.sem(sizes_lst, axis=0) else: sizes_avg_sem = 0 else: sizes_avg = 0 sizes_avg_std= 0 sizes_avg_sem=0 #sizes_avg_sem = stats.sem(sizes_lst, axis=0) alen_lst = np.array(recruited_mg[i]['alen'].tolist()) if(len(alen_lst) > 0): alen_avg = np.mean(alen_lst) alen_avg_std = np.std(alen_lst) if(len(alen_lst) > 1): alen_avg_sem = stats.sem(alen_lst, axis=0) else: alen_avg_sem = 0 else: alen_avg = 0 alen_avg_std = 0 alen_avg_sem=0 #alen_avg_sem = stats.sem(alen_lst, axis=0) iden_lst = np.array(recruited_mg[i]['iden'].tolist()) if(len(iden_lst) > 0): iden_avg = np.mean(iden_lst) iden_avg_std = np.std(iden_lst) if(len(iden_lst) > 1): iden_avg_sem = stats.sem(iden_lst, axis=0) else: iden_avg_sem = 0 else: iden_avg = 0 iden_avg_std = 0 iden_avg_sem=0 #iden_avg_sem = stats.sem(iden_lst, axis=0) gc_lst = np.array(recruited_mg[i]['Contig_GC'].tolist()) if(len(gc_lst) > 0): gc_avg = np.mean(gc_lst) gc_avg_std = np.std(gc_lst) if(len(gc_lst) > 1): gc_avg_sem = stats.sem(gc_lst, axis=0) else: gc_avg_sem = 0 else: gc_avg = 0 gc_avg_std = 0 gc_avg_sem=0 if ref_cnt > 0: recr_percent = float(len(ids))/float(len(all_records[i]))*100 else: recr_percent = 0.0 #log_header = ['Run','Project Name','Created', 'Reference(s)','Metagenome', 'No. Contigs','No. References', alen_header, "Min iden (%)", shear_header, "Mean Contig Size (bp)","STD Contig Size", "SEM Contig Size", "Mean Contig alen (bp)","STD Contig alen", "SEM Contig alen", "Mean Contig iden (bp)","STD Contig iden", "SEM Contig iden", "Mean Contig GC (%)","STD Contig GC","SEM Contig GC","Mean Reference GC (%)","STD Reference GC","SEM Reference GC"] log_header = ['Run','Project Name','Created', 'Reference(s)', shear_header,'No. Ref. Sequences','Metagenome','No. Metagenome Contigs' , alen_header, "Min iden (%)",'No. Recruited Contigs','% Recruited Contigs', 'Total RPKM', 'RPKM STD', 'RPKM SEM', "Mean Rec. Contig Size (bp)","STD Rec. Contig Size", "SEM Rec. Contig Size", "Mean alen (bp)","STD alen", "SEM alen", "Mean Rec. Contig iden (bp)","STD Rec. Contig iden", "SEM Rec. Contig iden", "Mean Rec. Contigs GC (%)","STD Rec. Contig GC","SEM Rec. Contig GC","Mean Total Reference(s) GC (%)","STD Total Reference(s) GC","SEM Total Reference(s) GC"] #log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), metagenome, len(ids),ref_cnt, alen, iden, shear_log_value, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem] log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), shear_log_value,ref_cnt, metagenome,len(all_records[i]) , alen, iden,len(ids),recr_percent,rpkm, rpkm_std, rpkm_sem, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem] if os.path.isfile(logfile):#file exists - appending with open(logfile, "a") as log_handle: log_writer = csv.writer(log_handle, delimiter='\t') log_writer.writerow(log_row) else:#no file exists - writing with open(logfile,"w") as log_handle: log_writer = csv.writer(log_handle, delimiter='\t') log_writer.writerow(log_header) log_writer.writerow(log_row) close_ind_lst(all_records) close_ind_lst([all_input_recs])