def funLocalBlast(sFastaFileName, sGBKFileName, dbName): """Import packages used """ from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SeqIO from Bio.SeqUtils import GC import subprocess import xlsxwriter from funReadBlast import funReadBlast from funANICalc import funANICalc from funBlastANI2XLS import funBlastANI2XLS #sFastaFileName = '/Users/yi.yan/Documents/FDA-ARGOS/Batch6/PFDA1_Batch6_Misidentified_Contaminated/CNH_804.fasta' #sGBKFileName = "TestFolderGenBank/AMERTCC_44.annotation.20161209.gbk" #sGBKFileName = 'N/A' #dbName = "ref_prok_rep_genomes" columnTitleRow = [ "FDAARGOS_ID", #0 "Num_Contig", #1 "Assembly_Size", #2 "N_50", #3 "Largest_Contig_Size", #4 "Contig_ID", #5 "Contig_Length", #6 "Contig_GC", #7 "Proposed Organism", #8 "Blast_Hit", #9 "ACCESSION", #10 "Score", #11 "Percent_Query_Identity", #12 "Percent_Query_Coverage", #13 "Scientific_Name", #14 "Query_ANI_Coverage", #15 "Subject_ANI_Coverage", #16 "Query_ANI_Length", #17 "Subject_ANI_Length", #18 "Query_ANI_HD", #19 "Subject_ANI_HD", #20 "Query_ANI_Identity", #21 "Subject_ANI_Identity", #22 "Query_ANI_SE", #23 "Subject_ANI_SE" ] sFileName = sFastaFileName[0:-6] + '.xlsx' lARGOSID = sFastaFileName.split("/") sARGOSID = lARGOSID[-1][0:-6] """Import Fasta sequence from assembly file""" lSeqRecord = [] for seq_record in SeqIO.parse(sFastaFileName, "fasta"): lSeqRecord.append(seq_record) """Import Annotation""" all_species = [] if sGBKFileName == "N/A": for seq_record in lSeqRecord: all_species.append('N/A-N/A') else: f = open(sGBKFileName, 'r', errors='ignore') for line in f: if "ORGANISM" in line: print(line) sSpecie = line all_species.append(sSpecie) f.close """Calculate Contig Statistics""" lSize = [] lGC = [] for seq_record in lSeqRecord: lSize.append(len(seq_record.seq)) lGC.append(GC(seq_record.seq)) nTotalAssemblySize = sum(lSize) nNumContig = len(lSize) nLargestContig = max(lSize) #nTotalGC = np.multiply(lGC,lSize) #nTotalPercentGC = sum(nTotalGC)/nTotalAssemblySize nThreshold = 0.5 * nTotalAssemblySize lTempSize = sorted(lSize, reverse=True) nSize = 0 count = 0 while nSize <= nThreshold: nSize = nSize + lTempSize[count] out = count count = count + 1 nN50 = lTempSize[out] #Run Blast sOutFileName = sARGOSID + ".txt" blastn_cline = NcbiblastnCommandline(task = "megablast", \ query = sFastaFileName, \ db = dbName,\ evalue = 0.001, \ max_target_seqs = 5, \ outfmt = "\"6 " +\ "qseqid "+\ "qlen "+\ "sscinames "+\ "sacc "+\ "stitle "+\ "length "+\ "score "+\ "pident "+\ "qcovs\"", out = sOutFileName) print('run Refseq Blast') process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\ +"&&/usr/local/ncbi/blast/bin/"\ +str(blastn_cline),\ shell=True,\ stdout = subprocess.PIPE,\ stderr = subprocess.PIPE) proc_out, proc_err = process.communicate() tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\ nN50,nLargestContig,lGC,lSize) #Run ANI print('Calculating ANI') FinalTbl = funANICalc(tblComplete, lSeqRecord, dbName) s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True) workbook = xlsxwriter.Workbook(sFileName) funBlastANI2XLS(workbook, s, dbName, columnTitleRow) #BLAST NT sOutFileName = sARGOSID + ".txt" blastn_cline = NcbiblastnCommandline(task = "megablast", \ query = sFastaFileName, \ db = "nt",\ evalue = 0.001, \ max_target_seqs = 5, \ outfmt = "\"6 " +\ "qseqid "+\ "qlen "+\ "sscinames "+\ "sacc "+\ "stitle "+\ "length "+\ "score "+\ "pident "+\ "qcovs\"", out = sOutFileName) print('Run BLAST NT') process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\ +"&&/usr/local/ncbi/blast/bin/"\ +str(blastn_cline),\ shell=True,\ stdout = subprocess.PIPE,\ stderr = subprocess.PIPE) proc_out, proc_err = process.communicate() tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\ nN50,nLargestContig,lGC,lSize) #Run ANI print('Calculating ANI') FinalTbl = funANICalc(tblComplete, lSeqRecord, "nt") s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True) funBlastANI2XLS(workbook, s, 'NT', columnTitleRow) workbook.close()
required=True, help="Digite o nome do output em formato fasta", type=str) parser.add_argument('-G', '--outputGc', dest="gc_file", required=True, help="Digite a saide do gc", type=str) args = parser.parse_args() with open(args.fastq_file, 'r') as fastqhandle, open(args.gc_file, 'w') as gchandle: for record in SeqIO.parse(fastqhandle, "fastq"): sequences[record.id] = GC(record.seq) gchandle.write(f'{record.id}\toriginal\t{sequences[record.id]}\n') with open(args.fastq_file2, 'r') as fastqhandle2, open(args.gc_file, 'a') as gchandle2: for record in SeqIO.parse(fastqhandle2, "fastq"): sequences_mutado[record.id] = GC(record.seq) gchandle2.write( f'{record.id}\tmutado\t{sequences_mutado[record.id]}\n') figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6)) ax1.hist( sequences.values(), bins=1000, label='original', color='green', )
# 3. 统计基因和基因间区不同信息的分布情况 # -------------------------------------------- ''' print(step3) levelTwoType = [] geneDist = os.path.join(outdir, "gene.dist.tsv") with open(geneDist, 'w') as f: f.write("id\ttranscript_num\tgc\tgc1\tgc2\tgc3\tlength\n") for gene in db.features_of_type("gene"): transcriptCounts = str(len(list(db.children(gene)))) transcriptType = [t.featuretype for t in db.children(gene, level=1)] levelTwoType += transcriptType geneFa = gene.sequence(fasta) gc = GC(geneFa) gc123 = GC123(geneFa) geneLen = gene.end - gene.start + 1 items = [ gene.id, transcriptCounts, str(gc), str(gc123[1]), str(gc123[2]), str(gc123[3]), str(geneLen) ] linestr = '\t'.join(items) f.write(linestr + '\n') print(set(levelTwoType))
for codon in codonList: if codon in codonCntDict: codonCntDict[codon]+=1 else: codonCntDict[codon]=1 featureList=[] # for codon in codonOrder: # featureList.append(random.randint(0,100)) for codon in codonOrder: if codon in codonCntDict: featureList.append(float(codonCntDict[codon])) else: featureList.append(0) gcList.append(GC(sequence)) # featureList.append(GC(sequence)) lengthList.append(len(sequence)) # featureList.append(len(sequence)) phi=methods.calPhiForGene(sequence) # featureList.append(phi) # featureList.append() # featureList.append(methods.calMForGene(sequence)) # if phi<0.1: # continue phiList.append(phi) tAIList.append(calculateTAI.calculateOneGene(sequence,species)) # CAIList.append(calculateCAI.calculateCAIForAGene(sequence)) featureList=minmax_scale(featureList) # Here does the scale
def contig2gc(records): from Bio.SeqUtils import GC contig2gc_dico = {} for record in records: contig2gc_dico[record.name] = GC(record.seq) return contig2gc_dico
sum_len = tot_dup_len + tot_non_dup_len print("Total duplicated region length: %d " % tot_dup_len) print("Maximum length of duplicated region: %d" % (max(dup_len_l))) print("Minimum length of duplicated region: %d" % (min(dup_len_l))) print("Median length of duplicated region: %d" % (numpy.median(dup_len_l))) print("Total non-duplicated region length: %d " % tot_non_dup_len) print("Sum of duplicated + non-duplicated region length: %d " % sum_len) print("Total genome chr length: %d" % tot_chr_len) print("Percent of duplicated regions: %.2f%% " % (tot_dup_len / float(tot_chr_len) * 100)) print("%d within chromosome matches; %d across chromosome matches" % (nr_within_chr, nr_across_chr)) mean_GC = GC("".join(map(str, seq_d.values()))) print("Whole genome GC %.2f; Avg GC of duplicated genes %.2f; Avg GC of non-duplicated genes %.2f; \ Avg GC3 of duplicated genes %.2f; Avg GC3 of non-duplicated genes %.2f " \ %(mean_GC, numpy.mean(dup_gene_gc_l), numpy.mean(non_dup_gene_gc_l), \ numpy.mean(dup_gene_gc3_l), numpy.mean(non_dup_gene_gc3_l))) print("Median GC of duplicated genes %.2f; Median GC of non-duplicated genes %.2f; \ Median GC3 of duplicated genes %.2f; Median GC3 of non-duplicated genes %.2f " \ %(numpy.median(dup_gene_gc_l), numpy.median(non_dup_gene_gc_l), \ numpy.median(dup_gene_gc3_l), numpy.median(non_dup_gene_gc3_l))) mean_dup_GC = GC("".join(map(str, dup_seq_l))) mean_non_dup_GC = GC("".join(map(str, non_dup_seq_l))) print("Avg GC of duplicated regions %.2f; Avg GC of non-duplicated regions %.2f " \ %(mean_dup_GC, mean_non_dup_GC)) non_dup_gc_l = FloatVector(non_dup_gc_l).r_repr()
def main(argv): #default parameters mg_lst = [] ref_lst = [] e_val = 1e-5 alen = 50.0 alen_percent = True alen_bp = False iden = 95.0 name = "output" fmt_lst = ["fasta"] supported_formats = ["fasta", "csv"] iterations = 1 alen_increment = 5.0 iden_increment = 0.0 blast_db_Dir = "" results_Dir = "" input_files_Dir = "" ref_out_0 = "" blasted_lst = [] continue_from_previous = False #poorly supported, just keeping the directories skip_blasting = False debugging = False sheared = False shear_val = None logfile = "" try: opts, args = getopt.getopt(argv, "r:m:n:e:a:i:s:f:h", [ "reference=", "metagenome=", "name=", "e_value=", "alignment_length=", "identity=", "shear=", "format=", "iterations=", "alen_increment=", "iden_increment=", "continue_from_previous", "skip_blasting", "debugging", "help" ]) except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() # elif opt in ("--recover_after_failure"): # recover_after_failure = True # print "Recover after failure:", recover_after_failure elif opt in ("--continue_from_previous"): continue_from_previous = True if debugging: print "Continue after failure:", continue_from_previous elif opt in ("--debugging"): debugging = True if debugging: print "Debugging messages:", debugging elif opt in ("-r", "--reference"): if arg: ref_lst = arg.split(',') #infiles = arg if debugging: print "Reference file(s)", ref_lst elif opt in ("-m", "--metagenome"): if arg: mg_lst = arg.split(',') #infiles = arg if debugging: print "Metagenome file(s)", mg_lst elif opt in ("-f", "--format"): if arg: fmt_lst = arg.split(',') #infiles = arg if debugging: print "Output format(s)", fmt_lst elif opt in ("-n", "--name"): if arg.strip(): name = arg if debugging: print "Project name", name elif opt in ("-e", "--e_value"): try: e_val = float(arg) except: print "\nERROR: Please enter numerical value as -e parameter (default: 1e-5)" usage() sys.exit(1) if debugging: print "E value", e_val elif opt in ("-a", "--alignment_length"): if arg.strip()[-1] == "%": alen_bp = False alen_percent = True else: alen_bp = True alen_percent = False try: alen = float(arg.split("%")[0]) except: print "\nERROR: Please enter a numerical value as -a parameter (default: 50.0)" usage() sys.exit(1) if debugging: print "Alignment length", alen elif opt in ("-i", "--identity"): try: iden = float(arg) except: print "\nERROR: Please enter a numerical value as -i parameter (default: 95.0)" usage() sys.exit(1) if debugging: print "Alignment length", iden elif opt in ("-s", "--shear"): sheared = True try: shear_val = int(arg) except: print "\nERROR: Please enter an integer value as -s parameter" usage() sys.exit(1) if debugging: print "Alignment length", iden elif opt in ("--iterations"): try: iterations = int(arg) except: print "\nWARNING: Please enter integer value as --iterations parameter (using default: 1)" if debugging: print "Iterations: ", iterations elif opt in ("--alen_increment"): try: alen_increment = float(arg) except: print "\nWARNING: Please enter numerical value as --alen_increment parameter (using default: )", alen_increment if debugging: print "Alignment length increment: ", alen_increment elif opt in ("--iden_increment"): try: iden_increment = float(arg) except: print "\nWARNING: Please enter numerical value as --iden_increment parameter (using default: )", iden_increment if debugging: print "Alignment length increment: ", iden_increment elif opt in ("--skip_blasting"): skip_blasting = True if debugging: print "Blasting step omitted; Using previous blast output." for ref_file in [x for x in ref_lst if x]: try: # with open(ref_file, "rU") as hand_ref: pass except: print "\nERROR: Reference File(s) [" + ref_file + "] doesn't exist" usage() sys.exit(1) for mg_file in [x for x in mg_lst if x]: try: # with open(mg_file, "rU") as hand_mg: pass except: print "\nERROR: Metagenome File(s) [" + mg_file + "] doesn't exist" usage() sys.exit(1) for fmt in [x for x in fmt_lst if x]: if fmt not in supported_formats: print "\nWARNING: Output format [", fmt, "] is not supported" print "\tUse -h(--help) option for the list of supported formats" fmt_lst = ["fasta"] print "\tUsing default output format: ", fmt_lst[0] project_dir = name if not continue_from_previous: if os.path.exists(project_dir): shutil.rmtree(project_dir) try: os.mkdir(project_dir) except OSError: print "ERROR: Cannot create project directory: " + name raise print "\n\t Initial Parameters:" print "\nProject Name: ", name, '\n' print "Project Directory: ", os.path.abspath(name), '\n' print "Reference File(s): ", ref_lst, '\n' if sheared: print "Shear Reference File(s):", str(shear_val) + "bp", '\n' print "Metagenome File(s): ", mg_lst, '\n' print "E Value: ", e_val, "\n" if alen_percent: print "Alignment Length: " + str(alen) + '%\n' if alen_bp: print "Alignment Length: " + str(alen) + 'bp\n' print "Sequence Identity: " + str(iden) + '%\n' print "Output Format(s):", fmt_lst, '\n' if iterations > 1: print "Iterations: ", iterations, '\n' print "Alignment Length Increment: ", alen_increment, '\n' print "Sequence identity Increment: ", iden_increment, '\n' #Initializing directories blast_db_Dir = name + "/blast_db" if not continue_from_previous: if os.path.exists(blast_db_Dir): shutil.rmtree(blast_db_Dir) try: os.mkdir(blast_db_Dir) except OSError: print "ERROR: Cannot create project directory: " + blast_db_Dir raise results_Dir = name + "/results" if not continue_from_previous: if os.path.exists(results_Dir): shutil.rmtree(results_Dir) try: os.mkdir(results_Dir) except OSError: print "ERROR: Cannot create project directory: " + results_Dir raise input_files_Dir = name + "/input_files" if not continue_from_previous: if os.path.exists(input_files_Dir): shutil.rmtree(input_files_Dir) try: os.mkdir(input_files_Dir) except OSError: print "ERROR: Cannot create project directory: " + input_files_Dir raise # Writing raw reference files into a specific input filename input_ref_records = {} for reference in ref_lst: ref_records_ind = parse_contigs_ind(reference) #ref_records = dict(ref_records_ind) input_ref_records.update(ref_records_ind) ref_records_ind.close() #input_ref_records.update(ref_records) ref_out_0 = input_files_Dir + "/reference0.fna" if (sheared & bool(shear_val)): with open(ref_out_0, "w") as handle: SeqIO.write( genome_shredder(input_ref_records, shear_val).values(), handle, "fasta") #NO NEED TO CLOSE with statement will automatically close the file else: with open(ref_out_0, "w") as handle: SeqIO.write(input_ref_records.values(), handle, "fasta") # Making BLAST databases #output fname from before used as input for blast database creation input_ref_0 = ref_out_0 title_db = name + "_db" #add iteration functionality outfile_db = blast_db_Dir + "/iteration" + str( iterations) + "/" + name + "_db" #change into for loop os.system("makeblastdb -in " + input_ref_0 + " -dbtype prot -title " + title_db + " -out " + outfile_db + " -parse_seqids") # BLASTing query contigs if not skip_blasting: print "\nBLASTing query file(s):" for i in range(len(mg_lst)): database = outfile_db # adjust for iterations blasted_lst.append(results_Dir + "/recruited_mg_" + str(i) + ".tab") start = time.time() os_string = 'blastp -db ' + database + ' -query \"' + mg_lst[ i] + '\" -out ' + blasted_lst[i] + " -evalue " + str( e_val) + " -outfmt 6 -num_threads 8" #print os_string os.system(os_string) print "\t" + mg_lst[i] + "; Time elapsed: " + str( time.time() - start) + " seconds." else: for i in range(len(mg_lst)): blasted_lst.append(results_Dir + "/recruited_mg_" + str(i) + ".tab") # Parsing BLAST outputs blast_cols = [ 'quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits' ] recruited_mg = [] for i in range(len(mg_lst)): try: df = pandas.read_csv(blasted_lst[i], sep="\t", header=None) except: df = pandas.DataFrame(columns=blast_cols) df.columns = blast_cols recruited_mg.append(df) # print len(recruited_mg[0]) # print len(recruited_mg[1]) #creating all_records entry #! Remember to close index objects after they are no longer needed #! Use helper function close_ind_lst() all_records = [] all_input_recs = parse_contigs_ind(ref_out_0) ##calculating GC of the reference # if (len(all_input_recs)>1): #TODO: make a better adaptation if False: # I'm adapting the script for blastn pass # ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()]) # ref_cnt = ref_gc_lst.size # ref_gc_avg = np.mean(ref_gc_lst) # ref_gc_avg_std = np.std(ref_gc_lst) # if(len(ref_gc_lst) > 0): # ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0) # else: # ref_gc_avg_sem=0 else: if (debugging): print "Only one reference" ref_gc_lst = np.array([GC(x.seq) for x in all_input_recs.values()]) ref_cnt = ref_gc_lst.size ref_gc_avg = np.mean(ref_gc_lst) ref_gc_avg_std = 0 ref_gc_avg_sem = 0 #ref_gc_avg_sem = stats.sem(ref_gc_lst, axis=0) # _ = 0 # for key, value in all_input_recs.items(): # _ +=1 # if _ < 20: # print key, len(value) print "\nIndexing metagenome file(s):" for i in range(len(mg_lst)): start = time.time() all_records.append(parse_contigs_ind(mg_lst[i])) print "\t" + mg_lst[i] + " Indexed in : " + str(time.time() - start) + " seconds." # Transforming data print "\nParsing recruited contigs:" for i in range(len(mg_lst)): start = time.time() #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe]) recruited_mg[i] = unique_scaffold_topBits(recruited_mg[i]) contig_list = recruited_mg[i]['quid'].tolist() #this should solve string/int fastaID problem, until now fixed with renaming contig_list = list(map(str, contig_list)) recruited_mg[i]['Contig_nt'] = retrive_sequence( contig_list, all_records[i]) recruited_mg[i]['Contig_size'] = recruited_mg[i]['Contig_nt'].apply( lambda x: len(x)) #recruited_mg[i]['Ref_nt']=recruited_mg[i]['suid'].apply(lambda x: all_input_recs[str(x)].seq) recruited_mg[i]['Ref_size'] = recruited_mg[i]['suid'].apply( lambda x: len(all_input_recs[str(x)])) #TODO: make a better adaptation recruited_mg[i]['Ref_GC'] = 0.0 #recruited_mg[i]['Ref_GC']=recruited_mg[i]['suid'].apply(lambda x: GC(all_input_recs[str(x)].seq)) #recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: 100.0*float(x))/min(recruited_mg[i]['Contig_size'].apply(lambda y: y),recruited_mg[i]['Ref_size'].apply(lambda z: z)) #df.loc[:, ['B0', 'B1', 'B2']].min(axis=1) recruited_mg[i]['Coverage'] = recruited_mg[i]['alen'].apply( lambda x: 100.0 * float(x) ) / recruited_mg[i].loc[:, ["Contig_size", "Ref_size"]].min(axis=1) recruited_mg[i]['Metric'] = recruited_mg[i]['Coverage'] * recruited_mg[ i]['iden'] / 100.0 try: recruited_mg[i]['Contig_GC'] = recruited_mg[i]['Contig_nt'].apply( lambda x: GC(x)) except: recruited_mg[i]['Contig_GC'] = recruited_mg[i]['Contig_nt'].apply( lambda x: None) try: recruited_mg[i]['Read_RPKM'] = 1.0 / ( (recruited_mg[i]['Ref_size'] / 1000.0) * (len(all_records[i]) / 1000000.0)) except: recruited_mg[i]['Read_RPKM'] = np.nan #recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Ref_size','Ref_GC','Ref_nt','Contig_size','Contig_GC','Contig_nt']] recruited_mg[i] = recruited_mg[i][[ 'quid', 'suid', 'iden', 'alen', 'Coverage', 'Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits', 'Ref_size', 'Ref_GC', 'Contig_size', 'Contig_GC', 'Read_RPKM', 'Contig_nt' ]] print "\tContigs from " + mg_lst[i] + " parsed in : " + str( time.time() - start) + " seconds." # Here would go statistics functions and producing plots # # # # # # Quality filtering before outputting if alen_percent: for i in range(len(recruited_mg)): recruited_mg[i] = recruited_mg[i][ (recruited_mg[i]['iden'] >= iden) & (recruited_mg[i]['Coverage'] >= alen) & (recruited_mg[i]['eval'] <= e_val)] if alen_bp: for i in range(len(recruited_mg)): recruited_mg[i] = recruited_mg[i][ (recruited_mg[i]['iden'] >= iden) & (recruited_mg[i]['alen'] >= alen) & (recruited_mg[i]['eval'] <= e_val)] # print len(recruited_mg[0]) # print len(recruited_mg[1]) # Batch export to outfmt (csv and/or multiple FASTA) alen_str = "" iden_str = "_iden_" + str(iden) + "%" if alen_percent: alen_str = "_alen_" + str(alen) + "%" if alen_bp: alen_str = "_alen_" + str(alen) + "bp" if iterations > 1: prefix = name + "/results/" + name.split("/")[0] + "_iter_e_" + str( e_val) + iden_str + alen_str else: prefix = name + "/results/" + name.split("/")[0] + "_e_" + str( e_val) + iden_str + alen_str if sheared: prefix = prefix + '_sheared_' + str(shear_val) + "bp" prefix = prefix + "_recruited_mg_" #initializing log file data logfile = name.split("/")[0] + "/results_log.csv" try: run = int(name.split("/")[-1].split("_") [-1]) # using "_" less depends on the wrapper script except: if name.split("/")[-1].split("_")[-1] == name: run = 0 else: print "Warning: Run identifier could not be written in: " + logfile #sys.exit(1) run = None alen_header = "Min alen" if alen_bp: alen_header = alen_header + " (bp)" if alen_percent: alen_header = alen_header + " (%)" shear_header = "Reference Shear (bp)" shear_log_value = 0 if sheared: shear_log_value = str(shear_val) print "\nWriting files:" for i in range(len(mg_lst)): records = [] if "csv" in fmt_lst: outfile1 = prefix + str(i) + ".csv" recruited_mg[i].to_csv(outfile1, sep='\t') print str(len( recruited_mg[i])) + " sequences written to " + outfile1 if "fasta" in fmt_lst: ids = recruited_mg[i]['quid'].tolist() # fixing the renaming error, converting to list of string ids = list(map(str, ids)) #if len(ids)==len(sequences): for j in range(len(ids)): records.append(all_records[i][ids[j]]) outfile2 = prefix + str(i) + ".fasta" with open(outfile2, "w") as output_handle: #SeqIO.write(records, output_handle, "fasta") #this should not have line wrappings SeqIO.write(records, output_handle, "fasta-2line") print str(len(ids)) + " sequences written to " + outfile2 #Writing logfile try: time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) except: print "Warning: Time identifier could not be written in: " + logfile metagenome = mg_lst[i] #contig info rpkm_lst = np.array(recruited_mg[i]['Read_RPKM'].tolist()) if (len(rpkm_lst) > 0): rpkm = np.sum(rpkm_lst) rpkm_std = np.std(rpkm_lst) rpkm_sem = np.std(rpkm_lst) * np.sqrt(len(rpkm_lst)) else: rpkm = 0 rpkm_std = 0 rpkm_sem = 0 sizes_lst = np.array(recruited_mg[i]['Contig_size'].tolist()) if (len(sizes_lst) > 0): sizes_avg = np.mean(sizes_lst) sizes_avg_std = np.std(sizes_lst) if (len(sizes_lst) > 1): sizes_avg_sem = stats.sem(sizes_lst, axis=0) else: sizes_avg_sem = 0 else: sizes_avg = 0 sizes_avg_std = 0 sizes_avg_sem = 0 #sizes_avg_sem = stats.sem(sizes_lst, axis=0) alen_lst = np.array(recruited_mg[i]['alen'].tolist()) if (len(alen_lst) > 0): alen_avg = np.mean(alen_lst) alen_avg_std = np.std(alen_lst) if (len(alen_lst) > 1): alen_avg_sem = stats.sem(alen_lst, axis=0) else: alen_avg_sem = 0 else: alen_avg = 0 alen_avg_std = 0 alen_avg_sem = 0 #alen_avg_sem = stats.sem(alen_lst, axis=0) iden_lst = np.array(recruited_mg[i]['iden'].tolist()) if (len(iden_lst) > 0): iden_avg = np.mean(iden_lst) iden_avg_std = np.std(iden_lst) if (len(iden_lst) > 1): iden_avg_sem = stats.sem(iden_lst, axis=0) else: iden_avg_sem = 0 else: iden_avg = 0 iden_avg_std = 0 iden_avg_sem = 0 #iden_avg_sem = stats.sem(iden_lst, axis=0) gc_lst = np.array(recruited_mg[i]['Contig_GC'].tolist()) if (len(gc_lst) > 0): gc_avg = np.mean(gc_lst) gc_avg_std = np.std(gc_lst) if (len(gc_lst) > 1): gc_avg_sem = stats.sem(gc_lst, axis=0) else: gc_avg_sem = 0 else: gc_avg = 0 gc_avg_std = 0 gc_avg_sem = 0 if ref_cnt > 0: recr_percent = float(len(ids)) / float(len(all_records[i])) * 100 else: recr_percent = 0.0 #log_header = ['Run','Project Name','Created', 'Reference(s)','Metagenome', 'No. Contigs','No. References', alen_header, "Min iden (%)", shear_header, "Mean Contig Size (bp)","STD Contig Size", "SEM Contig Size", "Mean Contig alen (bp)","STD Contig alen", "SEM Contig alen", "Mean Contig iden (bp)","STD Contig iden", "SEM Contig iden", "Mean Contig GC (%)","STD Contig GC","SEM Contig GC","Mean Reference GC (%)","STD Reference GC","SEM Reference GC"] log_header = [ 'Run', 'Project Name', 'Created', 'Reference(s)', shear_header, 'No. Ref. Sequences', 'Metagenome', 'No. Metagenome Contigs', alen_header, "Min iden (%)", 'No. Recruited Contigs', '% Recruited Contigs', 'Total RPKM', 'RPKM STD', 'RPKM SEM', "Mean Rec. Contig Size (bp)", "STD Rec. Contig Size", "SEM Rec. Contig Size", "Mean alen (bp)", "STD alen", "SEM alen", "Mean Rec. Contig iden (bp)", "STD Rec. Contig iden", "SEM Rec. Contig iden", "Mean Rec. Contigs GC (%)", "STD Rec. Contig GC", "SEM Rec. Contig GC", "Mean Total Reference(s) GC (%)", "STD Total Reference(s) GC", "SEM Total Reference(s) GC" ] #log_row = [run,name.split("/")[0],time_str, ";".join(ref_lst), metagenome, len(ids),ref_cnt, alen, iden, shear_log_value, sizes_avg,sizes_avg_std, sizes_avg_sem, alen_avg,alen_avg_std, alen_avg_sem, iden_avg,iden_avg_std, iden_avg_sem, gc_avg,gc_avg_std, gc_avg_sem,ref_gc_avg,ref_gc_avg_std, ref_gc_avg_sem] log_row = [ run, name.split("/")[0], time_str, ";".join(ref_lst), shear_log_value, ref_cnt, metagenome, len(all_records[i]), alen, iden, len(ids), recr_percent, rpkm, rpkm_std, rpkm_sem, sizes_avg, sizes_avg_std, sizes_avg_sem, alen_avg, alen_avg_std, alen_avg_sem, iden_avg, iden_avg_std, iden_avg_sem, gc_avg, gc_avg_std, gc_avg_sem, ref_gc_avg, ref_gc_avg_std, ref_gc_avg_sem ] if os.path.isfile(logfile): #file exists - appending with open(logfile, "a") as log_handle: log_writer = csv.writer(log_handle, delimiter='\t') log_writer.writerow(log_row) else: #no file exists - writing with open(logfile, "w") as log_handle: log_writer = csv.writer(log_handle, delimiter='\t') log_writer.writerow(log_header) log_writer.writerow(log_row) close_ind_lst(all_records) close_ind_lst([all_input_recs])
from Bio.SeqUtils import GC import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt usage = """ Usage: fasta_seq_gc_content_plot.py fastafile [fastafile...] """ if len(sys.argv) <= 1: print(usage) sys.exit(0) gc = [] for file in sys.argv[1:]: if not os.path.exists(file): print("file not exists: %s" % file) sys.exit(0) with open(file + ".gc", 'w') as fh: for seq in SeqIO.parse(file, "fasta"): gccontent = GC(seq.seq) gc.append(gccontent) fh.write("%s\t%d\n" % (seq.id, gccontent)) mpl.rc("figure", figsize=(8, 4)) sns.distplot(gc) plt.savefig(file + ".gc.png")
def calcularte_CG_content(self, peaks_fa_file): gc_values = sorted(GC(rec.seq) for rec in SeqIO.parse(peaks_fa_file, "fasta")) self.gc_total = statistics.mean(gc_values) self.gc_total= 31
def get_GC_contents(seq): return GC(seq)
from Bio import SeqIO from Bio.SeqUtils import GC import sys if(len(sys.argv) < 2): print("Usage python gc.py fasta_file ") else: for seq_record in SeqIO.parse(sys.argv[1], "fasta"): print("Sequence ID: "+seq_record.id) print("Sequence :\n"+str(seq_record.seq)) print("Sequence length "+str(len(seq_record))+"\n") print("GC content: "+str(GC(seq_record.seq))+"\n") print("A :"+str(100.00*seq_record.seq.count("A")/len(seq_record))+"%") print("T :"+str(100.00*seq_record.seq.count("T")/len(seq_record))+"%") print("G :"+str(100.00*seq_record.seq.count("G")/len(seq_record))+"%") print("C :"+str(100.00*seq_record.seq.count("C")/len(seq_record))+"%") print("\n")
from Bio import SeqIO from Bio.SeqUtils import GC records = list(SeqIO.parse("rosalind_tree.txt", "fasta")) max = 0 max_id = '' for item in records: if (GC(item.seq))> max: max = GC(item.seq) max_id = item.id print (max_id) print (max)
yield batch if Trinity == True: outputfile1 = (prefix + '_ContigDescrp_with_GC.txt') outputfile2 = (prefix + '_Contig_Coumpound_List.txt') outputfile3 = (prefix + '_ContigDescrp.txt') with open(inputfile) as fasta_file: # Will close handle cleanly identifier = [] length = [] description = [] gccontent = [] for title, sequence in SimpleFastaParser(fasta_file): identifier.append(title.split(None, 1)[0]) # First word is ID length.append(len(sequence)) gccontent.append(GC(sequence)) description.append( "No Description") # Description is "No Description" #ContigDescrp = DataFrame(dict(subjectid = Series(identifier, name = 'subjectid'), subjectlength = Series(length, name = 'subjectlength'))).set_index(['subjectid']) ContigDescrp = DataFrame( dict(Contigid=Series(identifier, name='Contigid'), ContigLength=Series(length, name='ContigLength'), GCContent=Series(gccontent, name='GCContent'), Description=Series(description, name='Description'))).set_index(['Contigid']) #print ContigDescrp ContigDescrp = ContigDescrp[["ContigLength", "Description", "GCContent"]] ContigDescrp.to_csv(outputfile1, sep='\t', index=True) ContigDescrp = ContigDescrp.drop('GCContent', 1) ContigDescrp.to_csv(outputfile3, sep='\t', index=True) #Getting another column : gene id
search_and_retrieve_fasta("nucleotide", "Blossfeldia[orgn] and rpl16", "blossfeldia_rpl16.fasta") # Look at output file ### PARSING from Bio import SeqIO blossfeldia_rpl16_sequences = list( SeqIO.parse("blossfeldia_rpl16.fasta", "fasta")) # Look at sequence list and blossfeldia_rpl16_sequences[0] ### SEQUENCE OBJECTS _first_blossfeldia_rpl16_sequence = blossfeldia_rpl16_sequences[0].seq first_blossfeldia_rpl16_sequence = blossfeldia_rpl16_sequences[0].seq # GC % from Bio.SeqUtils import GC GC(first_blossfeldia_rpl16_sequence) # DNA --> RNA --> DNA first_blossfeldia_rpl16_sequence.transcribe() first_blossfeldia_rpl16_sequence.back_transcribe() # DNA Coding Strand --> Protein first_blossfeldia_rpl16_sequence.translate() ### BLASTING from Bio.Blast import NCBIWWW from Bio import SeqIO result_handle = NCBIWWW.qblast("blastn", "nt", _first_blossfeldia_rpl16_sequence) save_file = open("blast_search_on_first_blossfeldia_rpl16_sequence.xml", "w") save_file.write(result_handle.read())
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC") for index, letter in enumerate(my_seq): print("%i %s" % (index, letter)) # length print(len(my_seq)) # first element print(my_seq[22]) #last element print(my_seq[-1]) print(my_seq.count("GC")) from Bio.SeqUtils import GC print(GC(my_seq)) #slicing print(my_seq[1:4]) # starting from 0 with step 3 print(my_seq[1::2]) print(my_seq[1:6:2]) #reverse print(my_seq[::-1]) print(my_seq[22:35]) my_seq2 = Seq("EVRNAK") print(my_seq + my_seq2) print(my_seq2 + my_seq) list_of_seqs = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
def get_stats(D_fasta, D_gff3): # Get stats D_stat = {} cds_lengths = [] protein_lengths = [] exon_lengths = [] transcript_lengths = [] intron_lengths = [] num_introns = [] num_exons = [] num_spliced = 0 single_exon_genes = 0 total_genes = 0 D_cds_seq = {} D_cds_coords = defaultdict(list) sorted_genes = sorted( D_gff3.items(), key=lambda x: ( int(re.findall(r'\d+', x[0])[0]), x[1][0][1] ) ) for prot_id, tuples in sorted_genes: total_genes += 1 tmp_prot_len = 0 if len(tuples) > 1: num_spliced += 1 cds_seq = '' for tup in tuples: scaffold, start, end, strand, phase = tup if strand == '+' and tup == tuples[0]: start = start + phase elif strand == '-' and tup == tuples[-1]: end = end - phase tmp_prot_len += end - start + 1 exon_lengths.append(end - start + 1) # Get sequence cds_seq += str(D_fasta[scaffold][start - 1:end].seq) # Store in dictionary D_cds_coords[scaffold].append((start, end)) if strand == '-': cds_seq = get_reverse_complement(cds_seq) D_cds_seq[prot_id] = cds_seq cds_length = tmp_prot_len cds_lengths.append(cds_length) protein_length = tmp_prot_len / 3 protein_lengths.append(protein_length) transcript_length = int(tuples[-1][2]) - int(tuples[0][1]) + 1 transcript_lengths.append(transcript_length) num_intron = len(tuples) - 1 if num_intron > 0: intron_start = [x[2] for x in tuples[:-1]] intron_end = [x[1] for x in tuples[1:]] intron_length = [ y - x - 1 for x, y in zip(intron_start, intron_end) ] intron_lengths += intron_length num_introns.append(len(tuples) - 1) else: intron_median = 0 num_introns_median = 0 num_exons.append(len(tuples)) if len(tuples) == 1: single_exon_genes += 1 intron_median = np.median(np.array(intron_lengths)) intron_len_average = np.average(np.array(intron_lengths)) num_introns_median = np.median(np.array(num_introns)) exon_median = np.median(np.array(exon_lengths)) exon_len_average = np.average(np.array(exon_lengths)) cds_average = np.average(cds_lengths) cds_median = np.median(cds_lengths) protein_average = np.average(np.array(protein_lengths)) protein_median = np.median(np.array(protein_lengths)) transcript_median = np.median(np.array(transcript_lengths)) transcript_average = np.average(np.array(transcript_lengths)) num_exons_median = np.median(np.array(num_exons)) # Guitar percent_splice = round(float(num_spliced) / total_genes * 100, 2) total_bases_lst = [len(str(x.seq)) for x in D_fasta.values()] total_bases = sum(total_bases_lst) gene_density = float(total_genes) / total_bases gene_density = gene_density * 1000000 gene_density = round(gene_density, 2) # Get GC content of CDS seq full_cds_seq = ''.join(D_cds_seq.values()) my_seq = Seq(full_cds_seq, IUPAC.unambiguous_dna) cds_gc_percent = GC(my_seq) # Percent coding coding_percent = float(len(full_cds_seq)) / total_bases coding_percent = coding_percent * 100 coding_percent = round(coding_percent, 2) D_stat['Total genes'] = total_genes D_stat['Transcript length'] = ( round(transcript_average, 1), transcript_median ) D_stat['CDS length'] = (round(cds_average, 1), cds_median) D_stat['Protein length'] = (round(protein_average, 1), protein_median) D_stat['Exon length'] = (round(exon_len_average, 1), exon_median) D_stat['Intron length'] = (round(intron_len_average, 1), intron_median) D_stat['Spliced'] = (num_spliced, percent_splice) D_stat['Gene density'] = gene_density D_stat['Num introns'] = sum(num_introns) D_stat['Num introns per gene'] = num_introns_median D_stat['Num exons'] = sum(num_exons) D_stat['Num exons per gene'] = num_exons_median D_stat['Num single exon genes'] = single_exon_genes D_stat['Percent coding region'] = (len(full_cds_seq), coding_percent) D_stat['Coding region GC'] = round(cds_gc_percent, 2) return D_cds_coords, protein_lengths, D_stat
def complete_tasks(full_seq, des, unique_key): file_details = st.radio("Details", ("Description", "Sequence"), key=unique_key) #Show description and sequence in DNA Analysis section if file_details == "Description": st.write(des) elif file_details == "Sequence": st.write(full_seq) #Nucleotide occurances plot and color selector for the bars st.subheader("Plot Nucleotide Frequency") full_seq_freq = OrderedDict(Counter(full_seq)) bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key) bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key) bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key) bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key) if st.button("Plot Frequency", key=unique_key): barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values()) barlist[0].set_color(bar1_colour) barlist[1].set_color(bar2_colour) barlist[2].set_color(bar3_colour) barlist[3].set_color(bar4_colour) st.pyplot() st.subheader("Properties") #GC Content, GC Melting temp, GC_skew, Complement and reverse complement gc_count = GC(full_seq) st.write("GC Content: {}".format(gc_count)) mt = MeltingTemp.Tm_GC(full_seq, strict=False) st.write("Melting Temperature based on GC Content: {}".format(mt)) gc_skew_bases = st.number_input("Enter number of bases", key=unique_key) try: gc_skew = GC_skew(full_seq, int(gc_skew_bases)) st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew)) except ValueError: st.write("Enter a Valid Number for bases") if st.checkbox("Complement", key=unique_key): st.write(full_seq.complement()) elif st.checkbox("Reverse Complement", key=unique_key): st.write(full_seq.reverse_complement()) #Protein Synthesis st.subheader("Protein Synthesis") p1 = full_seq.translate() if st.checkbox("Transcription: DNA to mRNA", key=unique_key): st.write(full_seq.transcribe()) elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence", key=unique_key): st.write(p1) elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence", key=unique_key): full_aa_name = str(p1).replace("*", "") st.write(seq3(full_aa_name)) elif st.checkbox("Plot Amino Acid Frequency", key=unique_key): aa_freq = OrderedDict(Counter(str(p1))) bar_colour = st.beta_color_picker("Pick Colour for all Bars", key=unique_key) plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour) st.pyplot() st.write("Asterisk (*) - Denotes Stop Codons.")
import pylab # Parte 1 - Abrindo arquivo GBK for i in SeqIO.parse("NC_017108.gbk", "genbank"): seq = str(i.seq) # Parte 2 - Variaveis importantes tamanho = len(seq) fragmentos = int(tamanho / 10000) gc = [] # Parte 3 - Armazenando conteudo GC for i in range(fragmentos): j = i * 10000 k = j + 9999 gc_atual = GC(seq[j:k]) gc.append(gc_atual) print(i,": ",j,"-",k,"- GC =",gc_atual,"%") # Parte 4 - Adicionando o ultimo elemento resto = tamanho % 10000 j = (i+1) * 10000 k = j + resto gc_ultimo = GC(seq[j:k]) gc.append(gc_ultimo) print(i+1,": ",j,"-",k,"- GC =",gc_ultimo,"%") # Parte 5 - Imprimindo grafico pylab.plot(gc) pylab.title("Conteudo GC\n%i fragmentos de 10000 pb variando de %0.1f%% \ a %0.1f%%" % (len(gc),min(gc),max(gc)))
def subsampleGC(modelfasta, subsamplefasta): #modelbins and subsamplebins are dictionaries where key is bin (e.g. '48_to_50') and value is list of IDs modelbins = {} subsamplebins = {} subsampledIDs = [] subsampledfasta = [] modelfastarecords = 0 subsamplefastarecords = 0 #Populate modelbins for record in SeqIO.parse(modelfasta, 'fasta'): lowerbound = 20 upperbound = 22 GCcontent = float(GC(record.seq)) modelfastarecords +=1 while upperbound <= 70: if GCcontent >= lowerbound and GCcontent < upperbound: if '%s_to_%s' % (lowerbound, upperbound) in modelbins: modelbins['%s_to_%s' % (lowerbound, upperbound)].append(record.id) break else: modelbins['%s_to_%s' % (lowerbound, upperbound)] = [record.id] break else: lowerbound +=2 upperbound +=2 #Populate subsamplebins for record in SeqIO.parse(subsamplefasta, 'fasta'): lowerbound = 20 upperbound = 22 GCcontent = float(GC(record.seq)) subsamplefastarecords +=1 while upperbound <= 70: if GCcontent >= lowerbound and GCcontent < upperbound: if '%s_to_%s' % (lowerbound, upperbound) in subsamplebins: subsamplebins['%s_to_%s' % (lowerbound, upperbound)].append(record.id) break else: subsamplebins['%s_to_%s' % (lowerbound, upperbound)] = [record.id] break else: lowerbound +=2 upperbound +=2 #Number of records in each fasta file...used for calculating density modelfastarecords = float(modelfastarecords) subsamplefastarecords = float(subsamplefastarecords) for modelbin in modelbins: modelbinpop = float(len(modelbins[modelbin])) modelbindens = float((len(modelbins[modelbin]) / modelfastarecords)) #Number of records to pick is density of that bin in modelfasta * number of records in subsamplefasta subsample_records_to_pick = int(round(modelbindens * subsamplefastarecords)) if modelbin in subsamplebins: subsamplebinpop = float(len(subsamplebins[modelbin])) if subsamplebinpop > subsample_records_to_pick: #pick random records random_subsampled_IDs = random.sample(subsamplebins[modelbin], subsample_records_to_pick) subsampledIDs += random_subsampled_IDs elif subsamplebinpop <= subsample_records_to_pick: #pick all records subsampledIDs += subsamplebins[modelbin] #Reassemble fasta from chosen IDs for record in SeqIO.parse(subsamplefasta, 'fasta'): if record.id in subsampledIDs: subsampledfasta.append(['>' + str(record.id), record.seq]) print 'There were %i records in the model fasta and %i in the fasta to be subsampled. %i records were chosen in the sampling.' % (modelfastarecords, subsamplefastarecords, len(subsampledIDs)) #Return a list of fasta records. Each record is itself a list where the first item is the ID and the second is the sequence return subsampledfasta
def gc_extract(): pos_neg_files = ['phycodnaviridae_virus_name.txt', 'phage_name.txt'] data_path = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\' gc_list = [] num_gene_list = [] gene_len_list = [] name_list = [] eg_list = ['pos', 'neg'] eg_num = 0 for example in pos_neg_files: prepend_str = eg_list[eg_num] example_path = data_path + example f = open(example_path, 'r') contig_num = 0 for dir in f.readlines(): #pvt_list = [] no_match = 0 #print dir path = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\all_fna\\all.fna\\' path1 = 'C:\\Users\\Reema\\Documents\\SDSU_Education\\Thesis_Phyco\\all_ffn\\all.ffn\\' dir = dir.strip('\n') dirpath = path + dir dirpath1 = path1 + dir #print "VIRUS = "+dirpath i = 'grinder-reads.fa' file_path = dirpath + '\\' + i file_handle = open(file_path, 'r') for seq in SeqIO.parse(file_handle, "fasta"): GC_content = 0 num_gene = 0 length = 0 inpstr = seq.description complement_in_grinder = 'position=complement' in inpstr p = re.compile('[0-9]+\.\.[0-9]+') x = p.findall(inpstr) y = x[0].split('..') gmin = y[0] gmax = y[1] #print inpstr #print y matched = 0 for j in os.listdir(dirpath1): if j.endswith(".ffn"): file_path1 = dirpath1 + '\\' + j file_handle1 = open(file_path1, 'r') for seq1 in SeqIO.parse(file_handle1, "fasta"): inpstr1 = seq1.description q = re.compile('[0-9]+\-[0-9]+ ') z = q.findall(inpstr1) u = z[0].split('-') smin = u[0] smax = u[1] #print u complement_in_seq = '|:c' in inpstr1 if (complement_in_grinder and complement_in_seq ) or (not (complement_in_grinder) and not (complement_in_seq)): if gmin <= smin and gmax >= smax: matched = 1 na_seq = seq1.seq GC_content = GC_content + GC(str(na_seq)) num_gene = num_gene + 1 length = length + len(str(na_seq)) if matched: GC_avg = round(GC_content / num_gene, 2) gc_list.append(GC_avg) avg_gene_length = length / num_gene gene_len_list.append(avg_gene_length) num_gene_list.append(num_gene) name_list.append(prepend_str + str(contig_num)) contig_num = contig_num + 1 else: no_match = no_match + 1 #print "Didnt match for ",contig_num #print pvt_list #print "Not matched contigs = ", no_match f.close() eg_num = eg_num + 1 ret_dict = { 'Name': name_list, 'GCcontent': gc_list, 'num_of_gene': num_gene_list, 'length_of_gene': gene_len_list } return ret_dict
headers_list = [] for seq in SeqIO.parse( inputFasta , "fasta" ): seq_dict[seq.id] = seq.seq header = seq.id headers_list.append( header ) #---------/calcular gc y length/------- gc_cont = [] lengths = [] for seqid in seq_dict: gc = GC( seq_dict[seqid] ) gc_cont.append( gc ) length = len( seq_dict[seqid] ) lengths.append( length ) #------------------------------------- num_seqs = len(headers_list) command = [ "blastn" , "-query" , sys.argv[1], "-subject" , sys.argv[1], "-outfmt", "6 qseqid sseqid pident", "-out", "blast_out.tmp" ] subprocess.call(command) pident_list = []
def get_feature(FA,heDAT,outFILE): f_out = open(outFILE+".feature", "w") test_count = 0 transcript_len = dict() ### get feature from lnc_fasta #### run txCdsPredict on the input FASTA file #### cmd = "txCdsPredict " + FA + " -anyStart tmp.cds" os.system(cmd) cds_len = dict() cds_score = dict() temp=open("tmp.cds") for line in temp: line_array = line.split() id_array = line_array[0].split('|') start=0 end=0 trans_id = id_array[0] pred_start = int(line_array[1]) pred_end = int(line_array[2]) pred_len = pred_end-pred_start cds_len[trans_id] = pred_len cds_score[trans_id] = float(line_array[5]) temp.close() os.system("rm tmp.cds") ############ end of running txCdsPredict ################ #### extract hexamer (CPAT)#### #build hexamer table from hexamer frequency file coding={} noncoding={} start_codons='ATG' stop_codons='TAG,TAA,TGA' for line in open(heDAT): line = line.strip() fields = line.split() if fields[0] == 'hexamer':continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) ####end extract hexamer #### #### extract peptide_length,Fickett_score,ORF_integrity (CPC2) #### strand = "+" strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I) ptU = re.compile("U",re.I) fickett_obj = Fickett() ####end extract peptide_length,Fickett_score,ORF_integrity (CPC2) #### ## read the FASAT file of transcripts f_in = open(FA) for record in SeqIO.parse(f_in, "fasta"): ID = record.id first = ID.split("|") seq = record.seq seq_len = len(seq) ACG_mer = seq.count("ACG")*100/float(seq_len-2) AGC_mer = seq.count("AGC")*100/float(seq_len-2) CAG_mer = seq.count("CAG")*100/float(seq_len-2) CAT_mer = seq.count("CAT")*100/float(seq_len-2) CCA_mer = seq.count("CCA")*100/float(seq_len-2) CGG_mer = seq.count("CGG")*100/float(seq_len-2) CGT_mer = seq.count("CGT")*100/float(seq_len-2) GAC_mer = seq.count("GAC")*100/float(seq_len-2) GAG_mer = seq.count("GAG")*100/float(seq_len-2) GAT_mer = seq.count("GAT")*100/float(seq_len-2) GGC_mer = seq.count("GGC")*100/float(seq_len-2) GGG_mer = seq.count("GGG")*100/float(seq_len-2) TAC_mer = seq.count("TAC")*100/float(seq_len-2) TAG_mer = seq.count("TAG")*100/float(seq_len-2) TCA_mer = seq.count("TCA")*100/float(seq_len-2) #Translating nucleotides to peptide sequences according to frame shift frame_0 = frame_translation(seq, 0, genetic_code=1) stop_count_0 = frame_0.count("*") frame_1 = frame_translation(seq, 1, genetic_code=1) stop_count_1 = frame_1.count("*") frame_2 = frame_translation(seq, 2, genetic_code=1) stop_count_2 = frame_2.count("*") stop = (stop_count_0, stop_count_1, stop_count_2) std_stop = numpy.std(stop) seqRNA = ptU.sub("T",str(seq).strip()) seqRNA = seqRNA.upper() seqCDS,start_pos,orf_strand,orf_fullness = FindCDS(seqRNA).longest_orf(strand) '''seqCDS:longest ORF''' seqprot = mRNA_translate(seqCDS) pep_len = len(seqprot) #pep_len = len(seqprot.strip("*")) newseqprot = strinfoAmbiguous.sub("",str(seqprot)) '''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence''' fickett_score = fickett_obj.fickett_value(seqRNA) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*"))) if pep_len > 0: isoelectric_point = protein_param(protparam_obj) else: orf_fullness = -1 isoelectric_point = 0.0 hexamer = extract_feature_from_seq(seq = seq, stt = start_codons,stp = stop_codons,c_tab=coding,g_tab=noncoding) # print features #print("%s"%first[0], end='\t', file=f_out) cds = cds_len[first[0]] len_perc = float(cds)/seq_len print("%0.2f"%len_perc, end='\t', file=f_out) print("%s"%str(fickett_score), end='\t', file=f_out) print("%s"%str(hexamer), end='\t', file=f_out) print("%d"%cds_score[first[0]], end='\t', file=f_out) print("%0.4f"%CGG_mer, end = "\t", file=f_out) print("%0.4f"%TAG_mer, end = "\t", file=f_out) print("%0.2f"%GC(seq), end='\t', file=f_out) print("%.6f"%std_stop, end='\t', file=f_out) print("%s"%str(isoelectric_point), end='\t', file=f_out) print("%0.4f"%ACG_mer, end = "\t", file=f_out) print("%0.4f"%GGC_mer, end = "\t", file=f_out) print("%d"%seq_len, end='\t', file=f_out) print("%0.4f"%CGT_mer, end = "\t", file=f_out) print("%0.4f"%AGC_mer, end = "\t", file=f_out) print("%0.4f"%GAC_mer, end = "\t", file=f_out) print("%0.4f"%GGG_mer, end = "\t", file=f_out) print("%0.4f"%TCA_mer, end = "\t", file=f_out) print("%0.4f"%CAT_mer, end = "\t", file=f_out) print("%s"%str(orf_fullness), end='\t', file=f_out) print("%0.4f"%CAG_mer,file=f_out) f_out.close() f_in.close()
def meanGC(lst): #takes in a list of sequence gcList=[] for seq in lst: gcList.append(GC(seq)) return ss.mstats.gmean(gcList)
# This script will take a fasta file (with only one fasta sequence, at this moment) and output a file with the GC content of specified sliding window size. Input the fasta file name and the window size as arguments at command line. import sys, os from Bio import SeqIO from Bio.SeqUtils import GC raw_file=open(sys.argv[1], "r") window_GC = open(sys.argv[2], "w") window_size=sys.argv[3] pieces=[] #window_GC=[] for rec in SeqIO.parse(raw_file, "fasta"): total_size=len(rec.seq) window_size=int(window_size) chunksize=total_size//window_size for pos in range(0, total_size, window_size): pieces.append(str(rec.seq[pos:pos+window_size])) #print pieces for small_chunk in pieces: gc_content=GC(small_chunk) window_GC.write(str(gc_content)+"\n") #print chunksize #print total_size//2000 window_GC.close() #print window_GC
oup_GC.write("group\tspecies\tGC\tGC1\tGC2\tGC3\n") for inl in folder: print inl group = inl.split(".aln")[0] ortho_groups.append(group) os.system("trimal -in %s/%s -out trimmed/%s.aln -gt 0.9" % (align_folder, inl, inl)) #generate a trimmed alignment # to calculate GC content on conserved sections align = AlignIO.read("trimmed/%s.aln" % inl, "clustal") for seq in align: species = seq.id seqstring = str(seq.seq).replace("-", "") GCcont = GC(seqstring) GC1 = GC(seqstring[0::3]) GC2 = GC(seqstring[1::3]) GC3 = GC(seqstring[2::3]) main_dict[species][group] = (GCcont, GC1, GC2, GC3) print species, group, GCcont ortho_groups = set(ortho_groups) for i in ortho_groups: for spec in species_list: oup_GC.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\n" % (i, spec, main_dict[spec][i][0], main_dict[spec][i][1], main_dict[spec][i][2], main_dict[spec][i][3]))
def result_tools3(request): input_seq = request.POST.get('tool1', 'default') rec1 = Seq(input_seq) ans = GC(rec1) params = {'res': ans} return render(request, 'mysite/result_tools.html', params)
def return_genbank_dict(gb_file, key='annotation', seq_type='amino_acid'): """Overview: This function will return a dictionary generated from a genbank file with key value supplied by caller. Returns: A dictionary created by the supplied genbank file (gb_file) indexed off the key value supplied. Default: The deafult key is locus, and this is generally the most useful key type since it is garanteed to be unique within the genbank file. This condition is not necessarily true for any other attribute. """ result = {} seq_record = SeqIO.parse(open(gb_file), "genbank").next() accession = seq_record.annotations['accessions'][0].split('.')[0] common_name = seq_record.annotations['organism'].replace(' ', '_') result.update({'accession': accession}) result.update({'common_name': common_name}) cnt = 0 # loop over the genbank file unk_cnt = 1 for fnum, feature in enumerate(seq_record.features): # here i simply check the gene coding type, and identify them in a way that can be used later. if feature.type == 'CDS' or feature.type == 'ncRNA' or feature.type == 'tRNA' or feature.type == 'mRNA' or feature.type == 'rRNA': start = feature.location.start stop = feature.location.end strand = feature.strand synonyms = 'NONE' if 'gene_synonym' in feature.qualifiers: synonyms = ':'.join( feature.qualifiers['gene_synonym'][0].replace( ' ', '').split(';')) try: locus = feature.qualifiers['locus_tag'][0] except: try: locus = feature.qualifiers['gene'][0] except: locus = '' print 'No locus associated. This should never be invoked meaning you are proper fracked. (The gbk file has an error).' try: gene = feature.qualifiers['gene'][0] except: gene = 'unknown' try: seq = feature.qualifiers['translation'] seq_type = 'Protein' except: cnt = cnt + 1 seq = seq_record.seq[start:stop] seq_type = feature.type if feature.type == 'CDS': seq_type = 'Pseudo_Gene' gc = "%2.1f" % GC(seq_record.seq[start:stop]) #print synonyms #method = "exact" if key == 'locus': result.update({locus: (locus, gene, seq, seq_type, synonyms)}) elif key == 'annotation': if gene == 'unknown': new_gene = 'unknown_' + str(unk_cnt) header = '|'.join([ accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc ]) result.update({new_gene: [header, ''.join(seq)]}) unk_cnt += 1 else: header = '|'.join([ accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc ]) result.update({gene: [header, ''.join(seq)]}) ################################################# # New code to overcome some issues with # # RegulonDB. we will store synonym data as well # # which improves operon recovery slightly. # ################################################# if synonyms != 'NONE': for syn in synonyms.split(':'): header = '|'.join([ accession, common_name, locus, gene, str(start), str(stop), str(strand), seq_type, synonyms, gc ]) result.update({syn: [header, ''.join(seq)]}) #print 'The number of non-protein regions in %s is: %i.' % (common_name, cnt) return result
def __call__(self, best_solution: PASSolution, mutations: [PASMutationSite], sequences: PASSequences) -> [PASResult]: """ Returns list of results """ # two shifted iterators to iterate over fragment and next fragment in the same time # in purpose to calculate overlaps frag_current_it = iter(best_solution.get_fragments()) frag_lagged_it = iter(best_solution.get_fragments()) next(frag_lagged_it) results = [] goi_offset = sequences.get_goi_offset() # sorted list of all mutations sites mutation_sites = list(set([mut.position for mut in mutations])) mutation_sites.sort() # creating the output values for every fragment for i, frag_current in enumerate(frag_current_it): # getting oligos for a fragment, and fragment parameters generator = OligoGenerator(self.config, self.is_mutations_as_codons, self.config.organism) oligos_group = generator( frag_current.get_sequence(best_solution.gene.sequence), mutations, frag_current, goi_offset, 250) fragment_sequence = frag_current.get_sequence( best_solution.gene.sequence) # getting list of mutations on a fragment a prepare it in a desired json format mutation_sites_on_fragment = [ site for site in mutation_sites if ((goi_offset + (site - 1) * 3) >= frag_current.get_start() and (goi_offset + (site - 1) * 3 + 2) <= frag_current.get_end()) ] mutations_on_fragment = [ mut for mut in mutations if mut.position in mutation_sites_on_fragment ] mutations_on_fragment_formatted = self.combine_mutations_list( frag_current, oligos_group, mutation_sites_on_fragment, mutations_on_fragment, fragment_sequence, goi_offset) list_oligos = combine_oligos_list(oligos_group, mutations_on_fragment_formatted, mutation_sites_on_fragment, goi_offset, frag_current) # getting overlap and its parameters try: frag_next = next(frag_lagged_it) overlap = frag_current.get_overlap_seq( frag_next, sequences.get_full_sequence()) overlap_Tm = best_solution.temp_calculator(overlap) overlap_GC = GC(overlap) overlap_length = len(overlap) except: # when lagged iterator returns None set all overlaps info to None overlap = overlap_Tm = overlap_GC = overlap_length = None # every fragment at even position should be reverse complement of the original sub-sequence # doing it here because previous code requires fragment in original forward direction if i % 2 == 1: for oligo in list_oligos: oligo.make_reverse_complement() fragment_sequence = reverse_complement(fragment_sequence) # combining the results together result_oligo = PASResult(fragment=fragment_sequence, start=frag_current.get_start(), end=frag_current.get_end(), length=frag_current.get_length(), overlap=overlap, overlap_Tm=overlap_Tm, overlap_GC=overlap_GC, overlap_length=overlap_length, mutations=mutations_on_fragment_formatted, oligos=list_oligos) results.append(result_oligo) # preparing input data for final json # list of all mutation on a gene in a desired json format # returning output json return results
def gc(self, seq): """Calculate GC content in percent (0-100).""" return GC(seq)
def draw_gc(sumgene, dwg, gc_color): #gc kmer number unit_num = 300 unit_len = int(sumgene.len / unit_num) #first coordinates start_unit = 0 end_unit = unit_len - 1 mid_unit = (end_unit - start_unit) / 2 #radius gcc_r = 430 gcs_r = 230 gcc_range = (330, 530) gcs_range = (130, 330) gcc_mean = GC(sumgene.seq) gc_contents = [] gc_skews = [] #compute gc while (unit_num > 0): gc_content = GC(sumgene.seq[start_unit:end_unit]) gcc_variance = gc_content - gcc_mean gc_contents.append(gcc_variance) gc_skew = GC_skew(sumgene.seq[start_unit:end_unit], window=unit_len)[0] gc_skews.append(gc_skew) start_unit += unit_len if end_unit + unit_len <= sumgene.len: end_unit += unit_len else: end_unit = sumgene.len unit_num -= 1 gcc_max = max(gc_contents) gcc_min = min(gc_contents) gcs_max = max(gc_skews) gcs_min = min(gc_skews) gcc_scal = (gcc_range[1] - gcc_range[0]) / (gcc_max - gcc_min) gcs_scal = (gcs_range[1] - gcs_range[0]) / (gcs_max - gcs_min) gc_count = 0 unit_num = 300 start_unit = 0 end_unit = unit_len - 1 mid_unit = (end_unit - start_unit) / 2 while (unit_num > 0): # draw gc_content c_pos = position_mapping(sumgene, [start_unit, end_unit], gcc_r) gcc_nr = (gc_contents[gc_count] - gcc_min) * gcc_scal + gcc_range[0] cm_pos = position_mapping(sumgene, [mid_unit, 0], gcc_nr) points = [(1500 + c_pos[0], 1500 - c_pos[1]), (1500 + c_pos[2], 1500 - c_pos[3]), (1500 + cm_pos[0], 1500 - cm_pos[1])] if gcc_nr >= gcc_r: dwg.add(dwg.polygon(points, fill=gc_color[0], stroke_width=0)) else: dwg.add(dwg.polygon(points, fill=gc_color[1], stroke_width=0)) # draw gc_skew s_pos = position_mapping(sumgene, [start_unit, end_unit], gcs_r) gcs_nr = (gc_skews[gc_count] - gcs_min) * gcs_scal + gcs_range[0] sm_pos = position_mapping(sumgene, [mid_unit, 0], gcs_nr) points = [(1500 + s_pos[0], 1500 - s_pos[1]), (1500 + s_pos[2], 1500 - s_pos[3]), (1500 + sm_pos[0], 1500 - sm_pos[1])] if gcs_nr >= gcs_r: dwg.add(dwg.polygon(points, fill=gc_color[2], stroke_width=0)) else: dwg.add(dwg.polygon(points, fill=gc_color[3], stroke_width=0)) start_unit += unit_len if end_unit + unit_len <= sumgene.len: end_unit += unit_len else: end_unit = sumgene.len mid_unit += unit_len unit_num -= 1 gc_count += 1 return dwg