def main(): # Read in the RNA sequences from a file specified by user input filename = input("Please enter the input file name: ") rnainfo = readfasta(filename) # Prepare to re-write the RNA sequences to an output file specified by user input outfilename = input("Please enter the output file name: ") handle = open(outfilename, mode="w") # Iterate through each RNA sequence in the input file for i in range(len(rnainfo)): # Specify gene that is being evaluated handle.write("Gene " + str(i + 1) + ": " + rnainfo[i][2] + "\n\n") # Translate the RNA Sequence to its corresponding single-letter amino acid sequence # Write information to the output file translatedseq = translate(rnainfo[i][2]) handle.write("Protein Sequence " + str(i + 1) + ": " + translatedseq + "\n\n") # Scan the single-letter amino acid sequence for transmembrane helices # Write results to the output file findTMD(translatedseq, handle) # Close file handle.close()
#!/usr/bin/python import sys from fasta import readfasta f = open(sys.argv[1], "r") fd = readfasta(f) for key in fd: sequence = fd[key][15720:15725] print sequence homozygote = 'ATCG' SNPs = 'RYSWKMBDHV' homozygote_count = len( [base.upper() for base in sequence if base.upper() in homozygote]) SNPs_count = len( [base.upper() for base in sequence if base.upper() in SNPs]) print homozygote_count print SNPs_count
GCs = 'GC' homozygote_count = len( [base.upper() for base in sequence if base.upper() in homozygote]) GC_count = len([base.upper() for base in sequence if base.upper() in GCs]) return homozygote_count, GC_count #************************************************ # Read bed and fasta input files #************************************************ overlapping = open(sys.argv[1], "r") fasta_seq = open(sys.argv[2], "r") out = open(sys.argv[3], "w") #************************************************ # Read fasta file in dictionary fasta_dict = readfasta(fasta_seq) #************************************************ #************************************************ # Read overlap file in dictionary #************************************************ overlapping_list = [] # Create a list to keep order when printing overlapping_dict = {} multi_window_intervals = {} for line in overlapping: line = line.strip("\n").split("\t") key = line[0] + ":" + line[1] + ":" + line[2] value = [int(line[4]), int(line[5]), int(line[6])] if key in overlapping_dict.keys(): overlapping_dict[key].append(value) else: overlapping_dict[key] = [value]
# use them for resampling. # Usage: ./compile_sequences.py seq.fasta seq.coordinates seq.scaf.chro fastaseq = open(sys.argv[1], "r") # File one is the fasta sequence intergenic = open(sys.argv[2], "r") # File two is the intergenic file chrfile = open(sys.argv[3], "r") # File three is the chromosome file that # contains all the scaffolds for that chromosome. def chunks(s, n): for start in range(0, len(s), n): yield s[start:start + n] # Read the fasta file into a dictionary fastaDict = readfasta(fastaseq) #print fastaDict # Read the intergenic coordinates into a dictionary def intergenicCoord(intergenic): intergenicDict = {} for line in intergenic: line = line.strip().split("\t") key, value = line[0], line[1:] if line[0] in intergenicDict.keys(): intergenicDict[key].append(value) else: intergenicDict[key] = [value] return intergenicDict
#!/usr/bin/python from __future__ import division import sys from fasta import readfasta from het import heterozygosity fasta = sys.argv[1] # Read fasta into a dictionary with open(fasta, 'r') as f: fasta_dict = readfasta(f) # Calculate heterozygosity for every scaffold for key in fasta_dict.keys(): het = heterozygosity(fasta_dict[key]) print(key + "\t" + str(het[0]) + "\t" + str(het[1]) + "\t" + str(het[2]))
# It reads the intron coordinates generated by extract_from_gff.py into a dictionary, # extract the corresponding sequences from a fasta file and then runs the heterozygosity # function to count the number of SNPs and bases. # Usage: ./intron_heterozygosity.py intron_coordinates.txt fasta.fa > output ########################################################################################## #*************************************************** # Read the intron coordinates and the fasta sequence #*************************************************** intron_coord = open(sys.argv[1], 'r') fasta = open(sys.argv[2], 'r') #****************************************** # Read the fasta sequence into a dictionary #****************************************** fastaseq = readfasta(fasta) #********************************************** # Read the intron coordinates into a dictionary #********************************************** intron_dict = {} for line in intron_coord: line = line.strip('\n').split('\t') key, value = line[0], line[1:] if key in intron_dict.keys(): intron_dict[key].append(value) else: intron_dict[key] = [value] #************************************************************************** # Extract the intronic sequences from fasta and read them into a dictionary
if not isinstance(sequence, str): raise Exception("Sequence is not a string") R = len([base.upper() for base in sequence if base.upper() == "R"]) Y = len([base.upper() for base in sequence if base.upper() == "Y"]) S = len([base.upper() for base in sequence if base.upper() == "S"]) W = len([base.upper() for base in sequence if base.upper() == "W"]) K = len([base.upper() for base in sequence if base.upper() == "K"]) M = len([base.upper() for base in sequence if base.upper() == "M"]) return (R, Y, S, W, K, M) ################################################################################ fastafile = open(sys.argv[1], "r") fastadict = readfasta(fastafile) R_l = [] Y_l = [] S_l = [] W_l = [] K_l = [] M_l = [] for key in fastadict.keys(): R = trans_tranv_count(fastadict[key])[0] Y = trans_tranv_count(fastadict[key])[1] S = trans_tranv_count(fastadict[key])[2] W = trans_tranv_count(fastadict[key])[3] K = trans_tranv_count(fastadict[key])[4] M = trans_tranv_count(fastadict[key])[5] R_l.append(R)
# If run: ./Open_reading_frame.py CDS.fa SW > output # The script counts only S and W sites. # CDS.fa is produced by extract_CDS_from_fasta.py and contains a fasta sequence with IUPAC # coded SNPs. ########################################################################################## #******************* # Specify the inputs #******************* inFile = open(sys.argv[1], 'r') argument = sys.argv[2] #************************************************************ # Read the fasta file into a dictionary #************************************************************ fastaseq = readfasta(inFile) # Specify four nucleotides #************************************************************ nucs = ["A", "T", "C", "G"] #************************************************************ # Specify IUPAC codes as a dictionary #************************************************************ IUPAC_code = { 'R': ['A', 'G'], 'Y': ['C', 'T'], 'S': ['G', 'C'], 'W': ['A', 'T'], 'K': ['G', 'T'], 'M': ['A', 'C'] } #, 'B':['C', 'G', 'T'], 'D':['A', 'G', 'T'], 'H':['A', 'C', 'T'], 'V':['A', 'C', 'G']} # Four fold degenerate sites
# The script counts only S and W sites. # CDS.fa is produced by extract_CDS_from_fasta.py and contains a fasta sequence with IUPAC # coded SNPs. ########################################################################################## #******************* # Specify the inputs #******************* inFile = open(sys.argv[1], 'r') argument = sys.argv[2] #strand = open(sys.argv[3], "r") #************************************************************ # Read the fasta file into a dictionary #************************************************************ dna_orf = readfasta(inFile) # Specify four nucleotides #************************************************************ nucs = ["A", "T", "C", "G"] #************************************************************ # Specify IUPAC codes as a dictionary #************************************************************ IUPAC_code = { 'R': ['A', 'G'], 'Y': ['C', 'T'], 'S': ['G', 'C'], 'W': ['A', 'T'], 'K': ['G', 'T'], 'M': ['A', 'C'] } #, 'B':['C', 'G', 'T'], 'D':['A', 'G', 'T'], 'H':['A', 'C', 'T'], 'V':['A', 'C', 'G']} # Four fold degenerate sites
# dictionary, it then extracts the CDS coordinates from the fasta file and finally # # concatenates each sequence to the other to create a CDS with SNPs marked # # as IUPAC code. # # Usage: ./extract_CDS_from_Fasta.py cds.txt fasta.fa > output # ###################################################################################### #******************* # Specify the inputs #******************* cds_file = open(sys.argv[1], 'r') fasta = open(sys.argv[2], 'r') #************************************************************ # Read the fasta file into a dictionary #************************************************************ fastaDict = readfasta(fasta) # With the current readfasta() function, it's much faster to # use the single line fasta sequence #******************************************* # Read the CDS coordinates into a dictionary #******************************************* CDS_dict = {} for line in cds_file: line = line.strip('\n').split('\t') key, value = line[0], line[1:3] if key in CDS_dict.keys(): CDS_dict[key].append(value) else: CDS_dict[key] = [value] #print CDS_dict
# synonymous, missense and nonsense according to the following rule: #******************************************************************* # Inputs: sequence.fa annotation.gff #******************************************************************* # Run: annotate_vcf.py sequence.fa annotation.gff > output.txt #******************************************************************* #******************************************************************* # Open inputs #******************************************************************* fastafile = open(sys.argv[1], "r") gff_file = open(sys.argv[2], "r") #******************************************************************* # Read fasta file into dictionary #******************************************************************* g_fasta = readfasta(fastafile) #******************************************************************* # Read gff file into dictionary #******************************************************************* gff_dict = gff_to_dict(gff_file) #******************************************************************* # Create the degeneracy count table #******************************************************************* degeneracy_table_counts = count_sites()[0] #******************************************************************* # Create the degeneracy base table #******************************************************************* degeneracy_table_bases = count_sites()[1] #******************************************************************* #*******************************************************************
#******************************************************************************* # Written by Homa Papoli - October 2017 #******************************************************************************* # Script contains functions to: # 1. Generate a fasta sequence without N # 2. Perform resampling from the fasta sequence to generate new sequence # 3. Calculate heterozygosity from the new sequence #******************************************************************************* f1 = open(sys.argv[1], "r") seq = sys.argv[2] # indicate which chromosome to resample replicates = int(sys.argv[3]) # number of resampling num = int(sys.argv[4]) # indicate the length from which to sample # Read the fasta file into a dictionary. fastadict = readfasta(f1) #def resampling_f(fastadict, seq, num): # fastadict[seq] = fastadict[seq].replace("N","").replace("n","") # l = [] # # If sampling the sequence as long as the original one # # new_seq = ''.join([random.choice(fastadict[seq]) for nuc in fastadict[seq]]) # # If sampling the sequence for a specific set of number # new_seq = ''.join([random.choice(fastadict[seq]) for i in range(num)]) # New sequences # new_seq_het = list(heterozygosity(new_seq))[2] # Het of the new sequence # l.append(new_seq_het) # return l def resampling_f(fastadict, seq, n, k): fastadict[seq] = fastadict[seq].replace("N", "").replace("n", "") seq_list = np.random.choice(tuple(fastadict[seq]),