def alignSequences(seq_list, max_iters=16, exepath=const_default_muscle_exepath): tmp_fasta_file = "tmp-muscle-in-{}.txt".format(''.join(random.sample(string.ascii_letters, 20))) tmpfile = open(tmp_fasta_file, 'w') # Write out the sequences for si in range(len(seq_list)): tmpfile.write('>seq{:d}\n{}\n'.format(si, seq_list[si])) tmpfile.close() outfile_name = os.path.join(os.getcwd(),"tmp-muscle-out-{}.txt".format(''.join(random.sample(string.ascii_letters, 20)))) cmd = "muscle -in {} -out {} -quiet -maxiters {:d}".format(tmp_fasta_file, outfile_name, max_iters) #print cmd #print os.path.expanduser(exepath) #print(exepath) runcmd = [exepath] + cmd.split()[1:] #print(runcmd) error = subprocess.run(runcmd) #print(error.returncode) if error.returncode == 0: seq_dict = biofile.readFASTADict(outfile_name) #print(seq_dict) seqs = [seq_dict["seq{:d}".format(i)] for i in range(len(seq_list))] os.remove(outfile_name) os.remove(tmp_fasta_file) return seqs else: if not os.path.isfile(os.path.expanduser(exepath)): raise MuscleError("Couldn't find muscle executable at {}".format(os.path.expanduser(exepath))) else: raise MuscleError("Muscle error code {:d}".format(error.returncode))
def readGenomesFromFile(multi_files_fname, genome_dir, genome_dicts, column_index=1, load_fxn=biofile.firstField, species=None, outstream=None): if outstream is None: outstream = util.OutStreams() # Format for species_map = {} for line in file(multi_files_fname,'r').readlines(): if line[0] != '#' and not line.strip() == '': # skip comments and blank lines flds = line.strip().split() #print flds, column_index species_map[flds[0]] = flds[column_index] if species is None: species = species_map.keys() else: assert set(species).intersection(set(species_map.keys())) == set(species), "Not all specified species found in mapping file" for spec in species: genome_file = os.path.join(os.path.expanduser(genome_dir), species_map[spec]) if not os.path.isfile(genome_file): outstream.write("# Cannot find file %s\n" % genome_file) genome = biofile.readFASTADict(genome_file, load_fxn) genome_dicts[spec] = genome outstream.write("# species=%s, genome file=%s has %d entries, example ID=%s\n" % (spec, genome_file, len(genome.keys()), genome.keys()[0])) return species_map
parser.add_argument("-g", "--debug", dest="debugging", action="store_true", default=False, help="debug mode?") parser.add_argument("-m", "--merge", dest="merge", action="store_true", default=False, help="merge the indicated experiments?") parser.add_argument("-t", "--tag", dest="tags", action="append", default=[], help="tags to restrict the analysis to specific tagged experiments") parser.add_argument("-u", "--unique", dest="unique_matches", action="store_true", default=False, help="use unique peptides only?") parser.add_argument("--normalize-intensity", dest="normalize_intensity", action="store_true", help="normalize intensity when merging?") parser.add_argument("--normalize-ratio-by", dest="normalize_ratio_by_orf", default=None, help="ORF to use for normalization across runs") parser.add_argument("--ratio-sig", dest="ratio_significance_field", default="ratio_hl_normalized", help="field to use for ratio significance calculations") parser.add_argument("--abundance", dest="abundance_field", default="intensity", help="field to use for abundance calculations") options = parser.parse_args() # Set up some output info_outs = util.OutStreams(sys.stdout) orf_dict = None if not options.database_fname is None: orf_dict = biofile.readFASTADict(options.database_fname) evidences = [] if not options.in_fname is None: #print "# Loading..." # Read more experiments from master file inf = file(os.path.expanduser(options.in_fname), 'r') dlr = util.DelimitedLineReader(inf, header=True) while not dlr.atEnd(): flds = dlr.nextDict() if os.path.isfile(os.path.expanduser(flds['filename'])): ed = mq.EvidenceDescriptor() ed.filename = os.path.expanduser(flds['filename']) ed.invert = flds['invert'][0].lower() in ['1','y','t'] ed.tags = [x.strip() for x in flds['tags'].split(',')] ed.experiment = flds['experiment']
if __name__=='__main__': parser = argparse.ArgumentParser(description="Calculate basic features of coding sequences") parser.add_argument(dest="cds_in_fname", type=str, help="FASTA file containing coding sequences") parser.add_argument(dest="prot_in_fname", type=str, help="FASTA file containing protein sequences") parser.add_argument(dest="feature_fname", type=str, help="SGD file containing sequence features") parser.add_argument(dest="paralog_fname", type=str, help="Yeast Gene Order Browser formatted file of paralog identifications") parser.add_argument("--aa", dest="do_aa", default=False, action="store_true", help="compute amino-acid frequencies?") parser.add_argument("--gc", dest="do_gc", default=False, action="store_true", help="compute GC frequencies?") parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?") parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis") parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser(options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable(file(os.path.expanduser(options.paralog_fname),'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data sgd_features = util.readTable(file(os.path.expanduser(options.feature_fname),'r'), header=False) '''
if not options.out_fname is None: outf = file(os.path.expanduser(options.out_fname), 'w') outs.addStream(outf) else: outs.addStream(sys.stdout) pp = protprop.ProteinProperties() if not options.sequence is None: if options.translate: seq = translate.translateRaw(options.sequence) else: seq = options.sequence seq_dict = {"input": seq} else: # Load from FASTA seq_dict = biofile.readFASTADict(options.in_fname) if options.translate: for k in seq_dict.keys(): seq_dict[k] = translate.translate(seq_dict[k]) outs.write("# {}\n".format(options)) outs.write("pos\taa\tcharge\n") n_seqs = len(seq_dict.keys()) for (seqid, seq) in seq_dict.items(): if n_seqs > 1: outs.write("# {}\n".format(seqid)) outs.write("# Total protein charge at pH {} = {}\n".format( options.pH, pp.getCharge(seq, options.pH))) window_width = (options.window - 1) / 2 # Run over start_pos = 0
orfs.append(x.strip().split(".")[0]) print "# Found %d alignments" % len(orfs) id_map = readFlybaseMapping(id_map_fname) ortho_dict = readOneToOneOrthologs(ortholog_fname, master_spec, tree_species) print "# Found %d 1:1 ortholog sets" % len(ortho_dict.keys()) n_to_align = min(len(orfs), options.num_to_align) alignment_dict = {} ortholog_dict = {} n_written = 0 n_failed = 0 n_duplicates = 0 for trans_id in orfs[0:n_to_align]: fname = os.path.join(in_dir,'%s.fasta' % trans_id) orf_alignment_dict = biofile.readFASTADict(fname) try: # Alignments are FBtr transcript IDs # Orthologs are FBgn gene IDs # ID map turns FBtr into FBgn gene_id = id_map[trans_id] #print trans_id, gene_id spec_orf_list = ortho_dict[gene_id] #print trans_id, spec_orf_list spec_orf_dict = dict(spec_orf_list) del spec_orf_dict[master_spec] spec_orf_dict[master_spec] = trans_id new_spec_orf_list = spec_orf_dict.items() #print trans_id, gene_id, new_spec_orf_list #print trans_id, orf_alignment_dict.keys() #print trans_id, n_written, orf_alignment_dict[master_spec][0:10]
if not options.out_fname is None: outf = file(os.path.expanduser(options.out_fname), "w") outs.addStream(outf) else: outs.addStream(sys.stdout) pp = protprop.ProteinProperties() if not options.sequence is None: if options.translate: seq = translate.translateRaw(options.sequence) else: seq = options.sequence seq_dict = {"input": seq} else: # Load from FASTA seq_dict = biofile.readFASTADict(options.in_fname) if options.translate: for k in seq_dict.keys(): seq_dict[k] = translate.translate(seq_dict[k]) outs.write("# {}\n".format(options)) outs.write("pos\taa\tcharge\n") n_seqs = len(seq_dict.keys()) for (seqid, seq) in seq_dict.items(): if n_seqs > 1: outs.write("# {}\n".format(seqid)) outs.write("# Total protein charge at pH {} = {}\n".format(options.pH, pp.getCharge(seq, options.pH))) window_width = (options.window - 1) / 2 # Run over start_pos = 0 focal_pos = 0
import sys, os, math, string import biofile if __name__=='__main__': (h,s) = biofile.readFASTA('test-biofile/test-biofile-001.fa') assert len(h) == 143 cd = biofile.readFASTADict(os.path.expanduser('test-biofile/test-biofile-001.fa')) assert len(cd.keys()) == len(h)
dest="output_type", default="fasta", help="type of output [fasta=alignment, ratio=profiles]") parser.add_argument("--IL", dest="I_equals_L", action="store_true", help="I equivalent to L?") options = parser.parse_args() # Set up some output info_outs = util.OutStreams(sys.stdout) orf_dict = None if not options.database_fname is None: orf_dict = biofile.readFASTADict( os.path.expanduser(options.database_fname)) # Pull out target protein target_prot = orf_dict[options.target_orf] if target_prot[-1] == '*': target_prot = target_prot[0:-1] evidences = [] for fi in range(len(options.evidence_fnames)): fname = options.evidence_fnames[fi] if options.experiments is None: # If no experiments are specified, we assume invert refers to whole evidence files. ed = mq.EvidenceDescriptor() ed.filename = os.path.expanduser(fname) ed.tags = options.tags evidences.append(ed)
random.seed(options.random_seed) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) sug_dict = {} if not options.suggest_sequences is None: if not os.path.isfile(options.suggest_sequences): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.suggest_sequences,'r') as inf: # Read a FASTA file? sug_dict = biofile.readFASTADict(inf) # Write output dout = util.DelimitedOutput() dout.addHeader('name','name of construct') dout.addHeader('sequence','sequence') dout.addHeader('notes','notes') dout.describeHeader(data_outs) dout.writeHeader(data_outs) n_written = 0 mutant_seqs = {} def parseHeader(x): name = biofile.firstField(x) property_entries = [tuple(y.split('=')) for y in x.split() if '=' in y]
"--ratio-sig", dest="ratio_significance_field", default="ratio_hl_normalized", help="field to use for ratio significance calculations") parser.add_argument("--abundance", dest="abundance_field", default="intensity", help="field to use for abundance calculations") options = parser.parse_args() # Set up some output info_outs = util.OutStreams(sys.stdout) orf_dict = None if not options.database_fname is None: orf_dict = biofile.readFASTADict(options.database_fname) evidences = [] if not options.in_fname is None: #print "# Loading..." # Read more experiments from master file inf = file(os.path.expanduser(options.in_fname), 'r') dlr = util.DelimitedLineReader(inf, header=True) while not dlr.atEnd(): flds = dlr.nextDict() if os.path.isfile(os.path.expanduser(flds['filename'])): ed = mq.EvidenceDescriptor() ed.filename = os.path.expanduser(flds['filename']) ed.invert = flds['invert'][0].lower() in ['1', 'y', 't'] ed.tags = [x.strip() for x in flds['tags'].split(',')] ed.experiment = flds['experiment']
parser.add_option("-s", "--scores-out", dest="score_fname", type="string", default="vanilla", help="format of ID in FASTA entry") parser.add_option("-p", "--pseudocount", dest="pseudocount", type="float", default=0.0, help="pseudocount to be added to all frequencies") (options, args) = parser.parse_args() in_fname = args[0] info_outs = util.OutStreams(sys.stdout) data_outs = util.OutStreams() # Start up output if not options.out_fname is None: outf = file(options.out_fname, 'w') data_outs.addStream(outf) else: data_outs.addStream(sys.stdout) formatFxn = biofile.getIDFunction(options.format) cdna_dict = biofile.readFASTADict(in_fname, formatFxn) calc = Calculator() calc.initializeFromSequences(cdna_dict.values(), options.pseudocount) syn_dict = calc.getCodonSYNScores() syn_opt_codons = [] for aa in translate.degenerateAAs(): codons = translate.getCodonsForAA(aa, rna=False) best_syn_codon = sorted([(syn_dict[c],c) for c in codons])[-1][1] syn_opt_codons.append(best_syn_codon) data_outs.write("# Read {0}\n#{1:d} sequences, {2:d} codons, {3:d} nucleotides\n".format(in_fname, len(cdna_dict.keys()), int(sum(calc.codon_freq.values())), int(sum(calc.nucleotide_freq.values())))) data_outs.write("# syn_scores = {0!s}\n".format(syn_dict)) data_outs.write("# SYN opt codons = {0!s}\n".format(sorted(syn_opt_codons))) data_outs.write("{0!s}".format(calc)) if not options.score_dict_fname is None: pickle.dump(syn_dict, file(options.score_dict_fname,'w'))
info_outs = util.OutStreams(sys.stdout) data_outs = util.OutStreams() # Check assert options.window_size > 0 if not options.upper_window_size is None: assert options.upper_window_size >= options.window_size else: options.upper_window_size = options.window_size assert options.window_size > 0 prot_dict = {} # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) prot_dict = biofile.readFASTADict(file(options.in_fname, 'r')) # Generate sequence windows and quantify them seq_weights = [(s,1.0/len(s)) for s in prot_dict.values()] window_sizes = range(options.window_size, options.upper_window_size+1, 1) for window_size in window_sizes: # Start up output if not options.out_fname is None: if len(window_sizes)>1: # Use formatted filename for each window size fname = "{}-{:d}mers.txt".format(options.out_fname, window_size) else: # Use filename as given, for single file fname = options.out_fname outf = file(fname,'w')
data_outs = util.OutStreams() # Check assert options.window_size > 0 if not options.upper_window_size is None: assert options.upper_window_size >= options.window_size else: options.upper_window_size = options.window_size assert options.window_size > 0 prot_dict = {} # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format( options.in_fname)) prot_dict = biofile.readFASTADict(file(options.in_fname, 'r')) # Generate sequence windows and quantify them seq_weights = [(s, 1.0 / len(s)) for s in prot_dict.values()] window_sizes = range(options.window_size, options.upper_window_size + 1, 1) for window_size in window_sizes: # Start up output if not options.out_fname is None: if len(window_sizes) > 1: # Use formatted filename for each window size fname = "{}-{:d}mers.txt".format(options.out_fname, window_size) else: # Use filename as given, for single file fname = options.out_fname
import sys, os, math, string import biofile if __name__ == '__main__': (h, s) = biofile.readFASTA('test-biofile/test-biofile-001.fa') assert len(h) == 143 cd = biofile.readFASTADict( os.path.expanduser('test-biofile/test-biofile-001.fa')) assert len(cd.keys()) == len(h)
parser.add_argument("-x", "--experiment", dest="experiments", action="append", default=None, help="experiments to assay") parser.add_argument("-g", "--debug", dest="debugging", action="store_true", default=False, help="debug mode?") parser.add_argument("-m", "--merge", dest="merge", action="store_true", default=False, help="merge the indicated experiments?") parser.add_argument("-t", "--tag", dest="tags", action="append", default=[], help="tags to restrict the analysis to specific tagged experiments") parser.add_argument("-u", "--unique", dest="unique_matches", action="store_true", default=False, help="use unique peptides only?") parser.add_argument("-y", "--type", dest="output_type", default="fasta", help="type of output [fasta=alignment, ratio=profiles]") parser.add_argument("--IL", dest="I_equals_L", action="store_true", help="I equivalent to L?") options = parser.parse_args() # Set up some output info_outs = util.OutStreams(sys.stdout) orf_dict = None if not options.database_fname is None: orf_dict = biofile.readFASTADict(os.path.expanduser(options.database_fname)) # Pull out target protein target_prot = orf_dict[options.target_orf] if target_prot[-1] == '*': target_prot = target_prot[0:-1] evidences = [] for fi in range(len(options.evidence_fnames)): fname = options.evidence_fnames[fi] if options.experiments is None: # If no experiments are specified, we assume invert refers to whole evidence files. ed = mq.EvidenceDescriptor() ed.filename = os.path.expanduser(fname) ed.tags = options.tags evidences.append(ed)
help="amino acids (e.g. ACDEF) for frequency analysis") parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser( options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable( file(os.path.expanduser(options.paralog_fname), 'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data