コード例 #1
0
def alignSequences(seq_list, max_iters=16, exepath=const_default_muscle_exepath):
	tmp_fasta_file = "tmp-muscle-in-{}.txt".format(''.join(random.sample(string.ascii_letters, 20)))
	tmpfile = open(tmp_fasta_file, 'w')
	# Write out the sequences
	for si in range(len(seq_list)):
		tmpfile.write('>seq{:d}\n{}\n'.format(si, seq_list[si]))
	tmpfile.close()

	outfile_name = os.path.join(os.getcwd(),"tmp-muscle-out-{}.txt".format(''.join(random.sample(string.ascii_letters, 20))))

	cmd = "muscle -in {} -out {} -quiet -maxiters {:d}".format(tmp_fasta_file, outfile_name, max_iters)
	#print cmd
	#print os.path.expanduser(exepath)
	#print(exepath)
	runcmd = [exepath] + cmd.split()[1:]
	#print(runcmd)
	error = subprocess.run(runcmd)
	#print(error.returncode)
	if error.returncode == 0:
		seq_dict = biofile.readFASTADict(outfile_name)
		#print(seq_dict)
		seqs = [seq_dict["seq{:d}".format(i)] for i in range(len(seq_list))]
		os.remove(outfile_name)
		os.remove(tmp_fasta_file)
		return seqs
	else:
		if not os.path.isfile(os.path.expanduser(exepath)):
			raise MuscleError("Couldn't find muscle executable at {}".format(os.path.expanduser(exepath)))
		else:
			raise MuscleError("Muscle error code {:d}".format(error.returncode))
コード例 #2
0
def readGenomesFromFile(multi_files_fname, genome_dir, genome_dicts, column_index=1, load_fxn=biofile.firstField, species=None, outstream=None):
	if outstream is None:
		outstream = util.OutStreams()
	# Format for
	species_map = {}
	for line in file(multi_files_fname,'r').readlines():
		if line[0] != '#' and not line.strip() == '':  # skip comments and blank lines
			flds = line.strip().split()
			#print flds, column_index
			species_map[flds[0]] = flds[column_index]
	if species is None:
		species = species_map.keys()
	else:
		assert set(species).intersection(set(species_map.keys())) == set(species), "Not all specified species found in mapping file"

	for spec in species:
		genome_file = os.path.join(os.path.expanduser(genome_dir), species_map[spec])
		if not os.path.isfile(genome_file):
			outstream.write("# Cannot find file %s\n" % genome_file)
		genome = biofile.readFASTADict(genome_file, load_fxn)
		genome_dicts[spec] = genome
		outstream.write("# species=%s, genome file=%s has %d entries, example ID=%s\n" % (spec, genome_file, len(genome.keys()), genome.keys()[0]))
	return species_map
コード例 #3
0
ファイル: extract-triple-evidence.py プロジェクト: dad/base
	parser.add_argument("-g", "--debug", dest="debugging", action="store_true", default=False, help="debug mode?")
	parser.add_argument("-m", "--merge", dest="merge", action="store_true", default=False, help="merge the indicated experiments?")
	parser.add_argument("-t", "--tag", dest="tags", action="append", default=[], help="tags to restrict the analysis to specific tagged experiments")
	parser.add_argument("-u", "--unique", dest="unique_matches", action="store_true", default=False, help="use unique peptides only?")
	parser.add_argument("--normalize-intensity", dest="normalize_intensity", action="store_true", help="normalize intensity when merging?")
	parser.add_argument("--normalize-ratio-by", dest="normalize_ratio_by_orf", default=None, help="ORF to use for normalization across runs")
	parser.add_argument("--ratio-sig", dest="ratio_significance_field", default="ratio_hl_normalized", help="field to use for ratio significance calculations")
	parser.add_argument("--abundance", dest="abundance_field", default="intensity", help="field to use for abundance calculations")
	options = parser.parse_args()

	# Set up some output
	info_outs = util.OutStreams(sys.stdout)

	orf_dict = None
	if not options.database_fname is None:
		orf_dict = biofile.readFASTADict(options.database_fname)

	evidences = []
	if not options.in_fname is None:
		#print "# Loading..."
		# Read more experiments from master file
		inf = file(os.path.expanduser(options.in_fname), 'r')
		dlr = util.DelimitedLineReader(inf, header=True)
		while not dlr.atEnd():
			flds = dlr.nextDict()
			if os.path.isfile(os.path.expanduser(flds['filename'])):
				ed = mq.EvidenceDescriptor()
				ed.filename = os.path.expanduser(flds['filename'])
				ed.invert = flds['invert'][0].lower() in ['1','y','t']
				ed.tags = [x.strip() for x in flds['tags'].split(',')]
				ed.experiment = flds['experiment']
コード例 #4
0
if __name__=='__main__':
	parser = argparse.ArgumentParser(description="Calculate basic features of coding sequences")
	parser.add_argument(dest="cds_in_fname", type=str, help="FASTA file containing coding sequences")
	parser.add_argument(dest="prot_in_fname", type=str, help="FASTA file containing protein sequences")
	parser.add_argument(dest="feature_fname", type=str, help="SGD file containing sequence features")
	parser.add_argument(dest="paralog_fname", type=str, help="Yeast Gene Order Browser formatted file of paralog identifications")
	parser.add_argument("--aa", dest="do_aa", default=False, action="store_true", help="compute amino-acid frequencies?")
	parser.add_argument("--gc", dest="do_gc", default=False, action="store_true", help="compute GC frequencies?")
	parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?")
	parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis")
	parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies")
	parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename")
	options = parser.parse_args()

	cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname))
	prot_dict = biofile.readFASTADict(os.path.expanduser(options.prot_in_fname))

	# Read paralog data from Yeast Gene Order Browser file
	ygob_data = util.readTable(file(os.path.expanduser(options.paralog_fname),'r'))
	paralog_dict = {}
	for flds in ygob_data.dictrows:
		scer1 = flds['scer1'].strip()
		scer2 = flds['scer2'].strip()
		if not (na.isNA(scer1) or na.isNA(scer2)):
			paralog_dict[scer1] = scer2
			paralog_dict[scer2] = scer1

	# Read SGD data
	sgd_features = util.readTable(file(os.path.expanduser(options.feature_fname),'r'), header=False)
	'''
コード例 #5
0
    if not options.out_fname is None:
        outf = file(os.path.expanduser(options.out_fname), 'w')
        outs.addStream(outf)
    else:
        outs.addStream(sys.stdout)

    pp = protprop.ProteinProperties()
    if not options.sequence is None:
        if options.translate:
            seq = translate.translateRaw(options.sequence)
        else:
            seq = options.sequence
        seq_dict = {"input": seq}
    else:
        # Load from FASTA
        seq_dict = biofile.readFASTADict(options.in_fname)
        if options.translate:
            for k in seq_dict.keys():
                seq_dict[k] = translate.translate(seq_dict[k])

    outs.write("# {}\n".format(options))
    outs.write("pos\taa\tcharge\n")
    n_seqs = len(seq_dict.keys())
    for (seqid, seq) in seq_dict.items():
        if n_seqs > 1:
            outs.write("# {}\n".format(seqid))
            outs.write("# Total protein charge at pH {} = {}\n".format(
                options.pH, pp.getCharge(seq, options.pH)))
        window_width = (options.window - 1) / 2
        # Run over
        start_pos = 0
コード例 #6
0
			orfs.append(x.strip().split(".")[0])

	print "# Found %d alignments" % len(orfs)
	id_map = readFlybaseMapping(id_map_fname)
	ortho_dict = readOneToOneOrthologs(ortholog_fname, master_spec, tree_species)
	print "# Found %d 1:1 ortholog sets" % len(ortho_dict.keys())

	n_to_align = min(len(orfs), options.num_to_align)
	alignment_dict = {}
	ortholog_dict = {}
	n_written = 0
	n_failed = 0
	n_duplicates = 0
	for trans_id in orfs[0:n_to_align]:
		fname = os.path.join(in_dir,'%s.fasta' % trans_id)
		orf_alignment_dict = biofile.readFASTADict(fname)
		try:
			# Alignments are FBtr transcript IDs
			# Orthologs are FBgn gene IDs
			# ID map turns FBtr into FBgn
			gene_id = id_map[trans_id]
			#print trans_id, gene_id
			spec_orf_list = ortho_dict[gene_id]
			#print trans_id, spec_orf_list
			spec_orf_dict = dict(spec_orf_list)
			del spec_orf_dict[master_spec]
			spec_orf_dict[master_spec] = trans_id
			new_spec_orf_list = spec_orf_dict.items()
			#print trans_id, gene_id, new_spec_orf_list
			#print trans_id, orf_alignment_dict.keys()
			#print trans_id, n_written, orf_alignment_dict[master_spec][0:10]
コード例 #7
0
ファイル: chargeprof.py プロジェクト: dad/base
    if not options.out_fname is None:
        outf = file(os.path.expanduser(options.out_fname), "w")
        outs.addStream(outf)
    else:
        outs.addStream(sys.stdout)

    pp = protprop.ProteinProperties()
    if not options.sequence is None:
        if options.translate:
            seq = translate.translateRaw(options.sequence)
        else:
            seq = options.sequence
        seq_dict = {"input": seq}
    else:
        # Load from FASTA
        seq_dict = biofile.readFASTADict(options.in_fname)
        if options.translate:
            for k in seq_dict.keys():
                seq_dict[k] = translate.translate(seq_dict[k])

    outs.write("# {}\n".format(options))
    outs.write("pos\taa\tcharge\n")
    n_seqs = len(seq_dict.keys())
    for (seqid, seq) in seq_dict.items():
        if n_seqs > 1:
            outs.write("# {}\n".format(seqid))
            outs.write("# Total protein charge at pH {} = {}\n".format(options.pH, pp.getCharge(seq, options.pH)))
        window_width = (options.window - 1) / 2
        # Run over
        start_pos = 0
        focal_pos = 0
コード例 #8
0
ファイル: test-biofile.py プロジェクト: dad/base
import sys, os, math, string
import biofile

if __name__=='__main__':
	(h,s) = biofile.readFASTA('test-biofile/test-biofile-001.fa')
	assert len(h) == 143
	cd = biofile.readFASTADict(os.path.expanduser('test-biofile/test-biofile-001.fa'))
	assert len(cd.keys()) == len(h)
コード例 #9
0
        dest="output_type",
        default="fasta",
        help="type of output [fasta=alignment, ratio=profiles]")
    parser.add_argument("--IL",
                        dest="I_equals_L",
                        action="store_true",
                        help="I equivalent to L?")

    options = parser.parse_args()

    # Set up some output
    info_outs = util.OutStreams(sys.stdout)

    orf_dict = None
    if not options.database_fname is None:
        orf_dict = biofile.readFASTADict(
            os.path.expanduser(options.database_fname))

    # Pull out target protein
    target_prot = orf_dict[options.target_orf]
    if target_prot[-1] == '*':
        target_prot = target_prot[0:-1]

    evidences = []
    for fi in range(len(options.evidence_fnames)):
        fname = options.evidence_fnames[fi]
        if options.experiments is None:
            # If no experiments are specified, we assume invert refers to whole evidence files.
            ed = mq.EvidenceDescriptor()
            ed.filename = os.path.expanduser(fname)
            ed.tags = options.tags
            evidences.append(ed)
コード例 #10
0
ファイル: proteins-to-genes.py プロジェクト: dad/lcscore
	random.seed(options.random_seed)

	# Read input
	if not os.path.isfile(options.in_fname):
	 	raise IOError("# Error: file {} does not exist".format(options.in_fname))
	with open(options.in_fname,'r') as inf:
	 	# Read a FASTA file?
	 	(headers, seqs) = biofile.readFASTA(inf)
	
	sug_dict = {}
	if not options.suggest_sequences is None:
		if not os.path.isfile(options.suggest_sequences):
		 	raise IOError("# Error: file {} does not exist".format(options.in_fname))
		with open(options.suggest_sequences,'r') as inf:
		 	# Read a FASTA file?
		 	sug_dict = biofile.readFASTADict(inf)
	
	# Write output
	dout = util.DelimitedOutput()
	dout.addHeader('name','name of construct')
	dout.addHeader('sequence','sequence')
	dout.addHeader('notes','notes')
	dout.describeHeader(data_outs)

	dout.writeHeader(data_outs)
	n_written = 0
	mutant_seqs = {}

	def parseHeader(x):
		name = biofile.firstField(x)
		property_entries = [tuple(y.split('=')) for y in x.split() if '=' in y]
コード例 #11
0
        "--ratio-sig",
        dest="ratio_significance_field",
        default="ratio_hl_normalized",
        help="field to use for ratio significance calculations")
    parser.add_argument("--abundance",
                        dest="abundance_field",
                        default="intensity",
                        help="field to use for abundance calculations")
    options = parser.parse_args()

    # Set up some output
    info_outs = util.OutStreams(sys.stdout)

    orf_dict = None
    if not options.database_fname is None:
        orf_dict = biofile.readFASTADict(options.database_fname)

    evidences = []
    if not options.in_fname is None:
        #print "# Loading..."
        # Read more experiments from master file
        inf = file(os.path.expanduser(options.in_fname), 'r')
        dlr = util.DelimitedLineReader(inf, header=True)
        while not dlr.atEnd():
            flds = dlr.nextDict()
            if os.path.isfile(os.path.expanduser(flds['filename'])):
                ed = mq.EvidenceDescriptor()
                ed.filename = os.path.expanduser(flds['filename'])
                ed.invert = flds['invert'][0].lower() in ['1', 'y', 't']
                ed.tags = [x.strip() for x in flds['tags'].split(',')]
                ed.experiment = flds['experiment']
コード例 #12
0
	parser.add_option("-s", "--scores-out", dest="score_fname", type="string", default="vanilla", help="format of ID in FASTA entry")
	parser.add_option("-p", "--pseudocount", dest="pseudocount", type="float", default=0.0, help="pseudocount to be added to all frequencies")
	(options, args) = parser.parse_args()
	in_fname = args[0]

	info_outs = util.OutStreams(sys.stdout)
	data_outs = util.OutStreams()

	# Start up output
	if not options.out_fname is None:
		outf = file(options.out_fname, 'w')
		data_outs.addStream(outf)
	else:
		data_outs.addStream(sys.stdout)
	formatFxn = biofile.getIDFunction(options.format)
	cdna_dict = biofile.readFASTADict(in_fname, formatFxn)
	calc = Calculator()
	calc.initializeFromSequences(cdna_dict.values(), options.pseudocount)
	syn_dict = calc.getCodonSYNScores()
	syn_opt_codons = []
	for aa in translate.degenerateAAs():
		codons = translate.getCodonsForAA(aa, rna=False)
		best_syn_codon = sorted([(syn_dict[c],c) for c in codons])[-1][1]
		syn_opt_codons.append(best_syn_codon)
	data_outs.write("# Read {0}\n#{1:d} sequences, {2:d} codons, {3:d} nucleotides\n".format(in_fname, len(cdna_dict.keys()), int(sum(calc.codon_freq.values())), int(sum(calc.nucleotide_freq.values()))))
	data_outs.write("# syn_scores = {0!s}\n".format(syn_dict))
	data_outs.write("# SYN opt codons = {0!s}\n".format(sorted(syn_opt_codons)))
	data_outs.write("{0!s}".format(calc))

	if not options.score_dict_fname is None:
		pickle.dump(syn_dict, file(options.score_dict_fname,'w'))
コード例 #13
0
	info_outs = util.OutStreams(sys.stdout)
	data_outs = util.OutStreams()

	# Check
	assert options.window_size > 0
	if not options.upper_window_size is None:
		assert options.upper_window_size >= options.window_size
	else:
		options.upper_window_size = options.window_size
	assert options.window_size > 0

	prot_dict = {}
	# Read input
	if not os.path.isfile(options.in_fname):
		raise IOError("# Error: file {} does not exist".format(options.in_fname))
	prot_dict = biofile.readFASTADict(file(options.in_fname, 'r'))
	
	# Generate sequence windows and quantify them
	seq_weights = [(s,1.0/len(s)) for s in prot_dict.values()]
	
	window_sizes = range(options.window_size, options.upper_window_size+1, 1)
	for window_size in window_sizes:
		# Start up output
		if not options.out_fname is None:
			if len(window_sizes)>1:
				# Use formatted filename for each window size
				fname = "{}-{:d}mers.txt".format(options.out_fname, window_size)
			else:
				# Use filename as given, for single file
				fname = options.out_fname
			outf = file(fname,'w')
コード例 #14
0
    data_outs = util.OutStreams()

    # Check
    assert options.window_size > 0
    if not options.upper_window_size is None:
        assert options.upper_window_size >= options.window_size
    else:
        options.upper_window_size = options.window_size
    assert options.window_size > 0

    prot_dict = {}
    # Read input
    if not os.path.isfile(options.in_fname):
        raise IOError("# Error: file {} does not exist".format(
            options.in_fname))
    prot_dict = biofile.readFASTADict(file(options.in_fname, 'r'))

    # Generate sequence windows and quantify them
    seq_weights = [(s, 1.0 / len(s)) for s in prot_dict.values()]

    window_sizes = range(options.window_size, options.upper_window_size + 1, 1)
    for window_size in window_sizes:
        # Start up output
        if not options.out_fname is None:
            if len(window_sizes) > 1:
                # Use formatted filename for each window size
                fname = "{}-{:d}mers.txt".format(options.out_fname,
                                                 window_size)
            else:
                # Use filename as given, for single file
                fname = options.out_fname
コード例 #15
0
import sys, os, math, string
import biofile

if __name__ == '__main__':
    (h, s) = biofile.readFASTA('test-biofile/test-biofile-001.fa')
    assert len(h) == 143
    cd = biofile.readFASTADict(
        os.path.expanduser('test-biofile/test-biofile-001.fa'))
    assert len(cd.keys()) == len(h)
コード例 #16
0
ファイル: mspep.py プロジェクト: dad/base
	parser.add_argument("-x", "--experiment", dest="experiments", action="append", default=None, help="experiments to assay")
	parser.add_argument("-g", "--debug", dest="debugging", action="store_true", default=False, help="debug mode?")
	parser.add_argument("-m", "--merge", dest="merge", action="store_true", default=False, help="merge the indicated experiments?")
	parser.add_argument("-t", "--tag", dest="tags", action="append", default=[], help="tags to restrict the analysis to specific tagged experiments")
	parser.add_argument("-u", "--unique", dest="unique_matches", action="store_true", default=False, help="use unique peptides only?")
	parser.add_argument("-y", "--type", dest="output_type", default="fasta", help="type of output [fasta=alignment, ratio=profiles]")
	parser.add_argument("--IL", dest="I_equals_L", action="store_true", help="I equivalent to L?")

	options = parser.parse_args()

	# Set up some output
	info_outs = util.OutStreams(sys.stdout)

	orf_dict = None
	if not options.database_fname is None:
		orf_dict = biofile.readFASTADict(os.path.expanduser(options.database_fname))

	# Pull out target protein
	target_prot = orf_dict[options.target_orf]
	if target_prot[-1] == '*':
		target_prot = target_prot[0:-1]

	evidences = []
	for fi in range(len(options.evidence_fnames)):
		fname = options.evidence_fnames[fi]
		if options.experiments is None:
			# If no experiments are specified, we assume invert refers to whole evidence files.
			ed = mq.EvidenceDescriptor()
			ed.filename = os.path.expanduser(fname)
			ed.tags = options.tags
			evidences.append(ed)
コード例 #17
0
                        help="amino acids (e.g. ACDEF) for frequency analysis")
    parser.add_argument("-p",
                        "--pseudo",
                        dest="pseudocount",
                        type=float,
                        default=0.0,
                        help="pseudocount to add to all frequencies")
    parser.add_argument("-o",
                        "--out",
                        dest="out_fname",
                        type=str,
                        default=None,
                        help="output filename")
    options = parser.parse_args()

    cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname))
    prot_dict = biofile.readFASTADict(os.path.expanduser(
        options.prot_in_fname))

    # Read paralog data from Yeast Gene Order Browser file
    ygob_data = util.readTable(
        file(os.path.expanduser(options.paralog_fname), 'r'))
    paralog_dict = {}
    for flds in ygob_data.dictrows:
        scer1 = flds['scer1'].strip()
        scer2 = flds['scer2'].strip()
        if not (na.isNA(scer1) or na.isNA(scer2)):
            paralog_dict[scer1] = scer2
            paralog_dict[scer2] = scer1

    # Read SGD data