Esempio n. 1
0
	def test_reading_from_guide_table(self):
		"""Read table"""
		root = Newick.Clade()
		root.name = "cellular organisms"
		#print(root.depths())
		inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r')
		table = util.readTable(inf, header=True)
		#print(table)
		tree = phyloutil.treeFromClassificationTable(table)
		inf.close()
		phyloutil.printTree(tree, sys.stdout)
Esempio n. 2
0
	def test_reading_from_class_table(self):
		"""Read table"""
		root = Newick.Clade()
		root.name = "cellular organisms"
		inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r')
		table = util.readTable(inf, header=True)
		#print(table)
		tree = phyloutil.treeFromClassificationTable(table)
		inf.close()
		#phyloutil.printTree(root, sys.stdout)
		termlist = list(tree.get_terminals())
		self.assertTrue(termlist[0].name=='Moesziomyces antarcticus T-34')
Esempio n. 3
0
 def test_run(self):
     """readTable header"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, "w")
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     with open(fname, "r") as inf:
         ldf = util.readTable(inf, header=True)
         h = ldf.header
         self.assertTrue(h[1] == "two")
     os.remove(fname)
Esempio n. 4
0
 def test_run(self):
     """readTable header"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, 'w')
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     with open(fname, 'r') as inf:
         ldf = util.readTable(inf, header=True)
         h = ldf.header
         self.assertTrue(h[1] == 'two')
     os.remove(fname)
Esempio n. 5
0
 def test_run(self):
     """readTable basic"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, 'w')
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     inf = open(fname, 'r')
     ldf = util.readTable(inf, header=True)
     self.assertTrue(ldf['three'][0] == 3)
     self.assertTrue(ldf['three'][1] == 33)
     inf.close()
     os.remove(fname)
Esempio n. 6
0
 def test_run(self):
     """readTable basic"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, "w")
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     inf = open(fname, "r")
     ldf = util.readTable(inf, header=True)
     self.assertTrue(ldf["three"][0] == 3)
     self.assertTrue(ldf["three"][1] == 33)
     inf.close()
     os.remove(fname)
Esempio n. 7
0
 def test_run(self):
     """readTable dictrows"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, 'w')
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     inf = open(fname, 'r')
     ldf = util.readTable(inf, header=True)
     for (ri, flds) in enumerate(ldf.dictrows):
         if ri == 0:
             self.assertTrue(flds['three'] == 3)
         if ri == 1:
             self.assertTrue(flds['three'] == 33)
     inf.close()
     os.remove(fname)
Esempio n. 8
0
 def test_run(self):
     """readTable dictrows"""
     fname = "tmp_lightdataframe.txt"
     inf = open(fname, "w")
     inf.write("one\ttwo\tthree\n")
     inf.write("a\tb\t3\n")
     inf.write("a\tb\t33\n")
     inf.close()
     inf = open(fname, "r")
     ldf = util.readTable(inf, header=True)
     for (ri, flds) in enumerate(ldf.dictrows):
         if ri == 0:
             self.assertTrue(flds["three"] == 3)
         if ri == 1:
             self.assertTrue(flds["three"] == 33)
     inf.close()
     os.remove(fname)
Esempio n. 9
0
File: protprop.py Progetto: dad/base
	def __init__(self):
		self.pKa     = {'D':3.9, 'E':4.3, 'H':6.1, 'C':8.3, 'Y':10.1, 'K':10.67, 'R':12, 'N-term':8, 'C-term':3.1}
		self.charges = {'D':-1,  'E':-1,  'H':1,  'C':-1,  'Y':-1,   'K':1,    'R':1,  'N-term':1, 'C-term':-1}
		#self.charges = {'D':-1, 'E':-1, 'H':1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1}
		self.hydrophobicity_scales = {}
		# Hack so that we can store scale information in a file -- need better way to store.
		dir_path = os.path.dirname(os.path.realpath(__file__))
		inf = open(os.path.expanduser(dir_path+"/../data/hydrophobicity-scales.txt"),'r')
		tab = util.readTable(inf)
		scales = tab.header[1:]
		for scale in scales:
			self.hydrophobicity_scales[scale.replace('.','-')] = dict(zip(tab.col('aa'), tab.col(scale)))
		# Molecular weights of the amino acids in Da, not residues; subtract 18 for residue weight
		self.mw = {'A': 89.09, 'C': 121.16, 'E': 147.13, 'D': 133.10, 'G': 75.07, 'F': 165.19, 
			'I': 131.18, 'H': 155.16, 'K': 146.19, 'M': 149.21, 'L': 131.18, 'N': 132.12, 'Q': 146.15, 
			'P': 115.13, 'S': 105.09, 'R': 174.20, 'T': 119.12, 'W': 204.23, 'V': 117.15, 'Y': 181.19,
			'B': 132.61, 'Z': 146.64}
		inf.close()
Esempio n. 10
0
File: protprop.py Progetto: dad/base
	def read(self, stream, header=True):
		tab = util.readTable(stream, header=header)
		for flds in tab.dictrows:
			self._comp_dict[flds['aa']] = flds['proportion']
Esempio n. 11
0
		data_outs.addStream(outf)
	else:
		# By default, write to stdout
		data_outs.addStream(sys.stdout)

	# Write out parameters
	data_outs.write("# Run started {}\n".format(util.timestamp()))
	data_outs.write("# Command: {}\n".format(' '.join(sys.argv)))
	data_outs.write("# Parameters:\n")
	optdict = vars(options)
	for (k,v) in optdict.items():
		data_outs.write("#\t{k}: {v}\n".format(k=k, v=v))

	# Read background information
	aa_volumes = {}
	vols = util.readTable(open(os.path.expanduser('~/research/lib/data/harpaz-aa-volumes.txt')))
	#print vols['volume.a3']
	aa_volumes = dict(zip(vols['aa'], [x/1000.0 for x in vols['mean.volume.a3']]))
	#print aa_volumes

	# Read input
	if not os.path.isfile(options.in_fname):
		raise IOError("# Error: file {} does not exist".format(options.in_fname))
	(headers, seqs) = biofile.readFASTA(file(options.in_fname, 'r')) #, key_fxn=biofile.secondField)
	if options.translate_sequences:
		seqs = [translate.translate(s) for s in seqs]
	zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None]
	all_keys = [biofile.firstField(h) for (h,s) in zhs]
	(headers, seqs) = zip(*zhs)
	prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs])
	gene_orf_dict = dict([(biofile.secondOrFirstField(h), biofile.firstField(h)) for h in headers])
Esempio n. 12
0
    # Read input
    if not os.path.isfile(options.in_fname):
        raise IOError("# Error: file {} does not exist".format(
            options.in_fname))
    with open(options.in_fname, 'r') as inf:
        # Read a FASTA file?
        (headers, seqs) = biofile.readFASTA(inf)
        info_outs.write("# Read {:d} sequences\n".format(len(headers)))

    if not options.in_names_fname is None:
        if not os.path.isfile(options.in_names_fname):
            raise IOError("# Error: file {} does not exist".format(
                options.in_names_fname))
        with open(options.in_names_fname, 'r') as inf:
            species = util.readTable(inf, header=True)

        species_name_lookup = dict(
            zip(species['ODB_code'],
                [shorten(x) for x in species['Organism']]))
        new_headers = []
        new_seqs = []
        for (hdr, seq) in zip(headers, seqs):
            odb_code = hdr.strip().split()[-1]
            if len(odb_code) == 5:
                try:
                    # Rename
                    species_name = species_name_lookup[odb_code]
                    new_headers.append('{} {}'.format(species_name, hdr))
                except KeyError:
                    new_headers.append(hdr)
Esempio n. 13
0
    if not os.path.isfile(tree_fname):
        raise IOError("# Error: file {} does not exist".format(tree_fname))
    tree_string = ""
    with open(tree_fname, 'r') as inf:
        lines = inf.readlines()
        for line in lines:
            if not line.strip()[0] == '#':
                tree_string += line.strip()
    trees = NewickIO.parse(StringIO(tree_string))
    tree = next(trees)
    # Read mapping file
    map_fname = os.path.expanduser(options.mapping_in_fname)
    if not os.path.isfile(map_fname):
        raise IOError("# Error: file {} does not exist".format(map_fname))
    with open(map_fname, 'r') as inf:
        map_table = util.readTable(inf, header=True)

    # Create mapping
    mapping_dict = dict(zip(map_table['species'],
                            map_table['updated.species']))

    # Update the FASTA headers
    #new_headers = []
    #new_seqs = []
    seq_dict = {}
    header_dict = {}
    short_species_names = {}
    for (i, h) in enumerate(headers):
        species_name = extractSpeciesName(h)
        short_name = makeShortSpeciesName(species_name)
        try:
Esempio n. 14
0
 def read(self, stream, header=True):
     tab = util.readTable(stream, header=header)
     for flds in tab.dictrows:
         self._comp_dict[flds['aa']] = flds['proportion']
Esempio n. 15
0
 def __init__(self):
     self.pKa = {
         'D': 3.9,
         'E': 4.3,
         'H': 6.1,
         'C': 8.3,
         'Y': 10.1,
         'K': 10.67,
         'R': 12,
         'N-term': 8,
         'C-term': 3.1
     }
     self.charges = {
         'D': -1,
         'E': -1,
         'H': 1,
         'C': -1,
         'Y': -1,
         'K': 1,
         'R': 1,
         'N-term': 1,
         'C-term': -1
     }
     #self.charges = {'D':-1, 'E':-1, 'H':1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1}
     self.hydrophobicity_scales = {}
     # Hack so that we can store scale information in a file -- need better way to store.
     dir_path = os.path.dirname(os.path.realpath(__file__))
     inf = open(
         os.path.expanduser(dir_path +
                            "/../data/hydrophobicity-scales.txt"), 'r')
     tab = util.readTable(inf)
     scales = tab.header[1:]
     for scale in scales:
         self.hydrophobicity_scales[scale.replace('.', '-')] = dict(
             zip(tab.col('aa'), tab.col(scale)))
     # Molecular weights of the amino acids in Da, not residues; subtract 18 for residue weight
     self.mw = {
         'A': 89.09,
         'C': 121.16,
         'E': 147.13,
         'D': 133.10,
         'G': 75.07,
         'F': 165.19,
         'I': 131.18,
         'H': 155.16,
         'K': 146.19,
         'M': 149.21,
         'L': 131.18,
         'N': 132.12,
         'Q': 146.15,
         'P': 115.13,
         'S': 105.09,
         'R': 174.20,
         'T': 119.12,
         'W': 204.23,
         'V': 117.15,
         'Y': 181.19,
         'B': 132.61,
         'Z': 146.64
     }
     inf.close()
Esempio n. 16
0
	for (k,v) in optdict.items():
		data_outs.write("#\t{k}: {v}\n".format(k=k, v=v))

	# Read input
	if not os.path.isfile(options.in_fname):
		raise IOError("# Error: file {} does not exist".format(options.in_fname))
	with open(options.in_fname,'r') as inf:
		# Read a FASTA file?
		(headers, seqs) = biofile.readFASTA(inf)
		info_outs.write("# Read {:d} sequences\n".format(len(headers)))

	if not options.in_names_fname is None:
		if not os.path.isfile(options.in_names_fname):
		 	raise IOError("# Error: file {} does not exist".format(options.in_names_fname))
		with open(options.in_names_fname,'r') as inf:
			species = util.readTable(inf, header=True)
		
		species_name_lookup = dict(zip(species['ODB_code'], [shorten(x) for x in species['Organism']]))
		new_headers = []
		new_seqs = []
		for (hdr, seq) in zip(headers,seqs):
			odb_code = hdr.strip().split()[-1]
			if len(odb_code) == 5:
				try:
					# Rename
					species_name = species_name_lookup[odb_code]
					new_headers.append('{} {}'.format(species_name, hdr))
				except KeyError:
					new_headers.append(hdr)
			else:
				new_headers.append(hdr)
Esempio n. 17
0
	optdict = vars(options)
	for (k,v) in optdict.items():
		data_outs.write("#\t{k}: {v}\n".format(k=k, v=v))

	# Read input
	if not os.path.isfile(options.in_fname):
		raise IOError("# Error: file {} does not exist".format(options.in_fname))
	with open(options.in_fname,'r') as inf:
		# Read a FASTA file?
		(headers, seqs) = biofile.readFASTA(inf)

	pref_ids = {}
	if not os.path.isfile(options.in_filter_fname):
		raise IOError("# Error: file {} does not exist".format(options.in_filter_fname))
	with open(options.in_filter_fname,'r') as inf:
		tab = util.readTable(inf, header=True)
		pref_ids = dict(zip(tab['species'],tab['orthodb.name']))

	# Now go through headers, find multiples, and select one from each.
	selected_indices = [] # index into headers and sequences
	new_headers = []
	new_seqs = []

	orthodb_ids = [h.split()[-1] for h in headers]
	species_names = [h.split()[0].split('_')[0] for h in headers]
	for species_name in list(set(species_names)):
		dupe_indices = [xi for (xi,spec) in enumerate(species_names) if spec==species_name]
		if len(dupe_indices)==1:
			# No problem, no duplicate
			selected_indices.append(dupe_indices[0])
			continue
Esempio n. 18
0
        data_outs.write("#\t{k}: {v}\n".format(k=k, v=v))

    # Read input
    if not os.path.isfile(options.in_fname):
        raise IOError("# Error: file {} does not exist".format(
            options.in_fname))
    with open(options.in_fname, 'r') as inf:
        # Read a FASTA file?
        (headers, seqs) = biofile.readFASTA(inf)

    pref_ids = {}
    if not os.path.isfile(options.in_filter_fname):
        raise IOError("# Error: file {} does not exist".format(
            options.in_filter_fname))
    with open(options.in_filter_fname, 'r') as inf:
        tab = util.readTable(inf, header=True)
        pref_ids = dict(zip(tab['species'], tab['orthodb.name']))

    # Now go through headers, find multiples, and select one from each.
    selected_indices = []  # index into headers and sequences
    new_headers = []
    new_seqs = []

    orthodb_ids = [h.split()[-1] for h in headers]
    species_names = [h.split()[0].split('_')[0] for h in headers]
    for species_name in list(set(species_names)):
        dupe_indices = [
            xi for (xi, spec) in enumerate(species_names)
            if spec == species_name
        ]
        if len(dupe_indices) == 1:
Esempio n. 19
0
	parser.add_argument(dest="prot_in_fname", type=str, help="FASTA file containing protein sequences")
	parser.add_argument(dest="feature_fname", type=str, help="SGD file containing sequence features")
	parser.add_argument(dest="paralog_fname", type=str, help="Yeast Gene Order Browser formatted file of paralog identifications")
	parser.add_argument("--aa", dest="do_aa", default=False, action="store_true", help="compute amino-acid frequencies?")
	parser.add_argument("--gc", dest="do_gc", default=False, action="store_true", help="compute GC frequencies?")
	parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?")
	parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis")
	parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies")
	parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename")
	options = parser.parse_args()

	cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname))
	prot_dict = biofile.readFASTADict(os.path.expanduser(options.prot_in_fname))

	# Read paralog data from Yeast Gene Order Browser file
	ygob_data = util.readTable(file(os.path.expanduser(options.paralog_fname),'r'))
	paralog_dict = {}
	for flds in ygob_data.dictrows:
		scer1 = flds['scer1'].strip()
		scer2 = flds['scer2'].strip()
		if not (na.isNA(scer1) or na.isNA(scer2)):
			paralog_dict[scer1] = scer2
			paralog_dict[scer2] = scer1

	# Read SGD data
	sgd_features = util.readTable(file(os.path.expanduser(options.feature_fname),'r'), header=False)
	'''
	http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README
	1.   Primary SGDID (mandatory)
	2.   Feature type (mandatory)
	3.   Feature qualifier (optional)
Esempio n. 20
0
	if not os.path.isfile(fname):
	 	raise IOError("# Error: file {} does not exist".format(fname))

	tree_root = Newick.Clade()
	tree_root.parent = None
	tree_root.name = "cellular organisms"


	# Get directory of guide file
	path = os.path.dirname(fname)
	curwd = os.getcwd()

	species_names = []
	with open(fname,'r') as inf:
		os.chdir(path)
		tab = util.readTable(inf, header=True)
		rows = tab.dictrows
		if options.debug:
			rows = [x for x in tab.dictrows][:2]
		just_started = True
		for row in rows:
			spec_fname = row['filename']
			#print(spec_fname)
			if not na.isNA(spec_fname):
				spec_inf = util.readTable(open(spec_fname,'r'), header=True)
				twig = phyloutil.treeFromClassificationTable(spec_inf)
				added = phyloutil.mergeTrees(tree_root, twig, add_to_leaf=just_started)
				if added:
					just_started = False
					species_names.append(row['updated.species'])
					#print(spec_fname)
Esempio n. 21
0
                        default=0.0,
                        help="pseudocount to add to all frequencies")
    parser.add_argument("-o",
                        "--out",
                        dest="out_fname",
                        type=str,
                        default=None,
                        help="output filename")
    options = parser.parse_args()

    cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname))
    prot_dict = biofile.readFASTADict(os.path.expanduser(
        options.prot_in_fname))

    # Read paralog data from Yeast Gene Order Browser file
    ygob_data = util.readTable(
        file(os.path.expanduser(options.paralog_fname), 'r'))
    paralog_dict = {}
    for flds in ygob_data.dictrows:
        scer1 = flds['scer1'].strip()
        scer2 = flds['scer2'].strip()
        if not (na.isNA(scer1) or na.isNA(scer2)):
            paralog_dict[scer1] = scer2
            paralog_dict[scer2] = scer1

    # Read SGD data
    sgd_features = util.readTable(file(
        os.path.expanduser(options.feature_fname), 'r'),
                                  header=False)
    '''
	http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README
	1.   Primary SGDID (mandatory)