コード例 #1
0
def get_physical_ppi(partner_bool=True):
    if partner_bool:
        d_ref = read_in('uniprot', 'oln')
        d_ref2 = read_in('uniprot', 'pdb')
    else:
        d_ref = read_in('Entry', 'Gene names  (ordered locus )', 'proteome')
    taxonomy = taxid()[organism]
    db = 'intact'  #oln not supported for ecoli for mint and biogrid
    score_crit = None

    d = {}
    error_list = []
    s = PSICQUIC(verbose=False)
    for uniprot, oln in d_ref.iteritems():
        try:
            ppis = s.query(
                db, "{0} AND taxid:{1} AND affinity".format(uniprot, taxonomy))
        except:
            ppis = []
            error_list.append(uniprot)
            print "error! can't find ppis for {0}".format(uniprot)
            continue

        if score_crit != None:
            ppis = get_score(ppis, score_crit)

        if partner_bool:
            d.update(get_ppi_partner(uniprot, d_ref2[uniprot], ppis, d_ref2))
        else:
            d[oln] = get_ppi_degree(ppis)

    if score_crit:
        db += '_{0}'.format(score_crit)
    return d, error_list, db
コード例 #2
0
	def __init__(self, verbose):
		self.verbose = verbose
		self.d_ref = read_in('oln', 'pdb', 'pre_seq2struc')
		self.d_ref2 = read_in('oln', 'uniprot', 'pre_seq2struc')

		self.d_input = read_in(*database(organism, 'length'))
		self.d_output = {}
コード例 #3
0
def parse_input(user_input, organism):
    inputs, sign, operation = get_operation(user_input, organism)
    if len(inputs) > 1:
        d = reconcile(read_in(*inputs[0]), read_in(*inputs[1]), sign,
                      operation)
    else:
        d = reconcile(read_in(*inputs[0]), read_in(*inputs[0]), sign)
    return d, get_label(user_input, inputs[0][2]), 'PDB' not in inputs[0][2]
コード例 #4
0
def reference(proteome_subset_bool, organism):
    if organism == 'protherm':
        d_ref = read_in('uniprot', 'pdb', organism=organism)
    elif proteome_subset_bool:
        d_ref = read_in(
            'Gene names  (ordered locus )',
            'Entry',
            'proteome',
            organism=organism
        )  #based on oln cuz proteome data usually have oln only
    else:
        d_ref = read_in('oln', 'pdb', organism=organism)
    return parse_condition(d_ref, organism)
コード例 #5
0
def get_info(organism):
    d_ref = read_in('uniprot', 'pdb', organism=organism)

    d = {}
    columns = [
        'id', 'genes', 'comment(function)', 'go(molecular function)',
        'comment(SUBCELLULAR LOCATION)'
    ]
    uniprot_api = UniProtAPI(columns)
    label_list, response = uniprot_api.organism_info(organism=organism)

    for line in response:
        word_list = line.split('\t')
        word_list = [word.strip() for word in word_list]
        uniprot = word_list[label_list.index(
            'Entry')]  #maybe iterate through a list
        if uniprot in d_ref:
            genes = word_list[label_list.index('Gene names')]
            function = word_list[label_list.index('Function [CC]')]
            function2 = word_list[label_list.index(
                'Gene ontology (molecular function)')]
            location = word_list[label_list.index('Subcellular location [CC]')]
            if '"' in function:  #sqlite3 cant handle "" marks
                function = function.replace('"', '')

            d[d_ref[uniprot]] = [
                genes, location[len('SUBCELLULAR LOCATION: '):],
                function[len("FUNCTION: "):], function2
            ]
    return d
コード例 #6
0
    def __init__(self, verbose, d_ref):
        self.verbose = verbose
        self.d_ref = d_ref
        self.d_ref = read_in('Entry',
                             'Gene names  (ordered locus )',
                             filename='proteome')

        uniprotapi = UniProtAPI(['id', 'feature(CHAIN)'])
        self.labels, self.raw_data = uniprotapi.organism_info()
        self.d_output = {}
コード例 #7
0
def read_in_index():
    d = initialize_dict('dict')
    for organism in organism_list:
        pre_d = read_in(
            'pdb',
            'uniprot',
            filename='../0-identify_structure/0-identify_pdb/{0}/output.txt'.
            format(organism))
        pre_d = collections.OrderedDict(sorted(pre_d.items()))
        d[organism] = {i: pdb for i, pdb in enumerate(pre_d)}
    return d
コード例 #8
0
if __name__ == "__main__":
    help_message(help_msg, bool_org_dir=False)
    d_org = int2organism()
    d_index = initialize_dict('dict')
    d_val = initialize_dict('list')

    protein_property_list = [
        'length', 'abundance', 'evolutionary_rate', 'contact_density',
        'PPI_degree', 'dosage_tolerance'
    ]
    log_zero_list = [
        -1, -1, -4, 1, -1
    ]  #make into dict	#dont log dosage tolerance, already logged for yeast, ecoli is discrete
    for organism in organism_list:
        pre_d_i = read_in('pdb', 'uniprot', organism=organism)
        pre_d_i = collections.OrderedDict(sorted(pre_d_i.items()))
        d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)}

        d_ref = read_in('oln', 'pdb', organism=organism)
        for protein_property in protein_property_list:
            x_input = database(organism, protein_property)
            d = read_in(*x_input)

            d_subset = {
                pdb: d[oln]
                for oln, pdb in d_ref.iteritems() if oln in d
            }
            d_val[organism].append(d_subset)

    line_list = prepare_sql(d_org, d_index, d_val, protein_property_list,
コード例 #9
0
import sys, os

CWD = os.getcwd()
UTLTS_DIR = CWD[:CWD.index('proteomevis_scripts'
                           )] + '/proteomevis_scripts/utlts'
sys.path.append(UTLTS_DIR)
from parse_user_input import help_message
from read_in_file import read_in
from parse_data import organism
from output import writeout

if __name__ == "__main__":
    help_message(
        help_msg)  #need to adjust help message to allow yeast_ecoli case
    if organism == 'yeast_ecoli':  #dependent on yeast/extra.txt being present
        d = read_in('uniprot', 'pdb', organism='ecoli')
        d_old = read_in('uniprot', 'pdb', filename='../ecoli/extra.txt')
        flag = 'EXTRA'
    else:
        d = read_in('uniprot', 'pdb')
        d_old = read_in(
            'uniprot',
            'pdb',
            filename='../../../0-identify_structure/3-length_check/{0}/{1}'.
            format(organism, 'old_seq2struc.txt'))
        flag = 'extra'
    pdb_list = set(d.items()) - set(d_old.items())
    d_output = dict(x for x in pdb_list)
    writeout(['uniprot', 'pdb'], d_output, filename='extra')

    if organism == 'yeast_ecoli':
コード例 #10
0
def get_file():
	f = []
	for (dirpath, dirnames, filenames) in os.walk("pdb_image/"):
		f.extend(filenames)
	f.remove('.gitkeep')
	return f

def update_file_list(file_list, d):
	update = file_list[:]
	for organism, d_pdb in d.iteritems():
		for pdb in d_pdb:
			pdb_file = pdb+'.png'
			if pdb_file in file_list:
				update.remove(pdb_file)
	return update

def remove_image(update):
	for pdb_file in update:
		path = '{0}/{1}'.format(get_path(pdb_file), pdb_file)
		print path
		os.remove(path)

if __name__ == "__main__":
	help_message(help_msg, bool_org_dir=False)
	file_list = get_file()
	d = initialize_dict('dict')
	for organism in organism_list:
		d[organism] = read_in('pdb', 'uniprot', organism=organism)
	update = update_file_list(file_list, d)
	remove_image(update)
コード例 #11
0
            else:
                want_i = chain_list.index(chain)
                self.io.set_structure(pre_chain_list[want_i])
                self.io.save(self.pdb_file.get_id() + "." + chain + ".pdb")

    def run(self):
        self.get_pdb_chain()
        for pdb, self.chain_list in self.d_input.iteritems():
            if os.path.exists("{0}/{1}.pdb".format(DIR, pdb)):
                self.pdb_file = PDBParser().get_structure(
                    pdb, "{0}/{1}.pdb".format(DIR, pdb))
                self.save_pdb_chain_file(None)
            else:
                pdb_bundle = glob.glob("{0}/{1}-pdb-bundle*pdb".format(
                    DIR, pdb))
                for sub_file in pdb_bundle:
                    translate_chain = read_in_mapping(pdb)
                    self.pdb_file = PDBParser().get_structure(pdb, sub_file)
                    self.save_pdb_chain_file(translate_chain)


if __name__ == "__main__":
    help_message(help_msg)
    untar()

    d_input = read_in('pdb', 'uniprot', filename='pre_seq2struc')
    pdbchain = PDBChain(d_input.keys())
    pdbchain.run()

    print_next_step()
コード例 #12
0
    def run(self, verbose=''):
        self.get_all_info()
        self.get_best_pdb_chain()
        if verbose:
            self.print_verbose()
        return self.d_output


def prepare_writeout(d_uniprot_pdb, d_proteome):
    d_output = {}
    for uniprot, pdb in d_uniprot_pdb.iteritems():
        d_output[uniprot] = [pdb, d_proteome[uniprot]]
    return d_output


if __name__ == '__main__':
    args = help_message(help_msg, bool_add_verbose=True)
    d_proteome = read_in('Entry',
                         'Gene names  (ordered locus )',
                         filename='proteome')
    uniprot2pdb = UniProt2PDB(d_proteome.keys())
    d_uniprot_pdb = uniprot2pdb.run(args.verbose)

    d_output = prepare_writeout(d_uniprot_pdb, d_proteome)
    filename = 'pre_seq2struc'
    writeout(['uniprot', 'pdb', 'oln'],
             d_output,
             filename="new_{0}".format(filename))
    database_update_needed(filename)
コード例 #13
0
			res1 = contact[0].id[1]
			res2 = contact[1].id[1]
			if not abs(res1 - res2) in [1, 0]:	#no nearest neighbors
				M[res1][res2] = 1
		return M + M.T
	

if __name__ == "__main__":
	help_message(help_msg)
	extra = ''
	method = false_or_true("Calculate contact density like Shakh2006 [default Zhou2008]?")
	if false_or_true("Relax selection criterion 2"):
		extra += 'pre_output'

	contact_defn = ['Bloom', 'Shakh'][method]
	d_input = read_in('pdb', 'oln', filename = extra)
	d_input1 = read_in('pdb', 'uniprot', filename = extra)
	d_output = {}
	for pdb, oln in d_input.iteritems():
		protein_contact = ProteinContact(pdb, contact_defn)
		residues = protein_contact.get_residues()
		contact_density = protein_contact.contact_matrix().sum() / float(len(residues))
		if organism=='protherm':
			d_output[d_input1[pdb]] = contact_density
			x_name = 'uniprot'
		else:
			d_output[oln] = contact_density 
			x_name = 'oln'

	filename = 'PDB'
	if method:
コード例 #14
0
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])
    ax.set_axis_off()
    ax.scatter(data_list[1], data_list[0], c=color, s=1)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_aspect('auto')
    plt.savefig('species.{0}.png'.format(int(d_label[organism])),
                transparent="True",
                dpi=350)


#	plt.show()

if __name__ == "__main__":
    help_message(help_msg, bool_org_dir=False)
    d_org = int2organism()
    d_label = {org: i for i, org in d_org.iteritems()}
    protein_property_list = ['sid', 'tm']
    for organism in organism_list:
        predata_list = []
        for protein_property in protein_property_list:
            x_input = database(organism, protein_property)
            d = read_in(*x_input)
            predata_list.append(d)
        data_ppi = read_in_ppi_partners()
        data_tup, color = merge(predata_list[0], predata_list[1], data_ppi)
        data_list = zip(*data_tup)
        plotout(data_list, color, d_label)
    print_next_step('../')
コード例 #15
0
             data[1],
             'bo',
             c=color[organism],
             label="$\\rho=${0:.2f} ({1:.2E})\n$n=${2} ({3:.0f}%)".format(
                 r, pvalue, num_list[0], 100 * num_list[0] / num_list[1]))

    plt.title(title)
    plt.xlabel(label_list[0]), plt.ylabel(label_list[1]), plt.legend()
    plt.show()


if __name__ == "__main__":
    organism = which_organism()
    d_x, xlabel, proteome_subset_bool_x = parse_input(
        str(raw_input("Property x: ")), organism)
    d_y, ylabel, proteome_subset_bool_y = parse_input(
        str(raw_input("Property y: ")), organism)

    proteome_subset_bool = proteome_subset_bool_x and proteome_subset_bool_y
    if proteome_subset_bool:
        proteome_subset_bool = false_or_true(
            "Include proteins not in ProteomeVis")

    data, labels = merge(d_x, d_y, proteome_subset_bool, organism)
    num = len(data[0])
    total = len(read_in('Entry', 'Entry', 'proteome', organism=organism))

    plotout(organism, data, [num, total],
            get_title(organism, proteome_subset_bool), [xlabel, ylabel],
            labels)
コード例 #16
0
            line.extend(d_info[organism][pdb])
            line.append(int(o))
            line_list.append(line)
    return line_list


if __name__ == "__main__":
    args = help_message(help_msg, bool_add_verbose=True,
                        bool_org_dir=False)  #add verbose option
    d_org = int2organism()
    d_translate = initialize_dict('dict')
    d_index = initialize_dict('dict')
    d_info = initialize_dict('dict')

    for organism in organism_list:
        pre_d_i = read_in('pdb', 'uniprot', organism=organism)
        pre_d_i = collections.OrderedDict(sorted(pre_d_i.items()))
        d_translate[organism] = pre_d_i
        d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)}
        d_info[organism] = get_info(organism)

    line_list = prepare_sql(d_org, d_translate, d_index, d_info, args)

    columns = [
        'chain_id', 'pdb', 'uniprot', 'genes', 'location', 'function1',
        'function2', 'species'
    ]
    write_sqlite = SQLite3('proteomevis_inspect', columns, line_list)
    write_sqlite.run()

    print_next_step('../')
コード例 #17
0
	url = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=pdb&compression=NO&structureId="
	not_available_list = []
	for pdb in pdb_list:
		pdb_name = "{0}.pdb".format(pdb)
		if not os.path.exists(pdb_name):
			pdbid = url+str(pdb)
			content = urllib.urlopen(pdbid).read()
			if '404 Not Found' in content: not_available_list.append(pdb)
			else:
				open(pdb_name, "w" ).write(content)
				print pdb_name 
	return not_available_list
 
def check(not_available_list):
	new_list = not_available_list[:]
	for pdb in not_available_list:
		if os.path.exists('{0}-pdb-bundle.tar.gz'.format(pdb)) or os.path.exists('{0}-pdb-bundle.tar'.format(pdb)):
			new_list.remove(pdb)
	if new_list:
		print "copy and paste the {0} structures below in the rcsb.org download feature (could not be downloaded programatically)".format(len(new_list))	#obtain bundle case
		print ",".join(new_list)


if __name__ == "__main__":
	help_message(help_msg)
	d = read_in('pdb', 'uniprot', 'pre_seq2struc')
	pdb_list = [x[:4] for x in d]
	not_available_list = save_pdb_file(set(pdb_list))
	check(not_available_list)
	print_next_step()
コード例 #18
0
                    float(d_val[organism][1][pdb_pair]),
                    int(d_val[organism][2][pdb_pair]), ppi_bool
                ]
                line_list.append(line)
                count += 1
    return line_list


if __name__ == "__main__":
    help_message(help_msg, bool_org_dir=False)
    d_org = int2organism()
    d_index = initialize_dict('dict')
    d_val = initialize_dict('list')

    for organism in organism_list:
        pre_d_i = read_in('pdb', 'uniprot', organism=organism)
        pre_d_i = collections.OrderedDict(sorted(pre_d_i.items()))
        d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)}

        for x in [
                'TM', 'SID', 'nal'
        ]:  #, 'align1', 'align2']:	sequence alignments takes up 700MB! makes downloading edges impossible
            d_val[organism].append(read_in(*database(organism, x)))
    d_ppi = read_in_ppi_partners()
    line_list = prepare_sql(d_org, d_index, d_ppi, d_val)

    columns = [
        'id', 'species', 'sourceID', 'targetID', 'tm', 'sid', 'align_length',
        'ppi'
    ]
    write_sqlite = SQLite3('proteomevis_edge', columns, line_list)