コード例 #1
0
def collect_pfam2go_info(pfam2go_file):  # Pfam2GO.txt
    pfam2go = {}
    for line in utils.gzip_bzip2_biom_open_readlines(pfam2go_file):
        line = line.strip()
        if not len(line):
            continue
        if re.search("^#", line):
            continue
        info = line.split("\t")
        pfam = info[0]
        go = info[1]
        ann = info[2]
        category = info[3]
        if category == "function":
            category = "MF"
        if category == "process":
            category = "BP"
        if category == "component":
            category = "CC"
        myann = ann + "(" + category + ":" + go + ")"
        if not pfam in pfam2go:
            pfam2go[pfam] = {}
        pfam2go[pfam][myann] = ""
    # foreach line

    return pfam2go
コード例 #2
0
def collect_taxonomy_info(map_file):
    taxa_map = {}
    titles = {}
    open_file = open(map_file, "r")
    for line in utils.gzip_bzip2_biom_open_readlines(map_file):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^Taxon", line):
            for item in info:
                titles[item] = info.index(item)
            continue
        # title line
        taxa_id = info[titles["Taxon"]]
        taxa_name = info[titles["Scientific_name"]]
        taxa_rank = info[titles["Rank"]]
        taxa_lineage = info[titles["Lineage"]]
        tmp = taxa_lineage.split("|")
        myid = taxa_name
        if re.search("__", tmp[-1]):
            mym = re.search("__([\S]+)", tmp[-1])
            myid = mym.group(1)
        taxa_map[
            myid] = taxa_id + "\t" + taxa_name + "\t" + taxa_rank + "\t" + taxa_lineage
# foreach line

    return taxa_map
コード例 #3
0
def collect_interaction_info (int_file, filter_flag, human_pfam):	# INTERACTION.txt
	interact = {}
	for line in utils.gzip_bzip2_biom_open_readlines (int_file): 
		line = line.strip()
		if not len(line):
			continue
		info = line.split("|")
		id1 = info[0]
		id2 = info[1]
		level = info[-2]
		#if level == "NA":
		#	continue
		if id1 == id2:	# self-interaction
			continue
		if filter_flag == "yes":
			if not id1 in human_pfam and not id2 in human_pfam: # no interaction with human pfam
				continue
		if not id1 in interact:
			interact[id1] = {}
		interact[id1][id2] = level
		if not id2 in interact:
			interact[id2] = {}
		interact[id2][id1] = level
	# foreach line
	
	return interact
コード例 #4
0
def collect_pfam_info(map_file):  # uniprot_human_pfam.tsv
    pfams = {}
    titles = {}
    for line in utils.gzip_bzip2_biom_open_readlines(map_file):
        line = line.strip()
        if not len(line):
            continue
        if re.search("^#", line):
            continue
        if re.search("^Pfam\t", line):
            info = line.split("\t")
            myindex = 0
            while myindex < len(info):
                titles[info[myindex]] = myindex
                myindex = myindex + 1
            continue
        info = line.split("\t")
        if len(info) != len(titles.keys()):
            continue
        pfam = info[titles["Pfam"]]
        gene = info[titles["Gene_names"]]
        taxaID = info[titles["NCBI_TaxID"]]
        if taxaID != "9606":  # filter out for human pfam
            continue
        pfams[pfam] = gene
    # foreach line

    return pfams
コード例 #5
0
def collect_mapping_info (map_file):	# uniprot_human_pfam.tsv
	maps = {}
	titles = {}
	for line in utils.gzip_bzip2_biom_open_readlines (map_file):
		line = line.strip()
		if not len(line):
			continue
		if re.search("^#", line):
			continue
		if re.search("^Pfam\t", line):
			info = line.split("\t")
			myindex = 0
			while myindex < len(info):
				titles[info[myindex]] = myindex
				myindex = myindex + 1
			continue
		info = line.split("\t")
		pfam = info[titles["Pfam"]]
		organism = info[titles["Organism"]]
		taxaID = info[titles["NCBI_TaxID"]]
		if not pfam in maps:
			maps[pfam] = {}
		maps[pfam][taxaID] = ""
	# foreach line

	# human pfams
	human_pfam = {}
	for mypfam in maps:
		if "9606" in maps[mypfam]:
			human_pfam[mypfam] = ";".join(sorted(maps[mypfam].keys()))
	return  human_pfam
コード例 #6
0
def extract_annotation_info (datfile, output_path):	# Pfam-A.hmm.dat
	anns = {}
	myid = ""
	myann = ""
	if not os.path.isfile(datfile):
		sys.exit("Error: pfam file doesn't exit! " + datfile)
	outfile = os.path.join(output_path, "pfam_descriptions.txt")
	outfile1 = re.sub(".txt", ".txt.gz", outfile)
	for line in utils.gzip_bzip2_biom_open_readlines (datfile):
		line = line.strip()
		if not len(line):
			continue
		if re.search("\#=GF\s+AC", line):
			mym = re.search("\#=GF\s+AC\s+([^\.]+)", line)
			myid = mym.group(1)
			continue
		if re.search("\#=GF\s+DE", line):
			mym = re.search("\#=GF\s+DE\s+([\S\s]+)", line)
			myann = mym.group(1)
			if not myid in anns:
				anns[myid] = myann
			else:
				anns[myid] = anns[myid] + ";" + myann
			continue
	# foreach line

	open_file = open(outfile, "w")
	open_file.write("Pfam\tdescription\n")
	for mypfam in sorted(anns.keys()):
		open_file.write(mypfam + "\t" + anns[mypfam] + "\n")
	# foreach Pfam
	open_file.close()
	os.system("gzip " + outfile)
コード例 #7
0
ファイル: pfam2go.py プロジェクト: biobakery/metawibele
def collect_pfam2go_info(annfile):  # Pfam2GO.txt
    pfam2go = {}
    for line in utils.gzip_bzip2_biom_open_readlines(annfile):
        line = line.strip()
        if not len(line):
            continue
        if re.search("^#", line):
            continue
        info = line.split("\t")
        mypfam = info[0]
        mygo = info[1]
        mydec = info[2]
        mytype = info[3]
        if mytype == "process":
            mytype = "BP"
        if mytype == "function":
            mytype = "MF"
        if mytype == "component":
            mytype = "CC"
        if not mypfam in pfam2go:
            pfam2go[mypfam] = {}
        pfam2go[mypfam][mygo] = mydec + "\t" + mytype
    # foreach line

    return pfam2go
コード例 #8
0
def collect_expression_info(exp_list):  #  expression.list
    expression = {}
    for myfile in exp_list:
        myfile = myfile.strip()
        if not len(myfile):
            continue
        if re.search("^#", myfile):
            continue
        for line in utils.gzip_bzip2_biom_open_readlines(myfile):
            line = line.strip()
            if not len(line):
                continue
            if re.search("^#", line) or re.search("^Gene ID", line):
                continue
            info = line.split("\t")
            #flag = 0
            #myindex = 2
            #while myindex < len(info):
            #	if info[myindex] == "":
            #		continue
            #	if float(info[myindex]) >= float(cutoff):
            #		flag = 1
            #	myindex = myindex + 1
            #if flag == 1:
            mygene = info[1]
            expression[mygene] = ""
        # foreach line
    # foreach dataset

    return expression
コード例 #9
0
def collect_taxa_list (taxa_file): # uniprot_taxaID_bac-arc-vir.tsv
	taxa = {}
	for line in utils.gzip_bzip2_biom_open_readlines (taxa_file):
		line = line.strip()
		if not len(line):
			continue
		taxa[line] = ""
	# foreach line
	
	return taxa
コード例 #10
0
def collect_pfam_info (pfamfile):	# Pfam_ann.tsv
	pfam = {}
	for line in utils.gzip_bzip2_biom_open_readlines (pfamfile): 
		line = line.strip()
		if not len(line):
			continue
		info = line.split("\t")
		if re.search("^Pfam", line):
			continue
		pfam[info[0]] = info[1]
	# foreach line
	
	return pfam
コード例 #11
0
def collect_taxonomy_info(taxa_file, taxa_hits):  # uniprot_taxonomy.map.tsv
    taxa = {}
    taxa_map = {}
    titles = {}
    for line in utils.gzip_bzip2_biom_open_readlines(taxa_file):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^Taxon", line):
            for item in info:
                titles[item] = info.index(item)
            continue
        # title line
        mytaxa = info[titles["Taxon"]]
        if not mytaxa in taxa_hits:
            continue
        myname = info[titles["Scientific_name"]]
        myline = info[titles["Lineage"]]
        myrank = info[titles["Rank"]]
        myinfo = myline.split("|")
        if re.search("unclassified_sequences", myinfo[0]):
            continue
        myid = myname
        if re.search("__", myinfo[-1]):
            mym = re.search("([^\_]+)__([\S]+)", myinfo[-1])
            myrank = mym.group(1)
            myid = mym.group(2)
            if myrank == "k":
                myrank = "Kingdom"
            if myrank == "p":
                myrank = "Phylum"
            if myrank == "c":
                myrank = "Class"
            if myrank == "o":
                myrank = "Order"
            if myrank == "f":
                myrank = "Family"
            if myrank == "g":
                myrank = "Genus"
            if myrank == "s":
                myrank = "Species"
            if myrank == "t":
                myrank = "Terminal"
        taxa_map[myid] = mytaxa + "\t" + myname + "\t" + myrank + "\t" + myline
        taxa[mytaxa] = myname + "\t" + myrank + "\t" + myline
    # foreach line

    return taxa, taxa_map
コード例 #12
0
def report_each_map(uniref_ann, map_type, uniref_type, output_path):
    outfile = os.path.join(output_path,
                           "map_" + map_type + "_" + uniref_type + ".txt")
    outfile1 = re.sub(".txt", ".txt.gz", outfile)
    outs = {}
    titles = {}
    titles1 = {}
    ids = {}
    for line in utils.gzip_bzip2_biom_open_readlines(uniref_ann):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^ID", line) or re.search("^UniRef\t", line) or re.search(
                "^UniRefID\t", line):
            for item in info:
                titles[item] = info.index(item)
                titles1[info.index(item)] = item
            continue
        info = line.split("\t")
        if "UniRef" in titles:
            myuniref = info[titles["UniRef"]]
        elif "ID" in titles:
            myuniref = info[titles["ID"]]
        myindex = 0
        while myindex < len(info):
            mykey = titles1[myindex]
            mykey = re.sub("\(", "_", mykey)
            mykey = re.sub("\)", "", mykey)
            myvalue = info[myindex]
            if myvalue == "NA" or myvalue == "na" or myvalue == "NaN":
                myindex = myindex + 1
                continue
            if mykey == map_type:
                if not myvalue in outs:
                    outs[myvalue] = {}
                outs[myvalue][myuniref] = ""
            myindex = myindex + 1

# foreach uniref

    open_out = open(outfile, "w")
    for myid in sorted(outs.keys()):
        myinfo = "\t".join(sorted(outs[myid].keys()))
        open_out.write(myid + "\t" + myinfo + "\n")
    open_out.close()

    os.system("gzip " + outfile)
コード例 #13
0
def collect_pfam_ann (pfamfile):   # Pfam_ann.tsv 
	pfams = {}
	for line in utils.gzip_bzip2_biom_open_readlines (pfamfile):
		line = line.strip()
		if not len(line):
			continue
		if re.search("^#", line):
			continue
		info = line.split("\t")
		pfam = info[0]
		ann = info[1]
		if not pfam in pfams:
			pfams[pfam] = ann
    # foreach line
	
	return pfams
コード例 #14
0
def collect_taxonomy_info(map_file):
    taxa_map = {}
    titles = {}
    for line in utils.gzip_bzip2_biom_open_readlines(map_file):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^Taxon", line):
            for item in info:
                titles[item] = info.index(item)
            continue
        # title line
        taxa_id = info[titles["Taxon"]]
        taxa_name = info[titles["Scientific_name"]]
        taxa_rank = info[titles["Rank"]]
        taxa_lineage = info[titles["Lineage"]]
        myid = taxa_name
        tmp = taxa_lineage.split("|")
        if re.search("__", tmp[-1]):
            mym = re.search("([^\_]+)__([\S]+)", tmp[-1])
            myrank = mym.group(1)
            myid = mym.group(2)
            if myrank == "k":
                myrank = "Kingdom"
            if myrank == "p":
                myrank = "Phylum"
            if myrank == "c":
                myrank = "Class"
            if myrank == "o":
                myrank = "Order"
            if myrank == "f":
                myrank = "Family"
            if myrank == "g":
                myrank = "Genus"
            if myrank == "s":
                myrank = "Species"
            if myrank == "t":
                myrank = "Terminal"
            taxa_rank = myrank
        taxa_map[
            myid] = taxa_id + "\t" + taxa_name + "\t" + taxa_rank + "\t" + taxa_lineage
    # foreach line

    return taxa_map
コード例 #15
0
def extract_mapping_info(mapfile):  # uniprot_taxonomy.map.tsv
    maps = {}
    titles = {}
    for line in utils.gzip_bzip2_biom_open_readlines(mapfile):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^Taxon", line):
            for item in info:
                titles[item] = info.index(item)
            continue
        taxa_id = info[titles["Taxon"]]
        taxa_name = info[titles["Scientific_name"]]
        maps[taxa_id] = taxa_name  # TaxID <-> Tax
    # foreach line

    return maps
コード例 #16
0
def collect_pfam_info (pfam_file):	# pdb_chain_pfam.tsv 
	pfams = {}
	for line in utils.gzip_bzip2_biom_open_readlines (pfam_file):
		line = line.strip()
		if not len(line):
			continue
		if re.search("^#", line) or re.search("^PDB", line):
			continue
		info = line.split("\t")
		mypdb = info[0]
		mychain = info[1]
		mypfam = info[-2]
		if not mypdb in pfams:
			pfams[mypdb] = {}
		if not mychain in pfams[mypdb]:
			pfams[mypdb][mychain] = {}
		pfams[mypdb][mychain][mypfam] = ""
	# foreach line
	
	return pfams
コード例 #17
0
def collect_taxanomy_info (map_file):	# pdb_chain_taxonomy.tsv 
	taxa = {}
	taxa_hit = {}
	for line in utils.gzip_bzip2_biom_open_readlines (map_file): 
		line = line.strip()
		if not len(line):
			continue
		if re.search("^#", line) or re.search("^PDB", line):
			continue
		info = line.split("\t")
		mypdb = info[0]
		mychain = info[1]
		tax = info[2]
		if not mypdb in taxa:
			taxa[mypdb] = {}
		if not mypdb in taxa_hit:
			taxa_hit[mypdb] = {}
		taxa_hit[mypdb][tax] = ""
		if not mychain in taxa[mypdb]:
			taxa[mypdb][mychain] = {}
		taxa[mypdb][mychain][tax] = ""
	# foreach line

	#taxa_flt = {}
	#for mypdb in taxa:
	#	if mypdb in taxa_hit:
	#		if filter_flag == "yes":
	#			if not "9606" in taxa_hit[mypdb]:	# human PDB
	#				continue
	#		# filtering human PDB
	#		if not mypdb in taxa_flt:
	#			taxa_flt[mypdb] = {}
	#		for mychain in taxa[mypdb]:
	#			if not mychain in taxa_flt[mypdb]:
	#				taxa_flt[mypdb][mychain] = {}
	#			for mytax in taxa[mypdb][mychain]:
	#				taxa_flt[mypdb][mychain][mytax] = ""
	#		# foreach chain
	#	# if hit 
	# foreach PDB
	return taxa
コード例 #18
0
def read_vignettes_file(vignettes_file, specific_annotation):
    """
	Collect proteins with specific annotations for prioritization
	Input: vignettes filename
	Output: vignettes = [ann1, ann2, ann3, ..]
	"""
    config.logger.info("Start read_vignettes_file")

    vignettes = {}
    titles = {}
    for line in utils.gzip_bzip2_biom_open_readlines(vignettes_file):
        line = line.strip()
        if not len(line):
            continue
        if re.search("^#", line):
            continue
        info = line.split("\t")
        if re.search("^type", line):
            for item in info:
                titles[item] = info.index(item)
            continue
        mytype = info[titles["type"]]
        if not mytype.lower(
        ) in specific_annotation and not mytype in specific_annotation:
            continue
        if not "annotation" in titles:
            # debug
            config.logger.info("WARNING! No annotation info!\t" + line)
            continue
        myid = info[titles["annotation"]]
        vignettes[myid] = ""
    # foreach line

    config.logger.info("Finish read_vignettes_file")

    return vignettes
コード例 #19
0
def collect_basic_info (uniref_list, hits):
	uniref_info = {}
	uniref_info_tmp = {}
	flags = {}
	names = [] # ["UniRefID", "Protein_names", "Gene_names", "UniProtKB", "Tax", "TaxID", "Rep_Tax", "Rep_TaxID", "GO", "KO", "eggNOG", "Pfam", "Level4EC"]
	hits_items = set(sorted(hits.keys()))
	for myfile in uniref_list:
		myfile = myfile.strip()
		if not len(myfile):
			continue
		if re.search("^#", myfile):
			continue
		if not os.path.isfile(myfile):
			config.logger.info ("ERROR! File does not exist: " + myfile)
			continue
		myname = os.path.basename(myfile)
		if not re.search("^map_", myname):
			continue
		if not re.search("uniref90", myname) and not re.search("uniref50", myname):
			continue
		myname = re.sub("map_", "", myname)
		if re.search("^[\S]+_uniref", myname):
			myname = re.sub("_uniref[\S]+$", "", myname)
			if myname == "go":
				myname = "GO"
			if myname == "ko":
				myname = "KO"
			if myname == "eggnog":
				myname = "eggNOG"
			if myname == "pfam":
				myname = "Pfam"
			if myname == "level4ec":
				myname = "Level4EC"
		if re.search("^uniref[\d]+_name", myname):
			myname = "Protein_names"
		if not myname in flags:
			flags[myname] = ""
			names.append(myname)
		else:
			continue
		for line in utils.gzip_bzip2_biom_open_readlines (myfile): 
			line = line.strip()
			if not len(line):
				continue
			if re.search("^#", line) :
				continue
			info = line.split("\t")
			myvalue = info[0]
			myset = set(sorted(info[1:len(info)]))
			myoverlap = myset.intersection(hits_items)
			for mykey in myoverlap:
				if not mykey in uniref_info_tmp:
					uniref_info_tmp[mykey] = {}
				if not myname in uniref_info_tmp[mykey]:
					uniref_info_tmp[mykey][myname] = myvalue
				else:
					uniref_info_tmp[mykey][myname] = uniref_info_tmp[mykey][myname] + ";" + myvalue

			#myindex = 1
			#while myindex < len(info):
			#	mykey = info[myindex]
			#	if not mykey in hits:
			#		myindex = myindex + 1
			#		continue
			#	if not mykey in uniref_info_tmp:
			#		uniref_info_tmp[mykey] = {}
			#	if not myname in uniref_info_tmp[mykey]:
			#		uniref_info_tmp[mykey][myname] = myvalue
			#	else:
			#		uniref_info_tmp[mykey][myname] = uniref_info_tmp[mykey][myname] + ";" + myvalue
			#	myindex = myindex + 1
		
		# foreach line
	# foreach dataset

	# collecting all info
	for myid in uniref_info_tmp.keys():
		mystr = myid
		for myname in names:
			if myname in uniref_info_tmp[myid]:
				mystr = mystr + "\t" + uniref_info_tmp[myid][myname]
			else:
				mystr = mystr + "\tNA"
		uniref_info[myid] = mystr
	uniref_info_tmp = {}

	return uniref_info, names
コード例 #20
0
def extract_annotation_info(output_path, maps):
    anns = {}
    human_pfams = {}
    titles = {}
    datfile = os.path.join(output_path, "uniprot_annotation.tsv.gz")
    if not os.path.isfile(datfile):
        sys.exit(
            "Error: uniprot annotation file (uniprot_annotation.tsv.gz) doesn't exit, please prepare for it"
        )

    items = [
        "UniProtKB", "Entry_name", "Organism", "Gene_names", "Length", "GO_BP",
        "GO_MF", "GO_CC", "KEGG", "KEGG-KO", "eggNOG", "Interpro",
        "Taxonomic_lineage", "Subcellular_location", "Transmembrane",
        "Signal_peptide", "Pfam"
    ]
    for line in utils.gzip_bzip2_biom_open_readlines(datfile):
        line = line.strip()
        if not len(line):
            continue
        info = line.split("\t")
        if re.search("^#", line):
            continue
        if re.search("^Entry", line):
            myindex = 0
            while myindex < len(info):
                item = info[myindex]
                if item == "Entry":
                    item = "UniProtKB"
                if item == "Entry name":
                    item = "Entry_name"
                if item == "Gene names":
                    item = "Gene_names"
                if item == "Gene ontology (biological process)":
                    item = "GO_BP"
                if item == "Gene ontology (molecular function)":
                    item = "GO_MF"
                if item == "Gene ontology (cellular component)":
                    item = "GO_CC"
                if re.search("KEGG", item):
                    item = "KEGG"
                if re.search("\(KO\)", item):
                    item = "KEGG-KO"
                if re.search("COG", item):
                    item = "eggNOG"
                if re.search("Taxonomic", item):
                    item = "Taxonomic_lineage"
                if re.search("Subcellular location", item):
                    item = "Subcellular_location"
                if re.search("Signal peptide", item):
                    item = "Signal_peptide"
                if re.search("Pfam", item):
                    item = "Pfam"
                if item == "Protein names":
                    item = "Protein_names"
                if item in items:
                    titles[myindex] = item
                if "NCBI_TaxID" == item:
                    titles[myindex] = item
                if "Protein_names" == item:
                    titles[myindex] = item
                myindex = myindex + 1
            # foreach item
            continue

        myindex = 0
        mystr = {}
        tax = "NA"
        taxID = "NA"  # taxa
        uniprot_id = "NA"
        org = "NA"
        gene = "NA"
        protein = "NA"
        pfam_item = "NA"
        entry_name = info[1]
        while myindex < len(info):
            item = info[myindex]
            if myindex in titles:
                myid = titles[myindex]
                if item == "":
                    item = "NA"
                if myid == "NCBI_TaxID":
                    taxID = item
                    if taxID in maps:
                        tax = maps[taxID]
                    myindex = myindex + 1
                    continue
                # UniProtKB
                if myid == "UniProtKB" and item != "NA" and item != "":
                    uniprot_id = item
                # Gene name
                if myid == "Gene_names" and item != "NA" and item != "":
                    gene = item
                    gene = re.sub(";$", "", gene)
                    gene = re.sub("\{[^\{]+\}", "", gene)
                    gene = re.sub("\s+", ";", gene)
                # Protein name
                if myid == "Protein_names" and item != "NA" and item != "":
                    protein = item
                    protein = re.sub(";$", "", protein)
                    protein = re.sub("\{[^\{]+\}", "", protein)
                    protein = re.sub("\s+$", "", protein)
                # Organism
                if myid == "Organism" and item != "NA" and item != "":
                    org = item
                # Pfam
                if myid == "Pfam" and item != "NA" and item != "":
                    pfam_item = item
                mystr[myid] = item
            myindex = myindex + 1
        # foreach item
        mystr_out = ""
        for item in items:
            if item in mystr:
                if mystr_out == "":
                    mystr_out = mystr[item]
                else:
                    mystr_out = mystr_out + "\t" + mystr[item]
            else:
                if mystr_out == "":
                    mystr_out = "NA"
                else:
                    mystr_out = mystr_out + "\tNA"
        anns[info[1]] = tax + "\t" + taxID + "\t" + mystr_out

        if taxID == "9606":  # human proteins
            if pfam_item != "NA":
                pfam_item = re.sub("\s+", "", pfam_item)
                tmps = pfam_item.split(";")
                for i in tmps:
                    if not i in human_pfams:
                        human_pfams[i] = {}
                    if uniprot_id != "NA":
                        human_pfams[i][uniprot_id] = protein + "\t" + gene

    # foreach line

    # report human pfams
    outfile = os.path.join(output_path, "uniprot_human_pfam.tsv")
    outfile1 = re.sub(".tsv", ".tsv.gz", outfile)
    open_out = open(outfile, "w")
    open_out.write(
        "Pfam\tOrganism\tNCBI_TaxID\tUniProtKB\tProtein_names\tGene_names\n")
    for myid in sorted(human_pfams.keys()):
        mystr = myid + "\tHomo sapiens (Human)\t9606"
        #mypfam = ";".join(sorted(human_pfams[myid].keys()))
        mystr1 = ""
        mystr2 = ""
        mystr3 = ""
        for i in sorted(human_pfams[myid].keys()):
            mystr1 = mystr1 + i + ";"
            x, y = human_pfams[myid][i].split("\t")
            if x != "NA" and x != "":
                mystr2 = mystr2 + x + ";"
            if y != "NA" and y != "":
                mystr3 = mystr3 + y + ";"
        mystr1 = re.sub(";$", "", mystr1)
        mystr2 = re.sub(";$", "", mystr2)
        mystr3 = re.sub(";$", "", mystr3)
        if mystr1 == "":
            mystr1 = "NA"
        if mystr2 == "":
            mystr2 = "NA"
        if mystr3 == "":
            mystr3 = "NA"
        open_out.write(mystr + "\t" + mystr1 + "\t" + mystr2 + "\t" + mystr3 +
                       "\n")
    open_out.close()
    os.system("gzip " + outfile)

    header = "Rep_Tax\tRep_TaxID\t" + "\t".join(items)

    return anns, header
コード例 #21
0
def extract_annotation_info(datfile, output_path):
    title = "Entry\tEntry name\tGene names\tProtein names\tOrganism\tNCBI_TaxID\tLength\tGene ontology (biological process)\tGene ontology (molecular function)\tGene ontology (cellular component)\tCross-reference (KEGG)\tCross-reference (KO)\tCOG\tInterpro\tTaxonomic lineage (ALL)\tSubcellular location [CC]\tTransmembrane\tSignal peptide\tCross-reference (Pfam)"
    outfile = os.path.join(output_path, "uniprot_annotation.tsv")
    outfile1 = os.path.join(output_path, "uniprot_annotation.tsv.gz")
    if not os.path.isfile(datfile):
        sys.exit("Error: please download the uniprot dat file!")
    if os.path.isfile(outfile) and not os.path.isfile(outfile1):
        os.system("gzip " + outfile)
        config.logger.info("WARNING! Already exist file and skip this step: " +
                           outfile1)
        return outfile1
    if os.path.isfile(outfile1):
        config.logger.info("WARNING! Already exist file and skip this step: " +
                           outfile1)
        return outfile1
    open_out = open(outfile, "w")
    open_out.write(title + "\n")
    entry = "NA"
    entry_name = "NA"
    prot_name = "NA"
    gene_name = "NA"
    organism = "NA"
    taxa = "NA"
    length = "NA"
    go_bp = "NA"
    go_mf = "NA"
    go_cc = "NA"
    kegg = "NA"
    ko = "NA"
    cog = "NA"
    interpro = "NA"
    tax = "NA"
    sub = "NA"
    trans = "NA"
    signal = "NA"
    pfam = "NA"
    for line in utils.gzip_bzip2_biom_open_readlines(datfile):
        line = line.strip()
        if not len(line):
            continue
        if line == "//":  # end of one entry
            # output info
            if entry != "NA":
                tax = tax + ";" + organism
                open_out.write(entry + "\t" + entry_name + "\t" + gene_name +
                               "\t" + prot_name + "\t" + organism + "\t" +
                               taxa + "\t" + length + "\t" + go_bp + "\t" +
                               go_mf + "\t" + go_cc + "\t" + kegg + "\t" + ko +
                               "\t" + cog + "\t" + interpro + "\t" + tax +
                               "\t" + sub + "\t" + trans + "\t" + signal +
                               "\t" + pfam + "\n")
            entry = "NA"
            entry_name = "NA"
            prot_name = "NA"
            gene_name = "NA"
            organism = "NA"
            taxa = "NA"
            length = "NA"
            go_bp = "NA"
            go_mf = "NA"
            go_cc = "NA"
            kegg = "NA"
            ko = "NA"
            cog = "NA"
            interpro = "NA"
            tax = "NA"
            sub = "NA"
            trans = "NA"
            signal = "NA"
            pfam = "NA"
            continue
        if re.search("^ID\s+", line):  # entry name
            mym = re.search("^ID\s+([\S]+)\s+[\s\S]+\s+([\d]+)\s+AA", line)
            entry_name = mym.group(1)
            length = mym.group(2)
            continue
        if re.search("^AC\s+", line):  # entry
            mym = re.search("AC\s+([\S]+)", line)
            entry = mym.group(1)
            entry = re.sub(";$", "", entry)
            continue
        if re.search("^GN\s+", line):  # entry
            mym = re.search("GN\s+([\S]+[\s\S]+)", line)
            gene_tmp = mym.group(1)
            gene_tmp = re.sub("\s+", "", gene_tmp)
            tmps = gene_tmp.split(";")
            gene_name_tmp = ""
            for tmp in tmps:
                if re.search("\=([^\=]+)", tmp):
                    mym = re.search("\=([^\=]+)", tmp)
                    gene_name_tmp = gene_name_tmp + mym.group(1) + ";"
            gene_name_tmp = re.sub(";$", "", gene_name_tmp)
            if gene_name == "NA":
                gene_name = gene_name_tmp
            else:
                gene_name = gene_name + ";" + gene_name_tmp
            continue
        if re.search("^DE\s+[\s\S]+Full\=([\S\s]+)", line):  # protein name
            mym = re.search("^DE\s+[\s\S]+Full\=([\S\s]+)", line)
            mydec = mym.group(1)
            mydec = re.sub(";$", "", mydec)
            if prot_name == "NA":
                prot_name = mydec
            else:
                prot_name = prot_name + ";" + mydec
            continue
        if re.search("^OS\s+", line):  # organism
            mym = re.search("^OS\s+([\S\s]+)", line)
            mydec = mym.group(1)
            mydec = re.sub("\.$", "", mydec)
            if organism == "NA":
                organism = mydec
            else:
                organism = organism + ";" + mydec
            continue
        if re.search("^OX\s+", line):  # NCBI taxonomy ID
            mym = re.search("^OX\s+NCBI_TaxID=([\d]+)", line)
            taxa_info = mym.group(1)
            taxa_info = re.sub(";$", "", taxa_info)
            if taxa == "NA":
                taxa = taxa_info
            else:
                taxa = taxa + ";" + taxa_info
            continue
        if re.search("^OC\s+", line):  # taxonomy lineage
            mym = re.search("^OC\s+([\S\s]+)", line)
            mydec = mym.group(1)
            mydec = re.sub("\.$", "", mydec)
            mydec = re.sub(";$", "", mydec)
            if tax == "NA":
                tax = mydec
            else:
                tax = tax + ";" + mydec
            continue
        if re.search("^DR\s+KEGG;", line):  # KEGG annotation
            mym = re.search("^DR\s+(KEGG;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            kegg_tmp = tmp[1]
            if kegg == "NA":
                kegg = kegg_tmp
            else:
                kegg = kegg + ";" + kegg_tmp
            continue
        if re.search("^DR\s+KO;", line):  # KO annotation
            mym = re.search("^DR\s+(KO;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            ko_tmp = tmp[1]
            if ko == "NA":
                ko = ko_tmp
            else:
                ko = ko + ";" + ko_tmp
            continue
        if re.search("^DR\s+eggNOG;", line):  # COG annotation
            mym = re.search("^DR\s+(eggNOG;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            cog_tmp = tmp[1]
            if cog == "NA":
                cog = cog_tmp
            else:
                cog = cog + ";" + cog_tmp
            continue
        if re.search("^DR\s+InterPro;", line):  # InterPro annotation
            mym = re.search("^DR\s+(InterPro;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            inter_tmp = tmp[1]
            if interpro == "NA":
                interpro = inter_tmp
            else:
                interpro = interpro + ";" + inter_tmp
            continue
        if re.search("^DR\s+GO;", line):  # GO annotation
            mym = re.search("^DR\s+(GO;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            go = tmp[1]
            go_info = tmp[2]
            if re.search("^P:", go_info):
                go_info = re.sub("^P:", "", go_info)
                if go_bp == "NA":
                    go_bp = go_info + " [" + go + "]"
                else:
                    go_bp = go_bp + ";" + go_info + " [" + go + "]"
            if re.search("^F:", go_info):
                go_info = re.sub("^F:", "", go_info)
                if go_mf == "NA":
                    go_mf = go_info + " [" + go + "]"
                else:
                    go_mf = go_mf + ";" + go_info + " [" + go + "]"
            if re.search("^C:", go_info):
                go_info = re.sub("^C:", "", go_info)
                if go_cc == "NA":
                    go_cc = go_info + " [" + go + "]"
                else:
                    go_cc = go_cc + ";" + go_info + " [" + go + "]"
            continue
        if re.search("^DR\s+Pfam;", line):  # GO annotation
            mym = re.search("^DR\s+(Pfam;[\S\s]+)", line)
            mydec = mym.group(1)
            tmp = mydec.split("; ")
            pfam_info = tmp[1]
            if pfam == "NA":
                pfam = pfam_info
            else:
                pfam = pfam + ";" + pfam_info
            continue
        if re.search("^CC\s+", line):  # CC info
            if re.search("^CC\s+\-\!\-\s+SUBCELLULAR\s+LOCATION",
                         line):  # Subcellular
                mym = re.search(
                    "^CC\s+\-\!\-\s+SUBCELLULAR\s+LOCATION\s*([\s\S]+)", line)
                myflag = 1
                mydec = mym.group(1)
                if sub == "NA":
                    sub = "SUBCELLULAR LOCATION" + mydec
                else:
                    sub = sub + ";" + "SUBCELLULAR LOCATION" + mydec
            elif re.search("^CC\s+\-\!\-", line):  # not subcellular
                myflag = 0
            else:
                if myflag == 1:
                    mym = re.search("^CC\s+(\S+[\s\S]+)", line)
                    mydec = mym.group(1)
                    mydec = re.sub("\.$", "", mydec)
                    sub = sub + " " + mydec
            continue
        if re.search("^FT\s+", line):  # FT info
            if re.search("^FT\s+TRANSMEM\s+", line):  # transmembrane
                transflag = 1
                signalflag = 0
                if trans == "NA":
                    trans = "TRANSMEM"
                else:
                    trans = trans + ";" + "TRANSMEM"
            if re.search("^FT\s+SIGNAL\s+", line):  # signal
                signalflag = 1
                transflag = 0
                if signal == "NA":
                    signal = "SIGNAL"
                else:
                    signal = signal + ";" + "SIGNAL"
            continue
        # FT info
    # foreach line
    # last entry
    tax = tax + ";" + organism
    open_out.write(entry + "\t" + entry_name + "\t" + prot_name + "\t" +
                   organism + "\t" + taxa + "\t" + length + "\t" + go_bp +
                   "\t" + go_mf + "\t" + go_cc + "\t" + kegg + "\t" + ko +
                   "\t" + interpro + "\t" + cog + "\t" + tax + "\t" + sub +
                   "\t" + trans + "\t" + signal + "\t" + pfam + "\n")
    open_out.close()

    os.system("gzip " + outfile)
コード例 #22
0
def format_taxonomy_info (taxafile, output_path):	# uniprot_taxonomy.tsv
	titles = {}
	taxa_info = {}
	kingdoms = {}
	phylums = {}
	classes = {}
	orders = {}
	families = {}
	gena = {}
	species = {}
	kingdom = ["Superkingdom", "Kingdom", "Subkingdom"]
	phylum = ["Superphylum", "Phylum", "Subphylum"]
	clas = ["Superclass", "Class", "Subclass", "Infraclass", "Cohort"]
	order = ["Superorder", "Order", "Suborder", "Infraorder", "Parvorder"]
	family = ["Superfamily", "Family", "Subfamily", "Tribe", "Subtribe"]
	genus = ["Genus", "Subgenus"]
	specie = ["Species", "Species group", "Species subgroup", "Species subsgroup", "Subspecies", "Forma", "Varietas"]
		
	# collect info
	title_num = 0
	for line in utils.gzip_bzip2_biom_open_readlines (taxafile): 
		line = line.strip()
		if not len(line):
			continue
		if re.search("^#", line):
			continue
		info = line.split("\t")
		if re.search("^Taxon", line):
			myindex = 0
			while myindex < len(info):
				item = info[myindex]
				titles[item] = myindex
				myindex = myindex + 1
			# foreach item
			title_num = len(info)
			continue	
		
		# debug
		if len(info) < title_num:
			mynum_tmp = len(info)
			while mynum_tmp < title_num:
				info.append("")
				mynum_tmp = mynum_tmp + 1		
		#print(str(len(info)) + "\t" + line)

		mytaxa = info[titles["Taxon"]]
		myname = info[titles["Scientific name"]]
		if myname == "":
			continue
		myrank = info[titles["Rank"]]
		myline = info[titles["Lineage"]]
		mypar = info[titles["Parent"]]
		mylevel = re.sub("\s+", "_", myname)
		if myrank in kingdom:
			mylevel = "k__" + re.sub("\s+", "_", myname)
			kingdoms[myname] = mytaxa
		if myrank in phylum:
			mylevel = "p__" + re.sub("\s+", "_", myname)
			phylums[myname] = mytaxa
		if myrank in clas:
			mylevel = "c__" + re.sub("\s+", "_", myname)
			classes[myname] = mytaxa
		if myrank in order:
			mylevel = "o__" + re.sub("\s+", "_", myname)
			orders[myname] = mytaxa
		if myrank in family:
			mylevel = "f__" + re.sub("\s+", "_", myname)
			families[myname] = mytaxa
		if myrank in genus:
			mylevel = "g__" + re.sub("\s+", "_", myname)
			gena[myname] = mytaxa
		if myrank in specie:
			mylevel = "s__" + re.sub("\s+", "_", myname)
			species[myname] = mytaxa
		taxa_info[mytaxa] = mytaxa + "\t" + myname + "\t" + myrank + "\t" + myline + "\t" + mylevel + "\t" + mypar	
	# foreach line
	
	## format taxonomy info
	mic_types = ["k__Viruses", "k__Bacteria", "k__Archaea", "k__Fungi"]
	mammalia_types = ["c__Mammalia"]
	mic_ids = {}
	mammalia_ids = {}
	outfile = os.path.join(output_path, "uniprot_taxonomy.tsv")
	open_out = open(outfile, "w")
	open_out.write("Taxon\tScientific_name\tRank\tLineage\tParent\n")
	for mytaxa in sorted(taxa_info.keys()):
		mytaxa, myname, myrank, myline, mylevel, mypar = taxa_info[mytaxa].split("\t")
		if myrank == "":
			mylevel = "t__" + mylevel
		info = myline.split("; ")
		mystr = ""
		if len(info) < 1:
			mystr = mylevel
		else:
			mystr = ""
			for item in info:
				if item in kingdoms:
					item = "k__" + item
				if item in phylums:
					item = "p__" + item
				if item in classes:
					item = "c__" + item
				if item in orders:
					item = "o__" + item
				if item in families:
					item = "f__" + item
				if item in gena:
					item = "g__" + item
				if item in species:
					item = "s__" + item
				if item != "":
					item = re.sub("\s+", "_", item)
					mystr = mystr + item + "|"
			# foreach item
			# check parent
			tmp_info = mystr.split("|")
			if mypar in taxa_info:
				mypar_level = taxa_info[mypar].split("\t")[-2]
				if not mypar_level in tmp_info:	# skipped the nearest ancester
					mystr = mystr + mypar_level + "|"
			mystr = mystr + mylevel
		# else
		# rename empoty myrank
		if myrank == "":
			if re.search("t__", mylevel):
				myrank = "Terminal"
			if re.search("s__", mylevel):
				myrank = "Species"
			if re.search("g__", mylevel):
				myrank = "Genus"
			if re.search("f__", mylevel):
				myrank = "Family"
			if re.search("o__", mylevel):
				myrank = "Order"
			if re.search("c__", mylevel):
				myrank = "Class"
			if re.search("p__", mylevel):
				myrank = "Phylum"
			if re.search("k__", mylevel):
				myrank = "Kingdom"
		open_out.write(mytaxa + "\t" + myname + "\t" + myrank + "\t" + mystr + "\t" + mypar + "\n")

		for i in mic_types:
			if re.search(i, mystr):
				mic_ids[mytaxa] = ""
		for i in mammalia_types:
			if re.search(i, mystr):
				mammalia_ids[mytaxa] = ""
	# foreach taxon
	open_out.close()
	os.system("gzip " + outfile)

	# output microbiome and mammalia taxa ids
	outfile = os.path.join(output_path, "uniprot_taxaID_microbiome.txt")
	open_out = open(outfile, "w")
	for myid in sorted(mic_ids.keys()):
		open_out.write(myid + "\n")
	open_out.close()
	os.system("gzip " + outfile)
	outfile = os.path.join(output_path, "uniprot_taxaID_mammalia.txt")
	open_out = open(outfile, "w")
	for myid in sorted(mammalia_ids.keys()):
		open_out.write(myid + "\n")
	open_out.close()
	os.system("gzip " + outfile)