def extract_citations_unicarbkb_ds(species): black_list = get_blacklisted_pmids(species) data_frame = {} in_file = path_obj[ "unreviewed"] + "%s_proteoform_glycosylation_sites_unicarbkb.csv" % ( species) libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] newrow = [ "uniprotkb_canonical_ac", "pmid", "title", "journal_name", "publication_date", "authors" ] print "\"%s\"" % ("\",\"".join(newrow)) seen = {} for row in data_frame["data"]: canon = row[f_list.index("uniprotkb_canonical_ac")] pmid = row[f_list.index("evidence")] if pmid in black_list: continue combo_id = "%s %s" % (canon, pmid) newrow = get_citation(pmid) if newrow != []: if combo_id not in seen: print "\"%s\"" % ("\",\"".join([canon] + newrow)) seen[combo_id] = True return
def make_taxid2name_ds(): seen = {} data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/taxa.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: tax_id = row[f_list.index("TaxID")] seen[tax_id] = True newrow = ["tax_id", "tax_name"] print "\"%s\"" % ("\",\"".join(newrow)) in_file = path_obj["downloads"] + "/ncbi/taxonomy/names.dmp" with open(in_file, "r") as FR: for line in FR: parts = line.strip().split("|") if parts[3].strip() == "scientific name": tax_id = parts[0].strip() tax_name = parts[1].strip() if tax_id in seen: newrow = [tax_id, tax_name] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_sequences_smiles_isomeric_ds(): cid2glytoucan = {} data_frame = {} in_file = path_obj["unreviewed"] + "glycan_xref_pubchem.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] database_id = row[f_list.index("database_id")] if database_id[0:3] == "CID": cid2glytoucan[database_id[3:]] = ac newrow = ["glytoucan_ac","pubchem_id", "sequence_smiles_isomeric"] print "\"%s\"" % ("\",\"".join(newrow)) in_file = path_obj["downloads"] + "pubchem/compound/cid2smiles.tsv" with open(in_file, "r") as FR: for line in FR: cid, smiles = line.strip().split("\t") if cid in cid2glytoucan: newrow = [cid2glytoucan[cid], cid, str(smiles)] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_xref_chebi_from_kegg_ds(): kegg2chebi = {} data_frame = {} in_file = path_obj["downloads"] + "chebi/database_accession_current.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: if row[f_list.index("SOURCE")] == "KEGG GLYCAN": chebi_id = row[f_list.index("COMPOUND_ID")] kegg_id = row[f_list.index("ACCESSION_NUMBER")] kegg2chebi[kegg_id] = chebi_id newrow = ["glytoucan_ac","database_id","database_label"] print "\"%s\"" % ("\",\"".join(newrow)) in_file = path_obj["unreviewed"] + "glycan_xref_kegg.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] kegg_id = row[f_list.index("database_id")] if kegg_id in kegg2chebi: chebi_id = kegg2chebi[kegg_id] newrow = [ac,chebi_id,"ChEBI"] print "\"%s\"" % ("\",\"".join(newrow)) return
def main(): in_file = "generated/misc/field_names.csv" data_frame = {} libgly.load_sheet(data_frame, in_file, ",") seen = {} for row in data_frame["data"]: seen[row[0]] = True config_obj = json.loads(open("/data/projects/glygen/generated/misc/dataset-masterlist.json", "r").read()) for species in ["human", "mouse", "rat"]: for cat in ["protein", "glycan", "proteoform"]: ds_list = config_obj[cat]["common"] ds_list += config_obj[cat][species] if species in config_obj[cat] else [] for ds in ds_list: if ds in ["allsequences", "canonicalsequences"]: continue ext = "csv" file_name = "%s_%s_%s.%s" % (species, cat, ds, ext) in_file = "unreviewed/%s" % (file_name) if os.path.isfile(in_file) == True: data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for f in f_list: if f not in seen: print "%s,%s" % (f, file_name)
def extract_motif_ds(): glycan_list = load_glycan_masterlist() newrow = ["glytoucan_ac","glytoucan_ac_motif","motif_name", "is_reducing_end"] print "\"%s\"" % ("\",\"".join(newrow)) data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/allmotif.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] seen = {} for row in data_frame["data"]: ac = row[f_list.index("GlyTouCanAccession")] motif_ac = row[f_list.index("MotifAccession")] motif_label = row[f_list.index("Label")] is_reducing_end = row[f_list.index("IsReducingEnd")] combo_id = "%s %s" % (ac, motif_ac) if ac in glycan_list: if combo_id not in seen: seen[combo_id] = True newrow = [ac, motif_ac, motif_label, is_reducing_end] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_images_ds(): images_dir = path_obj["downloads"] + "glytoucan/current/export/images-cfg-extended/cfg/extended/" #Remove intermediate/glycanimages dir and create it again cmd = "rm -rf %s/glycanimages" % (path_obj["intermediate"]) x = commands.getoutput(cmd) cmd = "mkdir %s/glycanimages" % (path_obj["intermediate"]) x = commands.getoutput(cmd) seen_list = [] data_frame = {} in_file = path_obj["unreviewed"] + "glycan_masterlist.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] if ac not in seen_list: seen_list.append(ac) cmd = "cp %s/%s.png %s/glycanimages/" % (images_dir, ac, path_obj["intermediate"]) x = commands.getoutput(cmd) cmd = "/usr/bin/tar -C %s/glycanimages/ -cvf %s/glycan_images.tar ./" % (path_obj["intermediate"], path_obj["unreviewed"]) x = commands.getoutput(cmd) cmd = "/usr/bin/gzip %s/glycan_images.tar" % (path_obj["unreviewed"]) x = commands.getoutput(cmd) return
def main(): config_obj = json.loads(open("conf/config.json", "r").read()) db_obj = config_obj[config_obj["server"]]["dbinfo"] data_dir = "unreviewed-backup/" field_dict = {} in_file = data_dir + "/field_names.csv" libgly.load_sheet_as_dict(field_dict, in_file, ",", "field_name_current") pattern = data_dir + "/*.csv" for in_file in glob.glob(pattern): file_name = in_file.split("/")[-1] data_frame = {} libgly.load_sheet(data_frame, in_file, ",") for j in xrange(0, len(data_frame["fields"])): field = data_frame["fields"][j] if field in field_dict["data"]: if field_dict["data"][field][0][0] != "": data_frame["fields"][j] = field_dict["data"][field][0][0] out_file = "temp/%s" % (file_name) FW = open(out_file, "w") FW.write("\"%s\"\n" % ("\",\"".join(data_frame["fields"]))) for row in data_frame["data"]: FW.write("\"%s\"\n" % ("\",\"".join(row))) FW.close()
def extract_sequences_inchi_ds(): glycan_list = load_glycan_masterlist() cid2glytoucan = {} data_frame = {} in_file = path_obj["unreviewed"] + "glycan_xref_pubchem.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] database_id = row[f_list.index("database_id")] if database_id[0:3] == "CID": cid2glytoucan[database_id[3:]] = ac newrow = ["glytoucan_ac","sequence_inchi","inchi_key"] print "\"%s\"" % ("\",\"".join(newrow)) data_frame = {} in_file = "generated/pubchem/compound/cid2inchi.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: cid = row[f_list.index("pubchem_cid")] inchi = row[f_list.index("inchi")] inchikey = row[f_list.index("inchikey")] if cid in cid2glytoucan: glytoucan_ac = cid2glytoucan[cid] if glytoucan_ac in glycan_list: newrow = [glytoucan_ac, inchi, inchikey] print "\"%s\"" % ("\",\"".join(newrow)) return
def main(): config_obj = json.loads(open("conf/config.json", "r").read()) species_obj = config_obj["speciesinfo"] global path_obj path_obj = config_obj["pathinfo"] url_tmplt = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?entity_name=%s" data_frame = {} in_file = "unreviewed/human_protein_xref_hgnc.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: canon = row[f_list.index("uniprotkb_canonical_ac")] gene_name = row[f_list.index("database_label")] out_file = path_obj[ "downloads"] + "genomics_england/panels/%s.json" % (gene_name) if os.path.isfile(out_file) == True: continue url = url_tmplt % (gene_name) res = requests.get(url, verify=False) if res.content.strip() != "": res_obj = json.loads(res.content) if res_obj["results"] == []: continue with open(out_file, 'w') as FW: FW.write("%s\n" % (res.content)) print "downloaded json for %s " % (gene_name)
def extract_citations_glytoucan_ds(): black_list = get_blacklisted_pmids() glycan_list = load_glycan_masterlist() in_file = path_obj["downloads"] + "glytoucan/current/export/pubs.tsv" data_frame = {} libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] newrow = ["glytoucan_ac","pmid","title","journal_name","publication_date", "authors","source", "source_id"] print "\"%s\"" % ("\",\"".join(newrow)) seen = {} for row in data_frame["data"]: glytoucan_ac = row[f_list.index("GlyTouCanAccession")] pmid = row[f_list.index("PubMedID")] source = row[f_list.index("Source")] source_id = row[f_list.index("SourceID")] if source_id.find("comp_") != -1: continue if pmid in black_list: continue combo_id = "%s %s" % (glytoucan_ac, pmid) cond_list = [glytoucan_ac not in glycan_list and glytoucan_ac != ""] cond_list.append(pmid in ["0"]) cond_list.append(combo_id in seen) if True in cond_list: continue out_file = path_obj["downloads"] + "ncbi/pubmed/medline/pmid.%s.txt" % (pmid) if os.path.isfile(out_file) == True: obj = {} with open(out_file, "r") as FR: lcount = 0 prev_key = "" for line in FR: lcount += 1 if lcount > 3: key = line[0:4].strip() val = line[5:].strip() if key not in obj: obj[key] = [] if key == "": obj[prev_key].append(val) else: obj[key].append(val) prev_key = key if "TI" not in obj or "JT" not in obj: continue title = " ".join(obj["TI"]) journal = " ".join(obj["JT"]) pubdate = " ".join(obj["DP"]) authors = ", ".join(obj["AU"]) newrow = [glytoucan_ac, pmid, title, journal, pubdate, authors, source, source_id] print "\"%s\"" % ("\",\"".join(newrow)) seen[combo_id] = True return
def extract_taxonomy_ds(): glycan_list = load_glycan_masterlist() newrow = ["glytoucan_ac","tax_id", "source", "source_id"] print "\"%s\"" % ("\",\"".join(newrow)) seen = {} data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/taxa.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("GlyTouCanAccession")] tax_id = row[f_list.index("TaxID")] source = row[f_list.index("Source")] source_id = row[f_list.index("SourceID")] if source_id.find("comp_") != -1: continue if ac in glycan_list: newrow = [ac, tax_id, source, source_id] newrow_str = ",".join(newrow) if newrow_str not in seen: print "\"%s\"" % ("\",\"".join(newrow)) seen[newrow_str] = True data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/species.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("GlyTouCanAccession")] evidence_type = row[f_list.index("Species")] evidence_desc = row[f_list.index("Value")].replace("[", "").replace("]", "").replace("'", "") if evidence_desc.find("TaxID") == -1: continue newrow = [] if evidence_desc.find("GlyTouCan") != -1: tax_id = evidence_desc.split(" ")[-1] source = "GlyTouCan" source_id = ac newrow = [ac,tax_id, source, source_id] elif evidence_desc.find("UniCarbKB") != -1: tax_id = evidence_desc.split(" ")[-1] source = "UniCarbKB" source_id = evidence_desc.split(" ")[-3].split(":")[1] if source_id.find("comp_") == -1: newrow = [ac,tax_id, source, source_id] if ac in glycan_list and newrow != []: newrow_str = ",".join(newrow) if newrow_str not in seen: print "\"%s\"" % ("\",\"".join(newrow)) seen[newrow_str] = True return
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-s", "--species", action="store", dest="species", help="human/mouse/rat") parser.add_option("-r", "--release", action="store", dest="release", help="1.0.13") (options, args) = parser.parse_args() for file in ([options.species, options.release]): if not (file): parser.print_help() sys.exit(0) species = options.species release = options.release config_obj = json.loads( open("/data/projects/glygen/generated/misc/dataset-masterlist.json", "r").read()) for cat in ["protein", "glycan", "proteoform"]: ds_list = config_obj[cat]["common"] ds_list += config_obj[cat][species] if species in config_obj[ cat] else [] for ds in ds_list: if ds in ["allsequences", "canonicalsequences"]: continue ext = "csv" file_name = "%s_%s_%s.%s" % (species, cat, ds, ext) in_file_one = "unreviewed/%s" % (file_name) in_file_two = "/data/projects/glygen/releases/data/v-%s/reviewed/%s" % ( release, file_name) if os.path.isfile(in_file_one) == False or os.path.isfile( in_file_two) == False: print "%s was not found in both versions" % (file_name) continue data_frame_one, data_frame_two = {}, {} libgly.load_sheet(data_frame_one, in_file_one, ",") libgly.load_sheet(data_frame_two, in_file_two, ",") f_list_one = data_frame_one["fields"] f_list_two = data_frame_two["fields"] if f_list_one != f_list_two: set_one = set(f_list_one) set_two = set(f_list_two) diff_one = "|".join(list(set_one - set_two)) diff_two = "|".join(list(set_two - set_one)) print "%s,%s,%s" % (file_name, diff_one, diff_two)
def get_blacklisted_pmids(species): black_list = [] in_file = "compiled/%s_protein_blacklisted_pmids.csv" % (species) if os.path.isfile(in_file) == True: data_frame = {} libgly.load_sheet(data_frame, in_file, ",") for row in data_frame["data"]: black_list.append(row[0]) black_list = sorted(set(black_list)) return black_list
def load_canon2xref(in_file, map_dict_one, map_dict_two): sheet_obj = {} libgly.load_sheet(sheet_obj, in_file, ",") f_list = sheet_obj["fields"] for row in sheet_obj["data"]: map_dict_one[row[f_list.index("database_id")]] = row[f_list.index("uniprotkb_canonical_ac")] map_dict_two[row[f_list.index("uniprotkb_canonical_ac")]] = { "id":row[f_list.index("database_id")], "name":row[f_list.index("database_label")] } return
def main(): site_dict = {"glcoyslation": {}, "mutation": {}} site_type_dict = {"glcoyslation": {}, "mutation": {}} for in_file in glob.glob("unreviewed/*_protein_glycosylation_motifs.csv"): data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: canon = row[f_list.index("uniprotkb_canonical_ac")] start_pos = row[f_list.index("start_pos")] motif = row[f_list.index("motif")] motif_type = motif[0] if canon not in site_dict["glcoyslation"]: site_dict["glcoyslation"][canon] = [] site_type_dict["glcoyslation"][canon] = [] if start_pos not in site_dict["glcoyslation"][canon]: site_dict["glcoyslation"][canon].append(start_pos) site_type_dict["glcoyslation"][canon].append(motif_type) for in_file in glob.glob("unreviewed/*_protein_mutation.csv"): data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: canon = row[f_list.index("uniprotkb_canonical_ac")] start_pos = row[f_list.index("aa_pos")] if canon not in site_dict["mutation"]: site_dict["mutation"][canon] = [] if start_pos not in site_dict["mutation"][canon]: site_dict["mutation"][canon].append(start_pos) newrow = [ "uniprotkb_canonical_ac", "glycosylation_site_count", "mutation_site_count", "motif_types" ] print "\"%s\"" % ("\",\"".join(newrow)) canon_list = sorted( list( set(site_dict["glcoyslation"].keys() + site_dict["mutation"].keys()))) for canon in canon_list: newrow = [canon] g1, g2, m1 = "", "", "" if canon in site_dict["glcoyslation"]: g1 = str(len(site_dict["glcoyslation"][canon])) g2 = ";".join( sorted(list(set(site_type_dict["glcoyslation"][canon])))) if canon in site_dict["mutation"]: m1 = str(len(site_dict["mutation"][canon])) newrow = [canon, g1, m1, g2] print "\"%s\"" % ("\",\"".join(newrow))
def load_glycan_masterlist(): glycan_list = [] data_frame = {} in_file = path_obj["unreviewed"] + "glycan_masterlist.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] if ac not in glycan_list: glycan_list.append(ac) return glycan_list
def main(): config_obj = json.loads(open("conf/config.json", "r").read()) species_obj = config_obj["speciesinfo"] cid2glytoucan = {} data_frame = {} for in_file in glob.glob("unreviewed/*_glycan_xref_pubchem.csv"): data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] database_id = row[f_list.index("database_id")] if database_id[0:3] == "CID": cid = database_id[3:] cid2glytoucan[cid] = ac seen = {} start = 1 end = 25000 n_failed = 0 for i in xrange(0, 10000): start = i * 25000 + 1 end = start + 25000 - 1 start = "000000000"[0:-len(str(start))] + str(start) end = "000000000"[0:-len(str(end))] + str(end) in_file = "downloads/pubchem/compound/sdf/Compound_%s_%s.sdf.gz" % ( start, end) out_file = "downloads/pubchem/compound/sdf4glygen/Compound_%s_%s.sdf" % ( start, end) if os.path.isfile(in_file) == True: FW = open(out_file, "w") flag = False with gzip.open(in_file, 'rb') as FR: prev_line, cid, buf = "", "", "" for line in FR: buf += line line = line.strip() if line == "$$$$": if cid != "" and cid in cid2glytoucan: flag = True FW.write("%s" % (buf)) cid, buf = "", "" if prev_line == "> <PUBCHEM_COMPOUND_CID>": cid = line prev_line = line FW.close() if flag == False: cmd = "rm -f %s" % (out_file) x = commands.getoutput(cmd)
def extract_xref_chebi_ds(): chebi2inchi = {} in_file = path_obj["downloads"] + "chebi/chebiId_inchi.tsv" with open(in_file, "r") as FR: for line in FR: parts = line.strip().split("\t") chebi2inchi[parts[0]] = parts[1] pubchem2inchi = {} in_file = "generated/pubchem/compound/cid2inchi.csv" data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: pubchem_id = row[f_list.index("pubchem_cid")] inchi = row[f_list.index("inchi")] pubchem2inchi[pubchem_id] = inchi cid2glytoucan = {} data_frame = {} in_file = path_obj["unreviewed"] + "glycan_xref_pubchem.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] database_id = row[f_list.index("database_id")] if database_id[0:3] == "CID": cid2glytoucan[database_id[3:]] = ac newrow = ["glytoucan_ac","database_id","database_label"] print "\"%s\"" % ("\",\"".join(newrow)) in_file = path_obj["downloads"] + "pubchem/compound/cid2synonym.tsv" with open(in_file, "r") as FR: for line in FR: line = line.strip() cid, chebi_id = line.split("\t") if cid in cid2glytoucan and chebi_id[0:6] == "CHEBI:": if chebi_id[6:] in chebi2inchi and cid in pubchem2inchi: chebi_inchi = chebi2inchi[chebi_id[6:]] pubchem_inchi = pubchem2inchi[cid] #newrow = [cid2glytoucan[cid], "CID" + cid, chebi_id] newrow = [cid2glytoucan[cid], chebi_id.split(":")[1], "ChEBI"] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_fully_determined_ds(): glycan_list = load_glycan_masterlist() newrow = ["glytoucan_ac"] print "\"%s\"" % ("\",\"".join(newrow)) data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/fully_determined.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("GlyTouCanAccession")] if ac in glycan_list: newrow = [ac] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_synthesized_ds(): glycan_list = load_glycan_masterlist() in_file = "compiled/glycan_synthesized.csv" data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] newrow = f_list print "\"%s\"" % ("\",\"".join(newrow)) for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] if ac in glycan_list: newrow = row print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_monosaccharide_composition_ds(): glycan_list = load_glycan_masterlist() data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/monocomp.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] newrow = f_list newrow[0] = "glytoucan_ac" print "\"%s\"" % ("\",\"".join(newrow)) seen = {} for row in data_frame["data"]: newrow = row if newrow[0] in glycan_list: print "\"%s\"" % ("\",\"".join(newrow)) return
def main(): in_file = "downloads/unicarbkb/human29112019.csv" if os.path.isfile(in_file) == True: data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] n_fields = len(f_list) flag = True row_count = 0 for row in data_frame["data"]: row_count += 1 n_cols = len(row) if n_fields != n_cols: flag = False print "Bad row, row number=%s" % (row_count) print in_file print f_list print row break
def load_glycosylation_type_two(): data_frame = {} in_file = path_obj[ "downloads"] + "glytoucan/current/export/classification.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] glytoucanac2glycosylationtype = {} for row in data_frame["data"]: glytoucan_ac = row[f_list.index("GlyTouCanAccession")].strip() gly_type = row[f_list.index("Type")].strip().lower() gly_subtype = row[f_list.index("Subtype")].strip() if glytoucan_ac not in glytoucanac2glycosylationtype: glytoucanac2glycosylationtype[glytoucan_ac] = [] if gly_type not in glytoucanac2glycosylationtype[glytoucan_ac]: if gly_type == "n-linked": glytoucanac2glycosylationtype[glytoucan_ac].append(gly_type) if gly_type == "o-linked": glytoucanac2glycosylationtype[glytoucan_ac].append(gly_type) return glytoucanac2glycosylationtype
def main(): in_file = "/data/projects/glygen/generated/misc/dataset-masterlist.json" ds_obj_list = json.loads(open(in_file, "r").read()) ntested, npassed, nfailed = 0, 0, 0 for obj in ds_obj_list: ds_name = obj["name"] ds_format = obj["format"] mol = obj["categories"]["molecule"] if ds_format != "csv": continue file_list = [] for species in obj["categories"]["species"]: f = "unreviewed/%s_%s_%s.%s" % (species, mol, ds_name, ds_format) file_list.append(f) if file_list == []: f = "unreviewed/%s_%s.%s" % (mol, ds_name, ds_format) file_list.append(f) for in_file in file_list: if os.path.isfile(in_file) == True: data_frame = {} libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] n_fields = len(f_list) flag = True for row in data_frame["data"]: n_cols = len(row) if n_fields != n_cols: flag = False print "Bad row" print in_file print f_list print row break ntested += 1 npassed += 1 if flag == True else 0 nfailed += 1 if flag == False else 0 print "%s tested, %s passed, %s failed" % (ntested, npassed, nfailed)
def load_glycosylation_type_one(species): data_frame = {} in_file = path_obj["downloads"] + "unicarbkb/%s_motif_current.txt" % ( species) libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] glytoucanac2glycosylationtype = {} for row in data_frame["data"]: uckb_id = row[f_list.index("uckb_id")].strip() glytoucan_ac = row[f_list.index("motif_ac")].strip() motif_label = row[f_list.index("motif_label")].strip() if glytoucan_ac not in glytoucanac2glycosylationtype: glytoucanac2glycosylationtype[glytoucan_ac] = [] if motif_label.lower().find("n-glycan") != -1: if "n-linked" not in glytoucanac2glycosylationtype[glytoucan_ac]: glytoucanac2glycosylationtype[glytoucan_ac].append("n-linked") if motif_label.lower().find("o-glycan") != -1: if "o-linked" not in glytoucanac2glycosylationtype[glytoucan_ac]: glytoucanac2glycosylationtype[glytoucan_ac].append("o-linked") return glytoucanac2glycosylationtype
def extract_classification_ds(): glycan_list = load_glycan_masterlist() newrow = ["glytoucan_ac","glycan_type","glycan_subtype"] print "\"%s\"" % ("\",\"".join(newrow)) data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/classification.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: glytoucan_ac = row[f_list.index("GlyTouCanAccession")] if glytoucan_ac in glycan_list: glycan_type = row[f_list.index("Type")] glycan_type = "N-glycan" if glycan_type == "N-linked" else glycan_type glycan_type = "O-glycan" if glycan_type == "O-linked" else glycan_type glycan_subtype = row[f_list.index("Subtype")] newrow = [glytoucan_ac, glycan_type, glycan_subtype] print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_masterlist_ds(): seen_list = [] data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/allglycan.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("GlyTouCanAccession")] if ac not in seen_list: seen_list.append(ac) newrow = [ "glytoucan_ac","glytoucan_type","glycan_mass", "glycan_permass", "base_composition","composition","topology","monosaccharides" ] print "\"%s\"" % ("\",\"".join(newrow)) data_frame = {} in_file = path_obj["downloads"] + "glytoucan/current/export/glycan_properties.tsv" libgly.load_sheet(data_frame, in_file, "\t") f_list = data_frame["fields"] for row in data_frame["data"]: newrow = [ row[f_list.index("glytoucan_acc")], row[f_list.index("glytoucan_type")], row[f_list.index("glycan_mass")], row[f_list.index("glycan_permass")], row[f_list.index("base_composition")], row[f_list.index("composition")], row[f_list.index("topology")], row[f_list.index("monosaccharides")] ] if newrow[0] in seen_list: print "\"%s\"" % ("\",\"".join(newrow)) return
def extract_sequences_ds(seq_format): newrow = ["glytoucan_ac","sequence_%s" % (seq_format)] print "\"%s\"" % ("\",\"".join(newrow)) seen_list = [] data_frame = {} in_file = path_obj["unreviewed"] + "glycan_masterlist.csv" libgly.load_sheet(data_frame, in_file, ",") f_list = data_frame["fields"] for row in data_frame["data"]: ac = row[f_list.index("glytoucan_ac")] if ac not in seen_list: seen_list.append(ac) in_file = path_obj["downloads"] + "glytoucan/current/export/%s/%s.txt" % (seq_format, ac) if os.path.isfile(in_file) == True: with open(in_file, "r") as FR: seq = "" for line in FR: seq += " " + line.strip() newrow = [ac, seq.strip()] print "\"%s\"" % ("\",\"".join(newrow)) return
def main(): config_obj = json.loads(open("conf/config.json", "r").read()) db_obj = config_obj[config_obj["server"]]["dbinfo"] field_dict = {} in_file = "unreviewed/field_names.csv" libgly.load_sheet(field_dict, in_file, ",") field_list = [] for row in field_dict["data"]: field = row[1] if row[1] != "" else row[0] field_list.append(field) pattern = "temp/*.csv" for in_file in glob.glob(pattern): file_name = in_file.split("/")[-1] if file_name == "field_names.csv": continue data_frame = {} libgly.load_sheet(data_frame, in_file, ",") for field in data_frame["fields"]: if field not in field_list: print "undefined", field, file_name