from pyproto.utils import parse_iter_sdf, rlen path_fn = '../../tmp/ChEBI_complete.sdf' i = 0 v_card = [] pc = set() for me in parse_iter_sdf(path_fn): attr = 'PubChem Database Links' if attr in me: pubchem_id = list(filter(lambda x: x.startswith('CID:'), me[attr])) if rlen(pubchem_id) > 1: v_card.append([me['ChEBI ID'], pubchem_id]) i += 1 if i % 5000 == 0: print(i) print("CHEBI SDF") print(v_card) print(pc)
i += 1 if i % 100 ==0: print(i) if r.content is None: continue data, refs = parse_KEGG(kegg_id, r.content.decode('utf-8')) if data is None or refs is None: continue for attr, val in list(data.items())+list(refs.items()): c = rlen(val) if c > card[attr]: card[attr] = c if c > 1: count[attr] += 1 if isinstance(val, str): nc = len(val) else: nc = max([len(f) for f in val]) if nc > nchar[attr]: nchar[attr] = nc print("kegg:") print(dict(card)) print(dict(count)) print(dict(nchar))