def match_inchi(): ''' input: train_sentences.json, train_tag_sentences.json, train_abstracts.json, chemid_inchi_map.json output: train_match_inchi.json ''' chemical_inchi_map = json.load(open('../data/chemid_inchi_map.json')) sentences = json.load(open('../data/train_sentences.json')) chemicals = json.load(open('../data/train_tag_sentences.json')) abstracts_rxns = json.load(open('../data/train_abstracts.json')) bar, i = pbar(len(sentences)), 0 print 'Getting matches by inchi' bar.start() matches = {} for sid, sent in sentences.iteritems(): i += 1 bar.update(i) pmid = sid.split('-')[0] chems = chemicals.get(sid, []) inchis = chem_canonicalizer.names_to_inchi(chems) inchi_set = set(inchis.keys()) rxns = abstracts_rxns[pmid]['reactions'] sentence_reactants = defaultdict(set) for rxn_id, reaction in rxns.iteritems(): substrate_set = set([chemical_inchi_map[str(x)] for x in reaction['substrates']]) product_set = set([chemical_inchi_map[str(x)] for x in reaction['products']]) s_intersect = inchi_set.intersection(substrate_set) p_intersect = inchi_set.intersection(product_set) if match_criteria(s_intersect, p_intersect): key = serialize_rxn([inchis[x] for x in s_intersect], [inchis[x] for x in p_intersect]) val = serialize_rxn(reaction['substrates'], reaction['products']) sentence_reactants[key].add(val) if len(sentence_reactants) > 0: reactants = dict([(x, list(y)) for x, y in sentence_reactants.items()]) matches[sid] = {'sentence': sent, 'reactants': reactants, } bar.finish() json.dump(matches, open('../data/train_match_inchi.json', 'wb'), indent=2, sort_keys=True) print 'Results dumped to ../data/train_match_inchi.json'
def abstract_stats(): ''' generates a summary of number of abstracts that have at least one substrate and at least one product, by matchine names and inchis ''' chemical_inchi_map = json.load(open('../data/chemid_inchi_map.json')) chemicals = json.load(open('../data/train_tag_sentences.json')) clean_chemicals = json.load(open('../data/train_clean_chemicals.json')) abstracts_rxns = json.load(open('../data/train_abstracts.json')) chemicals_abstract = defaultdict(list) for sid, chems in chemicals.iteritems(): chemicals_abstract[sid.split('-')[0]].extend(chems) print 'Matching abstracts by inchi' match_inchi = set() bar, i = pbar(len(abstracts_rxns)), 0 bar.start() for pmid, data in abstracts_rxns.iteritems(): rxns = data['reactions'] abstract = data['abstract'] i += 1 bar.update(i) chems = chemicals_abstract.get(pmid, []) inchi_set = set(chem_canonicalizer.names_to_inchi(chems)) matched = False for rxn_id, reaction in rxns.iteritems(): substrate_set = set([chemical_inchi_map[str(x)] for x in reaction['substrates']]) product_set = set([chemical_inchi_map[str(x)] for x in reaction['products']]) s_intersect = inchi_set.intersection(substrate_set) p_intersect = inchi_set.intersection(product_set) if match_criteria(s_intersect, p_intersect): matched = True break if matched: match_inchi.add(pmid) bar.finish() print 'Matching abstracts by name' match_name = set() bar, i = pbar(len(abstracts_rxns)), 0 bar.start() for pmid, data in abstracts_rxns.iteritems(): reactions = data['reactions'] abstract = data['abstract'] i += 1 bar.update(i) matched = False for rxn_id, reaction in reactions.iteritems(): sub_ids = set(reaction['substrates']) prod_ids = set(reaction['products']) sub_set = set( [y for x in sub_ids for y in clean_chemicals[str(x)]]) prod_set = set( [y for x in prod_ids for y in clean_chemicals[str(x)]]) rxn_found = find_reactants(abstract, sub_set, prod_set) if rxn_found: matched = True break if matched: match_name.add(pmid) bar.finish() print 'Abstracts: %s' % len(abstracts_rxns) print 'Match by name: %s' % len(match_name) print 'Match by inchi: %s' % len(match_inchi) print 'Intersection: %s' % len(match_name.intersection(match_inchi)) print 'Union: %s' % len(match_name.union(match_inchi))