Example #1
0
def match_inchi():
    '''
    input: train_sentences.json, train_tag_sentences.json, 
           train_abstracts.json, chemid_inchi_map.json
    output: train_match_inchi.json 
    '''
    chemical_inchi_map = json.load(open('../data/chemid_inchi_map.json'))
    sentences = json.load(open('../data/train_sentences.json'))
    chemicals = json.load(open('../data/train_tag_sentences.json'))
    abstracts_rxns = json.load(open('../data/train_abstracts.json'))
    bar, i = pbar(len(sentences)), 0
    print 'Getting matches by inchi'
    bar.start()
    matches = {}
    for sid, sent in sentences.iteritems():
        i += 1
        bar.update(i)
        pmid = sid.split('-')[0]
        chems = chemicals.get(sid, [])
        inchis = chem_canonicalizer.names_to_inchi(chems)
        inchi_set = set(inchis.keys())
        rxns = abstracts_rxns[pmid]['reactions']
        sentence_reactants = defaultdict(set)
        for rxn_id, reaction in rxns.iteritems():
            substrate_set = set([chemical_inchi_map[str(x)]
                                for x in reaction['substrates']])
            product_set = set([chemical_inchi_map[str(x)]
                               for x in reaction['products']])
            s_intersect = inchi_set.intersection(substrate_set)
            p_intersect = inchi_set.intersection(product_set)
            if match_criteria(s_intersect, p_intersect):
                key = serialize_rxn([inchis[x] for x in s_intersect],
                                    [inchis[x] for x in p_intersect])
                val = serialize_rxn(reaction['substrates'],
                                    reaction['products'])
                sentence_reactants[key].add(val)
        if len(sentence_reactants) > 0:
            reactants = dict([(x, list(y))
                             for x, y in sentence_reactants.items()])
            matches[sid] = {'sentence': sent,
                            'reactants': reactants,
                            }
    bar.finish()
    json.dump(matches, open('../data/train_match_inchi.json', 'wb'),
              indent=2, sort_keys=True)
    print 'Results dumped to ../data/train_match_inchi.json'
Example #2
0
def abstract_stats():
    '''
    generates a summary of number of abstracts that have at least one
    substrate and at least one product, by matchine names and inchis
    '''
    chemical_inchi_map = json.load(open('../data/chemid_inchi_map.json'))
    chemicals = json.load(open('../data/train_tag_sentences.json'))
    clean_chemicals = json.load(open('../data/train_clean_chemicals.json'))
    abstracts_rxns = json.load(open('../data/train_abstracts.json'))
    chemicals_abstract = defaultdict(list)
    for sid, chems in chemicals.iteritems():
        chemicals_abstract[sid.split('-')[0]].extend(chems)
    print 'Matching abstracts by inchi'
    match_inchi = set()
    bar, i = pbar(len(abstracts_rxns)), 0
    bar.start()
    for pmid, data in abstracts_rxns.iteritems():
        rxns = data['reactions']
        abstract = data['abstract']
        i += 1
        bar.update(i)
        chems = chemicals_abstract.get(pmid, [])
        inchi_set = set(chem_canonicalizer.names_to_inchi(chems))
        matched = False
        for rxn_id, reaction in rxns.iteritems():
            substrate_set = set([chemical_inchi_map[str(x)]
                                for x in reaction['substrates']])
            product_set = set([chemical_inchi_map[str(x)]
                               for x in reaction['products']])
            s_intersect = inchi_set.intersection(substrate_set)
            p_intersect = inchi_set.intersection(product_set)
            if match_criteria(s_intersect, p_intersect):
                matched = True
                break
        if matched:
            match_inchi.add(pmid)
    bar.finish()
    print 'Matching abstracts by name'
    match_name = set()
    bar, i = pbar(len(abstracts_rxns)), 0
    bar.start()
    for pmid, data in abstracts_rxns.iteritems():
        reactions = data['reactions']
        abstract = data['abstract']
        i += 1
        bar.update(i)
        matched = False
        for rxn_id, reaction in reactions.iteritems():
            sub_ids = set(reaction['substrates'])
            prod_ids = set(reaction['products'])
            sub_set = set(
                [y for x in sub_ids for y in clean_chemicals[str(x)]])
            prod_set = set(
                [y for x in prod_ids for y in clean_chemicals[str(x)]])
            rxn_found = find_reactants(abstract, sub_set, prod_set)
            if rxn_found:
                matched = True
                break
        if matched:
            match_name.add(pmid)
    bar.finish()
    print 'Abstracts: %s' % len(abstracts_rxns)
    print 'Match by name: %s' % len(match_name)
    print 'Match by inchi: %s' % len(match_inchi)
    print 'Intersection: %s' % len(match_name.intersection(match_inchi))
    print 'Union: %s' % len(match_name.union(match_inchi))