def tag_sentences(): ''' input: train_sentences.json output: train_tag_sentences.json Tags the chemicals in each sentence using ChemicalTagger (http://chemicaltagger.ch.cam.ac.uk/). This method communicates with ChemicalTagger through a custom REST API running on pathway.berkeley.edu ''' sentences = json.load(open('../data/train_sentences.json')) bar, i = pbar(len(sentences)), 0 print 'Tagging chemicals in sentences' bar.start() chemicals = {} for sid, sentence in sentences.iteritems(): chems = chemtagger.get_compounds(sid, sentence) if chems: chemicals[sid] = chems i += 1 bar.update(i) bar.finish() json.dump(chemicals, open('../data/train_tag_sentences.json', 'wb'), indent=2, sort_keys=True) print 'Result dumped to ../data/train_tag_sentences.json'
def extract(sid, sentence): reactants = [] chemicals = chemtagger.get_compounds(sid, sentence) if chemicals is None: return reactants chemicals = sanitize_chemicals(chemicals) chems = [y for x in chemicals for y in x.split()] stemmed_sentence = ' '.join([x if x in chems else STEMMER.stem(x) for x in sentence.split()]) tagged_sentence = ' %s ' % stemmed_sentence for chem in sorted(chemicals, key=len, reverse=True): tagged_sentence = tagged_sentence.replace( ' %s ' % chem, ' $%s$chem ' % chem) tagged_sentence = tagged_sentence.strip() grouped_sentence = group_list(tagged_sentence) for pattern_id in expand_patterns(): for pattern in expand_patterns()[pattern_id]: groups = pattern.findall(grouped_sentence, overlapped=True) matches = expand_chems(groups) for match in matches: if match and len(match) > 1: reactants.append((pattern_id, match)) return reactants