def updateNEdirs(prev_laf_dir, temp_laf_dir, new_laf_dir): changeinNEs = False for fn in prev_laf_dir: if fn.endswith('laf.xml'): prev_laf = os.path.join(prev_laf_dir, fn) temp_laf = os.path.join(temp_laf_dir, fn) try: assert os.path.exists(temp_laf) except AssertionError: logging.warn("{} processed last iteration but not this one".format(fn)) for fn in temp_laf_dir: if fn.endswith('laf.xml'): prev_laf = os.path.join(prev_laf_dir, fn) temp_laf = os.path.join(temp_laf_dir, fn) try: assert os.path.exists(prev_laf) except AssertionError: logging.warn("{} processed this iteration but not the last. Skipping...".format(fn)) continue prev_laf_doc = load_doc(prev_laf, LAFDocument, logger) temp_laf_doc = load_doc(temp_laf, LAFDocument, logger) doc_id = prev_laf_doc.doc_id prev_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in prev_laf_doc.mentions()] prev_spans = [(start_char, end_char) for [tag, extent, start_char, end_char] in prev_mentions] temp_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in temp_laf_doc.mentions()] mentions = [] for m in prev_mentions: mentions.append(m) for m in temp_mentions: if (m[2], m[3]) not in prev_spans: mentions.append(m) changeinNEs == True # Sort new mentions list by start_char then end_char mentions = sorted(mentions, key = lambda x: (int(x[2]), int(x[3]))) n=1 for tag, extent, start_char, end_char in mentions: entity_id = '{}-NE{}'.format(doc_id, n) mentions.append([entity_id, tag, extent, start_char, end_char]) n+=1 laf = os.path.join(new_laf_dir, fn) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) return new_laf_dir, changeinNEs
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf ] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp(); # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger); if ltf_doc is None: shutil.rmtree(temp_dir); return; # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); txt = ltf_doc.text(); spans = aligner.align(txt, tokens); # Extract features featsf = os.path.join(temp_dir, 'feats.txt'); feats = enc.get_feats(tokens); write_crfsuite_file(featsf, feats); # Tag. tagsf = os.path.join(temp_dir, 'tags.txt'); cmd = ['crfsuite', 'tag', '-m', modelf, featsf]; with open(tagsf, 'w') as f: subprocess.call(' '.join(cmd), shell=True, stdout=f); subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'}); # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f]; tags = tags[:len(tokens)]; # Chunk tags. chunks = chunker.tags_to_chunks(tags); # Construct mentions. doc_id = ltf_doc.doc_id; mentions = []; n = 1; for token_bi, token_ei, tag in chunks: if tag == 'O': continue; # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n); # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi]; end_char = token_offsets[token_ei]; # Finally, determine text of extent and append. extent_bi = spans[token_bi][0]; extent_ei = spans[token_ei][1]; extent = txt[extent_bi:extent_ei+1]; mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]); n += 1; # Write detected mentions to LAF file. bn = os.path.basename(ltf); laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)); laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id); laf_doc.write_to_file(laf); except KeyError: logger.warn('Problem with %s. Skipping.' % ltf); except ValueError: logger.warn('Problem with %s. Skipping.' % ltf); # Clean up. shutil.rmtree(temp_dir);
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext, threshold, A_vals, B_vals, G_vals): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') # feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs); feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) write_crfsuite_file(featsf, feats) shutil.copy(featsf, "featuresfile") #DEBUG # Tag. tagsf = os.path.join(temp_dir, 'tags.txt') cmd = [ 'crfsuite', 'tag', '--marginal', # outputs probability of each tag as extra field in tagsfile # '--probability', # outputs probability of tag sequence at top of tagsfile '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) shutil.copy(tagsf, "taggingprobs") #DEBUG # Look for NEs in the tagfile with marginal probs. # If the tag is 'O', keep it. # If the tag is anything else, keep if marginal prob is above threshold. tagsf2 = os.path.join(temp_dir, 'tags2.txt') """ Helper method for checking the tag sequence output in the section below. Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds threshold logprob - returns sequence of O's of equal length otherwise. If the seqeuence contains only one tag, that tag is returned as a U tag. """ def _check_BIL_sequence(tags, probs, threshold): nextpart = '' if len(tags) < 1: logging.warn("Empty tag sequence submitted as BI*L sequence.") elif len(tags) == 1: logging.warn( "Tag sequence of length 1 submitted as BI*L sequence.") if probs[0] >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= nextpart = 'U{}'.format(tags[0][1:]) else: nextpart = 'O\n' else: try: assert tags[0][0] == 'B' and tags[-1][0] == 'L' except AssertionError: logging.warn('Incomplete BI*L sequence submitted.') tags[0] = 'B{}'.format(tags[0][1:]) tags[-1] = 'L{}'.format(tags[-1][1:]) # NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs) # if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >= count = 0 for prob in probs: if prob >= threshold: count += 1 if count >= len(probs) / 2.0: nextpart = ''.join(tags) else: nextpart = 'O\n' * len(NEtags) return nextpart """ Retain or reject NE hypotheses based on probs and write new tags file """ with open(tagsf2, 'w') as f_out: with open(tagsf, 'r') as f_in: NEtags = None NEprobs = None for line in f_in.read().split('\n'): try: assert ':' in line tag, prob = line.strip().split(':') if tag[0] == 'O': # if seq in play, check seq # write tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None f_out.write(tag + '\n') elif tag[0] == 'U': # if seq in play, check seq # if prob >= threshold, write tag # else, write tag = O if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None if float( prob ) >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= f_out.write(tag + '\n') else: f_out.write('O\n') elif tag[0] == 'B': # if seq in play, check seq # start new seq with tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'I': # if seq in play, add tag to seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) else: logging.warn("Found an out of sequence I tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'L': # if seq in play, add tag to seq and check seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None else: logging.warn("Found an out of sequence L tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] except AssertionError: pass # logging.warn('No ":" in line {}'.format(line)) #DEBUG if NEtags: # Necessary if tagsf ends with an incomplete BI*L sequence f_out.write(_check_BIL_sequence(NEtags, NEprobs, threshold)) NEtags = None NEprobs = None tagsf = tagsf2 # Set the checked tag file as the new tag file # Continue shutil.copy(tagsf, "tagsfile") #DEBUG # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f] tags = tags[:len(tokens)] # Chunk tags. chunks = chunker.tags_to_chunks(tags) # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei+1] mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)