def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext, threshold, A_vals, B_vals, G_vals): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') # feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs); feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) write_crfsuite_file(featsf, feats) shutil.copy(featsf, "featuresfile") #DEBUG # Tag. tagsf = os.path.join(temp_dir, 'tags.txt') cmd = [ 'crfsuite', 'tag', '--marginal', # outputs probability of each tag as extra field in tagsfile # '--probability', # outputs probability of tag sequence at top of tagsfile '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) shutil.copy(tagsf, "taggingprobs") #DEBUG # Look for NEs in the tagfile with marginal probs. # If the tag is 'O', keep it. # If the tag is anything else, keep if marginal prob is above threshold. tagsf2 = os.path.join(temp_dir, 'tags2.txt') """ Helper method for checking the tag sequence output in the section below. Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds threshold logprob - returns sequence of O's of equal length otherwise. If the seqeuence contains only one tag, that tag is returned as a U tag. """ def _check_BIL_sequence(tags, probs, threshold): nextpart = '' if len(tags) < 1: logging.warn("Empty tag sequence submitted as BI*L sequence.") elif len(tags) == 1: logging.warn( "Tag sequence of length 1 submitted as BI*L sequence.") if probs[0] >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= nextpart = 'U{}'.format(tags[0][1:]) else: nextpart = 'O\n' else: try: assert tags[0][0] == 'B' and tags[-1][0] == 'L' except AssertionError: logging.warn('Incomplete BI*L sequence submitted.') tags[0] = 'B{}'.format(tags[0][1:]) tags[-1] = 'L{}'.format(tags[-1][1:]) # NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs) # if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >= count = 0 for prob in probs: if prob >= threshold: count += 1 if count >= len(probs) / 2.0: nextpart = ''.join(tags) else: nextpart = 'O\n' * len(NEtags) return nextpart """ Retain or reject NE hypotheses based on probs and write new tags file """ with open(tagsf2, 'w') as f_out: with open(tagsf, 'r') as f_in: NEtags = None NEprobs = None for line in f_in.read().split('\n'): try: assert ':' in line tag, prob = line.strip().split(':') if tag[0] == 'O': # if seq in play, check seq # write tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None f_out.write(tag + '\n') elif tag[0] == 'U': # if seq in play, check seq # if prob >= threshold, write tag # else, write tag = O if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None if float( prob ) >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= f_out.write(tag + '\n') else: f_out.write('O\n') elif tag[0] == 'B': # if seq in play, check seq # start new seq with tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'I': # if seq in play, add tag to seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) else: logging.warn("Found an out of sequence I tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'L': # if seq in play, add tag to seq and check seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None else: logging.warn("Found an out of sequence L tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] except AssertionError: pass # logging.warn('No ":" in line {}'.format(line)) #DEBUG if NEtags: # Necessary if tagsf ends with an incomplete BI*L sequence f_out.write(_check_BIL_sequence(NEtags, NEprobs, threshold)) NEtags = None NEprobs = None tagsf = tagsf2 # Set the checked tag file as the new tag file # Continue shutil.copy(tagsf, "tagsfile") #DEBUG # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f] tags = tags[:len(tokens)] # Chunk tags. chunks = chunker.tags_to_chunks(tags) # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf ] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp(); # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger); if ltf_doc is None: shutil.rmtree(temp_dir); return; # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); txt = ltf_doc.text(); spans = aligner.align(txt, tokens); # Extract features featsf = os.path.join(temp_dir, 'feats.txt'); feats = enc.get_feats(tokens); write_crfsuite_file(featsf, feats); # Tag. tagsf = os.path.join(temp_dir, 'tags.txt'); cmd = ['crfsuite', 'tag', '-m', modelf, featsf]; with open(tagsf, 'w') as f: subprocess.call(' '.join(cmd), shell=True, stdout=f); subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'}); # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f]; tags = tags[:len(tokens)]; # Chunk tags. chunks = chunker.tags_to_chunks(tags); # Construct mentions. doc_id = ltf_doc.doc_id; mentions = []; n = 1; for token_bi, token_ei, tag in chunks: if tag == 'O': continue; # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n); # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi]; end_char = token_offsets[token_ei]; # Finally, determine text of extent and append. extent_bi = spans[token_bi][0]; extent_ei = spans[token_ei][1]; extent = txt[extent_bi:extent_ei+1]; mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]); n += 1; # Write detected mentions to LAF file. bn = os.path.basename(ltf); laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)); laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id); laf_doc.write_to_file(laf); except KeyError: logger.warn('Problem with %s. Skipping.' % ltf); except ValueError: logger.warn('Problem with %s. Skipping.' % ltf); # Clean up. shutil.rmtree(temp_dir);
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf); ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')); laf_doc = load_doc(laf, LAFDocument, logger); ltf_doc = load_doc(ltf, LTFDocument, logger); if laf_doc is None or ltf_doc is None: continue; # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions(); if len(mentions) == 0: mentions_ = []; else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions); mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets); mentions_ = list(zip(tags, mention_onsets, mention_offsets)); # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_); prev_mention_offset = -1; temp_mentions_ = []; for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]); prev_mention_offset = mention_offset; mentions_ = temp_mentions_; # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_); except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf); continue; # Write to file. write_crfsuite_file(f, feats, targets);
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: A_vals = set() B_vals = set() G_vals = set() ltfs = [] for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) ltfs.append(ltf) A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger) # laf_doc = load_doc(laf, LAFDocument, logger); # ltf_doc = load_doc(ltf, LTFDocument, logger); # if laf_doc is None or ltf_doc is None: # continue; # Extract features/targets. # try: # Extract tokens. # try: # tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG(); # except: # tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # token_As = token_Bs = token_Gs = None; # if token_As != None: # A_vals.update(token_As) # if token_Bs != None: # B_vals.update(token_Bs) # if token_Gs != None: # G_vals.update(token_Gs) # except: # logger.warn('ABG values not found for %s. Skipping.' % laf); # continue; print( "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n" .format(len(A_vals), len(B_vals), len(G_vals))) for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip( *mentions) mention_onsets, mention_offsets = convert_extents( char_onsets, char_offsets, token_onsets, token_offsets) mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append( [tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ feats, targets = enc.get_feats_targets(tokens, mentions_, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) except: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei+1] mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() #print len(tokens) # Convert mentions to format expected by the encoder that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() #print mentions if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions) # print token_onsets # print char_onsets # print char_onsets mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets) #print mention_onsets mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). #print mentions_ sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ # print 'mentions:' #print mentions_ #print tokens # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_) #print 'feats: \n' #print feats #print 'targets:' #print targets except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)