def calc_stats(sys_laf, ref_dir, ltf_dir): """Return hits, false alarms, and misses for system output2 LAF relative to reference LAF located in ref_dir. Inputs ------ sys_laf : str LAF file containing system output2. ref_dir : str Directory containing reference LAF files. ltf_dir : str Directory containing LTF files. """ # Check that LTF and system and reference LAF are valid. sys_doc = load_doc(sys_laf, LAFDocument, logger) bn = os.path.basename(sys_laf) ref_laf = os.path.join(ref_dir, bn) ref_doc = load_doc(ref_laf, LAFDocument, logger) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) ltf_doc = load_doc(ltf, LTFDocument, logger) if not all([ref_laf, sys_laf, ltf_doc]): return 0.0, 0.0, 0.0 # Calculate hits, misses, and false alarms. try: tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() # Convert mentions to (token_onset, token_offset, tag) format. sys_mentions = sys_doc.mentions() if len(sys_mentions) > 0: sys_ids, sys_tags, sys_extents, sys_char_onsets, sys_char_offsets = zip(*sys_mentions) sys_mention_onsets, sys_mention_offsets = convert_extents(sys_char_onsets, sys_char_offsets, token_onsets, token_offsets) sys_mentions = zip(sys_tags, sys_mention_onsets, sys_mention_offsets) sys_mentions = set(map(tuple, sys_mentions)) else: sys_mentions = set() ref_mentions = ref_doc.mentions() if len(ref_mentions) > 0: ref_ids, ref_tags, ref_extents, ref_char_onsets, ref_char_offsets = zip(*ref_mentions) ref_mention_onsets, ref_mention_offsets = convert_extents(ref_char_onsets, ref_char_offsets, token_onsets, token_offsets) ref_mentions = zip(ref_tags, ref_mention_onsets, ref_mention_offsets) ref_mentions = set(map(tuple, ref_mentions)) else: ref_mentions = set() # Calculate. n_hit = len(sys_mentions & ref_mentions) n_fa = len(sys_mentions - ref_mentions) n_miss = len(ref_mentions - sys_mentions) except: logger.warn('Scoring failed for %s. Skipping.' % ref_laf) n_hit = n_fa = n_miss return n_hit, n_fa, n_miss
def updateNEdirs(prev_laf_dir, temp_laf_dir, new_laf_dir): changeinNEs = False for fn in prev_laf_dir: if fn.endswith('laf.xml'): prev_laf = os.path.join(prev_laf_dir, fn) temp_laf = os.path.join(temp_laf_dir, fn) try: assert os.path.exists(temp_laf) except AssertionError: logging.warn("{} processed last iteration but not this one".format(fn)) for fn in temp_laf_dir: if fn.endswith('laf.xml'): prev_laf = os.path.join(prev_laf_dir, fn) temp_laf = os.path.join(temp_laf_dir, fn) try: assert os.path.exists(prev_laf) except AssertionError: logging.warn("{} processed this iteration but not the last. Skipping...".format(fn)) continue prev_laf_doc = load_doc(prev_laf, LAFDocument, logger) temp_laf_doc = load_doc(temp_laf, LAFDocument, logger) doc_id = prev_laf_doc.doc_id prev_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in prev_laf_doc.mentions()] prev_spans = [(start_char, end_char) for [tag, extent, start_char, end_char] in prev_mentions] temp_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in temp_laf_doc.mentions()] mentions = [] for m in prev_mentions: mentions.append(m) for m in temp_mentions: if (m[2], m[3]) not in prev_spans: mentions.append(m) changeinNEs == True # Sort new mentions list by start_char then end_char mentions = sorted(mentions, key = lambda x: (int(x[2]), int(x[3]))) n=1 for tag, extent, start_char, end_char in mentions: entity_id = '{}-NE{}'.format(doc_id, n) mentions.append([entity_id, tag, extent, start_char, end_char]) n+=1 laf = os.path.join(new_laf_dir, fn) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) return new_laf_dir, changeinNEs
def get_ABG_value_sets(ltfs, logger): """ Scan through all LTF files in a directory and return the lists of values found for each of A, B, and G. Since uhhmm determines the number of values for each of this categories at runtime, it is not possible to know before retrieving the output of the system. """ A_vals = set() B_vals = set() G_vals = set() for ltf in ltfs: # Check that the LTF is valid. ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = None if token_As != None: A_vals.update(token_As) if token_Bs != None: B_vals.update(token_Bs) if token_Gs != None: G_vals.update(token_Gs) except: logger.warn('ABG values not found for %s. Skipping.' % ltf) continue return A_vals, B_vals, G_vals
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = [ '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf ] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext, threshold, A_vals, B_vals, G_vals): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') # feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs); feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) write_crfsuite_file(featsf, feats) shutil.copy(featsf, "featuresfile") #DEBUG # Tag. tagsf = os.path.join(temp_dir, 'tags.txt') cmd = [ 'crfsuite', 'tag', '--marginal', # outputs probability of each tag as extra field in tagsfile # '--probability', # outputs probability of tag sequence at top of tagsfile '-m', modelf, featsf ] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) shutil.copy(tagsf, "taggingprobs") #DEBUG # Look for NEs in the tagfile with marginal probs. # If the tag is 'O', keep it. # If the tag is anything else, keep if marginal prob is above threshold. tagsf2 = os.path.join(temp_dir, 'tags2.txt') """ Helper method for checking the tag sequence output in the section below. Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds threshold logprob - returns sequence of O's of equal length otherwise. If the seqeuence contains only one tag, that tag is returned as a U tag. """ def _check_BIL_sequence(tags, probs, threshold): nextpart = '' if len(tags) < 1: logging.warn("Empty tag sequence submitted as BI*L sequence.") elif len(tags) == 1: logging.warn( "Tag sequence of length 1 submitted as BI*L sequence.") if probs[0] >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= nextpart = 'U{}'.format(tags[0][1:]) else: nextpart = 'O\n' else: try: assert tags[0][0] == 'B' and tags[-1][0] == 'L' except AssertionError: logging.warn('Incomplete BI*L sequence submitted.') tags[0] = 'B{}'.format(tags[0][1:]) tags[-1] = 'L{}'.format(tags[-1][1:]) # NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs) # if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >= count = 0 for prob in probs: if prob >= threshold: count += 1 if count >= len(probs) / 2.0: nextpart = ''.join(tags) else: nextpart = 'O\n' * len(NEtags) return nextpart """ Retain or reject NE hypotheses based on probs and write new tags file """ with open(tagsf2, 'w') as f_out: with open(tagsf, 'r') as f_in: NEtags = None NEprobs = None for line in f_in.read().split('\n'): try: assert ':' in line tag, prob = line.strip().split(':') if tag[0] == 'O': # if seq in play, check seq # write tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None f_out.write(tag + '\n') elif tag[0] == 'U': # if seq in play, check seq # if prob >= threshold, write tag # else, write tag = O if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None if float( prob ) >= threshold: # compare probs, not abs vals of logprobs, hence >= and not <= f_out.write(tag + '\n') else: f_out.write('O\n') elif tag[0] == 'B': # if seq in play, check seq # start new seq with tag if NEtags: f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'I': # if seq in play, add tag to seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) else: logging.warn("Found an out of sequence I tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] elif tag[0] == 'L': # if seq in play, add tag to seq and check seq # else, start new seq with tag = B if NEtags: NEtags.append(tag + '\n') NEprobs.append(float(prob)) f_out.write( _check_BIL_sequence( NEtags, NEprobs, threshold)) NEtags = None NEprobs = None else: logging.warn("Found an out of sequence L tag.") tag = 'B{}'.format(tag[1:]) NEtags = [tag + '\n'] NEprobs = [float(prob)] except AssertionError: pass # logging.warn('No ":" in line {}'.format(line)) #DEBUG if NEtags: # Necessary if tagsf ends with an incomplete BI*L sequence f_out.write(_check_BIL_sequence(NEtags, NEprobs, threshold)) NEtags = None NEprobs = None tagsf = tagsf2 # Set the checked tag file as the new tag file # Continue shutil.copy(tagsf, "tagsfile") #DEBUG # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f] tags = tags[:len(tokens)] # Chunk tags. chunks = chunker.tags_to_chunks(tags) # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei + 1] mentions.append([ entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output LAF files. tagged_ext : str Extension to used for output LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp(); # Load LTF. ltf_doc = load_doc(ltf, LTFDocument, logger); if ltf_doc is None: shutil.rmtree(temp_dir); return; # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); txt = ltf_doc.text(); spans = aligner.align(txt, tokens); # Extract features featsf = os.path.join(temp_dir, 'feats.txt'); feats = enc.get_feats(tokens); write_crfsuite_file(featsf, feats); # Tag. tagsf = os.path.join(temp_dir, 'tags.txt'); cmd = ['crfsuite', 'tag', '-m', modelf, featsf]; with open(tagsf, 'w') as f: subprocess.call(' '.join(cmd), shell=True, stdout=f); subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'}); # Load tagged output. with open(tagsf, 'r') as f: tags = [line.strip() for line in f]; tags = tags[:len(tokens)]; # Chunk tags. chunks = chunker.tags_to_chunks(tags); # Construct mentions. doc_id = ltf_doc.doc_id; mentions = []; n = 1; for token_bi, token_ei, tag in chunks: if tag == 'O': continue; # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n); # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi]; end_char = token_offsets[token_ei]; # Finally, determine text of extent and append. extent_bi = spans[token_bi][0]; extent_ei = spans[token_ei][1]; extent = txt[extent_bi:extent_ei+1]; mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]); n += 1; # Write detected mentions to LAF file. bn = os.path.basename(ltf); laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)); laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id); laf_doc.write_to_file(laf); except KeyError: logger.warn('Problem with %s. Skipping.' % ltf); except ValueError: logger.warn('Problem with %s. Skipping.' % ltf); # Clean up. shutil.rmtree(temp_dir);
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf); ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')); laf_doc = load_doc(laf, LAFDocument, logger); ltf_doc = load_doc(ltf, LTFDocument, logger); if laf_doc is None or ltf_doc is None: continue; # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions(); if len(mentions) == 0: mentions_ = []; else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions); mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets); mentions_ = list(zip(tags, mention_onsets, mention_offsets)); # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_); prev_mention_offset = -1; temp_mentions_ = []; for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]); prev_mention_offset = mention_offset; mentions_ = temp_mentions_; # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_); except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf); continue; # Write to file. write_crfsuite_file(f, feats, targets);
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: A_vals = set() B_vals = set() G_vals = set() ltfs = [] for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) ltfs.append(ltf) A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger) # laf_doc = load_doc(laf, LAFDocument, logger); # ltf_doc = load_doc(ltf, LTFDocument, logger); # if laf_doc is None or ltf_doc is None: # continue; # Extract features/targets. # try: # Extract tokens. # try: # tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG(); # except: # tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # token_As = token_Bs = token_Gs = None; # if token_As != None: # A_vals.update(token_As) # if token_Bs != None: # B_vals.update(token_Bs) # if token_Gs != None: # G_vals.update(token_Gs) # except: # logger.warn('ABG values not found for %s. Skipping.' % laf); # continue; print( "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n" .format(len(A_vals), len(B_vals), len(G_vals))) for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip( *mentions) mention_onsets, mention_offsets = convert_extents( char_onsets, char_offsets, token_onsets, token_offsets) mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append( [tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ feats, targets = enc.get_feats_targets(tokens, mentions_, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) except: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext): """Extract features for tokenization in LTF file and tag named entities. Inputs ------ ltf : str LTF file. aligner : align.Aligner Aligner instance used to obtain character onsets/offsets of discovered mentions. enc : features.Encoder Encoder instance for feature extraction. chunker : chunk.ChunkEncoder ChunkEncoder instance for obtaining token onsets/offsets of discovered mentions from tag sequences. modelf : str CRFSuite model file. tagged_dir : str Directory to which to output2 LAF files. tagged_ext : str Extension to used for output2 LAF files. """ # Create working directory. temp_dir = tempfile.mkdtemp() # Load LTF. #print ltf # todo ltf_doc = load_doc(ltf, LTFDocument, logger) if ltf_doc is None: shutil.rmtree(temp_dir) return # Attempt tagging. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() txt = ltf_doc.text() spans = aligner.align(txt, tokens) # Extract features featsf = os.path.join(temp_dir, 'feats.txt') feats = enc.get_feats(tokens) write_crfsuite_file(featsf, feats) # Tag. # print "tmep_dir"+temp_dir tagsf = os.path.join(temp_dir, 'tags.txt') #probf = os.path.join(temp_dir, 'probs.txt') cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf] with open(tagsf, 'w') as f: subprocess.call(cmd, stdout=f) # Load tagged output2. probf1 = ltf.replace('ltf', 'probs') probf = probf1.replace('test', 'probs') # print probf cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i', featsf] with open(probf, 'w') as f: subprocess.call(cmd_, stdout=f) # maxprobf = ltf.replace('ltf', 'maxprobs') # # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag', # '-m', modelf, '-i', # featsf] # with open(maxprobf, 'w') as f: # subprocess.call(cmd_, stdout=f) with open(tagsf, 'r') as f: tags = [line.strip() for line in f] # print len(tags) # todo tags = tags[:len(tokens)] # print len(tags) # todo # print 'this is tags' # print tags # todo # Chunk tags. chunks = chunker.tags_to_chunks(tags) # todo:bughere # Construct mentions. doc_id = ltf_doc.doc_id mentions = [] n = 1 for token_bi, token_ei, tag in chunks: if tag == 'O': continue # Assign entity id. entity_id = '%s-NE%d' % (doc_id, n) # Determine char onsets/offset for mention extent. start_char = token_onsets[token_bi] end_char = token_offsets[token_ei] # Finally, determine text of extent and append. extent_bi = spans[token_bi][0] extent_ei = spans[token_ei][1] extent = txt[extent_bi:extent_ei+1] mentions.append([entity_id, # entity id tag, # NE type extent, # extent text start_char, # extent char onset end_char, # extent char offset ]) n += 1 # Write detected mentions to LAF file. bn = os.path.basename(ltf) laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) laf_doc.write_to_file(laf) except KeyError: logger.warn('Problem with %s. Skipping.' % ltf) # Clean up. shutil.rmtree(temp_dir)
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() #print len(tokens) # Convert mentions to format expected by the encoder that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() #print mentions if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions) # print token_onsets # print char_onsets # print char_onsets mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets) #print mention_onsets mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). #print mentions_ sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ # print 'mentions:' #print mentions_ #print tokens # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_) #print 'feats: \n' #print feats #print 'targets:' #print targets except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)