Beispiel #1
0
def calc_stats(sys_laf, ref_dir, ltf_dir):
    """Return hits, false alarms, and misses for system output2 LAF relative
    to reference LAF located in ref_dir.
    
    Inputs
    ------
    sys_laf : str
        LAF file containing system output2.

    ref_dir : str
        Directory containing reference LAF files.

    ltf_dir : str
        Directory containing LTF files.
    """
    # Check that LTF and system and reference LAF are valid.
    sys_doc = load_doc(sys_laf, LAFDocument, logger)
    bn = os.path.basename(sys_laf)
    ref_laf = os.path.join(ref_dir, bn)
    ref_doc = load_doc(ref_laf, LAFDocument, logger)
    ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if not all([ref_laf, sys_laf, ltf_doc]):
        return 0.0, 0.0, 0.0

    # Calculate hits, misses, and false alarms.
    try:
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized()

        # Convert mentions to (token_onset, token_offset, tag) format.
        sys_mentions = sys_doc.mentions()
        if len(sys_mentions) > 0:
            sys_ids, sys_tags, sys_extents, sys_char_onsets, sys_char_offsets = zip(*sys_mentions)
            sys_mention_onsets, sys_mention_offsets = convert_extents(sys_char_onsets, sys_char_offsets,
                                                                      token_onsets, token_offsets)
            sys_mentions = zip(sys_tags, sys_mention_onsets, sys_mention_offsets)
            sys_mentions = set(map(tuple, sys_mentions))
        else:
            sys_mentions = set()

        ref_mentions = ref_doc.mentions()
        if len(ref_mentions) > 0:
            ref_ids, ref_tags, ref_extents, ref_char_onsets, ref_char_offsets = zip(*ref_mentions)
            ref_mention_onsets, ref_mention_offsets = convert_extents(ref_char_onsets, ref_char_offsets,
                                                                      token_onsets, token_offsets)
            ref_mentions = zip(ref_tags, ref_mention_onsets, ref_mention_offsets)
            ref_mentions = set(map(tuple, ref_mentions))
        else:
            ref_mentions = set()

        # Calculate.
        n_hit = len(sys_mentions & ref_mentions)
        n_fa = len(sys_mentions - ref_mentions)
        n_miss = len(ref_mentions - sys_mentions)
    except:
        logger.warn('Scoring failed for %s. Skipping.' % ref_laf)
        n_hit = n_fa = n_miss

    return n_hit, n_fa, n_miss
def updateNEdirs(prev_laf_dir, temp_laf_dir, new_laf_dir):

    changeinNEs = False

    for fn in prev_laf_dir:
        if fn.endswith('laf.xml'):
            prev_laf = os.path.join(prev_laf_dir, fn)
            temp_laf = os.path.join(temp_laf_dir, fn)
            try:
                assert os.path.exists(temp_laf)
            except AssertionError:
                logging.warn("{} processed last iteration but not this one".format(fn))
    for fn in temp_laf_dir:
        if fn.endswith('laf.xml'):
            prev_laf = os.path.join(prev_laf_dir, fn)
            temp_laf = os.path.join(temp_laf_dir, fn)
            try:
                assert os.path.exists(prev_laf)
            except AssertionError:
                logging.warn("{} processed this iteration but not the last.  Skipping...".format(fn))
                continue
            
            prev_laf_doc = load_doc(prev_laf, LAFDocument, logger)
            temp_laf_doc = load_doc(temp_laf, LAFDocument, logger)
            doc_id = prev_laf_doc.doc_id

            prev_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in prev_laf_doc.mentions()]
            prev_spans = [(start_char, end_char) for [tag, extent, start_char, end_char] in prev_mentions]
            temp_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in temp_laf_doc.mentions()]
            mentions = []
            for m in prev_mentions:
                mentions.append(m)
            for m in temp_mentions:
                if (m[2], m[3]) not in prev_spans:
                    mentions.append(m)
                    changeinNEs == True

            # Sort new mentions list by start_char then end_char
            mentions = sorted(mentions, key = lambda x: (int(x[2]), int(x[3])))

            n=1
            for tag, extent, start_char, end_char in mentions:
                entity_id = '{}-NE{}'.format(doc_id, n)
                mentions.append([entity_id, tag, extent, start_char, end_char])
                n+=1
                            
            laf = os.path.join(new_laf_dir, fn)
            laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id)
            laf_doc.write_to_file(laf)

    return new_laf_dir, changeinNEs
Beispiel #3
0
def get_ABG_value_sets(ltfs, logger):
    """
    Scan through all LTF files in a directory and return the lists
    of values found for each of A, B, and G.
    Since uhhmm determines the number of values for each of this categories
    at runtime, it is not possible to know before retrieving the output of the system.
    
    """

    A_vals = set()
    B_vals = set()
    G_vals = set()

    for ltf in ltfs:
        # Check that the LTF is valid.
        ltf_doc = load_doc(ltf, LTFDocument, logger)
        if ltf_doc is None:
            continue

        # Extract features/targets.
        try:
            # Extract tokens.
            try:
                tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
                )
            except:
                tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
                )
                token_As = token_Bs = token_Gs = None
            if token_As != None:
                A_vals.update(token_As)
            if token_Bs != None:
                B_vals.update(token_Bs)
            if token_Gs != None:
                G_vals.update(token_Gs)
        except:
            logger.warn('ABG values not found for %s. Skipping.' % ltf)
            continue

    return A_vals, B_vals, G_vals
Beispiel #4
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized()
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens)
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
        probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i',
            featsf
        ]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)

        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Beispiel #5
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext,
             threshold, A_vals, B_vals, G_vals):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """

    # Create working directory.
    temp_dir = tempfile.mkdtemp()

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return

    # Attempt tagging.
    try:
        # Extract tokens.
        try:
            tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
            )
        except:
            tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
            )
            token_As = token_Bs = token_Gs = token_Fs = token_Js = None
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        #        feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs);
        feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs,
                              token_Fs, token_Js, A_vals, B_vals, G_vals)
        write_crfsuite_file(featsf, feats)

        shutil.copy(featsf, "featuresfile")  #DEBUG

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt')
        cmd = [
            'crfsuite',
            'tag',
            '--marginal',  # outputs probability of each tag as extra field in tagsfile
            #               '--probability',        # outputs probability of tag sequence at top of tagsfile
            '-m',
            modelf,
            featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)

        shutil.copy(tagsf, "taggingprobs")  #DEBUG

        # Look for NEs in the tagfile with marginal probs.
        # If the tag is 'O', keep it.
        # If the tag is anything else, keep if marginal prob is above threshold.

        tagsf2 = os.path.join(temp_dir, 'tags2.txt')
        """
        Helper method for checking the tag sequence output in the section below. 
        Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds 
        threshold logprob - returns sequence of O's of equal length otherwise.
        If the seqeuence contains only one tag, that tag is returned as a U tag.
        
        """
        def _check_BIL_sequence(tags, probs, threshold):

            nextpart = ''

            if len(tags) < 1:

                logging.warn("Empty tag sequence submitted as BI*L sequence.")

            elif len(tags) == 1:

                logging.warn(
                    "Tag sequence of length 1 submitted as BI*L sequence.")

                if probs[0] >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                    nextpart = 'U{}'.format(tags[0][1:])

                else:

                    nextpart = 'O\n'

            else:

                try:

                    assert tags[0][0] == 'B' and tags[-1][0] == 'L'

                except AssertionError:

                    logging.warn('Incomplete BI*L sequence submitted.')
                    tags[0] = 'B{}'.format(tags[0][1:])
                    tags[-1] = 'L{}'.format(tags[-1][1:])

#                NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs)
#                if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >=
                count = 0
                for prob in probs:
                    if prob >= threshold:
                        count += 1

                if count >= len(probs) / 2.0:

                    nextpart = ''.join(tags)

                else:

                    nextpart = 'O\n' * len(NEtags)

            return nextpart

        """ Retain or reject NE hypotheses based on probs and write new tags file """
        with open(tagsf2, 'w') as f_out:
            with open(tagsf, 'r') as f_in:
                NEtags = None
                NEprobs = None
                for line in f_in.read().split('\n'):

                    try:

                        assert ':' in line

                        tag, prob = line.strip().split(':')

                        if tag[0] == 'O':
                            # if seq in play, check seq
                            # write tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            f_out.write(tag + '\n')

                        elif tag[0] == 'U':
                            # if seq in play, check seq
                            # if prob >= threshold, write tag
                            # else, write tag = O

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            if float(
                                    prob
                            ) >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                                f_out.write(tag + '\n')

                            else:

                                f_out.write('O\n')

                        elif tag[0] == 'B':
                            # if seq in play, check seq
                            # start new seq with tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))

                            NEtags = [tag + '\n']
                            NEprobs = [float(prob)]

                        elif tag[0] == 'I':
                            # if seq in play, add tag to seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))

                            else:

                                logging.warn("Found an out of sequence I tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                        elif tag[0] == 'L':
                            # if seq in play, add tag to seq and check seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))
                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            else:

                                logging.warn("Found an out of sequence L tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                    except AssertionError:

                        pass
#                        logging.warn('No ":" in line {}'.format(line))  #DEBUG

                if NEtags:  # Necessary if tagsf ends with an incomplete BI*L sequence

                    f_out.write(_check_BIL_sequence(NEtags, NEprobs,
                                                    threshold))
                    NEtags = None
                    NEprobs = None

        tagsf = tagsf2  # Set the checked tag file as the new tag file
        # Continue

        shutil.copy(tagsf, "tagsfile")  #DEBUG

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            tags = tags[:len(tokens)]

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)

        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Beispiel #6
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp();

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger);
    if ltf_doc is None:
        shutil.rmtree(temp_dir);
        return;

    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        txt = ltf_doc.text();
        spans = aligner.align(txt, tokens);

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt');
        feats = enc.get_feats(tokens);
        write_crfsuite_file(featsf, feats);

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt');
        cmd = ['crfsuite', 'tag',
               '-m', modelf,
               featsf];

        with open(tagsf, 'w') as f:
            subprocess.call(' '.join(cmd), shell=True, stdout=f);
            subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'});

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f];
            tags = tags[:len(tokens)];

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags);

        # Construct mentions.
        doc_id = ltf_doc.doc_id;
        mentions = [];
        n = 1;
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue;

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n);

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi];
            end_char = token_offsets[token_ei];

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0];
            extent_ei = spans[token_ei][1];
            extent = txt[extent_bi:extent_ei+1];
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]);

            n += 1;

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf);
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext));
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id);
        laf_doc.write_to_file(laf);
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf);
    except ValueError:
        logger.warn('Problem with %s. Skipping.' % ltf);

    # Clean up.
    shutil.rmtree(temp_dir);
Beispiel #7
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf);
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'));
            laf_doc = load_doc(laf, LAFDocument, logger);
            ltf_doc = load_doc(ltf, LTFDocument, logger);
            if laf_doc is None or ltf_doc is None:
                continue;
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
                
                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions();
                if len(mentions) == 0:
                    mentions_ = [];
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions);
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets);
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets));

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_);
                prev_mention_offset = -1;
                temp_mentions_ = [];
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]);
                    prev_mention_offset = mention_offset;
                mentions_ = temp_mentions_;

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_);
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf);
                continue;

            # Write to file.
            write_crfsuite_file(f, feats, targets);
Beispiel #8
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:

        A_vals = set()
        B_vals = set()
        G_vals = set()
        ltfs = []

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            ltfs.append(ltf)

        A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger)

        #            laf_doc = load_doc(laf, LAFDocument, logger);
        #            ltf_doc = load_doc(ltf, LTFDocument, logger);
        #            if laf_doc is None or ltf_doc is None:
        #                continue;

        # Extract features/targets.
        #            try:
        # Extract tokens.
        #                try:
        #                    tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG();
        #                except:
        #                    tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        #                    token_As = token_Bs = token_Gs = None;
        #                if token_As != None:
        #                    A_vals.update(token_As)
        #                if token_Bs != None:
        #                    B_vals.update(token_Bs)
        #                if token_Gs != None:
        #                    G_vals.update(token_Gs)
        #            except:
        #                logger.warn('ABG values not found for %s. Skipping.' % laf);
        #                continue;

        print(
            "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n"
            .format(len(A_vals), len(B_vals), len(G_vals)))

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            laf_doc = load_doc(laf, LAFDocument, logger)
            ltf_doc = load_doc(ltf, LTFDocument, logger)
            if laf_doc is None or ltf_doc is None:
                continue

            # Extract features/targets.
            try:
                # Extract tokens.
                try:
                    tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
                    )
                except:
                    tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
                    )
                    token_As = token_Bs = token_Gs = token_Fs = token_Js = None

                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                if len(mentions) == 0:
                    mentions_ = []
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(
                        *mentions)
                    mention_onsets, mention_offsets = convert_extents(
                        char_onsets, char_offsets, token_onsets, token_offsets)
                    mentions_ = list(zip(tags, mention_onsets,
                                         mention_offsets))

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_)
                prev_mention_offset = -1
                temp_mentions_ = []
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append(
                            [tag, mention_onset, mention_offset])
                    prev_mention_offset = mention_offset
                mentions_ = temp_mentions_

                feats, targets = enc.get_feats_targets(tokens, mentions_,
                                                       token_nums, token_As,
                                                       token_Bs, token_Gs,
                                                       token_Fs, token_Js,
                                                       A_vals, B_vals, G_vals)

            except:
                logger.warn('Feature extraction failed for %s. Skipping.' %
                            laf)
                continue

            # Write to file.
            write_crfsuite_file(f, feats, targets)
Beispiel #9
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger) 
    if ltf_doc is None:
        shutil.rmtree(temp_dir) 
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
        txt = ltf_doc.text() 
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens) 
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf,
               featsf]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
	probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf, '-i',
               featsf]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)


        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1 
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue 

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n) 

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi] 
            end_char = token_offsets[token_ei] 

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0] 
            extent_ei = spans[token_ei][1] 
            extent = txt[extent_bi:extent_ei+1] 
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]) 

            n += 1 

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) 
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) 
        laf_doc.write_to_file(laf) 
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf) 

    # Clean up.
    shutil.rmtree(temp_dir)
Beispiel #10
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf) 
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) 
            laf_doc = load_doc(laf, LAFDocument, logger) 
            ltf_doc = load_doc(ltf, LTFDocument, logger) 
            if laf_doc is None or ltf_doc is None:
                continue 
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
                #print len(tokens)
                # Convert mentions to format expected by the encoder  that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                #print mentions
                if len(mentions) == 0:
                    mentions_ = [] 
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions)
                    # print token_onsets
                    # print char_onsets
                    # print char_onsets
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets)
                    #print mention_onsets
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets)) 

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                #print mentions_
                sort_mentions(mentions_) 
                prev_mention_offset = -1 
                temp_mentions_ = [] 
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]) 
                    prev_mention_offset = mention_offset 
                mentions_ = temp_mentions_
                # print 'mentions:'
                #print mentions_
                #print tokens

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_)
                #print 'feats: \n'
                #print feats
                #print 'targets:'
                #print targets
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf) 
                continue 

            # Write to file.
            write_crfsuite_file(f, feats, targets)