Example #1
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext,
             threshold, A_vals, B_vals, G_vals):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """

    # Create working directory.
    temp_dir = tempfile.mkdtemp()

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return

    # Attempt tagging.
    try:
        # Extract tokens.
        try:
            tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
            )
        except:
            tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
            )
            token_As = token_Bs = token_Gs = token_Fs = token_Js = None
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        #        feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs);
        feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs,
                              token_Fs, token_Js, A_vals, B_vals, G_vals)
        write_crfsuite_file(featsf, feats)

        shutil.copy(featsf, "featuresfile")  #DEBUG

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt')
        cmd = [
            'crfsuite',
            'tag',
            '--marginal',  # outputs probability of each tag as extra field in tagsfile
            #               '--probability',        # outputs probability of tag sequence at top of tagsfile
            '-m',
            modelf,
            featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)

        shutil.copy(tagsf, "taggingprobs")  #DEBUG

        # Look for NEs in the tagfile with marginal probs.
        # If the tag is 'O', keep it.
        # If the tag is anything else, keep if marginal prob is above threshold.

        tagsf2 = os.path.join(temp_dir, 'tags2.txt')
        """
        Helper method for checking the tag sequence output in the section below. 
        Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds 
        threshold logprob - returns sequence of O's of equal length otherwise.
        If the seqeuence contains only one tag, that tag is returned as a U tag.
        
        """
        def _check_BIL_sequence(tags, probs, threshold):

            nextpart = ''

            if len(tags) < 1:

                logging.warn("Empty tag sequence submitted as BI*L sequence.")

            elif len(tags) == 1:

                logging.warn(
                    "Tag sequence of length 1 submitted as BI*L sequence.")

                if probs[0] >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                    nextpart = 'U{}'.format(tags[0][1:])

                else:

                    nextpart = 'O\n'

            else:

                try:

                    assert tags[0][0] == 'B' and tags[-1][0] == 'L'

                except AssertionError:

                    logging.warn('Incomplete BI*L sequence submitted.')
                    tags[0] = 'B{}'.format(tags[0][1:])
                    tags[-1] = 'L{}'.format(tags[-1][1:])

#                NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs)
#                if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >=
                count = 0
                for prob in probs:
                    if prob >= threshold:
                        count += 1

                if count >= len(probs) / 2.0:

                    nextpart = ''.join(tags)

                else:

                    nextpart = 'O\n' * len(NEtags)

            return nextpart

        """ Retain or reject NE hypotheses based on probs and write new tags file """
        with open(tagsf2, 'w') as f_out:
            with open(tagsf, 'r') as f_in:
                NEtags = None
                NEprobs = None
                for line in f_in.read().split('\n'):

                    try:

                        assert ':' in line

                        tag, prob = line.strip().split(':')

                        if tag[0] == 'O':
                            # if seq in play, check seq
                            # write tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            f_out.write(tag + '\n')

                        elif tag[0] == 'U':
                            # if seq in play, check seq
                            # if prob >= threshold, write tag
                            # else, write tag = O

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            if float(
                                    prob
                            ) >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                                f_out.write(tag + '\n')

                            else:

                                f_out.write('O\n')

                        elif tag[0] == 'B':
                            # if seq in play, check seq
                            # start new seq with tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))

                            NEtags = [tag + '\n']
                            NEprobs = [float(prob)]

                        elif tag[0] == 'I':
                            # if seq in play, add tag to seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))

                            else:

                                logging.warn("Found an out of sequence I tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                        elif tag[0] == 'L':
                            # if seq in play, add tag to seq and check seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))
                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            else:

                                logging.warn("Found an out of sequence L tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                    except AssertionError:

                        pass
#                        logging.warn('No ":" in line {}'.format(line))  #DEBUG

                if NEtags:  # Necessary if tagsf ends with an incomplete BI*L sequence

                    f_out.write(_check_BIL_sequence(NEtags, NEprobs,
                                                    threshold))
                    NEtags = None
                    NEprobs = None

        tagsf = tagsf2  # Set the checked tag file as the new tag file
        # Continue

        shutil.copy(tagsf, "tagsfile")  #DEBUG

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            tags = tags[:len(tokens)]

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)

        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Example #2
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized()
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens)
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
        probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i',
            featsf
        ]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)

        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Example #3
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp();

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger);
    if ltf_doc is None:
        shutil.rmtree(temp_dir);
        return;

    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        txt = ltf_doc.text();
        spans = aligner.align(txt, tokens);

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt');
        feats = enc.get_feats(tokens);
        write_crfsuite_file(featsf, feats);

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt');
        cmd = ['crfsuite', 'tag',
               '-m', modelf,
               featsf];

        with open(tagsf, 'w') as f:
            subprocess.call(' '.join(cmd), shell=True, stdout=f);
            subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'});

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f];
            tags = tags[:len(tokens)];

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags);

        # Construct mentions.
        doc_id = ltf_doc.doc_id;
        mentions = [];
        n = 1;
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue;

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n);

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi];
            end_char = token_offsets[token_ei];

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0];
            extent_ei = spans[token_ei][1];
            extent = txt[extent_bi:extent_ei+1];
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]);

            n += 1;

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf);
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext));
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id);
        laf_doc.write_to_file(laf);
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf);
    except ValueError:
        logger.warn('Problem with %s. Skipping.' % ltf);

    # Clean up.
    shutil.rmtree(temp_dir);
Example #4
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf);
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'));
            laf_doc = load_doc(laf, LAFDocument, logger);
            ltf_doc = load_doc(ltf, LTFDocument, logger);
            if laf_doc is None or ltf_doc is None:
                continue;
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
                
                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions();
                if len(mentions) == 0:
                    mentions_ = [];
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions);
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets);
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets));

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_);
                prev_mention_offset = -1;
                temp_mentions_ = [];
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]);
                    prev_mention_offset = mention_offset;
                mentions_ = temp_mentions_;

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_);
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf);
                continue;

            # Write to file.
            write_crfsuite_file(f, feats, targets);
Example #5
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:

        A_vals = set()
        B_vals = set()
        G_vals = set()
        ltfs = []

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            ltfs.append(ltf)

        A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger)

        #            laf_doc = load_doc(laf, LAFDocument, logger);
        #            ltf_doc = load_doc(ltf, LTFDocument, logger);
        #            if laf_doc is None or ltf_doc is None:
        #                continue;

        # Extract features/targets.
        #            try:
        # Extract tokens.
        #                try:
        #                    tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG();
        #                except:
        #                    tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        #                    token_As = token_Bs = token_Gs = None;
        #                if token_As != None:
        #                    A_vals.update(token_As)
        #                if token_Bs != None:
        #                    B_vals.update(token_Bs)
        #                if token_Gs != None:
        #                    G_vals.update(token_Gs)
        #            except:
        #                logger.warn('ABG values not found for %s. Skipping.' % laf);
        #                continue;

        print(
            "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n"
            .format(len(A_vals), len(B_vals), len(G_vals)))

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            laf_doc = load_doc(laf, LAFDocument, logger)
            ltf_doc = load_doc(ltf, LTFDocument, logger)
            if laf_doc is None or ltf_doc is None:
                continue

            # Extract features/targets.
            try:
                # Extract tokens.
                try:
                    tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
                    )
                except:
                    tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
                    )
                    token_As = token_Bs = token_Gs = token_Fs = token_Js = None

                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                if len(mentions) == 0:
                    mentions_ = []
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(
                        *mentions)
                    mention_onsets, mention_offsets = convert_extents(
                        char_onsets, char_offsets, token_onsets, token_offsets)
                    mentions_ = list(zip(tags, mention_onsets,
                                         mention_offsets))

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_)
                prev_mention_offset = -1
                temp_mentions_ = []
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append(
                            [tag, mention_onset, mention_offset])
                    prev_mention_offset = mention_offset
                mentions_ = temp_mentions_

                feats, targets = enc.get_feats_targets(tokens, mentions_,
                                                       token_nums, token_As,
                                                       token_Bs, token_Gs,
                                                       token_Fs, token_Js,
                                                       A_vals, B_vals, G_vals)

            except:
                logger.warn('Feature extraction failed for %s. Skipping.' %
                            laf)
                continue

            # Write to file.
            write_crfsuite_file(f, feats, targets)
Example #6
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger) 
    if ltf_doc is None:
        shutil.rmtree(temp_dir) 
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
        txt = ltf_doc.text() 
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens) 
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf,
               featsf]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
	probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf, '-i',
               featsf]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)


        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1 
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue 

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n) 

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi] 
            end_char = token_offsets[token_ei] 

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0] 
            extent_ei = spans[token_ei][1] 
            extent = txt[extent_bi:extent_ei+1] 
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]) 

            n += 1 

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) 
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) 
        laf_doc.write_to_file(laf) 
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf) 

    # Clean up.
    shutil.rmtree(temp_dir)
Example #7
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf) 
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) 
            laf_doc = load_doc(laf, LAFDocument, logger) 
            ltf_doc = load_doc(ltf, LTFDocument, logger) 
            if laf_doc is None or ltf_doc is None:
                continue 
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
                #print len(tokens)
                # Convert mentions to format expected by the encoder  that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                #print mentions
                if len(mentions) == 0:
                    mentions_ = [] 
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions)
                    # print token_onsets
                    # print char_onsets
                    # print char_onsets
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets)
                    #print mention_onsets
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets)) 

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                #print mentions_
                sort_mentions(mentions_) 
                prev_mention_offset = -1 
                temp_mentions_ = [] 
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]) 
                    prev_mention_offset = mention_offset 
                mentions_ = temp_mentions_
                # print 'mentions:'
                #print mentions_
                #print tokens

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_)
                #print 'feats: \n'
                #print feats
                #print 'targets:'
                #print targets
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf) 
                continue 

            # Write to file.
            write_crfsuite_file(f, feats, targets)