def updateNEdirs(prev_laf_dir, temp_laf_dir, new_laf_dir):

    changeinNEs = False

    for fn in prev_laf_dir:
        if fn.endswith('laf.xml'):
            prev_laf = os.path.join(prev_laf_dir, fn)
            temp_laf = os.path.join(temp_laf_dir, fn)
            try:
                assert os.path.exists(temp_laf)
            except AssertionError:
                logging.warn("{} processed last iteration but not this one".format(fn))
    for fn in temp_laf_dir:
        if fn.endswith('laf.xml'):
            prev_laf = os.path.join(prev_laf_dir, fn)
            temp_laf = os.path.join(temp_laf_dir, fn)
            try:
                assert os.path.exists(prev_laf)
            except AssertionError:
                logging.warn("{} processed this iteration but not the last.  Skipping...".format(fn))
                continue
            
            prev_laf_doc = load_doc(prev_laf, LAFDocument, logger)
            temp_laf_doc = load_doc(temp_laf, LAFDocument, logger)
            doc_id = prev_laf_doc.doc_id

            prev_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in prev_laf_doc.mentions()]
            prev_spans = [(start_char, end_char) for [tag, extent, start_char, end_char] in prev_mentions]
            temp_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in temp_laf_doc.mentions()]
            mentions = []
            for m in prev_mentions:
                mentions.append(m)
            for m in temp_mentions:
                if (m[2], m[3]) not in prev_spans:
                    mentions.append(m)
                    changeinNEs == True

            # Sort new mentions list by start_char then end_char
            mentions = sorted(mentions, key = lambda x: (int(x[2]), int(x[3])))

            n=1
            for tag, extent, start_char, end_char in mentions:
                entity_id = '{}-NE{}'.format(doc_id, n)
                mentions.append([entity_id, tag, extent, start_char, end_char])
                n+=1
                            
            laf = os.path.join(new_laf_dir, fn)
            laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id)
            laf_doc.write_to_file(laf)

    return new_laf_dir, changeinNEs
Example #2
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized()
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens)
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
        probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = [
            '/home/wangtianlu/local/bin/crfsuite', 'tag', '-m', modelf, '-i',
            featsf
        ]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)

        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Example #3
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp();

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger);
    if ltf_doc is None:
        shutil.rmtree(temp_dir);
        return;

    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        txt = ltf_doc.text();
        spans = aligner.align(txt, tokens);

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt');
        feats = enc.get_feats(tokens);
        write_crfsuite_file(featsf, feats);

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt');
        cmd = ['crfsuite', 'tag',
               '-m', modelf,
               featsf];

        with open(tagsf, 'w') as f:
            subprocess.call(' '.join(cmd), shell=True, stdout=f);
            subprocess.call(' '.join(cmd), shell=True, stdout=f, env={'CRFSUITE': '/usr/local/bin'});

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f];
            tags = tags[:len(tokens)];

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags);

        # Construct mentions.
        doc_id = ltf_doc.doc_id;
        mentions = [];
        n = 1;
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue;

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n);

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi];
            end_char = token_offsets[token_ei];

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0];
            extent_ei = spans[token_ei][1];
            extent = txt[extent_bi:extent_ei+1];
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]);

            n += 1;

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf);
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext));
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id);
        laf_doc.write_to_file(laf);
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf);
    except ValueError:
        logger.warn('Problem with %s. Skipping.' % ltf);

    # Clean up.
    shutil.rmtree(temp_dir);
Example #4
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext,
             threshold, A_vals, B_vals, G_vals):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output LAF files.

    tagged_ext : str
        Extension to used for output LAF files.
    """

    # Create working directory.
    temp_dir = tempfile.mkdtemp()

    # Load LTF.
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if ltf_doc is None:
        shutil.rmtree(temp_dir)
        return

    # Attempt tagging.
    try:
        # Extract tokens.
        try:
            tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
            )
        except:
            tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
            )
            token_As = token_Bs = token_Gs = token_Fs = token_Js = None
        txt = ltf_doc.text()
        spans = aligner.align(txt, tokens)

        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        #        feats = enc.get_feats(tokens, token_As, token_Bs, token_Gs);
        feats = enc.get_feats(tokens, token_nums, token_As, token_Bs, token_Gs,
                              token_Fs, token_Js, A_vals, B_vals, G_vals)
        write_crfsuite_file(featsf, feats)

        shutil.copy(featsf, "featuresfile")  #DEBUG

        # Tag.
        tagsf = os.path.join(temp_dir, 'tags.txt')
        cmd = [
            'crfsuite',
            'tag',
            '--marginal',  # outputs probability of each tag as extra field in tagsfile
            #               '--probability',        # outputs probability of tag sequence at top of tagsfile
            '-m',
            modelf,
            featsf
        ]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)

        shutil.copy(tagsf, "taggingprobs")  #DEBUG

        # Look for NEs in the tagfile with marginal probs.
        # If the tag is 'O', keep it.
        # If the tag is anything else, keep if marginal prob is above threshold.

        tagsf2 = os.path.join(temp_dir, 'tags2.txt')
        """
        Helper method for checking the tag sequence output in the section below. 
        Checks for full BI*L sequence, returning that seqeunce if mean logprob exceeds 
        threshold logprob - returns sequence of O's of equal length otherwise.
        If the seqeuence contains only one tag, that tag is returned as a U tag.
        
        """
        def _check_BIL_sequence(tags, probs, threshold):

            nextpart = ''

            if len(tags) < 1:

                logging.warn("Empty tag sequence submitted as BI*L sequence.")

            elif len(tags) == 1:

                logging.warn(
                    "Tag sequence of length 1 submitted as BI*L sequence.")

                if probs[0] >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                    nextpart = 'U{}'.format(tags[0][1:])

                else:

                    nextpart = 'O\n'

            else:

                try:

                    assert tags[0][0] == 'B' and tags[-1][0] == 'L'

                except AssertionError:

                    logging.warn('Incomplete BI*L sequence submitted.')
                    tags[0] = 'B{}'.format(tags[0][1:])
                    tags[-1] = 'L{}'.format(tags[-1][1:])

#                NElogProb = reduce(lambda x, y: (log(x) * -1) + (log(y) * -1), probs)/len(probs)
#                if NElogProb <= (log(threshold) * -1): # compare abs vals of logprobs, hence <= and not >=
                count = 0
                for prob in probs:
                    if prob >= threshold:
                        count += 1

                if count >= len(probs) / 2.0:

                    nextpart = ''.join(tags)

                else:

                    nextpart = 'O\n' * len(NEtags)

            return nextpart

        """ Retain or reject NE hypotheses based on probs and write new tags file """
        with open(tagsf2, 'w') as f_out:
            with open(tagsf, 'r') as f_in:
                NEtags = None
                NEprobs = None
                for line in f_in.read().split('\n'):

                    try:

                        assert ':' in line

                        tag, prob = line.strip().split(':')

                        if tag[0] == 'O':
                            # if seq in play, check seq
                            # write tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            f_out.write(tag + '\n')

                        elif tag[0] == 'U':
                            # if seq in play, check seq
                            # if prob >= threshold, write tag
                            # else, write tag = O

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            if float(
                                    prob
                            ) >= threshold:  # compare probs, not abs vals of logprobs, hence >= and not <=

                                f_out.write(tag + '\n')

                            else:

                                f_out.write('O\n')

                        elif tag[0] == 'B':
                            # if seq in play, check seq
                            # start new seq with tag

                            if NEtags:

                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))

                            NEtags = [tag + '\n']
                            NEprobs = [float(prob)]

                        elif tag[0] == 'I':
                            # if seq in play, add tag to seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))

                            else:

                                logging.warn("Found an out of sequence I tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                        elif tag[0] == 'L':
                            # if seq in play, add tag to seq and check seq
                            # else, start new seq with tag = B

                            if NEtags:

                                NEtags.append(tag + '\n')
                                NEprobs.append(float(prob))
                                f_out.write(
                                    _check_BIL_sequence(
                                        NEtags, NEprobs, threshold))
                                NEtags = None
                                NEprobs = None

                            else:

                                logging.warn("Found an out of sequence L tag.")
                                tag = 'B{}'.format(tag[1:])
                                NEtags = [tag + '\n']
                                NEprobs = [float(prob)]

                    except AssertionError:

                        pass
#                        logging.warn('No ":" in line {}'.format(line))  #DEBUG

                if NEtags:  # Necessary if tagsf ends with an incomplete BI*L sequence

                    f_out.write(_check_BIL_sequence(NEtags, NEprobs,
                                                    threshold))
                    NEtags = None
                    NEprobs = None

        tagsf = tagsf2  # Set the checked tag file as the new tag file
        # Continue

        shutil.copy(tagsf, "tagsfile")  #DEBUG

        # Load tagged output.
        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            tags = tags[:len(tokens)]

        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)

        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n)

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi]
            end_char = token_offsets[token_ei]

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0]
            extent_ei = spans[token_ei][1]
            extent = txt[extent_bi:extent_ei + 1]
            mentions.append([
                entity_id,  # entity id
                tag,  # NE type
                extent,  # extent text
                start_char,  # extent char onset
                end_char,  # extent char offset
            ])

            n += 1

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext))
        laf_doc = LAFDocument(mentions=mentions,
                              lang=ltf_doc.lang,
                              doc_id=doc_id)
        laf_doc.write_to_file(laf)
    except:
        logger.warn('Problem with %s. Skipping.' % ltf)

    # Clean up.
    shutil.rmtree(temp_dir)
Example #5
0
def tag_file(ltf, aligner, enc, chunker, modelf, tagged_dir, tagged_ext):
    """Extract features for tokenization in LTF file and tag named entities.

    Inputs
    ------
    ltf : str
        LTF file.

    aligner : align.Aligner
        Aligner instance used to obtain character onsets/offsets of discovered
        mentions.

    enc : features.Encoder
        Encoder instance for feature extraction.

    chunker : chunk.ChunkEncoder
        ChunkEncoder instance for obtaining token onsets/offsets of discovered
        mentions from tag sequences.

    modelf : str
        CRFSuite model file.

    tagged_dir : str
        Directory to which to output2 LAF files.

    tagged_ext : str
        Extension to used for output2 LAF files.
    """
    # Create working directory.                                              
    temp_dir = tempfile.mkdtemp()
    # Load LTF.
    #print ltf  # todo
    ltf_doc = load_doc(ltf, LTFDocument, logger) 
    if ltf_doc is None:
        shutil.rmtree(temp_dir) 
        return
    # Attempt tagging.
    try:
        # Extract tokens.
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
        txt = ltf_doc.text() 
        spans = aligner.align(txt, tokens)
        # Extract features
        featsf = os.path.join(temp_dir, 'feats.txt')
        feats = enc.get_feats(tokens) 
        write_crfsuite_file(featsf, feats)
        # Tag.
        # print "tmep_dir"+temp_dir
        tagsf = os.path.join(temp_dir, 'tags.txt')
        #probf = os.path.join(temp_dir, 'probs.txt')
        cmd = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf,
               featsf]
        with open(tagsf, 'w') as f:
            subprocess.call(cmd, stdout=f)
        # Load tagged output2.

        probf1 = ltf.replace('ltf', 'probs')
	probf = probf1.replace('test', 'probs')

        # print probf
        cmd_ = ['/home/wangtianlu/local/bin/crfsuite', 'tag',
               '-m', modelf, '-i',
               featsf]
        with open(probf, 'w') as f:
            subprocess.call(cmd_, stdout=f)


        # maxprobf = ltf.replace('ltf', 'maxprobs')
        #
        # cmd_ = ['/Users/koala/Documents/lab/Blender/LORELEI/active_learning/ne-tagger/lib/crf/bin/crfsuite','tag',
        #        '-m', modelf, '-i',
        #        featsf]
        # with open(maxprobf, 'w') as f:
        #     subprocess.call(cmd_, stdout=f)

        with open(tagsf, 'r') as f:
            tags = [line.strip() for line in f]
            # print len(tags)  # todo
            tags = tags[:len(tokens)]
            # print len(tags)  # todo
            # print 'this is tags'
            # print tags # todo
        # Chunk tags.
        chunks = chunker.tags_to_chunks(tags)  # todo:bughere
        # Construct mentions.
        doc_id = ltf_doc.doc_id
        mentions = []
        n = 1 
        for token_bi, token_ei, tag in chunks:
            if tag == 'O':
                continue 

            # Assign entity id.
            entity_id = '%s-NE%d' % (doc_id, n) 

            # Determine char onsets/offset for mention extent.
            start_char = token_onsets[token_bi] 
            end_char = token_offsets[token_ei] 

            # Finally, determine text of extent and append.
            extent_bi = spans[token_bi][0] 
            extent_ei = spans[token_ei][1] 
            extent = txt[extent_bi:extent_ei+1] 
            mentions.append([entity_id,           # entity id
                             tag,                 # NE type
                             extent,              # extent text
                             start_char,          # extent char onset
                             end_char,            # extent char offset
                            ]) 

            n += 1 

        # Write detected mentions to LAF file.
        bn = os.path.basename(ltf)
        laf = os.path.join(tagged_dir, bn.replace('.ltf.xml', tagged_ext)) 
        laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id) 
        laf_doc.write_to_file(laf) 
    except KeyError:
        logger.warn('Problem with %s. Skipping.' % ltf) 

    # Clean up.
    shutil.rmtree(temp_dir)