예제 #1
0
class ProbeStep(MAT.PluginMgr.PluginStep):
    def addFileType(option, optstring, value, parser):
        try:
            ftype = MAT.DocumentIO.getInputDocumentIOClass(value)
            ftype.addOptions(parser.aggregator, values=parser.values)
        except KeyError:
            pass

    argList = [
        OpArgument("probe_a", hasArg=True),
        OpArgument("probe_b",
                   hasArg=True,
                   action="append",
                   dest="probe_b_list"),
        OpArgument("probe_c"),
        OpArgument("probe_d", hasArg=True),
        OpArgument("probe_e",
                   hasArg=True,
                   action="append",
                   dest="probe_e_list"),
        OpArgument("probe_f"),
        OpArgument("probe_g", hasArg=True, side_effect_callback=addFileType)
    ]

    def undo(self):
        pass

    def do(self,
           annotSet,
           probe_result=None,
           probe_a=None,
           probe_c=False,
           probe_b_list=None,
           probe_d=None,
           probe_e_list=None,
           probe_f=False,
           probe_g=None,
           **kw):
        if probe_result is not None:
            probe_result["probe_a"] = probe_a
            probe_result["probe_b_list"] = probe_b_list
            probe_result["probe_c"] = probe_c
            probe_result["probe_d"] = probe_d
            probe_result["probe_e_list"] = probe_e_list
            probe_result["probe_f"] = probe_f
            probe_result["probe_g"] = probe_g
        return annotSet
예제 #2
0
파일: XMLIO.py 프로젝트: govtmirror/DmD
class XMLDocumentIO(DocumentFileIO):
    def __init__(self,
                 xml_input_is_overlay=False,
                 xml_translate_all=False,
                 signal_is_xml=False,
                 xml_output_tag_exclusions=None,
                 xml_output_exclude_metadata=False,
                 encoding=None,
                 **kw):
        # Changing the default encoding.
        if encoding is None:
            encoding = "utf-8"
        DocumentFileIO.__init__(self, encoding=encoding, **kw)
        if xml_input_is_overlay:
            signal_is_xml = True
        self.excludeMetadata = xml_output_exclude_metadata
        self.xmlInputIsOverlay = xml_input_is_overlay
        self.tagExclusions = None
        if xml_output_tag_exclusions is not None:
            if type(xml_output_tag_exclusions) is str:
                xml_output_tag_exclusions = xml_output_tag_exclusions.split(
                    ",")
                if xml_output_tag_exclusions == [""]:
                    xml_output_tag_exclusions = []
            self.tagExclusions = dict([(t, True)
                                       for t in xml_output_tag_exclusions])
        self.signalIsXML = signal_is_xml
        self.xmlTranslateAll = xml_translate_all

    # This must be implemented by the children. s is a Unicode string.
    # annotDoc is an annotated document.

    def deserialize(self, s, annotDoc):
        # If there's no global annotation type repository, we want xmlTranslateAll to be True.
        state = _ParserState(
            annotDoc, self.xmlInputIsOverlay, self.xmlTranslateAll
            or (not annotDoc.atypeRepository.globalTypeRepository))
        state.parse(s)

    def writeToUnicodeString(self, annotDoc):
        signalIsXML = self.signalIsXML or \
                      (annotDoc.metadata.has_key("signal_type") and
                       annotDoc.metadata["signal_type"] == "xml")
        # Get all the annotations. Let's not care about overlap right now,
        # since overlap will happen if I'm writing everything out, because
        # it'll be nested. So just get the annotations and then
        # sort them, and if we ever get a crossing dependency, we'll
        # have to check otherwise.
        # Split the atypes into spanned and spanless.
        spanned = []
        spanless = []
        # I used to remove the tag exclusions when I collect the
        # indices, but I need to do it earlier in order to figure out
        # if I need a top element or not.
        annots = []
        spanlessAnnots = []
        for atype in annotDoc.atypeDict.keys():
            if self.tagExclusions and self.tagExclusions.has_key(atype.lab):
                continue
            if atype.hasSpan:
                spanned.append(atype.lab)
            else:
                spanless.append(atype.lab)
        if spanned:
            annots = annotDoc.getAnnotations(atypes=spanned)
        if spanless:
            spanlessAnnots = annotDoc.getAnnotations(atypes=spanless)
        # We now know they can nest. So let's sort them.
        # Sort them first by earliest start, latest end.
        annots.sort(self._cmpAnnot)
        # Now, I can loop through the annots, and keep a stack,
        # and we know when to pop the stack because of
        # how the indexes work.
        indices = {}
        lastAnnot = None
        for annot in annots:
            if lastAnnot and \
               (((lastAnnot.start < annot.start) and (lastAnnot.end < annot.end)) or \
                ((lastAnnot.start > annot.start) and (lastAnnot.end > annot.end))):
                raise SaveError, "crossing dependencies"
            try:
                indices[annot.start][0].append(annot)
            except KeyError:
                indices[annot.start] = [[annot], []]
            try:
                indices[annot.end][1].append(annot)
            except KeyError:
                indices[annot.end] = [[], [annot]]
        indexList = indices.keys()
        indexList.sort()
        segs = []
        # So we need to add a toplevel XML tag if we don't already have one, and if
        # we're not adding our own info.
        # The signal is not XML, and
        # There are no spanless annotations, or
        # The maximal annotation starts after the beginning, or
        # the maximal annotation ends before the end, or
        # there are spanless annots, which will be inserted as zero-length
        # annots before the first spanned annotation, or
        # we add metadata, which will insert the annotation types
        # in a similar position.
        addTop = (not signalIsXML) and \
                 ((not annots) or \
                  (annots[0].start > 0) or \
                  (annots[0].end < len(annotDoc.signal)) or \
                  spanlessAnnots or \
                  (not self.excludeMetadata))
        if addTop:
            segs.append("<__top>")
        pos = 0
        atypesInserted = False
        for i in indexList:
            if pos < i:
                seg = annotDoc.signal[pos:i]
                if not signalIsXML:
                    seg = xml.sax.saxutils.escape(seg)
                segs.append(seg)
                pos = i
            [starts, ends] = indices[i]
            # Reverse the ends.
            ends.reverse()
            for endAnnot in ends:
                segs.append("</" + endAnnot.atype.lab + ">")
            for startAnnot in starts:
                if not atypesInserted:
                    if not self.excludeMetadata:
                        segs.append(self._formatAtypes(annotDoc))
                    atypesInserted = True
                    if spanlessAnnots:
                        for sAnnot in spanlessAnnots:
                            segs.append(
                                self._formatAnnot(sAnnot, spanless=True))
                segs.append(self._formatAnnot(startAnnot))
        if pos < len(annotDoc.signal):
            seg = annotDoc.signal[pos:]
            if not signalIsXML:
                seg = xml.sax.saxutils.escape(seg)
            segs.append(seg)
        if addTop:
            segs.append("</__top>")
        if not self.excludeMetadata:
            segs.append("<!-- _mat_metadata_ " +
                        base64.b64encode(json.dumps(annotDoc.metadata)) +
                        " -->")
        return "".join(segs)

    def _cmpAnnot(self, ann1, ann2):
        return cmp(ann1.start, ann2.start) or -cmp(ann1.end, ann2.end)

    def _formatAtypes(self, annotDoc):
        segs = ["<_mat:atypes>"]
        for atype in annotDoc.atypeDict.keys():
            segs.append("<_mat:atype name='%s' hasSpan='%s'>" %
                        (atype.lab, (atype.hasSpan and "yes") or "no"))
            for attr in atype.attr_list:
                segs.append(
                    "<_mat:attr name='%s' type='%s' aggregation='%s'/>" %
                    (attr.name, attr._typename_, attr.aggregation or "none"))
            segs.append("</_mat:atype>")
        segs.append("</_mat:atypes>")
        return "".join(segs)

    def _formatAnnot(self, annot, spanless=False):
        elts = ["<", annot.atype.lab]
        if annot.attrs:
            for attr, val in zip(annot.atype.attr_list, annot.attrs):
                if val is not None:
                    # Handle annotations specially.
                    if attr._typename_ == "annotation":
                        if attr.aggregation:
                            v = ",".join([str(a.id) for a in val])
                        else:
                            v = str(val.id)
                    else:
                        v = attr.toStringNonNull(val)
                    elts.append(" " + attr.name + "=" +
                                xml.sax.saxutils.quoteattr(v))
        if annot.id is not None:
            elts.append(" _mat:id=" + xml.sax.saxutils.quoteattr(annot.id))
        if spanless:
            elts.append("/>")
        else:
            elts.append(">")
        return "".join(elts)

    # Best to use the OpArgument infrastructure, so we can extract
    # arguments cleanly from CGI and cmdline.

    inputArgs = OptionTemplate([
        OpArgument(
            "xml_input_is_overlay",
            help=
            "If specified, the input XML will be treated as a mix of task-relevant annotations and underlying XML, and the extracted signal will be a well-formed XML file"
        ),
        OpArgument(
            "xml_translate_all",
            help=
            "If specified, all tags will be converted, whether or not they're found in whatever task is specified"
        )
    ],
                               heading="Options for XML input")

    outputArgs = OptionTemplate([
        OpArgument(
            "signal_is_xml",
            help=
            "If specified, the underlying signal will be treated as a well-formed XML file when the output file is rendered. If the input file type is also 'xml-inline', use the --xml_input_is_overlay flag to control this setting instead."
        ),
        OpArgument(
            "xml_output_tag_exclusions",
            help=
            "A comma-delimited list of annotation labels to exclude from the XML output",
            hasArg=True),
        OpArgument(
            "xml_output_exclude_metadata",
            help=
            "Normally, the XML writer saves the document metadata inside an XML comment, so it can be read back in by the XML reader. This flag causes the metadata not to be written."
        )
    ],
                                heading="Options for XML output")
예제 #3
0
class ModelBuilder(OptionBearer):

    argList = [
        OpArgument(
            "partial_training_on_gold_only",
            help=
            "When the trainer is presented with partially tagged documents, by default MAT will ask it to train on all annotated segments, completed or not. If this flag is specified, only completed segments should be used for training."
        )
    ]

    def __init__(self,
                 task,
                 buildInfo,
                 file_type='mat-json',
                 partial_training_on_gold_only=False,
                 **kw):
        OptionBearer.__init__(self)
        # Task is required. All the elements will override whatever happens
        # to be in the task.
        if task is None:
            raise ModelBuilderError, "task is required"
        self.task = task
        self.buildInfo = buildInfo
        self.partialTrainingOnGoldOnly = partial_training_on_gold_only
        self.reader = MAT.DocumentIO.getDocumentIO(file_type,
                                                   task=self.task,
                                                   **kw)

    # docTmpDir is a temporary directory to put the documents in, in case
    # the caller wants to inspect them for some reason. tmpDir is the
    # tmp directory to use for everything, except for the docTmpDir if it's
    # provided.

    def _run(self, modelOutputFile, fileList, docTmpDir, tmpDir, oStream):
        raise ModelBuilderError, "not implemented"

    def _clearDir(self, d):
        # For everything in the directory, remove it. I could try to
        # remove the directory tree and then recreate it, but it's possible
        # that this directory is writeable by me and not its parent.
        for p in os.listdir(d):
            p = os.path.join(d, p)
            if os.path.isdir(p):
                shutil.rmtree(p)
            else:
                os.remove(p)

    def run(self,
            modelOutputFile,
            fileList,
            docTmpDir=None,
            tmpDir=None,
            oStream=None,
            collectCorpusStatistics=False):

        if tmpDir:
            # Clear the temp directory.
            if oStream:
                print >> oStream, "Clearing temp directory..."
            self._clearDir(tmpDir)
            self._runInTmpdirScope(modelOutputFile, fileList, docTmpDir,
                                   tmpDir, oStream, collectCorpusStatistics)
        else:
            with MAT.ExecutionContext.Tmpdir() as tmpDir:
                self._runInTmpdirScope(modelOutputFile, fileList, docTmpDir,
                                       tmpDir, oStream,
                                       collectCorpusStatistics)

    def _runInTmpdirScope(self, modelOutputFile, fileList, docTmpDir, tmpDir,
                          oStream, collectCorpusStatistics):

        if docTmpDir:
            # Clearing the temp directory.
            if oStream:
                print >> oStream, "Clearing document temp directory..."
            self._clearDir(docTmpDir)
        else:
            docTmpDir = os.path.join(tmpDir, "docs")
            os.mkdir(docTmpDir)

        self._run(modelOutputFile, fileList, docTmpDir, tmpDir, oStream)
        if collectCorpusStatistics:
            self.collectCorpusStatistics(fileList, docTmpDir)

    # More things you can override. Originally, this code was in the
    # experiment engine, because that's where it's used, but in some rare
    # situations, the corpus that's passed in to the model builder is not
    # exactly the corpus that's used to build the model. This situation might
    # arise if you have a special model builder you're using in experiments
    # which incrementally builds the model on the last one, or has to do
    # some extensive computation to prepare the corpus, which changes the
    # corpus statistics. So I've moved the code here, so it can be
    # overridden by these special model builders. Most people will never,
    # ever have to touch it.

    def reportCorpusStatistics(self):
        if hasattr(self, "corpusStatistics"):
            return self.corpusStatistics
        else:
            return {
                "totalDocuments": 0,
                "totalItems": 0,
                "totalTokens": 0,
                "totalItemTokens": 0,
                "totalItemsByTag": {},
                "totalItemTokensByTag": {}
            }

    def collectCorpusStatistics(self, fileList, docTmpDir):
        totalDocuments = len(fileList)
        totalItems = 0
        totalTokens = 0
        totalItemTokens = 0
        totalItemsByTag = {}
        totalItemTokensByTag = {}
        for f in fileList:
            thisTotalItems, thisTotalItemTokens, thisTotalItemsByTag, \
                            thisTotalTokens, thisTotalItemTokensByTag = self.collectFileStatistics(f, docTmpDir)
            totalItems += thisTotalItems
            totalTokens += thisTotalTokens
            totalItemTokens += thisTotalItemTokens
            for k, v in thisTotalItemsByTag.items():
                try:
                    totalItemsByTag[k] += v
                except KeyError:
                    totalItemsByTag[k] = v
            for k, v in thisTotalItemTokensByTag.items():
                try:
                    totalItemTokensByTag[k] += v
                except KeyError:
                    totalItemTokensByTag[k] = v
        self.corpusStatistics = {
            "totalDocuments": totalDocuments,
            "totalItems": totalItems,
            "totalTokens": totalTokens,
            "totalItemTokens": totalItemTokens,
            "totalItemsByTag": totalItemsByTag,
            "totalItemTokensByTag": totalItemTokensByTag
        }

    # This is the important one, I think. For each document, we open it up
    # and loop through the content and lex tags. But we need to do it within
    # a particular region, say, zones.

    def collectFileStatistics(self, trainingF, docTmpDir):

        f = os.path.join(docTmpDir, os.path.basename(trainingF))
        _jsonIO = MAT.DocumentIO.getDocumentIO('mat-json', task=self.task)
        doc = _jsonIO.readFromSource(f)

        return self.collectDocumentStatistics(
            doc, [(zone.start, zone.end) for zone in doc.orderAnnotations(
                self.task.getAnnotationTypesByCategory("zone"))])

    # collectDocumentStatistics needs to do something sensible for
    # spanless annotations. So what I'm going to do is compute the
    # implied span of the spanless content annotations, and those
    # which have no span will be assigned the start index of the
    # first zone, and the end index of the last. Then, we
    # filter by regions.

    def collectDocumentStatistics(self, doc, orderedRegionList):

        contentTags = {}
        totalItems = 0
        totalByTag = {}
        totalToksByTag = {}
        totalToks = 0
        totalItemToks = 0
        task = self.task

        localToks = []
        localSpannedContent = []
        localSpanlessContent = []
        for atype, annots in doc.atypeDict.items():
            if MAT.Score.checkLexTag(contentTags, atype, task):
                localToks += annots
            elif MAT.Score.checkContentTag(contentTags, atype, task):
                if atype.hasSpan:
                    localSpannedContent += annots
                else:
                    localSpanlessContent += annots

        maxRegionHash = {}
        if localSpanlessContent:
            regionList = MAT.Document.AnnotatedDoc.processableRegions(
                [doc], task=task)[0]
            maxRegionHash[doc] = (regionList[0][0], regionList[-1][1])

        # Now we have all the relevant objects. We need to use the same
        # algorithm from Score.py to filter by regions.

        (localSpannedContent, localToks), (localSpanlessContent,) = \
                              MAT.Pair.PairState.filterByRegions(orderedRegionList,
                                                                 spannedLists = [localSpannedContent, localToks],
                                                                 spanlessLists = [localSpanlessContent],
                                                                 maxRegionHash = maxRegionHash)

        localContent = localSpannedContent + localSpanlessContent

        localContent = [(task.getEffectiveAnnotationLabel(annot), annot)
                        for annot in localContent]

        # Now, they're filtered and labeled. I can finally collect the statistics.

        totalToks = len(localToks)
        totalItems = len(localContent)

        for lab, annot in localContent:
            if totalByTag.has_key(lab):
                totalByTag[lab] += 1
            else:
                totalByTag[lab] = 1

        # Sort the tokens, and label them using the same algorithm
        # from Score.py. Be aware that there may be no tokens.

        if localToks:
            tStartMap = {}
            tEndMap = {}

            j = 0
            for t in localToks:
                tStartMap[t.start] = j
                tEndMap[t.end] = j
                j += 1
            for label, ann in localContent:
                # What should we do if the indices don't line up?
                # Keep going backward and/or forward until you
                # find something that works? What a mess. Either
                # that, or we'd have to abort collecting the statistics.
                s = ann.start
                noTokens = False
                while True:
                    try:
                        startI = tStartMap[s]
                        break
                    except KeyError:
                        # Let's say the start doesn't line up.
                        # What to do? Start shrinking it.
                        # If you can't find a token that starts
                        # in the annotation, there are no
                        # tokens.
                        s += 1
                    if s >= ann.end:
                        noTokens = True
                        break
                if not noTokens:
                    e = ann.end
                    while True:
                        try:
                            endI = tEndMap[e]
                            break
                        except KeyError:
                            # Shrink the end.
                            e -= 1
                        if e <= ann.start:
                            noTokens = True
                            break
                if noTokens:
                    tokIncrement = 0
                else:
                    tokIncrement = 1 + (endI - startI)
                try:
                    totalToksByTag[label] += tokIncrement
                except KeyError:
                    totalToksByTag[label] = tokIncrement
                totalItemToks += tokIncrement

        return totalItems, totalItemToks, totalByTag, totalToks, totalToksByTag
예제 #4
0
class CarafeModelBuilder(MAT.ModelBuilder.ModelBuilder):

    argList = MAT.ModelBuilder.ModelBuilder.argList + \
              [OpArgument("feature_spec", hasArg = True,
                          help = "path to the Carafe feature spec file to use. Optional if feature_spec is set in the <build_settings> for the relevant model config in the task.xml file for the task."),
               OpArgument("training_method", hasArg = True,
                          help = "If present, specify a training method other than the standard method. Currently, the only recognized value is psa. The psa method is noticeably faster, but may result in somewhat poorer results. You can use a value of '' to override a previously specified training method (e.g., a default method in your task)."),
               OpArgument("max_iterations", hasArg = True,
                          help = "number of iterations for the optimized PSA training mechanism to use. A value between 6 and 10 is appropriate. Overrides any possible default in <build_settings> for the relevant model config in the task.xml file for the task."),
               OpArgument("lexicon_dir", hasArg = True,
                          help = "If present, the name of a directory which contains a Carafe training lexicon. This pathname should be an absolute pathname, and should have a trailing slash. The content of the directory should be a set of files, each of which contains a sequence of tokens, one per line. The name of the file will be used as a training feature for the token. Overrides any possible default in <build_settings> for the relevant model config in the task.xml file for the task."),
               OpArgument("parallel", help = "If present, parallelizes the feature expectation computation, which reduces the clock time of model building when multiple CPUs are available"),
               OpArgument("nthreads", hasArg = True, help = "If --parallel is used, controls the number of threads used for training."),
               OpArgument("gaussian_prior", hasArg = True,
                          help = "A positive float, default is 10.0. See the jCarafe docs for details."),
               OpArgument("no_begin", help = "Don't introduce begin states during training. Useful if you're certain that you won't have any adjacent spans with the same label. See the jCarafe documentation for more details."),
               OpArgument("l1", help = "Use L1 regularization for PSA training. See the jCarafe docs for details."),
               OpArgument("l1_c", hasArg = True,
                          help = "Change the penalty factor for the L1 regularizer. See the jCarafe docs for details."),
               OpArgument("heap_size", hasArg = True,
                          help = "If present, specifies the -Xmx argument for the Java JVM"),
               OpArgument("stack_size", hasArg = True,
                          help = "If present, specifies the -Xss argument for the Java JVM"),
               OpArgument("tags", hasArg = True,
                          help = "if present, a comma-separated list of tags to pass to the training engine instead of the full tag set for the task (used to create per-tag pre-tagging models for multi-stage training and tagging)"),
               OpArgument("pre_models", hasArg = True,
                          help = "if present, a comma-separated list of glob-style patterns specifying the models to include as pre-taggers."),
               OpArgument("add_tokens_internally", help = "If present, Carafe will use its internal tokenizer to tokenize the document before training. If your workflow doesn't tokenize the document, you must provide this flag, or Carafe will have no tokens to base its training on. We recommend strongly that you tokenize your documents separately; you should not use this flag."),
               OpArgument("word_properties", hasArg = True, help = "See the jCarafe docs for --word-properties."),
               OpArgument("word_scores", hasArg = True, help = "See the jCarafe docs for --word-scores."),
               OpArgument("learning_rate", hasArg = True, help = "See the jCarafe docs for --learning-rate."),
               OpArgument("disk_cache", hasArg = True, help = "See the jCarafe docs for --disk_cache.")]

    def __init__(self,
                 task,
                 buildInfo,
                 feature_spec=None,
                 max_iterations=None,
                 parallel=False,
                 nthreads=None,
                 training_method=None,
                 gaussian_prior=None,
                 no_begin=False,
                 l1=False,
                 l1_c=None,
                 lexicon_dir=None,
                 heap_size=None,
                 stack_size=None,
                 tags=None,
                 pre_models=None,
                 add_tokens_internally=False,
                 word_properties=None,
                 word_scores=None,
                 learning_rate=None,
                 disk_cache=None,
                 **kw):
        MAT.ModelBuilder.ModelBuilder.__init__(self, task, buildInfo, **kw)
        self.addTokensInternally = add_tokens_internally
        self.featureSpec = feature_spec
        if self.featureSpec is None:
            # Use the default one.
            self.featureSpec = os.path.join(
                os.path.dirname(MAT.Config.MATConfig["JCARAFE_JAR"]),
                "resources", "default.fspec")
        if not os.path.isabs(self.featureSpec):
            self.featureSpec = os.path.join(self.task.taskRoot,
                                            self.featureSpec)
        # So you can shut off lexicon_dir.
        self.lexiconDir = lexicon_dir
        if (self.lexiconDir
                is not None) and (not os.path.isabs(self.lexiconDir)):
            self.lexiconDir = os.path.join(self.task.taskRoot, self.lexiconDir)
        self.maxIterations = max_iterations
        self.parallel = parallel
        self.nthreads = nthreads
        if self.nthreads is not None:
            self.nthreads = int(self.nthreads)
        self.passThroughArgs = {
            "--word-properties": word_properties,
            "--word-scores": word_scores,
            "--learning-rate": learning_rate,
            "--disk-cache": disk_cache
        }
        # Make sure you can use the empty string to override.
        self.trainingMethod = training_method
        if self.trainingMethod == "":
            self.trainingMethod = None
        if (self.trainingMethod is not None) and (self.trainingMethod
                                                  not in ["psa"]):
            raise ModelBuilderError, ("unknown training method '%s'" %
                                      self.trainingMethod)
        # It can be passed in as an int or a string.
        if self.maxIterations is not None:
            self.maxIterations = int(self.maxIterations)
        self.heapSize = heap_size
        self.stackSize = stack_size
        self.tags = None
        if tags is not None:
            self.tags = tags.split(",")
        self.preModels = None
        pre_models = pre_models
        if pre_models is not None:
            modelPats = pre_models.split(",")
            import glob
            self.preModels = []
            for pat in modelPats:
                self.preModels += glob.glob(pat)
        self.gaussianPrior = gaussian_prior
        self.noBegin = no_begin
        self.dol1Regularization = l1
        self.l1Penalty = l1_c

    # So. We may need two different temp directories. The first (maybe) is for the files, the second
    # is for the tag_set (maybe).

    # docTmpDir is a temporary directory to put the documents in, in case
    # the caller wants to inspect them for some reason. tmpDir is the
    # tmp directory to use for everything, except for the docTmpDir if it's
    # provided.

    def _run(self, modelOutputFile, fileList, docTmpDir, tmpDir, oStream):

        if not self.tags:
            tagSetFile = self._createTagSetFile(tmpDir)
        else:
            tagSetFile = None

        self._prepareDocuments(fileList, docTmpDir, oStream)

        cmdContainer = self._prepareJavaCmd(modelOutputFile, docTmpDir,
                                            tagSetFile)

        if oStream:
            print >> oStream, "Cmdline is", cmdContainer.createCmdline()

        # For the moment, until the modeler writes to stdout only, we'll
        # interleave error in the case where we're subprocess monitoring.
        cmd = SimpleLocalProcess(cmdContainer)
        # stdout can't be None.
        exitStatus, errMsg = cmd.RunSynchronous(stdout=oStream or sys.stdout)

        if exitStatus != _SUB_SUCCESS:
            raise ModelBuilderError, (
                "Command failure during model training (%s); cmdline was %s" %
                (errMsg, cmdContainer.createCmdline()))

    # Support methods. Might be overridden below.

    def _createTagSetFile(self, tmpDir):

        # Build the tag set file. Use a temporary directory,
        # because access to temp files across platforms isn't
        # guaranteed, according to the Python docs.
        tagSetFile = os.path.join(tmpDir, "tag_set")
        # A tag set file should contain only the names of
        # the tags, one per line, no newlines. If the tags
        # have attr sets, make sure you create a special
        # tag set file which has the proper attr syntax.
        # If there's more than one attr in the attr set,
        # it can't be done.
        # Opening "wb" because it appears that \n\r on Windows
        # hoses Java Carafe. Well, now it doesn't; the OTHER one hoses it on
        # Windows.
        fp = open(tagSetFile, "w")
        # First, do all the effective info. If the true label isn't in
        # trueL, it's an error unless the label category is "token".
        # Then, for all labels which aren't already mentioned, add the labels.
        # If there are attributes which aren't effective labels,
        # ignore them. If any of the labels are spanless, then barf.
        trueL, attrMap, effectiveInfo = self.task.getLabelsAndAttributesForCategory(
            "content")
        r = self.task.getAnnotationTypeRepository()
        spannedTrueL = set([t for t in trueL if r[t].hasSpan])
        trueLRemainder = set(trueL)
        okToks = set()
        for eName, (trueLabel, attr, val) in effectiveInfo.items():
            if trueLabel not in spannedTrueL:
                continue
            if (trueLabel not in trueL) and (trueLabel not in okToks):
                if self.task.getCategoryForLabel(trueLabel) != "token":
                    raise ModelBuilderError, (
                        "can't build Carafe model for effective labels (%s) whose true label isn't in the same category, unless it's a token"
                        % eName)
                else:
                    okToks.add(trueLabel)
            fp.write("%s:%s=%s\n" % (trueLabel, attr, val))
            trueLRemainder.discard(trueLabel)
        for name in (trueLRemainder & spannedTrueL):
            fp.write(name + "\n")
        fp.close()
        return tagSetFile

    # Ensure that the documents, in case they have the same names, are distinguished.
    # But don't change them unless you have to (test suite barfs, etc.).

    # Actually, things have gotten a bit more complicated. Now that I'm using SEGMENT
    # annotations, the system can train on any segment that isn't owned by MACHINE
    # and has been annotated. We might restrict it further to gold or reconciled.
    # So we never use copy anymore.

    # We're also coding around two bugs of Ben's: first, that Carafe will train
    # against everything if no regions are found, and second, that Carafe won't
    # match a region if it has more than one attribute-value pair.

    # Note that we need CARAFE_INSTRUCTION for the latter reason.

    def _prepareDocuments(self, fileList, docTmpDir, oStream):

        basenameMap = {}

        if self.reader.__class__ is not MAT.DocumentIO.JSONDocumentIO:
            if oStream:
                print >> oStream, "Converting files to MAT JSON format..."
        else:
            if oStream:
                print >> oStream, "Copying files..."

        foundDocs = False
        docEntries = []
        for f in fileList:
            bname = os.path.basename(f)
            try:
                basenamePrefix = str(basenameMap[f]) + "_"
                basenameMap[f] += 1
            except KeyError:
                basenamePrefix = ""
                basenameMap[f] = 1
            docEntries.append(
                [f, bname, basenamePrefix,
                 self.reader.readFromSource(f)])

        # It used to be the case that we could just copy over documents which
        # were already in the right format. But now we have to find those segments
        # which are useable, and mark them.

        def usableSeg(seg):
            if (seg.get("annotator") in [None, "MACHINE"]) or \
               (self.partialTrainingOnGoldOnly and (seg.get("status") == "non-gold")):
                return False
            else:
                return True

        regionLists = MAT.Document.AnnotatedDoc.processableRegions(
            [d for [f, bname, basenamePrefix, d] in docEntries],
            task=self.task,
            segmentFilterFn=usableSeg)

        for regionList, [f, bname, basenamePrefix,
                         d] in zip(regionLists, docEntries):
            if regionList:
                # At the moment, Carafe ignores region markings if there are no regions at all. So
                # we can only use docs where segs were found.
                foundDocs = True
                t = d.findAnnotationType("CARAFE_INSTRUCTION")
                for [start, end, ignore] in regionList:
                    d.createAnnotation(start, end, t)
                _jsonIO.writeToTarget(
                    d, os.path.join(docTmpDir, basenamePrefix + bname))

        if not foundDocs:
            raise ModelBuilderError, "No appropriate segments found for model building"

    def _prepareJavaCmd(self, modelOutputFile, docTmpDir, tagSetFile):

        # regionStr = ":".join(regions)
        # Adding double-quotes for those situations where there are spaces in pathnames.
        cmdContainer = _jCarafeJarInvocation(
            heap_size=self.heapSize,
            stack_size=self.stackSize,
            cls="org.mitre.jcarafe.tagger.GenericTagger",
            task=self.task)
        cmdContainer.extend(
            [
                '--mode', 'json', '--train', '--input-dir', "%(input)s",
                '--model', "%(model)s", '--fspec', "%(fspec)s", "--region",
                "CARAFE_INSTRUCTION"
            ], {
                'input': docTmpDir,
                'model': modelOutputFile,
                'fspec': self.featureSpec
            })

        if not self.addTokensInternally:
            cmdContainer.extend(['--no-pre-proc'])

        # No need. We're standardizing on CARAFE_INSTRUCTION above.
        #zType, rAttr, regions = self.task.getTrueZoneInfo()
        #if rAttr:
        #    for region in regions:
        #        cmdContainer.extend(["--region", "%s:%s=%s" % (zType, rAttr, region)])
        #else:
        #    cmdContainer.extend(["--region", zType])

        # Either the tag set, or a set of tags.
        if self.tags is not None:
            for t in self.tags:
                cmdContainer.extend(['--tag', t])
        else:
            cmdContainer.extend(['--tagset', "%(tagset)s"],
                                {'tagset': tagSetFile})
        if self.gaussianPrior is not None:
            cmdContainer.extend(['--gaussian-prior', self.gaussianPrior])
        if self.lexiconDir is not None:
            cmdContainer.extend(['--lexicon-dir', "%(lexicon)s"],
                                {'lexicon': self.lexiconDir})
        if self.preModels is not None:
            # We need a separate binding for each model, because
            # the dictionary requires separate values.
            i = 0
            for m in self.preModels:
                cmdContainer.extend(
                    ['--pre-model', "%%(preModel%d)s" % i],
                    {"preModel" + str(i): m})
                i += 1
        if self.parallel:
            cmdContainer.extend(['--parallel'])
            if self.nthreads is not None:
                cmdContainer.extend(['--nthreads', str(self.nthreads)])
        if self.noBegin:
            cmdContainer.extend(['--no-begin'])
        if self.maxIterations is not None:
            cmdContainer.extend(['--max-iters', "%d" % self.maxIterations])
        if self.trainingMethod is not None:
            cmdContainer.extend(["--" + self.trainingMethod])
            if self.dol1Regularization:
                cmdContainer.extend(['--l1'])
                if self.l1Penalty is not None:
                    cmdContainer.extend(['--l1-C', str(self.l1Penalty)])
        for key, val in self.passThroughArgs.items():
            if val is not None:
                cmdContainer.extend([key, val])
        return cmdContainer

    # Corpus statistics. We want to include just those documents which have at least
    # one CARAFE_INSTRUCTION segment, and just those segments. This should ultimately
    # be promoted to ModelBuilder, but the world doesn't necessarily know about
    # this yet.

    def collectCorpusStatistics(self, fileList, docTmpDir):
        # We're relying here on the fact that all collectCorpusStatistics does
        # is loop through the file list and call collectFileStatistics. So I can
        # open the docs here. And, by the way, some of the files won't be there,
        # because no segments were found. So we don't have to test the segments
        # here, because we know that any document that's present will have
        # segments.
        trueFileList = []
        for trainingF in fileList:
            f = os.path.join(docTmpDir, os.path.basename(trainingF))
            if os.path.exists(f):
                doc = _jsonIO.readFromSource(f)
                trueFileList.append(doc)
        return MAT.ModelBuilder.ModelBuilder.collectCorpusStatistics(
            self, trueFileList, docTmpDir)

    def collectFileStatistics(self, doc, docTmpDir):
        # doc is an actual document object - see above.
        return self.collectDocumentStatistics(
            doc, [(zone.start, zone.end)
                  for zone in doc.orderAnnotations(["CARAFE_INSTRUCTION"])])
예제 #5
0
class CarafeTagStep(CmdlineTagStep):

    argList = CmdlineTagStep.argList + [
        OpArgument(
            "prior_adjust",
            help=
            "Bias the Carafe tagger to favor precision (positive values) or recall (negative values). Default is -1.0 (slight recall bias). Practical range of values is usually +-6.0.",
            hasArg=True),
        OpArgument(
            "heap_size",
            hasArg=True,
            help="If present, specifies the -Xmx argument for the Java JVM"),
        OpArgument(
            "stack_size",
            hasArg=True,
            help="If present, specifies the -Xss argument for the Java JVM"),
        OpArgument(
            "tagging_pre_models",
            hasArg=True,
            help=
            "if present, a comma-separated list of glob-style patterns specifying the models to include as pre-taggers."
        ),
        OpArgument("parallel", help="Parallelizes the decoding"),
        OpArgument(
            "nthreads",
            hasArg=True,
            help=
            "If --parallel is used, controls the number of threads used for decoding."
        ),
        OpArgument(
            "add_tokens_internally",
            help=
            "If present, Carafe will use its internal tokenizer to tokenize the document before tagging. If your workflow doesn't tokenize the document, you must provide this flag, or Carafe will have no tokens to base its tagging on. We recommend strongly that you tokenize your documents separately; you should not use this flag."
        ),
        OpArgument(
            "capture_token_confidences",
            help=
            "If present, Carafe will capture token confidence metrics for later exploitation."
        ),
        OpArgument(
            "capture_sequence_confidences",
            help=
            "If present, Carafe will capture sequence confidence metrics for later exploitation."
        )
    ]

    #
    # Carafe support.
    #

    # This is all taken from the AMIA tagger. There ought to be generic
    # support for AMIA tagger is kind of complex. I've tried to generalize
    # it.

    # I'm not ruling out the possibility that heap_size, etc., can be
    # provided as createSettings in step_implementation.

    def tagWithCarafe(self,
                      annotSets,
                      heap_size=None,
                      stack_size=None,
                      prior_adjust=None,
                      tagging_pre_models=None,
                      add_tokens_internally=False,
                      capture_token_confidences=False,
                      capture_sequence_confidences=False,
                      parallel=False,
                      nthreads=None,
                      **kw):
        if heap_size is None and self.initSettings.has_key("heap_size"):
            heap_size = self.initSettings["heap_size"]
        if stack_size is None and self.initSettings.has_key("stack_size"):
            stack_size = self.initSettings["stack_size"]
        if prior_adjust is None and self.initSettings.has_key("prior_adjust"):
            prior_adjust = self.initSettings["prior_adjust"]
        if tagging_pre_models is None and self.initSettings.has_key(
                "tagging_pre_models"):
            tagging_pre_models = self.initSettings["tagging_pre_models"]
        if nthreads is None and self.initSettings.has_key("nthreads"):
            nthreads = self.initSettings["nthreads"]
        if nthreads is not None:
            nthreads = int(nthreads)
        parallel = parallel or self.initSettings.get("parallel", False)
        add_tokens_internally = add_tokens_internally or self.initSettings.get(
            "add_tokens_internally", False)
        capture_token_confidences = capture_token_confidences or self.initSettings.get(
            "capture_token_confidences", False)
        capture_sequence_confidences = capture_sequence_confidences or self.initSettings.get(
            "capture_sequence_confidences", False)
        try:
            s = CarafeTagger(
                self.descriptor,
                self.stepName,
                heap_size=heap_size,
                stack_size=stack_size,
                prior_adjust=prior_adjust,
                tagging_pre_models=tagging_pre_models,
                add_tokens_internally=add_tokens_internally,
                capture_token_confidences=capture_token_confidences,
                capture_sequence_confidences=capture_sequence_confidences,
                parallel=parallel,
                nthreads=nthreads,
                **kw)
            s.Process(annotSets)
        except TaggerConfigurationError, e:
            raise PluginError, "Carafe not configured properly for this task and workflow: " + str(
                e)
예제 #6
0
class CarafeTokenizationStep(CmdlineTokenizationStep):

    argList = [
        OpArgument(
            "heap_size",
            hasArg=True,
            help="If present, specifies the -Xmx argument for the Java JVM"),
        OpArgument(
            "stack_size",
            hasArg=True,
            help="If present, specifies the -Xss argument for the Java JVM"),
        OpArgument(
            "handle_tags",
            help=
            "If present, treat the signal as XML and tokenize XML elements as single tokens"
        ),
        OpArgument("tokenizer_patterns",
                   hasArg=True,
                   help="See the jCarafe docs on --tokenizer-patterns.")
    ]

    def doBatch(self,
                iDataPairs,
                handle_tags=False,
                tokenizer_patterns=None,
                **kw):
        if iDataPairs == []:
            return iDataPairs
        # We're only going to deal with the true zone type.
        zType, rAttr, regions = self.descriptor.getTrueZoneInfo()
        # For almost all applications, this will be called from
        # the zoner. But it could be a standalone step, no problem.
        import MAT.Config, MAT.Document
        from MAT.Command import FileSystemCmdlineLocalProcess
        # regionStr = ":".join(regions)
        # Adding double-quotes for those situations where there are spaces in pathnames.
        cmdContainer = _jCarafeJarInvocation(
            cls="org.mitre.jcarafe.tokenizer.FastTokenizer",
            task=self.descriptor,
            **kw)
        cmdContainer.extend([
            '--json', '--input-dir', "%(tok_in)s", '--output-dir',
            "%(tok_out)s"
        ], {
            "tok_in": None,
            "tok_out": None
        })

        if rAttr:
            for region in regions:
                cmdContainer.extend(
                    ["--region",
                     "%s:%s=%s" % (zType, rAttr, region)])
        else:
            cmdContainer.extend(["--region", zType])

        if handle_tags:
            cmdContainer.extend(["--handle-tags"])

        if tokenizer_patterns:
            cmdContainer.extent(["--tokenizer_patterns", tokenizer_patterns])

        cmd = FileSystemCmdlineLocalProcess(cmdContainer,
                                            inVar="tok_in",
                                            outVar="tok_out",
                                            fileDumper=_jsonIO,
                                            fileLoader=_jsonIO,
                                            argsAreDirectories=True)
        # SAM 12/27/10: exploiting the filterer temporarily to free me from having to worry
        # about Ben updating Carafe. See _jsonIO above too.
        # If the document HAPPENS to be tokenized, bad things will
        # happen: if we use mergeOnOutput alone, the tokens will be copied
        # into the document sent to Carafe, and then the document
        # will come back with two sets of tokens: one from the original
        # tokenization, and one from the new one. Then BOTH will
        # be merged, which will give you three sets of tokens. So
        # we have to make sure we remove them in case they're already
        # there (this is a perverse case, in which something was
        # tokenized but there's nothing in the phases to indicate
        # it, and this bug will vanish when we start inferring the "done"
        # state properly, but right now, this is probably something
        # we should be doing, just to be safe).
        lexTags = self.descriptor.getAnnotationTypesByCategory("token")
        cmd.processAnnotSets([x[1] for x in iDataPairs],
                             removeOnInput=lexTags,
                             truncateAndMergeOnOutput=lexTags
                             # mergeOnOutput = lexTags
                             )
        return iDataPairs

    def do(self, annotSet, **kw):
        return self.doBatch([(None, annotSet)], **kw)[0][1]
예제 #7
0
            if getattr(parser, "failOnFileTypes", False):
                raise ConfigurationError, (self, "input_file_type must be one of " + ", ".join(["'"+x+"'" for x in DocumentIO.allInputDocumentIO()]))
        if inCls is not None:
            inCls.addOptions(parser.aggregator, values = parser.values)

    def _output_file_type_enhancer(option, optstring, value, parser):
        outCls = None
        try:
            outCls = DocumentIO.getOutputDocumentIOClass(value)
        except KeyError:
            if getattr(parser, "failOnFileTypes", False):
                raise ConfigurationError, (self, "output_file_type must be one of " + ", ".join(["'"+x+"'" for x in DocumentIO.allOutputDocumentIO()]))
        if outCls is not None:
            outCls.addOptions(parser.aggregator, values = parser.values)

    INTERNAL_ARGS = [OpArgument("input_file", hasArg = True),
                     OpArgument("input_dir", hasArg = True),
                     OpArgument("input_file_re", hasArg = True),
                     OpArgument("input_encoding", hasArg = True),
                     OpArgument("input_file_type", hasArg = True,
                                side_effect_callback = _input_file_type_enhancer),
                     OpArgument("output_file", hasArg = True),
                     OpArgument("output_dir", hasArg = True),
                     OpArgument("output_fsuff", hasArg = True),
                     OpArgument("output_file_type", hasArg = True,
                                side_effect_callback = _output_file_type_enhancer),
                     OpArgument("output_encoding", hasArg = True),
                     OpArgument("workflow", hasArg = True),
                     OpArgument("steps", hasArg = True),
                     OpArgument("print_steps", hasArg = True),
                     OpArgument("undo_through", hasArg = True)]