예제 #1
0
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, analysisMode="INSERT", debug=False, preprocessorSteps=None, preprocessorParameters=None, logPath=None):
    global bioNLP13AnalysesTempDir
    
    print >> sys.stderr, "==========", "Converting BioNLP Shared Task", corpus, "corpus", "=========="
    assert analysisMode in ("AUTO", "INSERT", "BUILD", "SKIP")
    if logPath == "AUTO":
        if outDir != None:
            logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        else:
            logPath = None
    if logPath:
        Stream.openLog(logPath)
    downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
    packageSubPath = None
    if corpus == "BB13T2":
        packageSubPath = "task_2"
    elif corpus == "BB13T3":
        packageSubPath = "task_3"
    xml = convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, analysisMode=analysisMode, packageSubPath=packageSubPath, debug=debug, preprocessorSteps=preprocessorSteps, preprocessorParameters=preprocessorParameters)
    if logPath != None:
        Stream.closeLog(logPath)
    
    if bioNLP13AnalysesTempDir != None:
        shutil.rmtree(bioNLP13AnalysesTempDir)
        bioNLP13AnalysesTempDir = None
    
    return xml
예제 #2
0
파일: classify.py 프로젝트: ninjin/TEES
def classify(input, model, output, workDir=None, step=None, omitSteps=None, 
             goldInput=None, detector=None, debug=False, clear=False, 
             preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None: # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt") # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")
    
    classifyInput = input
    if selector.check("PREPROCESS"):
        preprocessor = Preprocessor()
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"])
    
    if selector.check("CLASSIFY"):
        detector = getDetector(detector, model)[0]() # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model)
        detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
예제 #3
0
파일: train.py 프로젝트: ninjin/TEES
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"):
    # When using a template, always remove existing work directory
    if copyFrom != None:
        deleteIfExists = True
    # Remove existing work directory, if requested to do so
    if os.path.exists(path) and deleteIfExists:
        print >> sys.stderr, "Output directory exists, removing", path
        shutil.rmtree(path)
    # Create work directory if needed
    if not os.path.exists(path):
        if copyFrom == None:
            print >> sys.stderr, "Making output directory", path
            os.makedirs(path)
        else:
            print >> sys.stderr, "Copying template from", options.copyFrom, "to", path
            shutil.copytree(options.copyFrom, path)
    else:
        print >> sys.stderr, "Using existing output directory", path
    # Remember current directory and switch to workdir
    atexit.register(os.chdir, os.getcwd())
    os.chdir(path)
    # Open log (if a relative path, it goes under workdir)
    if log != None:
        Stream.openLog(log)
    else:
        print >> sys.stderr, "No logging"
    return path
예제 #4
0
def convert(corpora, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, addAnalyses=True):
    global bioNLP13AnalysesTempDir
    
    if outDir == None:
        os.path.normpath(Settings.DATAPATH + "/corpora")
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP Shared Task", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
        packageSubPath = None
        if corpus == "BB13T2":
            packageSubPath = "task_2"
        elif corpus == "BB13T3":
            packageSubPath = "task_3"
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, addAnalyses=addAnalyses, packageSubPath=packageSubPath)
        Stream.closeLog(logFileName)
        count += 1
    
    if bioNLP13AnalysesTempDir != None:
        shutil.rmtree(bioNLP13AnalysesTempDir)
        bioNLP13AnalysesTempDir = None
예제 #5
0
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None):
    assert corpus in PPI_CORPORA
    if logPath == "AUTO":
        logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None
    if logPath:
        Stream.openLog(logPath)
    print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "=========="
    downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
    print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------"
    print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"]
    xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"])
    root = xml.getroot()
    updateXML(root, removeAnalyses)
    print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------"
    addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"])
    if develFraction > 0.0:
        print >> sys.stderr, "---------------", "Generating devel set", "---------------"
        MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1)
    if outDir != None:
        print >> sys.stderr, "---------------", "Writing corpus", "---------------"
        #if intermediateFiles:
        #print >> sys.stderr, "Writing combined corpus"
        #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml"))
        print >> sys.stderr, "Dividing into sets"
        Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml")
    
    if logPath != None:
        Stream.closeLog(logPath)
    return xml  
예제 #6
0
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"):
    # When using a template, always remove existing work directory
    if copyFrom != None:
        deleteIfExists = True
    # Remove existing work directory, if requested to do so
    if os.path.exists(path) and deleteIfExists:
        print >> sys.stderr, "Output directory exists, removing", path
        shutil.rmtree(path)
    # Create work directory if needed
    if not os.path.exists(path):
        if copyFrom == None:
            print >> sys.stderr, "Making output directory", path
            os.makedirs(path)
        else:
            print >> sys.stderr, "Copying template from", options.copyFrom, "to", path
            shutil.copytree(options.copyFrom, path)
    else:
        print >> sys.stderr, "Using existing output directory", path
    # Remember current directory and switch to workdir
    atexit.register(os.chdir, os.getcwd())
    os.chdir(path)
    # Open log (if a relative path, it goes under workdir)
    if log != None:
        Stream.openLog(log)
    else:
        print >> sys.stderr, "No logging"
    return path
예제 #7
0
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="
    
    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)
    
    for dataset in datasets:       
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)
        
        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)
    
    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
예제 #8
0
파일: EvaluateEPE.py 프로젝트: jbjorne/TEES
def beginLog(outDir, logPath="AUTO"):
    if logPath == "AUTO":
        logPath = os.path.join(outDir, "log.txt")
    elif logPath == "None":
        logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    return logPath
예제 #9
0
def beginLog(outDir, logPath="AUTO"):
    if logPath == "AUTO":
        logPath = os.path.join(outDir, "log.txt")
    elif logPath == "None":
        logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    return logPath
예제 #10
0
 def process(self,
             source,
             output=None,
             model=None,
             fromStep=None,
             toStep=None,
             omitSteps=None,
             logPath=None):
     if logPath == "AUTO":
         if output != None:
             logPath = output
             if "*" in logPath:
                 logPath = logPath.split("*")[0].rstrip("-")
             logPath = os.path.join(
                 logPath.rstrip("/").rstrip("\\") + "-log.txt")
         else:
             logPath = None
     elif logPath == "None":
         logPath = None
     if logPath != None:
         if not os.path.exists(os.path.dirname(logPath)):
             os.makedirs(os.path.dirname(logPath))
         Stream.openLog(logPath)
     print >> sys.stderr, "Preprocessor steps:", [
         x.name for x in self.steps
     ]
     if len(self.steps) == 0:
         raise Exception("No preprocessing steps defined")
     #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps):
     #    raise Exception("Preprocessor step 'CONVERT' may not be omitted")
     #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID
     #    print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source)
     #    source = Utils.Download.getPubMed(int(source))
     # Initialize variables and save existing default values
     #self.intermediateFileTag = corpusName
     #parameters = self.getParameters(parameters, model)
     #parameters["CONVERT.dataSetNames"] = sourceDataSetNames
     #parameters["CONVERT.corpusName"] = corpusName
     #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"]
     #convertCorpusName = self.stepArgs("CONVERT")["corpusName"]
     #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames
     #self.stepArgs("CONVERT")["corpusName"] = corpusName
     # Run the tool chain
     xml = ToolChain.process(self, source, output, model, fromStep, toStep,
                             omitSteps)
     # Reset variables to saved default values
     #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames
     #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName
     if logPath != None:
         Stream.closeLog(logPath)
     return xml
예제 #11
0
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False):
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, downloadDir, None, redownload)
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate)
        Stream.closeLog(logFileName)
        count += 1
예제 #12
0
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
예제 #13
0
def convert(corpora,
            outDir,
            downloadDir=None,
            redownload=False,
            makeIntermediateFiles=True,
            evaluate=False):
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus (" + str(
            count) + "/" + str(len(corpora)) + ")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, downloadDir, None, redownload)
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles,
                          evaluate)
        Stream.closeLog(logFileName)
        count += 1
예제 #14
0
 def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None):
     if logPath == "AUTO":
         if output != None:
             logPath = output
             if "*" in logPath:
                 logPath = logPath.split("*")[0].rstrip("-")
             logPath = os.path.join(logPath.rstrip("/").rstrip("\\") + "-log.txt")
         else:
             logPath = None
     elif logPath == "None":
         logPath = None
     if logPath != None:
         if not os.path.exists(os.path.dirname(logPath)):
             os.makedirs(os.path.dirname(logPath))
         Stream.openLog(logPath)
     print >> sys.stderr, "Preprocessor steps:", [x.name for x in self.steps]
     if len(self.steps) == 0:
         raise Exception("No preprocessing steps defined")
     #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps):
     #    raise Exception("Preprocessor step 'CONVERT' may not be omitted")
     #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID
     #    print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source)
     #    source = Utils.Download.getPubMed(int(source))   
     # Initialize variables and save existing default values
     #self.intermediateFileTag = corpusName
     #parameters = self.getParameters(parameters, model)
     #parameters["CONVERT.dataSetNames"] = sourceDataSetNames
     #parameters["CONVERT.corpusName"] = corpusName
     #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"]
     #convertCorpusName = self.stepArgs("CONVERT")["corpusName"]
     #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames
     #self.stepArgs("CONVERT")["corpusName"] = corpusName
     # Run the tool chain
     xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps)
     # Reset variables to saved default values
     #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames
     #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName
     if logPath != None:
         Stream.closeLog(logPath)
     return xml
예제 #15
0
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    from optparse import OptionParser
    optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing")
    optparser.add_option("-i", "--input", default=None, dest="input", help="")
    optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="")
    optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name")
    optparser.add_option("-o", "--output", default=None, dest="output", help="output directory")
    optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters")
    optparser.add_option("-s", "--step", default=None, dest="step", help="")
    optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="")
    optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="")
    optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="")
    optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
    optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="")
    (options, args) = optparser.parse_args()
    if options.omitSteps != None:
        options.omitSteps = options.omitSteps.split(",")
    
    if not options.noLog:
        Stream.openLog(os.path.join(options.output + "-log.txt"))
        #log(False, True, os.path.join(options.output, options.corpus + "-log.txt"))
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", options.debug)
    preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus
    preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities
    preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps)
예제 #16
0
def combine(inputA,
            inputB,
            inputGold,
            outPath=None,
            mode="OR",
            skip=None,
            logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(
                outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]),
                              "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(
            *[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(
            set([x.get("id")
                 for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[
                x.findall("sentence")
                for x in (docA, docB, docGold, docTemplate)
        ]):
            assert len(
                set([
                    x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)
                ])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip,
                                           counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses")
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode,
                                                     counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(
            a, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(
            b, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(
            template, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
예제 #17
0
def convertDDI(outDir,
               downloadDir=None,
               redownload=False,
               makeIntermediateFiles=True,
               debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir,
                                      downloadDir)

    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"

    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(),
                             key=lambda (k, v): (v, k),
                             reverse=True)
    datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]}
    for i in range(0, len(sortedDocCounts) - 3, 4):
        for j in [0, 1]:
            docById[sortedDocCounts[i + j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i + j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i + j][1][1]
        docById[sortedDocCounts[i + 2][0]].set(
            "set",
            "train")  #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i + 3][0]].set(
            "set",
            "devel")  #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i + 2][1][
            0]  #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i + 2][1][
            1]  #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][
            0]  #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][
            1]  #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents:  # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in [
            'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334',
            'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354',
            'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388',
            'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409',
            'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
            'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452',
            'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474',
            'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492',
            'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500',
            'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
            'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552',
            'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570',
            'DrugDDI.d578'
    ]:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"),
                                            testDocById[key].get("origId"),
                                            sorted(docById.keys()),
                                            sorted(testDocById.keys()),
                                            sorted(overlappingIds))
        docById.update(testDocById)

    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(trainMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(testMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")

    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"],
                                      os.path.join(Settings.DATAPATH,
                                                   "TEES-parses"),
                                      downloadDir,
                                      redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH,
                                     "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"source": "TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"stanfordSource": "TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml,
                              splitTarget,
                              tokenization=None,
                              output=None,
                              removeExisting=True)

    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")

    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
예제 #18
0
        options.triggerExampleBuilder = "PhraseTriggerExampleBuilder"
        options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000"
        options.recallAdjustParams = "0.8,0.9,0.95,1.0"

# These commands will be in the beginning of most pipelines
WORKDIR=options.output
if options.copyFrom != None:
    if os.path.exists(WORKDIR):
        shutil.rmtree(WORKDIR)
    print >> sys.stderr, "Copying template from", options.copyFrom
    shutil.copytree(options.copyFrom, WORKDIR)
    workdir(WORKDIR, False)
else:
    workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files
if not options.noLog:
    Stream.openLog("log.txt")
    #log() # Start logging into a file in working directory

## Make downsampling for learning curve
#downSampleTag = "-r" + str(options.downSampleTrain) + "_s" + str(options.downSampleSeed)
#newTrainFile = makeSubset(TRAIN_FILE, options.task + "-train-nodup" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed)
#makeSubset(TRAIN_FILE.replace("-nodup", ""), options.task + "-train" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed)
#TRAIN_FILE = newTrainFile

if subTask != None:
    print >> sys.stderr, "Task:", options.task + "." + str(subTask)
else:
    print >> sys.stderr, "Task:", options.task

eventDetector = EventDetector()
eventDetector.debug = options.debug
예제 #19
0
def classify(input,
             model,
             output,
             workDir=None,
             step=None,
             omitSteps=None,
             goldInput=None,
             detector=None,
             debug=False,
             clear=False,
             preprocessorTag="-preprocessed.xml.gz",
             preprocessorParams=None,
             bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None:  # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt")  # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")

    classifyInput = input
    if selector.check("PREPROCESS"):
        if preprocessorParams == None:
            preprocessorParams = [
                "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO",
                "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE"
            ]
        preprocessor = Preprocessor(preprocessorParams)
        if debug:
            preprocessor.setArgForAllSteps("debug", True)
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(
                preprocessorOutput
        ) and not clear:  #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput  # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput,
                                                 model)

    if selector.check("CLASSIFY"):
        detector = getDetector(detector,
                               model)[0]()  # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
            bioNLPSTParams, model)
        detector.classify(classifyInput,
                          model,
                          output,
                          goldData=goldInput,
                          fromStep=detectorSteps["CLASSIFY"],
                          omitSteps=omitDetectorSteps["CLASSIFY"],
                          workDir=workDir)
예제 #20
0
        "-c",
        "--corpora",
        default="GE",
        dest="corpora",
        help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"")
    optparser.add_option("-o",
                         "--outdir",
                         default=os.path.normpath(Settings.DATAPATH +
                                                  "/corpora"),
                         dest="outdir",
                         help="directory for output files")
    optparser.add_option("-d",
                         "--downloaddir",
                         default=None,
                         dest="downloaddir",
                         help="directory to download corpus files to")
    optparser.add_option("--intermediateFiles",
                         default=False,
                         action="store_true",
                         dest="intermediateFiles",
                         help="save intermediate corpus files")
    optparser.add_option("--forceDownload",
                         default=False,
                         action="store_true",
                         dest="forceDownload",
                         help="re-download all source files")
    (options, args) = optparser.parse_args()

    Stream.openLog(os.path.join(options.outdir, "conversion-log.txt"))
    convert(options.corpora.split(","), options.outdir, options.downloaddir,
            options.forceDownload, options.intermediateFiles)
예제 #21
0
파일: Combine.py 프로젝트: jbjorne/TEES
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(*[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[x.findall("sentence") for x in (docA, docB, docGold, docTemplate)]):
            assert len(set([x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses") 
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(a, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(b, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(template, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
예제 #22
0
def convertDDI13(
        outDir,
        downloadDir=None,
        datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"],
        redownload=False,
        insertParses=True,
        parse=False,
        makeIntermediateFiles=True,
        debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="

    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)

    for dataset in datasets:
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml,
                   "train",
                   downloaded[dataset],
                   subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)

        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree,
                                           downloaded[dataset +
                                                      "_TEES_PARSES"],
                                           None,
                                           extraAttributes={"source": "TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(
                corpusTree,
                downloaded[dataset + "_TEES_PARSES"],
                None,
                extraAttributes={"stanfordSource": "TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)

    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
예제 #23
0
         options.triggerExampleBuilder = "PhraseTriggerExampleBuilder"
         options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000"
         options.recallAdjustParams = "0.8,0.9,0.95,1.0"
 
 # These commands will be in the beginning of most pipelines
 WORKDIR=options.output
 if options.copyFrom != None:
     if os.path.exists(WORKDIR):
         shutil.rmtree(WORKDIR)
     print >> sys.stderr, "Copying template from", options.copyFrom
     shutil.copytree(options.copyFrom, WORKDIR)
     workdir(WORKDIR, False)
 else:
     workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files
 if not options.noLog:
     Stream.openLog("log.txt")
     #log() # Start logging into a file in working directory
 
 print >> sys.stderr, "Importing detector", options.detector
 Detector = eval("from " + options.detector + " import " + options.detector.split(".")[-1])
 detector = Detector()
 detector.debug = options.debug
 detector.stWriteScores = True # write confidence scores into additional st-format files
 detector.setConnection(getConnection(options.connection)).debug = options.debug
 # Pre-calculate all the required SVM models
 if selector.check("TRAIN"):
     print >> sys.stderr, "----------------------------------------------------"
     print >> sys.stderr, "------------------ Train Detector ------------------"
     print >> sys.stderr, "----------------------------------------------------"
     if options.singleStage:
         detector.train(trainFile, develFile, options.develModel, options.testModel,
예제 #24
0
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir)
    
    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"
    
    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
    datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
    for i in range(0, len(sortedDocCounts)-3, 4):
        for j in [0,1]:
            docById[sortedDocCounts[i+j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
        docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents: # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 
                    'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 
                    'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 
                    'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 
                    'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 
                    'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 
                    'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 
                    'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 
                    'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 
                    'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds))
        docById.update(testDocById)
    
    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")



    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)    
    
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")
    
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
예제 #25
0
                         help="")
    optparser.add_option("--debug",
                         default=False,
                         action="store_true",
                         dest="debug",
                         help="")
    optparser.add_option("--requireEntities",
                         default=False,
                         action="store_true",
                         dest="requireEntities",
                         help="")
    (options, args) = optparser.parse_args()
    if options.omitSteps != None:
        options.omitSteps = options.omitSteps.split(",")

    if not options.noLog:
        Stream.openLog(os.path.join(options.output + "-log.txt"))
        #log(False, True, os.path.join(options.output, options.corpus + "-log.txt"))
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", options.debug)
    preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus
    preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities
    preprocessor.process(options.input,
                         options.output,
                         options.parameters,
                         None,
                         options.inputNames,
                         fromStep=options.step,
                         toStep=options.toStep,
                         omitSteps=options.omitSteps)
예제 #26
0
def convert(inPath,
            outDir,
            corpusId,
            directed,
            negatives,
            preprocess,
            preprocessorParameters=None,
            debug=False,
            clear=False,
            constParser="BLLIP-BIO",
            depParser="STANFORD-CONVERT",
            logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(),
                            setName,
                            directed=directed,
                            negatives=negatives,
                            usedIds=usedIds,
                            tree=tree,
                            corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7),
                                                 ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(
            convertedPath,
            outPath,
            preprocessorParameters,
            omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
예제 #27
0
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    
    bigfileName = os.path.join(outDir, "DDI")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    if trainUnified == None:
        trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"]
    if trainMTMX == None:
        trainMTMX = Settings.URL["DDI_TRAIN_MTMX"]
    if testUnified == None:
        testUnified = Settings.URL["DDI_TEST_UNIFIED"]
    if testMTMX == None:
        testMTMX = Settings.URL["DDI_TEST_MTMX"]
    
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    if True:
        documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir)
        
        sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
        for i in range(0, len(sortedDocCounts)-3, 4):
            for j in [0,1]:
                docById[sortedDocCounts[i+j][0]].set("set", "train")
                datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
                datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
            docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
            docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
            datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
            datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
            datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
        for document in documents: # epajaolliset jaa yli
            if document.get("set") == None:
                document.set("set", "train")
        
        print datasetCounts
        for key in datasetCounts.keys():
            if datasetCounts[key][1] != 0:
                print key, datasetCounts[key][0] / float(datasetCounts[key][1])
            else:
                print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
        
        if testUnified != None:
            testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir)
            for document in testDocuments:
                document.set("set", "test")
            documents = documents + testDocuments
        
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DrugDDI")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")
    #sys.exit()
        
    if False:
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
    
        #if True:
        #xml = bigfileName + "-stanford.xml"        
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
        #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #if "devel" in [x[0] for x in datasets]:
    #    print >> sys.stderr, "Creating empty devel set"
    #    deletionRules = {"interaction":{},"entity":{"isName":"False"}}
    #    InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules)
    #return xml
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
예제 #28
0
            Tools.StanfordParser.insertParses(xml, tempdir + "/" + os.path.basename(files[corpus + "_" + setName.upper() + "_McCC"])[:-len(".tar.gz")].split("-", 2)[-1] + "/mccc/sd_ccproc", None, extraAttributes={"stanfordSource":"BioNLP'11"})
            print >> sys.stderr, "Removing temporary directory", tempdir
            shutil.rmtree(tempdir)

def processParses(xml, splitTarget="McCC"):
    print >> sys.stderr, "Protein Name Splitting"
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    print >> sys.stderr, "Head Detection"
    xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)

if __name__=="__main__":
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"

    from optparse import OptionParser
    from Utils.Parameters import *
    optparser = OptionParser(usage="%prog [options]\nBioNLP'11 Shared Task corpus conversion")
    optparser.add_option("-c", "--corpora", default="GE", dest="corpora", help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"")
    optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files")
    optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to")
    optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files")
    optparser.add_option("--forceDownload", default=False, action="store_true", dest="forceDownload", help="re-download all source files")
    (options, args) = optparser.parse_args()
    
    Stream.openLog(os.path.join(options.outdir, "conversion-log.txt"))
    convert(options.corpora.split(","), options.outdir, options.downloaddir, options.forceDownload, options.intermediateFiles)