Ejemplo n.º 1
0
def getFolds(path, folds, seed=0):
    files = os.listdir(path)
    docNumbers = set()
    for file in files:
        numPart = file.split(".",1)[0]
        if numPart.isdigit():
            docNumbers.add(int(numPart))
    docNumbers = list(docNumbers)
    folds = Split.getFolds(len(docNumbers), folds, seed)
    foldByDocNumber = {}
    for i in range(len(docNumbers)):
        foldByDocNumber[docNumbers[i]] = folds[i]
    return foldByDocNumber
Ejemplo n.º 2
0
    corpusElement = ET.Element("corpus")
    corpusElement.attrib["source"] = "GENIA"
    totalSentences = 0
    documentsWithSentences = []
    for documentElement in documentElements:
        parseGraphs = documentElement.attrib["parseGraphs"]
        del documentElement.attrib["parseGraphs"]
        sentenceCount = 0
        for parseGraph in parseGraphs:
            parseGraph.writeToInteractionXML(documentElement, sentenceCount)
            sentenceCount += 1
        if sentenceCount > 0:
            documentsWithSentences.append(documentElement)
        totalSentences += sentenceCount
    
    visibleSet = Split.getSample(len(documentsWithSentences), options.visibleSet, 0)
    visibleSetDocuments = 0
    visibleSetSentences = 0
    for i in range(len(documentsWithSentences)):
        if visibleSet[i] == 0:
            documentElement = documentsWithSentences[i]
            corpusElement.append(documentElement)
            visibleSetDocuments += 1
            visibleSetSentences += len(documentElement.findall("sentence"))
    ETUtils.write(corpusElement, options.output)
    print >> sys.stderr, "Total:", str(len(documentElements)) + " documents"
    print >> sys.stderr, "Total:", str(len(documentsWithSentences)) + " documents with sentences"
    print >> sys.stderr, "Total:", str(totalSentences) + " sentences"
    print >> sys.stderr, "Visible Set:", str(visibleSetDocuments) + " documents"
    print >> sys.stderr, "Visible Set:", str(visibleSetSentences) + " sentences"
Ejemplo n.º 3
0
    corpusElement = ET.Element("corpus")
    corpusElement.attrib["source"] = "GENIA"
    totalSentences = 0
    documentsWithSentences = []
    for documentElement in documentElements:
        parseGraphs = documentElement.attrib["parseGraphs"]
        del documentElement.attrib["parseGraphs"]
        sentenceCount = 0
        for parseGraph in parseGraphs:
            parseGraph.writeToInteractionXML(documentElement, sentenceCount)
            sentenceCount += 1
        if sentenceCount > 0:
            documentsWithSentences.append(documentElement)
        totalSentences += sentenceCount

    visibleSet = Split.getSample(len(documentsWithSentences),
                                 options.visibleSet, 0)
    visibleSetDocuments = 0
    visibleSetSentences = 0
    for i in range(len(documentsWithSentences)):
        if visibleSet[i] == 0:
            documentElement = documentsWithSentences[i]
            corpusElement.append(documentElement)
            visibleSetDocuments += 1
            visibleSetSentences += len(documentElement.findall("sentence"))
    ETUtils.write(corpusElement, options.output)
    print >> sys.stderr, "Total:", str(len(documentElements)) + " documents"
    print >> sys.stderr, "Total:", str(
        len(documentsWithSentences)) + " documents with sentences"
    print >> sys.stderr, "Total:", str(totalSentences) + " sentences"
    print >> sys.stderr, "Visible Set:", str(
        visibleSetDocuments) + " documents"