def getFolds(path, folds, seed=0): files = os.listdir(path) docNumbers = set() for file in files: numPart = file.split(".",1)[0] if numPart.isdigit(): docNumbers.add(int(numPart)) docNumbers = list(docNumbers) folds = Split.getFolds(len(docNumbers), folds, seed) foldByDocNumber = {} for i in range(len(docNumbers)): foldByDocNumber[docNumbers[i]] = folds[i] return foldByDocNumber
corpusElement = ET.Element("corpus") corpusElement.attrib["source"] = "GENIA" totalSentences = 0 documentsWithSentences = [] for documentElement in documentElements: parseGraphs = documentElement.attrib["parseGraphs"] del documentElement.attrib["parseGraphs"] sentenceCount = 0 for parseGraph in parseGraphs: parseGraph.writeToInteractionXML(documentElement, sentenceCount) sentenceCount += 1 if sentenceCount > 0: documentsWithSentences.append(documentElement) totalSentences += sentenceCount visibleSet = Split.getSample(len(documentsWithSentences), options.visibleSet, 0) visibleSetDocuments = 0 visibleSetSentences = 0 for i in range(len(documentsWithSentences)): if visibleSet[i] == 0: documentElement = documentsWithSentences[i] corpusElement.append(documentElement) visibleSetDocuments += 1 visibleSetSentences += len(documentElement.findall("sentence")) ETUtils.write(corpusElement, options.output) print >> sys.stderr, "Total:", str(len(documentElements)) + " documents" print >> sys.stderr, "Total:", str(len(documentsWithSentences)) + " documents with sentences" print >> sys.stderr, "Total:", str(totalSentences) + " sentences" print >> sys.stderr, "Visible Set:", str(visibleSetDocuments) + " documents" print >> sys.stderr, "Visible Set:", str(visibleSetSentences) + " sentences"
corpusElement = ET.Element("corpus") corpusElement.attrib["source"] = "GENIA" totalSentences = 0 documentsWithSentences = [] for documentElement in documentElements: parseGraphs = documentElement.attrib["parseGraphs"] del documentElement.attrib["parseGraphs"] sentenceCount = 0 for parseGraph in parseGraphs: parseGraph.writeToInteractionXML(documentElement, sentenceCount) sentenceCount += 1 if sentenceCount > 0: documentsWithSentences.append(documentElement) totalSentences += sentenceCount visibleSet = Split.getSample(len(documentsWithSentences), options.visibleSet, 0) visibleSetDocuments = 0 visibleSetSentences = 0 for i in range(len(documentsWithSentences)): if visibleSet[i] == 0: documentElement = documentsWithSentences[i] corpusElement.append(documentElement) visibleSetDocuments += 1 visibleSetSentences += len(documentElement.findall("sentence")) ETUtils.write(corpusElement, options.output) print >> sys.stderr, "Total:", str(len(documentElements)) + " documents" print >> sys.stderr, "Total:", str( len(documentsWithSentences)) + " documents with sentences" print >> sys.stderr, "Total:", str(totalSentences) + " sentences" print >> sys.stderr, "Visible Set:", str( visibleSetDocuments) + " documents"