def __init__(self, language, partition="together", storeMorph=False, splitLemmas=False, shuffleData=True, shuffleDataSeed=None, splitWords=False, ignoreCorporaWithoutWords=True, size=1000000000): print >> sys.stderr, ("LANGUAGE", language) if splitLemmas: assert language == "Korean" self.splitLemmas = splitLemmas self.splitWords = splitWords assert self.splitWords == (language == "BKTreebank_Vietnamese") self.storeMorph = storeMorph if language.startswith("ISWOC_"): data = accessISWOCData.readISWOCCorpus( language.replace("ISWOC_", ""), partition) elif language.startswith("TOROT_"): data = accessTOROTData.readTOROTCorpus( language.replace("TOROT_", ""), partition) elif language == "BKTreebank_Vietnamese": import accessBKTreebank data = accessBKTreebank.readBKTreebank(partition) elif language == "TuebaJS": import accessTuebaJS assert partition == "together", partition data_valid = accessTuebaJS.readTuebaJSTreebank("dev") data_train = accessTuebaJS.readTuebaJSTreebank("train") data = data_train + data_valid assert len(data) > 0, (language, partition) elif language == "LDC2012T05": import accessChineseDependencyTreebank data = accessChineseDependencyTreebank.readChineseDependencyTreebank( partition) assert len(data) > 0, (language, partition) else: data = readUDCorpus( language, partition, ignoreCorporaWithoutWords=ignoreCorporaWithoutWords) if shuffleData: if shuffleDataSeed is None: assert False random.shuffle(data) else: random.Random(shuffleDataSeed).shuffle(data) else: assert False data = data[:size] self.data = data self.partition = partition self.language = language assert len(data) > 0, (language, partition)
def __init__(self, language, partition, storeMorph=False, splitLemmas=False, shuffleData=True, shuffleDataSeed=None, splitWords=False, ignoreCorporaWithoutWords=True, errorWhenEmpty=True): print >> sys.stderr, ("LANGUAGE", language) if splitLemmas: assert language == "Korean" self.splitLemmas = splitLemmas self.splitWords = splitWords assert self.splitWords == (language == "BKTreebank_Vietnamese") self.storeMorph = storeMorph if language.startswith("ISWOC_"): data = accessISWOCData.readISWOCCorpus( language.replace("ISWOC_", ""), partition) elif language.startswith("TOROT_"): data = accessTOROTData.readTOROTCorpus( language.replace("TOROT_", ""), partition) elif language == "BKTreebank_Vietnamese": import accessBKTreebank data = accessBKTreebank.readBKTreebank(partition) elif language == "TuebaJS": import accessTuebaJS data = accessTuebaJS.readTuebaJSTreebank(partition) assert len(data) > 0, (language, partition) elif language == "LDC2012T05": import accessChineseDependencyTreebank data = accessChineseDependencyTreebank.readChineseDependencyTreebank( partition) assert len(data) > 0, (language, partition) else: data = readUDCorpus( language, partition, ignoreCorporaWithoutWords=ignoreCorporaWithoutWords, errorWhenEmpty=errorWhenEmpty) if shuffleData: if shuffleDataSeed is None: random.shuffle(data) else: random.Random(shuffleDataSeed).shuffle(data) self.data = data self.partition = partition self.language = language if errorWhenEmpty: assert len(data) > 0, (language, partition)
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True, shuffleDataSeed=None, splitWords=False): if splitLemmas: assert language == "Korean" self.splitLemmas = splitLemmas self.splitWords = splitWords assert self.splitWords == (language == "BKTreebank_Vietnamese") self.storeMorph = storeMorph if language.startswith("ISWOC_"): import accessISWOCData data = accessISWOCData.readISWOCCorpus( language.replace("ISWOC_", ""), partition) elif language.startswith("TOROT_"): import accessTOROTData data = accessTOROTData.readTOROTCorpus( language.replace("TOROT_", ""), partition) elif language == "BKTreebank_Vietnamese": import accessBKTreebank data = accessBKTreebank.readBKTreebank(partition) elif language == "TuebaJS": import accessTuebaJS data = accessTuebaJS.readTuebaJSTreebank(partition) assert len(data) > 0, (language, partition) elif language == "LDC2012T05": import accessChineseDependencyTreebank data = accessChineseDependencyTreebank.readChineseDependencyTreebank( partition) assert len(data) > 0, (language, partition) elif language == "PTB": import accessPTB data = accessPTB.readDependencyPTB(partition) assert len(data) > 0, (language, partition) else: data = readUDCorpus(language, partition) if shuffleData: if shuffleDataSeed is None: random.shuffle(data) else: random.Random(shuffleDataSeed).shuffle(data) self.data = data self.partition = partition self.language = language assert len(data) > 0, (language, partition)
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True, shuffleDataSeed=5, splitWords=False, trainSize=None, devSize=None): if splitLemmas: assert language == "Korean" self.splitLemmas = splitLemmas self.splitWords = splitWords assert self.splitWords == (language == "BKTreebank_Vietnamese") self.storeMorph = storeMorph if language == "BKTreebank_Vietnamese": import accessBKTreebank data = accessBKTreebank.readBKTreebank(partition) elif language == "TuebaJS": import accessTuebaJS data = accessTuebaJS.readTuebaJSTreebank(partition) assert len(data) > 0, (language, partition) elif language == "LDC2012T05": import accessChineseDependencyTreebank data = accessChineseDependencyTreebank.readChineseDependencyTreebank(partition) assert len(data) > 0, (language, partition) else: data = readUDCorpus(language, partition) if shuffleData: if shuffleDataSeed is None: assert False random.shuffle(data) else: random.Random(shuffleDataSeed).shuffle(data) else: assert False if partition == "dev": data = data[:devSize] elif partition == "train": data = data[:trainSize] else: assert False self.data = data self.partition = partition self.language = language assert len(data) > 0, (language, partition)