def __init__(self,
                 language,
                 partition="together",
                 storeMorph=False,
                 splitLemmas=False,
                 shuffleData=True,
                 shuffleDataSeed=None,
                 splitWords=False,
                 ignoreCorporaWithoutWords=True,
                 size=1000000000):
        print >> sys.stderr, ("LANGUAGE", language)
        if splitLemmas:
            assert language == "Korean"
        self.splitLemmas = splitLemmas
        self.splitWords = splitWords
        assert self.splitWords == (language == "BKTreebank_Vietnamese")

        self.storeMorph = storeMorph
        if language.startswith("ISWOC_"):
            data = accessISWOCData.readISWOCCorpus(
                language.replace("ISWOC_", ""), partition)
        elif language.startswith("TOROT_"):
            data = accessTOROTData.readTOROTCorpus(
                language.replace("TOROT_", ""), partition)
        elif language == "BKTreebank_Vietnamese":
            import accessBKTreebank
            data = accessBKTreebank.readBKTreebank(partition)
        elif language == "TuebaJS":
            import accessTuebaJS
            assert partition == "together", partition
            data_valid = accessTuebaJS.readTuebaJSTreebank("dev")
            data_train = accessTuebaJS.readTuebaJSTreebank("train")
            data = data_train + data_valid
            assert len(data) > 0, (language, partition)
        elif language == "LDC2012T05":
            import accessChineseDependencyTreebank
            data = accessChineseDependencyTreebank.readChineseDependencyTreebank(
                partition)
            assert len(data) > 0, (language, partition)

        else:
            data = readUDCorpus(
                language,
                partition,
                ignoreCorporaWithoutWords=ignoreCorporaWithoutWords)
        if shuffleData:
            if shuffleDataSeed is None:
                assert False
                random.shuffle(data)
            else:
                random.Random(shuffleDataSeed).shuffle(data)
        else:
            assert False
        data = data[:size]

        self.data = data
        self.partition = partition
        self.language = language
        assert len(data) > 0, (language, partition)
    def __init__(self,
                 language,
                 partition,
                 storeMorph=False,
                 splitLemmas=False,
                 shuffleData=True,
                 shuffleDataSeed=None,
                 splitWords=False,
                 ignoreCorporaWithoutWords=True,
                 errorWhenEmpty=True):
        print >> sys.stderr, ("LANGUAGE", language)
        if splitLemmas:
            assert language == "Korean"
        self.splitLemmas = splitLemmas
        self.splitWords = splitWords
        assert self.splitWords == (language == "BKTreebank_Vietnamese")

        self.storeMorph = storeMorph
        if language.startswith("ISWOC_"):
            data = accessISWOCData.readISWOCCorpus(
                language.replace("ISWOC_", ""), partition)
        elif language.startswith("TOROT_"):
            data = accessTOROTData.readTOROTCorpus(
                language.replace("TOROT_", ""), partition)
        elif language == "BKTreebank_Vietnamese":
            import accessBKTreebank
            data = accessBKTreebank.readBKTreebank(partition)
        elif language == "TuebaJS":
            import accessTuebaJS
            data = accessTuebaJS.readTuebaJSTreebank(partition)
            assert len(data) > 0, (language, partition)
        elif language == "LDC2012T05":
            import accessChineseDependencyTreebank
            data = accessChineseDependencyTreebank.readChineseDependencyTreebank(
                partition)
            assert len(data) > 0, (language, partition)

        else:
            data = readUDCorpus(
                language,
                partition,
                ignoreCorporaWithoutWords=ignoreCorporaWithoutWords,
                errorWhenEmpty=errorWhenEmpty)
        if shuffleData:
            if shuffleDataSeed is None:
                random.shuffle(data)
            else:
                random.Random(shuffleDataSeed).shuffle(data)

        self.data = data
        self.partition = partition
        self.language = language
        if errorWhenEmpty:
            assert len(data) > 0, (language, partition)
Example #3
0
    def __init__(self,
                 language,
                 partition="train",
                 storeMorph=False,
                 splitLemmas=False,
                 shuffleData=True,
                 shuffleDataSeed=None,
                 splitWords=False):
        if splitLemmas:
            assert language == "Korean"
        self.splitLemmas = splitLemmas
        self.splitWords = splitWords
        assert self.splitWords == (language == "BKTreebank_Vietnamese")

        self.storeMorph = storeMorph
        if language.startswith("ISWOC_"):
            import accessISWOCData
            data = accessISWOCData.readISWOCCorpus(
                language.replace("ISWOC_", ""), partition)
        elif language.startswith("TOROT_"):
            import accessTOROTData
            data = accessTOROTData.readTOROTCorpus(
                language.replace("TOROT_", ""), partition)
        elif language == "BKTreebank_Vietnamese":
            import accessBKTreebank
            data = accessBKTreebank.readBKTreebank(partition)
        elif language == "TuebaJS":
            import accessTuebaJS
            data = accessTuebaJS.readTuebaJSTreebank(partition)
            assert len(data) > 0, (language, partition)
        elif language == "LDC2012T05":
            import accessChineseDependencyTreebank
            data = accessChineseDependencyTreebank.readChineseDependencyTreebank(
                partition)
            assert len(data) > 0, (language, partition)
        elif language == "PTB":
            import accessPTB
            data = accessPTB.readDependencyPTB(partition)
            assert len(data) > 0, (language, partition)

        else:
            data = readUDCorpus(language, partition)
        if shuffleData:
            if shuffleDataSeed is None:
                random.shuffle(data)
            else:
                random.Random(shuffleDataSeed).shuffle(data)

        self.data = data
        self.partition = partition
        self.language = language
        assert len(data) > 0, (language, partition)
   def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True, shuffleDataSeed=5, splitWords=False, trainSize=None, devSize=None):
      if splitLemmas:
           assert language == "Korean"
      self.splitLemmas = splitLemmas
      self.splitWords = splitWords
      assert self.splitWords == (language == "BKTreebank_Vietnamese")

      self.storeMorph = storeMorph
      if language == "BKTreebank_Vietnamese":
          import accessBKTreebank
          data = accessBKTreebank.readBKTreebank(partition)
      elif language == "TuebaJS":
         import accessTuebaJS
         data = accessTuebaJS.readTuebaJSTreebank(partition)
         assert len(data) > 0, (language, partition)
      elif language == "LDC2012T05":
         import accessChineseDependencyTreebank
         data = accessChineseDependencyTreebank.readChineseDependencyTreebank(partition)
         assert len(data) > 0, (language, partition)
        
      else:
          data = readUDCorpus(language, partition)
      if shuffleData:
       if shuffleDataSeed is None:
         assert False
         random.shuffle(data)
       else:
         random.Random(shuffleDataSeed).shuffle(data)
      else:
       assert False
      if partition == "dev":
        data = data[:devSize]
      elif partition == "train":
        data = data[:trainSize]
      else:
        assert False
      self.data = data
      self.partition = partition
      self.language = language
      assert len(data) > 0, (language, partition)