def __init__(self,
              language,
              partition="train",
              storeMorph=False,
              splitLemmas=False,
              shuffleDataSeed=None):
     self.basis = CorpusIterator_V(language,
                                   partition=partition,
                                   storeMorph=storeMorph,
                                   splitLemmas=splitLemmas,
                                   shuffleDataSeed=shuffleDataSeed)
 def __init__(self,
              language,
              partition="train",
              fraction=1.0,
              storeMorph=False,
              splitLemmas=False):
     self.basis = CorpusIterator_V(language,
                                   partition=partition,
                                   storeMorph=storeMorph,
                                   splitLemmas=splitLemmas,
                                   shuffleDataSeed=4)
     self.basis.data = self.basis.data[:int(fraction *
                                            len(self.basis.data))]
     self.permute()
     self.fraction = fraction
Exemple #3
0
def runOnCorpus():
    global chart
    chart = [[
        torch.cuda.FloatTensor(
            [[float("-Inf") for _ in itos_setOfNonterminals]
             for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY)
    ] for _ in range(args.MAX_BOUNDARY)]

    iterator = iterator_dense(
        CorpusIterator_V(args.language, "dev", shuffleDataSeed=4).iterator())
    chunk = []
    surprisals = [0 for _ in range(args.MAX_BOUNDARY)]
    while True:
        linearized = []
        try:
            for _ in range(args.BATCHSIZE):
                linearized.append(next(iterator))
        except StopIteration:
            if len(linearized) == 0:
                break
            args.BATCHSIZE = len(linearized)
            chart = [[
                torch.cuda.FloatTensor(
                    [[float("-Inf") for _ in itos_setOfNonterminals]
                     for _ in range(args.BATCHSIZE)])
                for _ in range(args.MAX_BOUNDARY)
            ] for _ in range(args.MAX_BOUNDARY)]

        print(
            sentCount, [
                surprisals[i + 1] - surprisals[i]
                for i in range(args.MAX_BOUNDARY - 1)
            ]
        )  # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)])

        computeSurprisals(linearized)
        surprisals = [
            surprisalTableSums[i] / (surprisalTableCounts[i] + 1e-9)
            for i in range(args.MAX_BOUNDARY)
        ]
        print(
            sentCount, [
                surprisals[i + 1] - surprisals[i]
                for i in range(args.MAX_BOUNDARY - 1)
            ]
        )  # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)])
    return surprisals
     assert tree["category"] in leftCornerCounts
     return leftCorner


def linearizeTree2String(tree, sent):
   if tree["children"] is None:
       sent.append(tree["word"])
   else:
      for x in tree["children"]:
          linearizeTree2String(x, sent)


sentCount = 0

print("Collecting counts from training corpus")
for sentence in CorpusIterator_V(args.language,"train", ignoreCorporaWithoutWords=True).iterator():
   sentCount += 1
   ordered = orderSentence(sentence,  sentCount % 400 == 0)


   linearized = []
   linearizeTree2String(ordered, linearized)
#   if len(linearized) > 10:
 #     continue

#   print(ordered)
   roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1
   rootsTotal = rootsTotal + 1

   if sentCount % 100 == 0:
      print(sentCount, ordered["category"])