class CorpusIteratorFuncHead_V():
    def __init__(self,
                 language,
                 partition="train",
                 storeMorph=False,
                 splitLemmas=False,
                 shuffleDataSeed=None):
        self.basis = CorpusIterator_V(language,
                                      partition=partition,
                                      storeMorph=storeMorph,
                                      splitLemmas=splitLemmas,
                                      shuffleDataSeed=shuffleDataSeed)

    def permute(self):
        self.basis.permute()

    def length(self):
        return self.basis.length()

    def iterator(self, rejectShortSentences=False):
        iterator = self.basis.iterator(
            rejectShortSentences=rejectShortSentences)
        for sentence in iterator:
            reverse_content_head(sentence)
            yield sentence

    def getSentence(self, index):
        return reverse_content_head(self.basis.getSentence(index))
 def __init__(self,
              language,
              partition="train",
              storeMorph=False,
              splitLemmas=False,
              shuffleDataSeed=None):
     self.basis = CorpusIterator_V(language,
                                   partition=partition,
                                   storeMorph=storeMorph,
                                   splitLemmas=splitLemmas,
                                   shuffleDataSeed=shuffleDataSeed)
 def __init__(self,
              language,
              partition="train",
              fraction=1.0,
              storeMorph=False,
              splitLemmas=False):
     self.basis = CorpusIterator_V(language,
                                   partition=partition,
                                   storeMorph=storeMorph,
                                   splitLemmas=splitLemmas,
                                   shuffleDataSeed=4)
     self.basis.data = self.basis.data[:int(fraction *
                                            len(self.basis.data))]
     self.permute()
     self.fraction = fraction
Esempio n. 4
0
def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        countsByPartition[partition] = {}
        sentCount = 0
        for sentence in CorpusIterator_V(args.language,
                                         partition,
                                         storeMorph=True,
                                         shuffleDataSeed=4).iterator():
            sentCount += 1
            if sentCount > 100 and partition == "dev":
                break
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                vocab_lemmas[line["lemma"]] = vocab_lemmas.get(
                    line["lemma"], 0) + 1

                countsByPartition[partition][
                    line["word"]] = countsByPartition[partition].get(
                        line["word"], 0) + 1

                depsVocab.add(line["dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                for morph in line["morph"]:
                    morphKeyValuePairs.add(morph)
                if line["dep"] == "root":
                    continue

                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = (posHead, dep, posHere)
                keyWithDir = (posHead, dep, posHere, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0
        dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab
class CorpusIteratorFuncHeadFraction_V():
   def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False):
      self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4)
      self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))]
      self.permute()
      self.fraction = fraction
   def permute(self):
      self.basis.permute()
   def length(self):
      return self.basis.length()
   def iterator(self, rejectShortSentences = False):
     iterator = self.basis.iterator(rejectShortSentences=rejectShortSentences)
     counter = 0
     print("Actual length", self.length())
     for sentence in iterator:
#         if counter > self.fraction * self.length():
 #           break
  #          counter += 1
         reverse_content_head(sentence)
         yield sentence
   def getSentence(self, index):
      return reverse_content_head(self.basis.getSentence(index))
Esempio n. 6
0
def computeDevLoss():
    devBatchSize = 512
    global printHere
    #   global counter
    #   global devSurprisalTable
    global horizon
    devLoss = 0.0
    devWords = 0
    #   corpusDev = getNextSentence("dev")
    corpusDev = CorpusIterator_V(
        language, "dev", storeMorph=True).iterator(rejectShortSentences=False)
    stream = createStreamContinuous(corpusDev)

    surprisalTable = [0 for _ in range(horizon)]
    devCounter = 0
    devCounterTimesBatchSize = 0
    while True:
        #     try:
        #        input_indices, wordStartIndices = next(stream)
        try:
            input_indices_list = []
            wordStartIndices_list = []
            for _ in range(devBatchSize):
                input_indices, wordStartIndices = next(stream)
                input_indices_list.append(input_indices)
                wordStartIndices_list.append(wordStartIndices)
        except StopIteration:
            devBatchSize = len(input_indices_list)


#        break
        if devBatchSize == 0:
            break
        devCounter += 1
        #     counter += 1
        printHere = (devCounter % 100 == 0)
        _, _, _, newLoss, newWords = doForwardPass(
            input_indices_list,
            wordStartIndices_list,
            surprisalTable=surprisalTable,
            doDropout=False,
            batchSizeHere=devBatchSize)
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
        devCounterTimesBatchSize += devBatchSize
    devSurprisalTableHere = [
        surp / (devCounterTimesBatchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere
Esempio n. 7
0
def runOnCorpus():
    global chart
    chart = [[
        torch.cuda.FloatTensor(
            [[float("-Inf") for _ in itos_setOfNonterminals]
             for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY)
    ] for _ in range(args.MAX_BOUNDARY)]

    iterator = iterator_dense(
        CorpusIterator_V(args.language, "dev", shuffleDataSeed=4).iterator())
    chunk = []
    surprisals = [0 for _ in range(args.MAX_BOUNDARY)]
    while True:
        linearized = []
        try:
            for _ in range(args.BATCHSIZE):
                linearized.append(next(iterator))
        except StopIteration:
            if len(linearized) == 0:
                break
            args.BATCHSIZE = len(linearized)
            chart = [[
                torch.cuda.FloatTensor(
                    [[float("-Inf") for _ in itos_setOfNonterminals]
                     for _ in range(args.BATCHSIZE)])
                for _ in range(args.MAX_BOUNDARY)
            ] for _ in range(args.MAX_BOUNDARY)]

        print(
            sentCount, [
                surprisals[i + 1] - surprisals[i]
                for i in range(args.MAX_BOUNDARY - 1)
            ]
        )  # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)])

        computeSurprisals(linearized)
        surprisals = [
            surprisalTableSums[i] / (surprisalTableCounts[i] + 1e-9)
            for i in range(args.MAX_BOUNDARY)
        ]
        print(
            sentCount, [
                surprisals[i + 1] - surprisals[i]
                for i in range(args.MAX_BOUNDARY - 1)
            ]
        )  # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)])
    return surprisals
Esempio n. 8
0
def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator_V(language,
                                         partition,
                                         shuffleDataSeed=40).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["coarse_dep"] = makeCoarse(line["dep"])
                depsVocab.add(line["coarse_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["coarse_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["coarse_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = dep
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab
    for w in sentence:
        entry = {}
        for key, value in w.iteritems():
            entry[key] = value
        result.append(entry)
    return result


dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))),
                         Variable(torch.FloatTensor([1.0] * len(itos_deps))))
distanceWeights_Prior = Normal(
    Variable(torch.FloatTensor([0.0] * len(itos_deps))),
    Variable(torch.FloatTensor([1.0] * len(itos_deps))))

counter = 0
corpus = CorpusIterator_V(language, "train")


def guide(corpus):
    mu_DH = pyro.param(
        "mu_DH",
        Variable(torch.FloatTensor([0.0] * len(itos_deps)),
                 requires_grad=True))
    mu_Dist = pyro.param(
        "mu_Dist",
        Variable(torch.FloatTensor([0.0] * len(itos_deps)),
                 requires_grad=True))

    sigma_DH = pyro.param(
        "sigma_DH",
        Variable(torch.FloatTensor([1.0] * len(itos_deps)),
Esempio n. 10
0
logsoftmax = torch.nn.LogSoftmax()

def deepCopy(sentence):
  result = []
  for w in sentence:
     entry = {}
     for key, value in w.items():
       entry[key] = value
     result.append(entry)
  return result
dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))
distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))

counter = 0
corpus = CorpusIterator_V(args.language,"train")

def guide(corpus):
  mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))
  mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))

  sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))
  sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))

  dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)

def model(corpus):
  global counter
  dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
Esempio n. 11
0
          assert totalDepLength == 0
       numberOfWords = wordNum
       return (totalDepLength, numberOfWords, byType)



assert batchSize == 1

depLengths = []
#while True:
outpath = "/u/scr/mhahn/japanese/"+str(myID)
with open(outpath, "w") as outFile:
 print >> outFile, "\t".join(["Sent", "Length"])
 counter = 0
 if True:
   corpus = CorpusIterator_V(language,"train", shuffleDataSeed=40)
   corpusIterator = corpus.iterator()
   if corpus.length() == 0:
      quit()
   while True:
     try:
        batch = map(lambda x:next(corpusIterator), 10*range(batchSize))
     except StopIteration:
        break
     batch = sorted(batch, key=len)
     partitions = range(10)
     
     for partition in partitions:
        counter += 1
        printHere = (counter % 100 == 0)
        current = batch[partition*batchSize:(partition+1)*batchSize]
Esempio n. 12
0
     assert tree["category"] in leftCornerCounts
     return leftCorner


def linearizeTree2String(tree, sent):
   if tree["children"] is None:
       sent.append(tree["word"])
   else:
      for x in tree["children"]:
          linearizeTree2String(x, sent)


sentCount = 0

print("Collecting counts from training corpus")
for sentence in CorpusIterator_V(args.language,"train", shuffleDataSeed=4).iterator():
   sentCount += 1
   ordered = orderSentence(sentence,  sentCount % 400 == 0)


   linearized = []
   linearizeTree2String(ordered, linearized)
#   if len(linearized) > 10:
 #     continue

#   print(ordered)
   roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1
   rootsTotal = rootsTotal + 1

   if sentCount % 100 == 0:
      print(sentCount, ordered["category"])
Esempio n. 13
0
    else:
        assert totalDepLength == 0
    numberOfWords = wordNum
    return (totalDepLength, numberOfWords, byType)


assert batchSize == 1

depLengths = []
#while True:
outpath = "/u/scr/mhahn/japanese/" + str(myID)
with open(outpath, "w") as outFile:
    print >> outFile, "\t".join(["Sent", "Length"])
    counter = 0
    if True:
        corpus = CorpusIterator_V(language, "train", shuffleDataSeed=40)
        corpusIterator = corpus.iterator()
        if corpus.length() == 0:
            quit()
        while True:
            try:
                batch = map(lambda x: next(corpusIterator),
                            10 * range(batchSize))
            except StopIteration:
                break
            batch = sorted(batch, key=len)
            partitions = range(10)

            for partition in partitions:
                counter += 1
                printHere = (counter % 100 == 0)
     assert tree["category"] in leftCornerCounts
     return leftCorner


def linearizeTree2String(tree, sent):
   if tree["children"] is None:
       sent.append(tree["word"])
   else:
      for x in tree["children"]:
          linearizeTree2String(x, sent)


sentCount = 0

print("Collecting counts from training corpus")
for sentence in CorpusIterator_V(args.language,"train", ignoreCorporaWithoutWords=True).iterator():
   sentCount += 1
   ordered = orderSentence(sentence,  sentCount % 400 == 0)


   linearized = []
   linearizeTree2String(ordered, linearized)
#   if len(linearized) > 10:
 #     continue

#   print(ordered)
   roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1
   rootsTotal = rootsTotal + 1

   if sentCount % 100 == 0:
      print(sentCount, ordered["category"])
logsoftmax = torch.nn.LogSoftmax()

def deepCopy(sentence):
  result = []
  for w in sentence:
     entry = {}
     for key, value in w.items():
       entry[key] = value
     result.append(entry)
  return result

#dhWeights_Prior = Normal(, Variable(torch.FloatTensor([1.0]* len(itos_deps))))
#distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))

counter = 0
corpus = CorpusIterator_V(args.language,"train")

#def guide(corpus):
#  mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))
#  mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))
#
#  sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))
#  sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))
#
#  dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
#  distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
#

#dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
#distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view(len(docs), len(itos_deps)))
Esempio n. 16
0
header = [
    "index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_",
    "_"
]

from corpusIterator_V import CorpusIterator_V

originalDistanceWeights = {}

morphKeyValuePairs = set()

vocab_lemmas = {}

corpusTrain = CorpusIterator_V(
    args.language, "train",
    storeMorph=True).iterator(rejectShortSentences=False)
pairs = set()
counter = 0
data = []
for sentence in corpusTrain:
    #    print(len(sentence))
    verb = []
    for line in sentence[::-1]:
        #       print(line)
        if line["posUni"] == "PUNCT":
            continue
        verb.append(line)
        if line["posUni"] == "VERB":
            verb = verb[::-1]
            #          print(verb)
        assert tree["category"] in leftCornerCounts
        return leftCorner


def linearizeTree2String(tree, sent):
    if tree["children"] is None:
        sent.append(tree["word"])
    else:
        for x in tree["children"]:
            linearizeTree2String(x, sent)


sentCount = 0

print("Collecting counts from training corpus")
for sentence in CorpusIterator_V(language, "train").iterator():
    sentCount += 1
    ordered = orderSentence(sentence, sentCount % 50 == 0)

    linearized = []
    linearizeTree2String(ordered, linearized)
    #   if len(linearized) > 10:
    #     continue

    #   print(ordered)
    roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1
    roots["__TOTAL__"] = roots.get("__TOTAL__", 0) + 1

    if sentCount % 100 == 0:
        print(sentCount, ordered["category"])
    # update inStackDistribution
Esempio n. 18
0
            surprisalTable=surprisalTable,
            doDropout=False,
            batchSizeHere=devBatchSize)
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
        devCounterTimesBatchSize += devBatchSize
    devSurprisalTableHere = [
        surp / (devCounterTimesBatchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere

DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator_V(language, storeMorph=True)
while failedDevRuns == 0:
    epochCount += 1
    print "Starting new epoch, permuting corpus"
    corpusBase.permute()
    #  corpus = getNextSentence("train")
    corpus = corpusBase.iterator(rejectShortSentences=False)
    stream = createStream(corpus)

    if counter > 5:
        #       if counter % DEV_PERIOD == 0:
        newDevLoss, devSurprisalTableHere = computeDevLoss()
        #             devLosses.append(
        devLosses.append(newDevLoss)
        print "New dev loss " + str(newDevLoss) + ". previous was: " + str(
            lastDevLoss)