def corrupt(self):
        """
        Return a (notw2, weight), a corrupt target word and its weight.
        Note: This will return a different random value every call.
        """
        from hyperparameters import HYPERPARAMETERS
        import random
        possible_targets = targetmap()[self.w1][self.l2]
        assert len(possible_targets) > 1
        assert self.w2 in possible_targets
        notw2 = self.w2
        cnt = 0
        while self.w2 == notw2:
            if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
                notw2 = random.choice(possible_targets)
                pr = 1./len(possible_targets)
            elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
                assert 0
    #            import noise
    #            from common.myrandom import weighted_sample
    #            e[-1], pr = weighted_sample(noise.indexed_weights())
    ##            from vocabulary import wordmap
    ##            print wordmap.str(e[-1]), pr
            else:
                assert 0
            cnt += 1
            # Backoff to 0gram smoothing if we fail 10 times to get noise.
            if cnt > 10: notw2 = random.choice(possible_targets)

        if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]:
            weight = 1.
        else:
            weight = 1./pr
        return notw2, weight
    def corrupt(self):
        """
        Return a (notw2, weight), a corrupt target word and its weight.
        Note: This will return a different random value every call.
        """
        from hyperparameters import HYPERPARAMETERS
        import random
        possible_targets = targetmap()[self.w1][self.l2]
        assert len(possible_targets) > 1
        assert self.w2 in possible_targets
        notw2 = self.w2
        cnt = 0
        while self.w2 == notw2:
            if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
                notw2 = random.choice(possible_targets)
                pr = 1. / len(possible_targets)
            elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
                assert 0

    #            import noise
    #            from common.myrandom import weighted_sample
    #            e[-1], pr = weighted_sample(noise.indexed_weights())
    ##            from vocabulary import wordmap
    ##            print wordmap.str(e[-1]), pr
            else:
                assert 0
            cnt += 1
            # Backoff to 0gram smoothing if we fail 10 times to get noise.
            if cnt > 10: notw2 = random.choice(possible_targets)

        if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]:
            weight = 1.
        else:
            weight = 1. / pr
        return notw2, weight
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
#                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
#                from lemmatizer import lemmatize
#                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
#                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW-1)/2
            max = i1 + (WINDOW-1)/2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1)-1)
                max = len(ws1)-1
            assert lpad + (max - min + 1) + rpad == WINDOW

#            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
#            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad
#            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
#            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW-1)/2] == w1
            yield BilingualExample(l1, seq, w1, w2)
                print >> sys.stderr, "WEIRD WORD: %s" % word
            word = string.lower(word)
        assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"]
        tot += 1
        if tot % 10000 == 0:
            print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
        if word in original_embeddings:
#            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array([float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
Beispiel #5
0
        if tot % 10000 == 0:
            print >> sys.stderr, "\tRead %d lines from %s" % (
                tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
        if word in original_embeddings:
            #            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array(
                [float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS[
        "W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(
        tot - len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros(
        (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len,
                                HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % ` wordmap(
                ).str(w) `
                embedding = original_embeddings["*UNKNOWN*"]
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(
            l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None
                    or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
                #                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
                #                from lemmatizer import lemmatize
                #                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                #                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug(
                        "Focus word %s not in our list of focus lemmas" %
                        ( ` wordmap().str(w1) `))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" %
                                ( ` wordmap().str(w1) `))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning(
                    "Word %s has no translations for language %s, skipping" %
                    ( ` wordmap().str(w1) `, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" %
                              ( ` wordmap().str(w1) `, ` wordmap().str(w2) `))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug(
                    "Word %s has only one translation in language %s, skipping"
                    % ( ` wordmap().str(w1) `, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW - 1) / 2
            max = i1 + (WINDOW - 1) / 2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1) - 1)
                max = len(ws1) - 1
            assert lpad + (max - min + 1) + rpad == WINDOW

            #            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
            #            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))
                   ] * lpad + ws1[min:max +
                                  1] + [wordmap().id(
                                      (None, "*RBOUNDARY*"))] * rpad
            #            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
            #            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW - 1) / 2] == w1
            yield BilingualExample(l1, seq, w1, w2)