def bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
    """
    Given languages l1 and l2 and their bicorpus filenames f1, f2, and falign,
    yield tuples of the former (ws1, ws2, links),
    where ws1 are the word ids in the sentence from f1,
    where ws2 are the word ids in the sentence from f2,
    and links is a list of (i1, i2) word indexes that are linked.
    """
    from w2w.vocabulary import wordmap

    i = 0
    emptycnt = 0
    logging.info("Reading %s,%s sentences and alignments from %s, %s, %s" %
                 (l1, l2, f1, f2, falign))
    fil1, fil2, filalign = open(f1), open(f2), open(falign)
    for (s1, s2, salign) in itertools.izip(fil1, fil2, filalign):
        #     print s1, s2, salign,
        i += 1
        if i % 100000 == 0:
            logging.info("\tRead line %d of %s, %s, %s..." %
                         (i, f1, f2, falign))
            logging.info("\tEmpty sentences are %s..." %
                         (percent(emptycnt, i)))
            logging.info("\t%s" % stats())

        ws1 = [(l1, w1) for w1 in string.split(s1)]
        ws2 = [(l2, w2) for w2 in string.split(s2)]
        ws1 = [wordmap().id(tok) for tok in ws1]
        ws2 = [wordmap().id(tok) for tok in ws2]

        if len(ws1) == 0 or len(ws2) == 0:
            emptycnt += 1
            continue

#     print ws2, [w2w.vocabulary.wordmap.str(w2) for w2 in ws2]
        links = [string.split(link, sep="-") for link in string.split(salign)]
        links = [(int(i1), int(i2)) for i1, i2 in links]

        yield ws1, ws2, links

    # Make sure all iterators are exhausted
    alldone = 0
    try:
        value = fil1.next()
    except StopIteration:
        alldone += 1
    try:
        value = fil2.next()
    except StopIteration:
        alldone += 1
    try:
        value = filalign.next()
    except StopIteration:
        alldone += 1
    assert alldone == 3

    logging.info("DONE. Read line %d of %s, %s, %s..." % (i, f1, f2, falign))
    logging.info("Empty sentences are %s..." % (percent(emptycnt, i)))
    logging.info(stats())
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
#                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
#                from lemmatizer import lemmatize
#                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
#                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW-1)/2
            max = i1 + (WINDOW-1)/2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1)-1)
                max = len(ws1)-1
            assert lpad + (max - min + 1) + rpad == WINDOW

#            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
#            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad
#            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
#            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW-1)/2] == w1
            yield BilingualExample(l1, seq, w1, w2)
 def __str__(self):
     return "%s" % `(wordmap().str(self.w2), self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])`
        assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"]
        tot += 1
        if tot % 10000 == 0:
            print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
        if word in original_embeddings:
#            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array([float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
            else:
                # Mix the target word embedding over the weighted translation into the source language
Exemple #5
0
        if word in original_embeddings:
            #            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array(
                [float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS[
        "W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(
        tot - len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros(
        (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len,
                                HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % ` wordmap(
                ).str(w) `
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (
                    ELANG, wordmap().str(w), reversemap[w].keys())
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(
            l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None
                    or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
                #                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
                #                from lemmatizer import lemmatize
                #                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                #                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug(
                        "Focus word %s not in our list of focus lemmas" %
                        ( ` wordmap().str(w1) `))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" %
                                ( ` wordmap().str(w1) `))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning(
                    "Word %s has no translations for language %s, skipping" %
                    ( ` wordmap().str(w1) `, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" %
                              ( ` wordmap().str(w1) `, ` wordmap().str(w2) `))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug(
                    "Word %s has only one translation in language %s, skipping"
                    % ( ` wordmap().str(w1) `, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW - 1) / 2
            max = i1 + (WINDOW - 1) / 2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1) - 1)
                max = len(ws1) - 1
            assert lpad + (max - min + 1) + rpad == WINDOW

            #            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
            #            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))
                   ] * lpad + ws1[min:max +
                                  1] + [wordmap().id(
                                      (None, "*RBOUNDARY*"))] * rpad
            #            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
            #            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW - 1) / 2] == w1
            yield BilingualExample(l1, seq, w1, w2)
 def __str__(self):
     return "%s" % ` (wordmap().str(self.w2), self.l1, wordform(
         self.w1), [wordmap().str(w)[1] for w in self.l1seq]) `