def __init__(self, l1, l1seq, w1):
        """
        l1 = source language
        l1seq = sequence of word IDs in source language
        w1 = focus word ID in source language
        """
        self.l1 = l1
        self.l1seq = l1seq
        self.w1 = w1

        if wordform(self.w1) != "*UNKNOWN*":
            assert self.l1 == language(self.w1)
    def __init__(self, l1, l1seq, w1):
        """
        l1 = source language
        l1seq = sequence of word IDs in source language
        w1 = focus word ID in source language
        """
        self.l1 = l1
        self.l1seq = l1seq
        self.w1 = w1

        if wordform(self.w1) != "*UNKNOWN*":
            assert self.l1 == language(self.w1)
def visualize(embeddings, idxs, name, PERPLEXITY=30):
    idxs = [w % embeddings.shape[0] for w in idxs]
    titles = [wordform(w) for w in idxs]
    import os.path
    filename = HYPERPARAMETERS["INITIAL_EMBEDDINGS"] + ".visualize-%s.png" % name
    try:
        from textSNE.calc_tsne import tsne
#       from textSNE.tsne import tsne
        out = tsne(embeddings[idxs], perplexity=PERPLEXITY)
        from textSNE.render import render
        render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename)
    except IOError:
        logging.info("ERROR visualizing", filename, ". Continuing...")
Example #4
0
def visualize(embeddings, idxs, name, PERPLEXITY=30):
    idxs = [w % embeddings.shape[0] for w in idxs]
    titles = [wordform(w) for w in idxs]
    import os.path
    filename = HYPERPARAMETERS[
        "INITIAL_EMBEDDINGS"] + ".visualize-%s.png" % name
    try:
        from textSNE.calc_tsne import tsne
        #       from textSNE.tsne import tsne
        out = tsne(embeddings[idxs], perplexity=PERPLEXITY)
        from textSNE.render import render
        render([(title, point[0], point[1])
                for title, point in zip(titles, out)], filename)
    except IOError:
        logging.info("ERROR visualizing", filename, ". Continuing...")
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
#                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
#                from lemmatizer import lemmatize
#                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
#                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW-1)/2
            max = i1 + (WINDOW-1)/2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1)-1)
                max = len(ws1)-1
            assert lpad + (max - min + 1) + rpad == WINDOW

#            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
#            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad
#            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
#            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW-1)/2] == w1
            yield BilingualExample(l1, seq, w1, w2)
 def __str__(self):
     return "%s" % `(wordmap().str(self.w2), self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])`
Example #7
0
    import w2w.corpora
    from w2w.vocabulary import wordmap, language, wordform
    from collections import defaultdict
    from common.mydict import sort as dictsort

    cnt = {}
    reversecnt = {}
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(
                l1, l2, f1, f2, falign):
            for i1, i2 in links:
                if len(ws1) <= i1 or len(ws2) <= i2:
                    print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (
                        i1, i2, len(ws1), len(ws2))
                    print >> sys.stderr, [wordform(w) for w in ws1]
                    print >> sys.stderr, [wordform(w) for w in ws2]
                    print >> sys.stderr, links
                w1 = ws1[i1]
                w2 = ws2[i2]
                #                print wordmap.str(w1)[1], wordmap.str(w2)[1]

                l2new = language(w2)

                assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                # Skip translations to unknown words
                if wordform(w2) == "*UNKNOWN*": continue

                assert l2new == l2

                # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas.
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
            else:
                # Mix the target word embedding over the weighted translation into the source language

                mixcnt = {}
                for w2 in reversemap[w][ELANG]:
                    if language(w2) is None:
                        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                        continue
                    assert language(w2) == ELANG
                    if wordform(w2) not in original_embeddings:
                        print >> sys.stderr, "%s is NOT mixed by %s %d (no embedding)" % (wordmap().str(w), wordmap().str(w2), reversemap[w][ELANG][w2])
                        continue
                    mixcnt[w2] = reversemap[w][ELANG][w2]

                tot = 0
                for w2 in mixcnt: tot += mixcnt[w2]

                if tot == 0:
                    print >> sys.stderr, "Unable to mix ANY translations for %s, using *UNKNOWN*" % `wordmap().str(w)`
                    embedding = original_embeddings["*UNKNOWN*"]
                else:
                    embedding = numpy.zeros((HYPERPARAMETERS["EMBEDDING_SIZE"]))
                    for w2 in mixcnt:
                        embedding += 1. * mixcnt[w2] / tot * (original_embeddings[wordform(w2)])
#                       print >> sys.stderr, "%s is mixed %s by %s" % (wordmap().str(w), percent(mixcnt[w2], tot), wordmap().str(w2))
Example #9
0
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (
                    ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
            else:
                # Mix the target word embedding over the weighted translation into the source language

                mixcnt = {}
                for w2 in reversemap[w][ELANG]:
                    if language(w2) is None:
                        assert HYPERPARAMETERS[
                            "W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                        continue
                    assert language(w2) == ELANG
                    if wordform(w2) not in original_embeddings:
                        print >> sys.stderr, "%s is NOT mixed by %s %d (no embedding)" % (
                            wordmap().str(w), wordmap().str(w2),
                            reversemap[w][ELANG][w2])
                        continue
                    mixcnt[w2] = reversemap[w][ELANG][w2]

                tot = 0
                for w2 in mixcnt:
                    tot += mixcnt[w2]

                if tot == 0:
                    print >> sys.stderr, "Unable to mix ANY translations for %s, using *UNKNOWN*" % ` wordmap(
                    ).str(w) `
                    embedding = original_embeddings["*UNKNOWN*"]
                else:
    import logging
    logging.basicConfig(level=logging.DEBUG)

    import w2w.corpora
    from w2w.vocabulary import wordmap, language, wordform
    from collections import defaultdict
    from common.mydict import sort as dictsort

    cnt = {}
    reversecnt = {}
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
            for i1, i2 in links:
                if len(ws1) <= i1 or len(ws2) <= i2:
                    print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (i1,i2, len(ws1), len(ws2))
                    print >> sys.stderr, [wordform(w) for w in ws1]
                    print >> sys.stderr, [wordform(w) for w in ws2]
                    print >> sys.stderr, links
                w1 = ws1[i1]
                w2 = ws2[i2]
#                print wordmap.str(w1)[1], wordmap.str(w2)[1]

                l2new = language(w2)

                assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                # Skip translations to unknown words
                if wordform(w2) == "*UNKNOWN*": continue

                assert l2new == l2

Example #11
0
def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(
            l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None
                    or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
                #                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
                #                from lemmatizer import lemmatize
                #                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                #                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug(
                        "Focus word %s not in our list of focus lemmas" %
                        ( ` wordmap().str(w1) `))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" %
                                ( ` wordmap().str(w1) `))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning(
                    "Word %s has no translations for language %s, skipping" %
                    ( ` wordmap().str(w1) `, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" %
                              ( ` wordmap().str(w1) `, ` wordmap().str(w2) `))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug(
                    "Word %s has only one translation in language %s, skipping"
                    % ( ` wordmap().str(w1) `, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW - 1) / 2
            max = i1 + (WINDOW - 1) / 2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1) - 1)
                max = len(ws1) - 1
            assert lpad + (max - min + 1) + rpad == WINDOW

            #            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
            #            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))
                   ] * lpad + ws1[min:max +
                                  1] + [wordmap().id(
                                      (None, "*RBOUNDARY*"))] * rpad
            #            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
            #            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW - 1) / 2] == w1
            yield BilingualExample(l1, seq, w1, w2)
Example #12
0
 def __str__(self):
     return "%s" % ` (wordmap().str(self.w2), self.l1, wordform(
         self.w1), [wordmap().str(w)[1] for w in self.l1seq]) `