def corrupt(self): """ Return a (notw2, weight), a corrupt target word and its weight. Note: This will return a different random value every call. """ from hyperparameters import HYPERPARAMETERS import random possible_targets = targetmap()[self.w1][self.l2] assert len(possible_targets) > 1 assert self.w2 in possible_targets notw2 = self.w2 cnt = 0 while self.w2 == notw2: if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: notw2 = random.choice(possible_targets) pr = 1./len(possible_targets) elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: assert 0 # import noise # from common.myrandom import weighted_sample # e[-1], pr = weighted_sample(noise.indexed_weights()) ## from vocabulary import wordmap ## print wordmap.str(e[-1]), pr else: assert 0 cnt += 1 # Backoff to 0gram smoothing if we fail 10 times to get noise. if cnt > 10: notw2 = random.choice(possible_targets) if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]: weight = 1. else: weight = 1./pr return notw2, weight
def corrupt(self): """ Return a (notw2, weight), a corrupt target word and its weight. Note: This will return a different random value every call. """ from hyperparameters import HYPERPARAMETERS import random possible_targets = targetmap()[self.w1][self.l2] assert len(possible_targets) > 1 assert self.w2 in possible_targets notw2 = self.w2 cnt = 0 while self.w2 == notw2: if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: notw2 = random.choice(possible_targets) pr = 1. / len(possible_targets) elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: assert 0 # import noise # from common.myrandom import weighted_sample # e[-1], pr = weighted_sample(noise.indexed_weights()) ## from vocabulary import wordmap ## print wordmap.str(e[-1]), pr else: assert 0 cnt += 1 # Backoff to 0gram smoothing if we fail 10 times to get noise. if cnt > 10: notw2 = random.choice(possible_targets) if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]: weight = 1. else: weight = 1. / pr return notw2, weight
def get_training_biexample(l1, l2, f1, f2, falign): """ Generator of bilingual training examples from this bicorpus. """ import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") WINDOW = HYPERPARAMETERS["WINDOW_SIZE"] for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): for i1, i2 in links: w1 = ws1[i1] w2 = ws2[i2] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # Skip translations from unknown words if wordform(w1) == "*UNKNOWN*": continue # If we are filtering examples by lemma if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # print wordmap().str(w1), wordmap().str(w2) assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: # logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`)) continue if w1 not in targetmap(): logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`)) continue if l2new not in targetmap()[w1]: logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new)) continue if w2 not in targetmap()[w1][l2new]: logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`)) continue if len(targetmap()[w1][l2new]) == 1: logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new)) continue # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary. min = i1 - (WINDOW-1)/2 max = i1 + (WINDOW-1)/2 lpad = 0 rpad = 0 if min < 0: lpad = -min min = 0 if max >= len(ws1): rpad = max - (len(ws1)-1) max = len(ws1)-1 assert lpad + (max - min + 1) + rpad == WINDOW # print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2 # print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad) seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad # print [wordmap.str(w) for w in seq] assert len(seq) == WINDOW # print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2] assert seq[(WINDOW-1)/2] == w1 yield BilingualExample(l1, seq, w1, w2)
print >> sys.stderr, "WEIRD WORD: %s" % word word = string.lower(word) assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"] tot += 1 if tot % 10000 == 0: print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]) if word in original_embeddings: # print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0]) continue else: original_embeddings[word] = numpy.array([float(v) for v in vals[1:]]) print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"] print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap: print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)` embedding = original_embeddings["*UNKNOWN*"] elif ELANG not in reversemap[w]: print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys()) embedding = original_embeddings["*UNKNOWN*"]
if tot % 10000 == 0: print >> sys.stderr, "\tRead %d lines from %s" % ( tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]) if word in original_embeddings: # print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0]) continue else: original_embeddings[word] = numpy.array( [float(v) for v in vals[1:]]) print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS[ "W2W INITIAL EMBEDDINGS"] print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent( tot - len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros( (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap: print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % ` wordmap( ).str(w) ` embedding = original_embeddings["*UNKNOWN*"]
def get_training_biexample(l1, l2, f1, f2, falign): """ Generator of bilingual training examples from this bicorpus. """ import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") WINDOW = HYPERPARAMETERS["WINDOW_SIZE"] for ws1, ws2, links in bicorpus_sentences_and_alignments( l1, l2, f1, f2, falign): for i1, i2 in links: w1 = ws1[i1] w2 = ws2[i2] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # Skip translations from unknown words if wordform(w1) == "*UNKNOWN*": continue # If we are filtering examples by lemma if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # print wordmap().str(w1), wordmap().str(w2) assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: # logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: logging.debug( "Focus word %s not in our list of focus lemmas" % ( ` wordmap().str(w1) `)) continue if w1 not in targetmap(): logging.warning("No translations for word %s, skipping" % ( ` wordmap().str(w1) `)) continue if l2new not in targetmap()[w1]: logging.warning( "Word %s has no translations for language %s, skipping" % ( ` wordmap().str(w1) `, l2new)) continue if w2 not in targetmap()[w1][l2new]: logging.error("Word %s cannot translate to word %s, skipping" % ( ` wordmap().str(w1) `, ` wordmap().str(w2) `)) continue if len(targetmap()[w1][l2new]) == 1: logging.debug( "Word %s has only one translation in language %s, skipping" % ( ` wordmap().str(w1) `, l2new)) continue # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary. min = i1 - (WINDOW - 1) / 2 max = i1 + (WINDOW - 1) / 2 lpad = 0 rpad = 0 if min < 0: lpad = -min min = 0 if max >= len(ws1): rpad = max - (len(ws1) - 1) max = len(ws1) - 1 assert lpad + (max - min + 1) + rpad == WINDOW # print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2 # print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad) seq = [wordmap().id((None, "*LBOUNDARY*")) ] * lpad + ws1[min:max + 1] + [wordmap().id( (None, "*RBOUNDARY*"))] * rpad # print [wordmap.str(w) for w in seq] assert len(seq) == WINDOW # print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2] assert seq[(WINDOW - 1) / 2] == w1 yield BilingualExample(l1, seq, w1, w2)