def get_validation_example(): from vocabulary import wordmap for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]): prevwords = [] for w in string.split(l): w = string.strip(w) if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: prevwords = []
def __iter__(self): from vocabulary import wordmap self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"] self.count = 0 for l in myopen(self.filename): prevwords = [] for w in string.split(l): w = string.strip(w) id = None if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: self.count += 1 yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: prevwords = []
def get_validation_example(): HYPERPARAMETERS = common.hyperparameters.read("language-model") from vocabulary import wordmap for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]): prevwords = [] for w in string.split(l): w = string.strip(w) if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: # If we can learn an unknown word token, we should # delexicalize the word, not discard the example! if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 prevwords = []
def __iter__(self): HYPERPARAMETERS = common.hyperparameters.read("language-model") from vocabulary import wordmap self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"] self.count = 0 for l in myopen(self.filename): prevwords = [] for w in string.split(l): w = string.strip(w) id = None if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: self.count += 1 yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: # If we can learn an unknown word token, we should # delexicalize the word, not discard the example! if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 prevwords = []
def trainingsentences(): """ For each line (sentence) in the training data, transform it into a list of token IDs. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap filename = HYPERPARAMETERS["TRAIN_SENTENCES"] count = 0 for l in myopen(filename): tokens = [] for w in string.split(l): w = string.strip(w) assert wordmap.exists(w) # Not exactly clear what to do # if the word isn't in the vocab. tokens.append(wordmap.id(w)) yield tokens count += 1 if count % 1000 == 0: logging.info("Read %d lines from training file %s..." % (count, filename)) logging.info(stats())