Esempio n. 1
0
def get_validation_example():

    from vocabulary import wordmap
    for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]):
        prevwords = []
        for w in string.split(l):
            w = string.strip(w)
            if wordmap.exists(w):
                prevwords.append(wordmap.id(w))
                if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                    yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
            else:
                prevwords = []
Esempio n. 2
0
def get_validation_example():
    
    from vocabulary import wordmap
    for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]):
        prevwords = []
        for w in string.split(l):
            w = string.strip(w)
            if wordmap.exists(w):
                prevwords.append(wordmap.id(w))
                if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                    yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
            else:
                prevwords = []
Esempio n. 3
0
 def __iter__(self):
     from vocabulary import wordmap
     self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
     self.count = 0
     for l in myopen(self.filename):
         prevwords = []
         for w in string.split(l):
             w = string.strip(w)
             id = None
             if wordmap.exists(w):
                 prevwords.append(wordmap.id(w))
                 if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                     self.count += 1
                     yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
             else:
                 prevwords = []
Esempio n. 4
0
 def __iter__(self):
     from vocabulary import wordmap
     self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
     self.count = 0
     for l in myopen(self.filename):
         prevwords = []
         for w in string.split(l):
             w = string.strip(w)
             id = None
             if wordmap.exists(w):
                 prevwords.append(wordmap.id(w))
                 if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                     self.count += 1
                     yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
             else:
                 prevwords = []
Esempio n. 5
0
def get_validation_example():
    HYPERPARAMETERS = common.hyperparameters.read("language-model")

    from vocabulary import wordmap
    for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]):
        prevwords = []
        for w in string.split(l):
            w = string.strip(w)
            if wordmap.exists(w):
                prevwords.append(wordmap.id(w))
                if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                    yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
            else:
                # If we can learn an unknown word token, we should
                # delexicalize the word, not discard the example!
                if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
                prevwords = []
def get_validation_example():
    HYPERPARAMETERS = common.hyperparameters.read("language-model")

    from vocabulary import wordmap
    for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]):
        prevwords = []
        for w in string.split(l):
            w = string.strip(w)
            if wordmap.exists(w):
                prevwords.append(wordmap.id(w))
                if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                    yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
            else:
                # If we can learn an unknown word token, we should
                # delexicalize the word, not discard the example!
                if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
                prevwords = []
Esempio n. 7
0
 def __iter__(self):
     HYPERPARAMETERS = common.hyperparameters.read("language-model")
     from vocabulary import wordmap
     self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
     self.count = 0
     for l in myopen(self.filename):
         prevwords = []
         for w in string.split(l):
             w = string.strip(w)
             id = None
             if wordmap.exists(w):
                 prevwords.append(wordmap.id(w))
                 if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                     self.count += 1
                     yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
             else:
                 # If we can learn an unknown word token, we should
                 # delexicalize the word, not discard the example!
                 if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
                 prevwords = []
 def __iter__(self):
     HYPERPARAMETERS = common.hyperparameters.read("language-model")
     from vocabulary import wordmap
     self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
     self.count = 0
     for l in myopen(self.filename):
         prevwords = []
         for w in string.split(l):
             w = string.strip(w)
             id = None
             if wordmap.exists(w):
                 prevwords.append(wordmap.id(w))
                 if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
                     self.count += 1
                     yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
             else:
                 # If we can learn an unknown word token, we should
                 # delexicalize the word, not discard the example!
                 if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
                 prevwords = []
def trainingsentences():
    """
    For each line (sentence) in the training data, transform it into a list of token IDs.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap
    filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
    count = 0
    for l in myopen(filename):
        tokens = []
        for w in string.split(l):
            w = string.strip(w)
            assert wordmap.exists(w)     # Not exactly clear what to do
                                         # if the word isn't in the vocab.
            tokens.append(wordmap.id(w))
        yield tokens
        count += 1
        if count % 1000 == 0:
            logging.info("Read %d lines from training file %s..." % (count, filename))
            logging.info(stats())
Esempio n. 10
0
def trainingsentences():
    """
    For each line (sentence) in the training data, transform it into a list of token IDs.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap
    filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
    count = 0
    for l in myopen(filename):
        tokens = []
        for w in string.split(l):
            w = string.strip(w)
            assert wordmap.exists(w)  # Not exactly clear what to do
            # if the word isn't in the vocab.
            tokens.append(wordmap.id(w))
        yield tokens
        count += 1
        if count % 1000 == 0:
            logging.info("Read %d lines from training file %s..." %
                         (count, filename))
            logging.info(stats())