Esempio n. 1
0
def fit_scaler(data_dir, word2vec_model=WORD2VEC_MODELPATH, batch_size=1024,
               persist_to_path=None):
    """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
     This scaler can be used afterwards for normalizing feature matrices. """
    if type(word2vec_model) == str:
        word2vec_model = Word2Vec.load(word2vec_model)

    doc_generator = get_documents(data_dir)
    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in range(batch_size):
            try:
                batch.append(six.next(doc_generator))
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model:
                    vectors.append(word2vec_model[word])

        matrix = np.array(vectors)
        print("Fitted to {} vectors".format(matrix.shape[0]))

        scaler.partial_fit(matrix)

    if persist_to_path:
        save_to_disk(persist_to_path, scaler)

    return scaler
Esempio n. 2
0
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
    """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
     This scaler can be used afterwards for normalizing feature matrices. """
    if type(word2vec_model) == str:
        word2vec_model = Word2Vec.load(word2vec_model)

    doc_generator = get_documents(data_dir)
    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in range(batch_size):
            try:
                batch.append(six.next(doc_generator))
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model:
                    vectors.append(word2vec_model[word])

        matrix = np.array(vectors)
        print("Fitted to {} vectors".format(matrix.shape[0]))

        scaler.partial_fit(matrix)

    if persist_to_path:
        save_to_disk(persist_to_path, scaler)

    return scaler
Esempio n. 3
0
def train(
    trainset_dir,
    word2vec_path=WORD2VEC_MODELPATH,
    ontology_path=ONTOLOGY_PATH,
    model_path=MODEL_PATH,
    recreate_ontology=False,
    verbose=True,
):
    """
    Train and save the model on a given dataset
    :param trainset_dir: path to the directory with the training set
    :param word2vec_path: path to the gensim word2vec model
    :param ontology_path: path to the ontology file
    :param model_path: path where the model should be pickled
    :param recreate_ontology: boolean flag whether to recreate the ontology
    :param verbose: whether to print computation times

    :return None if everything goes fine, error otherwise
    """
    ontology = get_ontology(path=ontology_path, recreate=recreate_ontology)
    docs = get_documents(trainset_dir)

    global_index = build_global_frequency_index(trainset_dir, verbose=verbose)
    word2vec_model = Word2Vec.load(word2vec_path)
    model = LearningModel(global_index, word2vec_model)

    tick = time.clock()

    x, y = build_train_matrices(docs, model, trainset_dir, ontology)

    if verbose:
        print("Matrices built in: {0:.2f}s".format(time.clock() - tick))
    t1 = time.clock()

    if verbose:
        print("X size: {}".format(x.shape))

    # Normalize features
    x = model.maybe_fit_and_scale(x)

    # Train the model
    model.fit_classifier(x, y)

    if verbose:
        print("Fitting the model: {0:.2f}s".format(time.clock() - t1))

    # Pickle the model
    save_to_disk(model_path, model, overwrite=True)
Esempio n. 4
0
def fit_scaler(data: DataList, word2vec_model,
               batch_size=1024, persist_to_path=None):
  """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
   This scaler can be used afterwards for normalizing feature matrices. """
  if isinstance(word2vec_model, str):
    word2vec_model = Word2Vec.load(word2vec_model)

  # TODO add other non-text features here
  doc_generator = iter([Document(example['text']) for example in data])
  scaler = StandardScaler(copy=False)

  no_more_samples = False
  while not no_more_samples:
    batch = []
    for i in range(batch_size):
      try:
        batch.append(six.next(doc_generator))
      except StopIteration:
        no_more_samples = True
        break

    vectors = []
    for doc in batch:
      for word in doc.get_all_words():
        if word in word2vec_model.wv:
          vectors.append(word2vec_model.wv[word])

    matrix = np.array(vectors)
    print("Fitted to {} vectors".format(matrix.shape[0]))

    scaler.partial_fit(matrix)

  if persist_to_path:
    save_to_disk(persist_to_path, scaler)

  return scaler
Esempio n. 5
0
 def save_word2vec_model(self, filepath, overwrite=False):
     """ Save the word2vec model to a file """
     if not self.word2vec_model:
         raise ValueError("Can't save the word2vec model, " + \
                          "it has not been trained yet")
     save_to_disk(filepath, self.word2vec_model, overwrite=overwrite)
Esempio n. 6
0
 def save_scaler(self, filepath, overwrite=False):
     """ Save the scaler object to a file """
     if not self.scaler:
         raise ValueError("Can't save the scaler, " + \
                          "it has not been trained yet")
     save_to_disk(filepath, self.scaler, overwrite=overwrite)
Esempio n. 7
0
 def save_word2vec_model(self, filepath, overwrite=False):
     """ Save the word2vec model to a file """
     if not self.word2vec_model:
         raise ValueError("Can't save the word2vec model, " + \
                          "it has not been trained yet")
     save_to_disk(filepath, self.word2vec_model, overwrite=overwrite)
Esempio n. 8
0
 def save_scaler(self, filepath, overwrite=False):
     """ Save the scaler object to a file """
     if not self.scaler:
         raise ValueError("Can't save the scaler, " + \
                          "it has not been trained yet")
     save_to_disk(filepath, self.scaler, overwrite=overwrite)
Esempio n. 9
0
 def save_word2vec_model(self, filepath, overwrite=False):
     """ Save the word2vec model to a file """
     save_to_disk(filepath, self.word2vec_model, overwrite=overwrite)
Esempio n. 10
0
 def save_scaler(self, filepath, overwrite=False):
     """ Save the scaler object to a file """
     save_to_disk(filepath, self.scaler, overwrite=overwrite)
Esempio n. 11
0
def batch_train(
    trainset_dir,
    testset_dir,
    nb_epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    ontology_path=ONTOLOGY_PATH,
    model_path=MODEL_PATH,
    recreate_ontology=False,
    word2vec_path=WORD2VEC_MODELPATH,
    verbose=True,
):
    """
    Train and save the model on a given dataset
    :param trainset_dir: path to the directory with the training set
    :param testset_dir: path to the directory with the test set
    :param nb_epochs: number of passes over the training set
    :param batch_size: the size of a single batch
    :param ontology_path: path to the ontology file
    :param model_path: path to the pickled LearningModel object
    :param word2vec_path: path to the gensim word2vec model
    :param recreate_ontology: boolean flag whether to recreate the ontology
    :param verbose: whether to print computation times

    :return None if everything goes fine, error otherwise
    """
    ontology = get_ontology(path=ontology_path, recreate=recreate_ontology, verbose=False)

    global_index = build_global_frequency_index(trainset_dir, verbose=False)
    word2vec_model = Word2Vec.load(word2vec_path)
    model = LearningModel(global_index, word2vec_model)
    previous_best = -1

    for epoch in xrange(nb_epochs):
        doc_generator = get_documents(
            trainset_dir,
            as_generator=True,
            shuffle=True,
        )
        epoch_start = time.clock()

        if verbose:
            print("Epoch {}".format(epoch + 1), end=' ')

        no_more_samples = False
        batch_number = 0
        while not no_more_samples:
            batch_number += 1

            batch = []
            for i in xrange(batch_size):
                try:
                    batch.append(doc_generator.next())
                except StopIteration:
                    no_more_samples = True
                    break

            if not batch:
                break

            x, y = build_train_matrices(batch, model, trainset_dir, ontology)

            # Normalize features
            x = model.maybe_fit_and_scale(x)

            # Train the model
            model.partial_fit_classifier(x, y)

            if verbose:
                sys.stdout.write(b'.')
                sys.stdout.flush()

        if verbose:
            print(" {0:.2f}s".format(time.clock() - epoch_start))

        metrics = test(
            testset_dir,
            model=model,
            ontology=ontology,
            verbose=False
        )

        for k, v in metrics.iteritems():
            print("{0}: {1}".format(k, v))

        if metrics['map'] > previous_best:
            previous_best = metrics['map']
            save_to_disk(model_path, model, overwrite=True)