Esempio n. 1
0
def test_lda(model_file, dict_file, dbs_dir):
    """ Run training and display test results if visualize is true

  Args:
    model_file(str): saved model file to continue training on
    dict_file(str): dict_file path to load dictionary from 
    dbs_dir(str): dir path to load databases from 
  """

    assert (os.path.isdir(dbs_dir)), "Invalid data directory path"
    lda = LDA()
    print 'Loading existing dictionary...'
    lda.load_dict_from_disk(dict_file)
    test_results = list()
    #Iterate over all data and train model
    for root, dirs, files in os.walk(dbs_dir):
        #Iterate over sub-dirs
        for d in files:
            db = Database()
            #Load database object from saved file
            db.load_from_disk(dbs_dir + '/' + d)

            #Add database to model
            lda.add_database(db)
            #Test model
            test_results.append(lda.test(model_file, db_name=db.get_name()))
            lda.remove_database(db.get_name())

            del db
            gc.collect()

    #Print test results
    for idx, i in enumerate(test_results):
        print('Test results for database {}'.format(idx))
        for j in i[0]:
            print('Topic: {} has probability: {}'.format(j[0], j[1]))
        counter = 0
        for k in i[1]:
            print('Topic {} has topic-coherence score: {}'.format(
                counter, k[1]))
            counter += 1

    print lda.model.show_topics()
Esempio n. 2
0
def run_lda(data_dir, num_topics, use_mini_batches, batch_size, epochs,
            model_file, create_dict, dict_file, load_dbs):
    """ Run training and display test results if visualize is true

  Args:
    data_dir(str): directory containing director(y/ies) of data
    num_topics(int): Number of topics to train the model on
    batch_size(int): Size of mini batches used to train the model
    epochs(int): Number of epochs to train the data for on the train set
    model_file(str): saved model file to continue training on
    create_dict(bool): create dictionary from data or load dict from a file
    dict_file(str): dict_file path to load dictionary from 
    load_dbs(bool): if true, load databases from saved pickle files
  """

    assert (os.path.isdir(data_dir)), "Invalid data directory path"

    use_model_file = False
    if model_file:
        use_model_file = True

    #Create model
    lda = LDA(num_topics=num_topics)
    if create_dict:
        print 'Creating dictionary from data'
        #Create word to id mapping for all texts
        lda.create_dict(data_dir)
        lda.store_dict_to_disk('./dict/dictionary')
    else:
        print 'Loading existing dictionary...'
        lda.load_dict_from_disk(dict_file)

    #Iterate over all data and train model
    for root, dirs, files in os.walk(data_dir):
        if load_dbs:
            print 'Training will be done on existing databases'
            datum = files
        else:
            print 'Training will be done after creating databases from text files'
            datum = dirs
        #Iterate over sub-dirs
        for d in datum:
            db = None
            if not load_dbs:
                #Create database object
                db = Database(d, os.path.abspath(data_dir + '/' + d))
            else:
                db = Database()
                #Load database object from saved file
                db.load_from_disk(data_dir + '/' + d)

            #Add database to model
            lda.add_database(db)

            if use_model_file:
                #Load model paramaters from model file and call train
                lda.train(model_file,
                          db_name=db.get_name(),
                          use_mini_batches=use_mini_batches,
                          use_internal_dict=True,
                          batch_size=batch_size,
                          num_epochs=epochs)
                #Set to false, as we just need to load the model once and train it on the entire dataset
                use_model_file = False
            else:
                #Call train on the model
                lda.train(db_name=db.get_name(),
                          use_mini_batches=use_mini_batches,
                          use_internal_dict=True,
                          batch_size=batch_size,
                          num_epochs=epochs)
            if not load_dbs:
                #Remove db to free memory (can also save it if preferred)
                db.store_to_disk('./databases/' + d)

            lda.remove_database(db.get_name())
            del db
            gc.collect()
            tmp_file = './models/' + d + str(num_topics)
            lda.save_model(tmp_file)

    #Save final model
    file_name = './models/final' + str(num_topics)
    lda.save_model(file_name)