Exemple #1
0
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = True

    checkpoint_dir = "checkpoints/"
    
    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()
    
    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, False)
    cnxn.close()
    
    with open('Categories_base_stat_geo.txt','r') as f:
            Categories = f.read().splitlines()
    Categories = sorted(Categories)
    category_count = len(Categories)
    nx,ny = out.shape
    if(DEBUG):
        # orient the user a bit
        print("read data is: ")
        print(out)
        print("fixed categories are: ")
        print(Categories)
        print("The size of the read raw data is %d rows by %d columns"%(nx,ny))
        print("corresponding header length is: %d"%len(out_array_header))
        #print(out_array_header)
    
    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells,:] #truncate the rows (max_cells)
    header = out_array_header
    
    ## ADD GEODATA
    import logging
    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    # read-in relevant data
    in_file_postal = 'data/geodata/allCountries_postal.txt'
    reader_postal = GeonamesCountriesTxtFileReader(in_file_postal)
    df_postal = reader_postal.read_csv_postal()
    
    # read-in country codes
    in_file_cc = 'data/geodata/country_codes_csv.csv'
    df_countries = pd.read_csv(in_file_cc,dtype=str)

    # now, build up geo-data training data set
    N_SAMPLES = 500 # number of samples for each additional category
    
    start = time.time()
    data = np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T # draw first random column
    data_header = list([['country','text'],]) # start with the 'country' category
    for i in np.arange(1,N_SAMPLES): # draw remaining random columns
        #print(data)
        data = np.concatenate((data,np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1)
        data_header.append(list(['country','text']))
    # now, handle the remaining new geo categories
    NewCategories = list(['state','city','postal_code','latitude','longitude','country_code'])
    for category in NewCategories:
        for i in np.arange(N_SAMPLES):
            data = np.concatenate((data,np.asarray(df_postal[category].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1)
            if((category=='latitude') or (category=='longitude')):
                data_header.append(list([category,'float']))
            elif(category=='postal_code'):
                # label as 'int' where appropriate (one may also need to label as 'text' sometimes as well, but going with this for now)
                data_header.append(list([category,'int']))
            else :
                data_header.append(list([category,'text']))
    
    if(DEBUG):
        print("DEBUG::the shape of geo data is:")
        print(data.shape)
        print("DEBUG::the length of the corresponding geo data header is:")
        print(len(data_header))
        print("DEBUG::the time elapsed to build the geo data set is (sec):")
        print(time.time()-start)
        print("DEBUG::merging geo data with datalake data... ")
    
    raw_data = np.column_stack((raw_data,data))
    header.extend(data_header)
    
    if(DEBUG):
        print("DEBUG::done!!")
        print("DEBUG::the shape of final merged data is:")
        print(raw_data.shape)
        print("DEBUG::the length of final merged data header is:")
        print(len(header))
    
    ## FINISHED ADDING GEODATA
    
    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    start_time_guess = time.time()
    guesses = []
    print("Beginning Guessing categorical/ordinal for geo+datalake data...")
    category_count = 0
    ordinal_count = 0
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:,i], for_types ='category')
        if tmp[0]=='category':
            category_count += 1
            header[i].append('categorical')
            if ('int' in header[i]) or ('float' in header[i]) \
                or ('datetime' in header[i]):
                    ordinal_count += 1
                    header[i].append('ordinal')
        guesses.append(tmp)
    
    if(DEBUG):
        #print(guesses)
        #print(len(guesses))
        print("DEBUG::The number of categorical columns is %d"%category_count)
        print("DEBUG::The number of ordinal columns is %d"%ordinal_count)
        #print(header)
        
    elapsed_time = time.time()-start_time_guess
    print("Total guessing time is : %.2f sec" % elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL
    
    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))
    
    # do other processing and encode the data
    if execution_config is None:
        raise TypeError
        
    Classifier = Simon(encoder={}) # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']
    
    encoder.categories=Categories
    category_count = len(Categories) # need to reset this variable as it is used for stg. else above
    
    # build classifier model    
    Classifier = Simon(encoder=encoder) # text classifier for unit test    
    model = Classifier.generate_transfer_model(maxlen, max_cells, category_count-9, category_count, checkpoint, checkpoint_dir)
    
    model_compile = lambda m: m.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['binary_accuracy'])
    model_compile(model)
    
    # encode the data and evaluate model
    X, y = encoder.encode_data(raw_data, header, maxlen)
    if(DEBUG):
        print("DEBUG::y is:")
        print(y)
        print("DEBUG::The encoded labels (first row) are:")
        print(y[0,:])
        
    data = Classifier.setup_test_sets(X, y)
    
    ## Build p_threshold per class
    # p_threshold = np.sum(data.y_train,axis=0)*1/(data.y_train.shape[0])
    p_threshold = 0.5 # you could also use a fixed scalar
    
    if(DEBUG):
        print("DEBUG::per class p_threshold is:")
        print(p_threshold)

    max_cells = encoder.cur_max_cells
    
    start = time.time()
    history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data)
    end = time.time()
    print("Time for training is %f sec"%(end-start))
    
    config = { 'encoder' :  encoder,
               'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) }
    Classifier.save_config(config, checkpoint_dir)
    Classifier.plot_loss(history) #comment out on docker images...
    
    Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500
    p_threshold = 0.5

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = False

    checkpoint_dir = "checkpoints/"
    
    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()
    
    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, DEBUG)
    cnxn.close()
    
    with open('Categories_base.txt','r') as f:
            Categories = f.read().splitlines()
    Categories = sorted(Categories)
    category_count = len(Categories)
    nx,ny = out.shape
    if(DEBUG):
        # orient the user a bit
        print("read data is: ")
        print(out)
        print("fixed categories are: ")
        print(Categories)
        print("The size of the read raw data is %d rows by %d columns"%(nx,ny))
        print("corresponding header length is: %d"%len(out_array_header))
        #print(out_array_header)
    
    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells,:] #truncate the rows (max_cells)
    header = out_array_header
    
    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))
    
    # do other processing and encode the data
    if execution_config is None:
        raise TypeError
        
    Classifier = Simon(encoder={}) # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']
    
    encoder.categories=Categories
    
    # build classifier model    
    Classifier = Simon(encoder=encoder) # text classifier for unit test    
    model = Classifier.generate_transfer_model(maxlen, max_cells, category_count, category_count, checkpoint, checkpoint_dir)
    
    model_compile = lambda m: m.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['binary_accuracy'])
    model_compile(model)
    
    # encode the data and evaluate model
    X, y = encoder.encode_data(raw_data, header, maxlen)
    if(DEBUG):
        print("DEBUG::y is:")
        print(y)
        print("DEBUG::The encoded labels (first row) are:")
        print(y[0,:])
        
    data = Classifier.setup_test_sets(X, y)

    max_cells = encoder.cur_max_cells
    
    start = time.time()
    history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data)
    end = time.time()
    print("Time for training is %f sec"%(end-start))
    
    config = { 'encoder' :  encoder,
               'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) }
    Classifier.save_config(config, checkpoint_dir)
    Classifier.plot_loss(history) #comment out on docker images...
    
    Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
Exemple #3
0
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct,
         try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500
    p_threshold = 0.5  # threshold for positive probability of a label

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = False

    checkpoint_dir = "checkpoints/"
    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()

    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(
        max_cells, cursor, adl, DEBUG)
    cnxn.close()

    # orient the user a bit
    print("read clustered data is: ")
    print(out)
    print("fixed categories are: ")
    with open('Categories.txt', 'r') as f:
        Categories = f.read().splitlines()
    Categories = sorted(Categories)
    print(Categories)

    nx, ny = out.shape
    print("The size of the read raw data is %d rows by %d columns" % (nx, ny))

    # specify datalake data for the transfer-learning experiment
    raw_data = out[0:max_cells, :]  #truncate the rows (max_cells)
    header = out_array_header

    #read "post-processed" header from file!
    #    with open('datalake_labels','r',newline='\n') as myfile:
    #        reader = csv.reader(myfile, delimiter=',')
    #        header = []
    #        for row in reader:
    #            header.append(row)
    # OR header is out_array_header

    print("corresponding header length is: %d" % len(out_array_header))
    #print(header)

    ## ADD GEODATA
    import logging
    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    # read-in relevant data
    in_file_postal = 'data/geodata/allCountries_postal.txt'
    reader_postal = GeonamesCountriesTxtFileReader(in_file_postal)
    df_postal = reader_postal.read_csv_postal()

    # read-in country codes
    in_file_cc = 'data/geodata/country_codes_csv.csv'
    df_countries = pd.read_csv(in_file_cc, dtype=str)

    # now, build up geo-data training data set
    N_SAMPLES = 1000  # number of samples for each additional category

    start = time.time()
    data = np.asarray(df_countries['Name'].sample(
        n=max_cells, replace='True'))[np.newaxis].T  # draw first random column
    data_header = list([
        ['country', 'text'],
    ])  # start with the 'country' category
    for i in np.arange(1, N_SAMPLES):  # draw remaining random columns
        #print(data)
        data = np.concatenate(
            (data,
             np.asarray(df_countries['Name'].sample(
                 n=max_cells, replace='True'))[np.newaxis].T),
            axis=1)
        data_header.append(list(['country', 'text']))
    # now, handle the remaining new geo categories
    NewCategories = list([
        'state', 'city', 'postal_code', 'latitude', 'longitude', 'country_code'
    ])
    for category in NewCategories:
        for i in np.arange(N_SAMPLES):
            data = np.concatenate(
                (data,
                 np.asarray(df_postal[category].sample(
                     n=max_cells, replace='True'))[np.newaxis].T),
                axis=1)
            if ((category == 'latitude') or (category == 'longitude')):
                data_header.append(list([category, 'float']))
            elif (category == 'postal_code'):
                # label as 'int' where appropriate (one may also need to label as 'text' sometimes as well, but going with this for now)
                data_header.append(list([category, 'int']))
            else:
                data_header.append(list([category, 'text']))

    print("DEBUG::the shape of geo data is:")
    print(data.shape)
    print("DEBUG::the length of the corresponding geo data header is:")
    print(len(data_header))
    print("DEBUG::the time elapsed to build the geo data set is (sec):")
    print(time.time() - start)

    print("DEBUG::merging geo data with datalake data... ")
    raw_data = np.column_stack((raw_data, data))
    header.extend(data_header)
    print("DEBUG::done!!")

    print("DEBUG::the shape of final merged data is:")
    print(raw_data.shape)
    print("DEBUG::the length of final merged data header is:")
    print(len(header))
    ## FINISHED ADDING GEODATA

    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    start_time_guess = time.time()
    guesses = []
    print("Beginning Guessing categorical/ordinal for geo+datalake data...")
    category_count = 0
    ordinal_count = 0
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:, i], for_types='category')
        if tmp[0] == 'category':
            category_count += 1
            header[i].append('categorical')
            if ('int' in header[i]) or ('float' in header[i]) \
                or ('datetime' in header[i]):
                ordinal_count += 1
                header[i].append('ordinal')
        guesses.append(tmp)

    print(guesses)
    print(len(guesses))
    print("DEBUG::The number of categorical columns is %d" % category_count)
    print("DEBUG::The number of ordinal columns is %d" % ordinal_count)
    print(header)
    elapsed_time = time.time() - start_time_guess
    print("Total guessing time is : %.2f sec" % elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL

    # load checkpoint from faker data
    config = {}
    if execution_config is None:
        raise TypeError
    config = load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    if checkpoint is None:
        checkpoint = config['checkpoint']

    # enable training mode for transfer-learning experiment
    should_train = True

    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))

    # encode the data
    X, y = encoder.encode_data(raw_data, header, maxlen)

    print("DEBUG::The encoded labels (one row) are:")
    #print(y[0,:])
    #print(y)

    max_cells = encoder.cur_max_cells
    data = None
    if should_train:
        data = setup_test_sets(X, y)
    else:
        data = type('data_type', (object, ), {'X_test': X, 'y_test': y})

    print('Sample chars in X:{}'.format(X[2, 0:10]))
    print('y:{}'.format(y[2]))

    # need to know number of fixed categories to create model
    category_count = y.shape[1]
    print('Number of fixed categories is :')
    print(category_count)

    #model = generate_model(maxlen, max_cells, category_count-2)
    max_len = maxlen
    filter_length = [1, 3, 3]
    nb_filter = [40, 200, 1000]
    pool_length = 2
    # document input
    document = Input(shape=(max_cells, max_len), dtype='int64')
    # sentence input
    in_sentence = Input(shape=(max_len, ), dtype='int64')
    # char indices to one hot matrix, 1D sequence to 2D
    embedded = Lambda(binarize, output_shape=binarize_outshape)(in_sentence)
    # embedded: encodes sentence
    for i in range(len(nb_filter)):
        embedded = Convolution1D(nb_filter=nb_filter[i],
                                 filter_length=filter_length[i],
                                 border_mode='valid',
                                 activation='relu',
                                 init='glorot_normal',
                                 subsample_length=1)(embedded)

        embedded = Dropout(0.1)(embedded)
        embedded = MaxPooling1D(pool_length=pool_length)(embedded)

    forward_sent = LSTM(256,
                        return_sequences=False,
                        dropout_W=0.2,
                        dropout_U=0.2,
                        consume_less='gpu')(embedded)
    backward_sent = LSTM(256,
                         return_sequences=False,
                         dropout_W=0.2,
                         dropout_U=0.2,
                         consume_less='gpu',
                         go_backwards=True)(embedded)

    sent_encode = merge([forward_sent, backward_sent],
                        mode='concat',
                        concat_axis=-1)
    sent_encode = Dropout(0.3)(sent_encode)
    # sentence encoder

    sentence_encoder = Model(input=in_sentence, output=sent_encode)

    print(sentence_encoder.summary())
    encoded = TimeDistributed(sentence_encoder)(document)

    # encoded: sentences to bi-lstm for document encoding
    forwards = LSTM(128,
                    return_sequences=False,
                    dropout_W=0.2,
                    dropout_U=0.2,
                    consume_less='gpu')(encoded)
    backwards = LSTM(128,
                     return_sequences=False,
                     dropout_W=0.2,
                     dropout_U=0.2,
                     consume_less='gpu',
                     go_backwards=True)(encoded)

    merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
    output_pre = Dropout(0.3)(merged)
    output_pre = Dense(128, activation='relu')(output_pre)
    output_pre = Dropout(0.3)(output_pre)
    output = Dense(category_count - 2 - 6 - 1,
                   activation='sigmoid')(output_pre)
    # output = Activation('softmax')(output)
    model = Model(input=document, output=output)

    # having built model of previous size, load weights
    load_weights(checkpoint, config, model, checkpoint_dir)

    #having loaded weights,rebuild model using new category_count in last layer
    output = Dense(category_count, activation='sigmoid')(output_pre)
    model = Model(input=document, output=output)

    # retrain the last layer
    for layer in model.layers[:7]:
        layer.trainable = False
    model.layers[8].trainable = True

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])
    if (should_train):
        start = time.time()
        train_model(batch_size, checkpoint_dir, model, nb_epoch, data)
        end = time.time()
        print("Time for training is %f sec" % (end - start))

        evaluate_model(max_cells, model, data, encoder, p_threshold)

        config = {
            'encoder': encoder,
            'checkpoint': get_best_checkpoint(checkpoint_dir)
        }
        save_config(config, checkpoint_dir)
Exemple #4
0
import random
import pyodbc


def graphDoesntContainFile(filename, cnxn):
    cursor = cnxn.cursor()

    cursor.execute("SELECT top(1) * FROM datasets where name=?", filename)

    name = cursor.fetchone()

    return name == None


store_name = 'nktraining'
adl = client.get_adl_client(store_name)

files = adl.ls('training-data/CKAN')

random.shuffle(files)

cnxn = gc.getConnection()

i = 0
for file in files:
    if (i > 1000):
        break

    if graphDoesntContainFile(file, cnxn):
        try:
            with adl.open(file, blocksize=2**20) as f:
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct,
         try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500
    p_threshold = 0.5

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = False

    checkpoint_dir = "checkpoints/"

    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()

    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(
        max_cells, cursor, adl, DEBUG)
    cnxn.close()

    with open('Categories_base_stat.txt', 'r') as f:
        Categories = f.read().splitlines()
    Categories = sorted(Categories)
    category_count = len(Categories)
    nx, ny = out.shape
    if (DEBUG):
        # orient the user a bit
        print("read data is: ")
        print(out)
        print("fixed categories are: ")
        print(Categories)
        print("The size of the read raw data is %d rows by %d columns" %
              (nx, ny))
        print("corresponding header length is: %d" % len(out_array_header))
        #print(out_array_header)

    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells, :]  #truncate the rows (max_cells)

    #read "post-processed" header from file, this includes categorical and ordinal classifications...
    #    with open('datalake_labels','r',newline='\n') as myfile:
    #        reader = csv.reader(myfile, delimiter=',')
    #        header = []
    #        for row in reader:
    #            header.append(row)
    # OR
    header = out_array_header

    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    start_time_guess = time.time()
    guesses = []
    print("Beginning Guessing categorical/ordinal for datalake data...")
    category_count = 0
    ordinal_count = 0
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:, i], for_types='category')
        if tmp[0] == 'category':
            category_count += 1
            header[i].append('categorical')
            if ('int' in header[i]) or ('float' in header[i]) \
                or ('datetime' in header[i]):
                ordinal_count += 1
                header[i].append('ordinal')
        guesses.append(tmp)

    if (DEBUG):
        #print(guesses)
        #print(len(guesses))
        print("DEBUG::The number of categorical columns is %d" %
              category_count)
        print("DEBUG::The number of ordinal columns is %d" % ordinal_count)
        #print(header)

    elapsed_time = time.time() - start_time_guess
    print("Total guessing time is : %.2f sec" % elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL

    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))

    # do other processing and encode the data
    if execution_config is None:
        raise TypeError

    Classifier = Simon(encoder={})  # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']

    encoder.categories = Categories

    # build classifier model
    Classifier = Simon(encoder=encoder)  # text classifier for unit test
    model = Classifier.generate_transfer_model(maxlen, max_cells,
                                               category_count - 2,
                                               category_count, checkpoint,
                                               checkpoint_dir)

    model_compile = lambda m: m.compile(loss='categorical_crossentropy',
                                        optimizer='adam',
                                        metrics=['binary_accuracy'])
    model_compile(model)

    # encode the data and evaluate model
    X, y = encoder.encode_data(raw_data, header, maxlen)
    if (DEBUG):
        print("DEBUG::y is:")
        print(y)
        print("DEBUG::The encoded labels (first row) are:")
        print(y[0, :])

    data = Classifier.setup_test_sets(X, y)

    max_cells = encoder.cur_max_cells

    start = time.time()
    history = Classifier.train_model(batch_size, checkpoint_dir, model,
                                     nb_epoch, data)
    end = time.time()
    print("Time for training is %f sec" % (end - start))

    config = {
        'encoder': encoder,
        'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir)
    }
    Classifier.save_config(config, checkpoint_dir)
    Classifier.plot_loss(history)  #comment out on docker images...

    Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = False

    checkpoint_dir = "checkpoints/"
    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)

        
    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()
    
    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, DEBUG)
    cnxn.close()
    
    
    # orient the user a bit
    print("read clustered data is: ")
    print(out)
    print("fixed categories are: ")
    with open('Categories.txt','r') as f:
            Categories = f.read().splitlines()
    print(Categories)
    
    #label = input("Your best bet for the column label is? Select one of the fixed categories... : ")
    
    nx,ny = out.shape
    print("The size of the read raw data is %d rows by %d columns"%(nx,ny))
    print("corresponding header length is: %d"%len(out_array_header))
    #print(out_array_header)
    
       

    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells,:] #truncate the rows (max_cells)
    header = out_array_header
    
    
    
    # do other processing and encode the data
    if null_pct > 0:
        DataGenerator.add_nulls_uniform(raw_data, null_pct)
    config = {}
    if not should_train:
        if execution_config is None:
            raise TypeError
        config = load_config(execution_config, checkpoint_dir)
        encoder = config['encoder']
        if checkpoint is None:
            checkpoint = config['checkpoint']
    else:
        encoder = Encoder()
        encoder.process(raw_data, max_cells)
        
    # enable training mode for transfer-learning experiment
    should_train = True
    
    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))
    
    # encode the data 
    X, y = encoder.encode_data(raw_data, header, maxlen)
    
    print("DEBUG::The encoded labels (one row) are:")
    print(y[0,:])

    max_cells = encoder.cur_max_cells
    data = None
    if should_train:
        data = setup_test_sets(X, y)
    else:
        data = type('data_type', (object,), {'X_test': X, 'y_test':y})

    print('Sample chars in X:{}'.format(X[2, 0:10]))
    print('y:{}'.format(y[2]))
    
    # need to know number of fixed categories to create model
    category_count = y.shape[1] 
    print('Number of fixed categories is :')
    print(category_count)

    model = generate_model(maxlen, max_cells, category_count)


    load_weights(checkpoint, config, model, checkpoint_dir)
    
    # retrain the last layer
    for layer in model.layers[:7]:
        layer.trainable = False
    model.layers[8].trainable = True


    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['binary_accuracy'])
    if(should_train):
        start = time.time()
        train_model(batch_size, checkpoint_dir, model, nb_epoch, data)
        end = time.time()
        print("Time for training is %f sec"%(end-start))
        config = { 'encoder' :  encoder,
                   'checkpoint' : get_best_checkpoint(checkpoint_dir) }
        save_config(config, checkpoint_dir)
        
    evaluate_model(max_cells, model, data, encoder)