コード例 #1
0
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = True

    checkpoint_dir = "checkpoints/"
    
    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()
    
    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(max_cells, cursor, adl, False)
    cnxn.close()
    
    with open('Categories_base_stat_geo.txt','r') as f:
            Categories = f.read().splitlines()
    Categories = sorted(Categories)
    category_count = len(Categories)
    nx,ny = out.shape
    if(DEBUG):
        # orient the user a bit
        print("read data is: ")
        print(out)
        print("fixed categories are: ")
        print(Categories)
        print("The size of the read raw data is %d rows by %d columns"%(nx,ny))
        print("corresponding header length is: %d"%len(out_array_header))
        #print(out_array_header)
    
    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells,:] #truncate the rows (max_cells)
    header = out_array_header
    
    ## ADD GEODATA
    import logging
    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    # read-in relevant data
    in_file_postal = 'data/geodata/allCountries_postal.txt'
    reader_postal = GeonamesCountriesTxtFileReader(in_file_postal)
    df_postal = reader_postal.read_csv_postal()
    
    # read-in country codes
    in_file_cc = 'data/geodata/country_codes_csv.csv'
    df_countries = pd.read_csv(in_file_cc,dtype=str)

    # now, build up geo-data training data set
    N_SAMPLES = 500 # number of samples for each additional category
    
    start = time.time()
    data = np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T # draw first random column
    data_header = list([['country','text'],]) # start with the 'country' category
    for i in np.arange(1,N_SAMPLES): # draw remaining random columns
        #print(data)
        data = np.concatenate((data,np.asarray(df_countries['Name'].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1)
        data_header.append(list(['country','text']))
    # now, handle the remaining new geo categories
    NewCategories = list(['state','city','postal_code','latitude','longitude','country_code'])
    for category in NewCategories:
        for i in np.arange(N_SAMPLES):
            data = np.concatenate((data,np.asarray(df_postal[category].sample(n=max_cells,replace='True'))[np.newaxis].T),axis=1)
            if((category=='latitude') or (category=='longitude')):
                data_header.append(list([category,'float']))
            elif(category=='postal_code'):
                # label as 'int' where appropriate (one may also need to label as 'text' sometimes as well, but going with this for now)
                data_header.append(list([category,'int']))
            else :
                data_header.append(list([category,'text']))
    
    if(DEBUG):
        print("DEBUG::the shape of geo data is:")
        print(data.shape)
        print("DEBUG::the length of the corresponding geo data header is:")
        print(len(data_header))
        print("DEBUG::the time elapsed to build the geo data set is (sec):")
        print(time.time()-start)
        print("DEBUG::merging geo data with datalake data... ")
    
    raw_data = np.column_stack((raw_data,data))
    header.extend(data_header)
    
    if(DEBUG):
        print("DEBUG::done!!")
        print("DEBUG::the shape of final merged data is:")
        print(raw_data.shape)
        print("DEBUG::the length of final merged data header is:")
        print(len(header))
    
    ## FINISHED ADDING GEODATA
    
    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    start_time_guess = time.time()
    guesses = []
    print("Beginning Guessing categorical/ordinal for geo+datalake data...")
    category_count = 0
    ordinal_count = 0
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:,i], for_types ='category')
        if tmp[0]=='category':
            category_count += 1
            header[i].append('categorical')
            if ('int' in header[i]) or ('float' in header[i]) \
                or ('datetime' in header[i]):
                    ordinal_count += 1
                    header[i].append('ordinal')
        guesses.append(tmp)
    
    if(DEBUG):
        #print(guesses)
        #print(len(guesses))
        print("DEBUG::The number of categorical columns is %d"%category_count)
        print("DEBUG::The number of ordinal columns is %d"%ordinal_count)
        #print(header)
        
    elapsed_time = time.time()-start_time_guess
    print("Total guessing time is : %.2f sec" % elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL
    
    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))
    
    # do other processing and encode the data
    if execution_config is None:
        raise TypeError
        
    Classifier = Simon(encoder={}) # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']
    
    encoder.categories=Categories
    category_count = len(Categories) # need to reset this variable as it is used for stg. else above
    
    # build classifier model    
    Classifier = Simon(encoder=encoder) # text classifier for unit test    
    model = Classifier.generate_transfer_model(maxlen, max_cells, category_count-9, category_count, checkpoint, checkpoint_dir)
    
    model_compile = lambda m: m.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['binary_accuracy'])
    model_compile(model)
    
    # encode the data and evaluate model
    X, y = encoder.encode_data(raw_data, header, maxlen)
    if(DEBUG):
        print("DEBUG::y is:")
        print(y)
        print("DEBUG::The encoded labels (first row) are:")
        print(y[0,:])
        
    data = Classifier.setup_test_sets(X, y)
    
    ## Build p_threshold per class
    # p_threshold = np.sum(data.y_train,axis=0)*1/(data.y_train.shape[0])
    p_threshold = 0.5 # you could also use a fixed scalar
    
    if(DEBUG):
        print("DEBUG::per class p_threshold is:")
        print(p_threshold)

    max_cells = encoder.cur_max_cells
    
    start = time.time()
    history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data)
    end = time.time()
    print("Time for training is %f sec"%(end-start))
    
    config = { 'encoder' :  encoder,
               'checkpoint' : Classifier.get_best_checkpoint(checkpoint_dir) }
    Classifier.save_config(config, checkpoint_dir)
    Classifier.plot_loss(history) #comment out on docker images...
    
    Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
コード例 #2
0
def main(execution_config, DEBUG):
    maxlen = 20
    max_cells = 500
    p_threshold = 0.5

    DEBUG = True  # boolean to specify whether or not print DEBUG information

    checkpoint_dir = "pretrained_models/"

    with open('Categories.txt', 'r') as f:
        Categories = f.read().splitlines()

    # orient the user a bit
    print("fixed categories are: ")
    Categories = sorted(Categories)
    print(Categories)
    category_count = len(Categories)

    # load specified execution configuration
    if execution_config is None:
        raise TypeError
    Classifier = Simon(encoder={})  # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']

    # read unit test data
    dataset_name = "replicate_eval_error"  # alternatively, o_38 or o_185
    if (DEBUG):
        print("DEBUG::BEGINNING UNIT TEST...")
    frame = pd.read_csv('unit_test_data/' + dataset_name + '.csv',
                        dtype='str',
                        header=None)

    X = encoder.encodeDataFrame(frame)

    # build classifier model
    model = Classifier.generate_model(maxlen, max_cells, category_count)
    Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
    model_compile = lambda m: m.compile(loss='binary_crossentropy',
                                        optimizer='adam',
                                        metrics=['binary_accuracy'])
    model_compile(model)
    y = model.predict(X)
    # discard empty column edge case
    y[np.all(frame.isnull(), axis=0)] = 0

    result = encoder.reverse_label_encode(y, p_threshold)

    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    print("Beginning Guessing categorical/ordinal classifications...")
    start_time_guess = time.time()
    category_count = 0
    ordinal_count = 0
    raw_data = frame.as_matrix()
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:, i], for_types='category')
        if tmp[0] == 'category':
            category_count += 1
            tmp2 = list(result[0][i])
            tmp2.append('categorical')
            result[0][i] = tuple(tmp2)
            result[1][i].append(1)
            if ('int' in result[1][i]) or ('float' in result[1][i]) \
                or ('datetime' in result[1][i]):
                ordinal_count += 1
                tmp2 = list(result[0][i])
                tmp2.append('ordinal')
                result[0][i] = tuple(tmp2)
                result[1][i].append(1)
    elapsed_time = time.time() - start_time_guess
    print("Total statistical variable guessing time is : %.2f sec" %
          elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL
    print("The predicted classes and probabilities are respectively:")
    print(result)
コード例 #3
0
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct,
         try_reuse_data, batch_size, execution_config):
    maxlen = 20
    max_cells = 500
    p_threshold = 0.5

    # set this boolean to True to print debug messages (also terminate on *any* exception)
    DEBUG = False

    checkpoint_dir = "checkpoints/"

    # basic preliminaries for database calls
    store_name = 'nktraining'
    adl = client.get_adl_client(store_name)
    files = adl.ls('training-data/CKAN')
    random.shuffle(files)
    cnxn = gc.getConnection()
    cursor = cnxn.cursor()

    # make required database calls
    out, out_array_header = FetchLabeledDataFromDatabase(
        max_cells, cursor, adl, DEBUG)
    cnxn.close()

    with open('Categories_base_stat.txt', 'r') as f:
        Categories = f.read().splitlines()
    Categories = sorted(Categories)
    category_count = len(Categories)
    nx, ny = out.shape
    if (DEBUG):
        # orient the user a bit
        print("read data is: ")
        print(out)
        print("fixed categories are: ")
        print(Categories)
        print("The size of the read raw data is %d rows by %d columns" %
              (nx, ny))
        print("corresponding header length is: %d" % len(out_array_header))
        #print(out_array_header)

    # specify data for the transfer-learning experiment
    raw_data = out[0:max_cells, :]  #truncate the rows (max_cells)

    #read "post-processed" header from file, this includes categorical and ordinal classifications...
    #    with open('datalake_labels','r',newline='\n') as myfile:
    #        reader = csv.reader(myfile, delimiter=',')
    #        header = []
    #        for row in reader:
    #            header.append(row)
    # OR
    header = out_array_header

    ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
    start_time_guess = time.time()
    guesses = []
    print("Beginning Guessing categorical/ordinal for datalake data...")
    category_count = 0
    ordinal_count = 0
    for i in np.arange(raw_data.shape[1]):
        tmp = guess(raw_data[:, i], for_types='category')
        if tmp[0] == 'category':
            category_count += 1
            header[i].append('categorical')
            if ('int' in header[i]) or ('float' in header[i]) \
                or ('datetime' in header[i]):
                ordinal_count += 1
                header[i].append('ordinal')
        guesses.append(tmp)

    if (DEBUG):
        #print(guesses)
        #print(len(guesses))
        print("DEBUG::The number of categorical columns is %d" %
              category_count)
        print("DEBUG::The number of ordinal columns is %d" % ordinal_count)
        #print(header)

    elapsed_time = time.time() - start_time_guess
    print("Total guessing time is : %.2f sec" % elapsed_time)
    ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL

    # transpose the data
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))

    # do other processing and encode the data
    if execution_config is None:
        raise TypeError

    Classifier = Simon(encoder={})  # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    checkpoint = config['checkpoint']

    encoder.categories = Categories

    # build classifier model
    Classifier = Simon(encoder=encoder)  # text classifier for unit test
    model = Classifier.generate_transfer_model(maxlen, max_cells,
                                               category_count - 2,
                                               category_count, checkpoint,
                                               checkpoint_dir)

    model_compile = lambda m: m.compile(loss='categorical_crossentropy',
                                        optimizer='adam',
                                        metrics=['binary_accuracy'])
    model_compile(model)

    # encode the data and evaluate model
    X, y = encoder.encode_data(raw_data, header, maxlen)
    if (DEBUG):
        print("DEBUG::y is:")
        print(y)
        print("DEBUG::The encoded labels (first row) are:")
        print(y[0, :])

    data = Classifier.setup_test_sets(X, y)

    max_cells = encoder.cur_max_cells

    start = time.time()
    history = Classifier.train_model(batch_size, checkpoint_dir, model,
                                     nb_epoch, data)
    end = time.time()
    print("Time for training is %f sec" % (end - start))

    config = {
        'encoder': encoder,
        'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir)
    }
    Classifier.save_config(config, checkpoint_dir)
    Classifier.plot_loss(history)  #comment out on docker images...

    Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold)
コード例 #4
0
    def _produce_annotations(self, *, inputs: Inputs) -> Outputs:
        """
        Parameters
        ----------
        inputs: Input pandas frame

        Returns
        -------
        Outputs
            The outputs is two lists of lists, each has length equal to number of columns in input pandas frame.
            Each entry of the first one is a list of strings corresponding to each column's multi-label classification.
            Each entry of the second one is a list of floats corresponding to prediction probabilities.
        """
        frame = inputs

        # setup model as you typically would in a Simon main file
        maxlen = 20
        max_cells = 500
        p_threshold = 0.5

        DEBUG = True # boolean to specify whether or not print DEBUG information
        checkpoint_dir = self.volumes["simon_models_1"]+"/pretrained_models/"
        
        if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(self.volumes["simon_models_1"]+ category_list,'r') as f:
            Categories = f.read().splitlines()
        
        # orient the user a bit
        print("fixed categories are: ")
        Categories = sorted(Categories)
        print(Categories)
        category_count = len(Categories)

        # load specified execution configuration
        if execution_config is None:
            raise TypeError
        Classifier = Simon(encoder={}) # dummy text classifier
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config['encoder']
        checkpoint = config['checkpoint']

        X = encoder.encodeDataFrame(frame)

        # build classifier model
        model = Classifier.generate_model(maxlen, max_cells, category_count)
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)

        model_compile = lambda m: m.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=['binary_accuracy'])
        model_compile(model)
        y = model.predict(X)   
        # discard empty column edge case
        y[np.all(frame.isnull(),axis=0)]=0

        result = encoder.reverse_label_encode(y,p_threshold)

        
        ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
        category_count = 0
        ordinal_count = 0
        raw_data = frame.as_matrix()
        for i in np.arange(raw_data.shape[1]):
            if 'statistical_classification' in self.hyperparams.keys() and self.hyperparams['statistical_classification']:
                print("Beginning Guessing categorical/ordinal classifications...")
                tmp = guess(raw_data[:,i], for_types ='category')
                if tmp[0]=='category':
                    category_count += 1
                    tmp2 = list(result[0][i])
                    tmp2.append('categorical')
                    result[0][i] = tmp2
                    result[1][i].append(1)
                    if ('int' in result[1][i]) or ('float' in result[1][i]) \
                        or ('datetime' in result[1][i]):
                            ordinal_count += 1
                            tmp2 = list(result[0][i])
                            tmp2.append('ordinal')
                            result[0][i] = tmp2
                            result[1][i].append(1)
                print("Done with statistical variable guessing")
                ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL
            result[0][i] = d3m_List(result[0][i])
            result[1][i] = d3m_List(result[1][i])
        Classifier.clear_session()

        out_df = pandas.DataFrame.from_records(list(result)).T
        out_df.columns = ['semantic types','probabilities']
        return out_df
コード例 #5
0
    def _produce_annotations(self, *, inputs: Inputs) -> Outputs:
        """ generates dataframe with semantic type classifications and classification probabilities 
            for each column of original dataframe
        
        Arguments:
            inputs {Inputs} -- D3M dataframe
        
        Returns:
            Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities"
                       Each row represents a column in the original dataframe. The column "semantic type 
                       classifications" contains a list of all semantic type labels and the column
                       "probabilities" contains a list of the model's confidence in assigning each 
                       respective semantic type label 
        """

        # load model checkpoint
        checkpoint_dir = (self.volumes["simon_models_1"] +
                          "/simon_models_1/pretrained_models/")
        if self.hyperparams["statistical_classification"]:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(
                self.volumes["simon_models_1"] + "/simon_models_1" +
                category_list, "r") as f:
            Categories = f.read().splitlines()

        # create model object
        Classifier = Simon(encoder={})
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config["encoder"]
        checkpoint = config["checkpoint"]
        model = Classifier.generate_model(self.hyperparams["max_chars"],
                                          self.hyperparams["max_rows"],
                                          len(Categories))
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["binary_accuracy"])

        # prepare data and make predictions
        frame = inputs.copy()
        prepped_data = encoder.encodeDataFrame(frame)
        preds = model.predict_on_batch(tf.constant(prepped_data))
        decoded_preds = encoder.reverse_label_encode(
            preds, self.hyperparams["p_threshold"])

        # apply statistical / ordinal classification if desired
        if self.hyperparams["statistical_classification"]:
            logger.debug(
                "Beginning Guessing categorical/ordinal classifications...")
            raw_data = frame.values
            guesses = [
                guess(raw_data[:, i], for_types="category")
                for i in np.arange(raw_data.shape[1])
            ]
            for i, g in enumerate(guesses):
                if g[0] == "category":
                    decoded_preds[0][i] += ("categorical", )
                    decoded_preds[1][i].append(1)
                    if (("int" in decoded_preds[1][i])
                            or ("float" in decoded_preds[1][i])
                            or ("datetime" in decoded_preds[1][i])):
                        decoded_preds[0][i] += ("ordinal", )
                        decoded_preds[1][i].append(1)
            logger.debug("Done with statistical variable guessing")

        # clear tf session
        Classifier.clear_session()

        out_df = pd.DataFrame.from_records(list(decoded_preds)).T
        out_df.columns = ["semantic types", "probabilities"]
        return out_df
コード例 #6
0
    def runModel(self, frame, p_threshold):

        # setup model as you typically would in a Simon main file
        maxlen = 20
        max_cells = 500
        p_threshold = 0.5

        DEBUG = True  # boolean to specify whether or not print DEBUG information

        checkpoint_dir = "/clusterfiles/scripts/pretrained_models/"

        with open('/clusterfiles/scripts/Categories.txt', 'r') as f:
            Categories = f.read().splitlines()

        # orient the user a bit
        print("fixed categories are: ")
        Categories = sorted(Categories)
        print(Categories)
        category_count = len(Categories)

        execution_config = modelName

        # load specified execution configuration
        if execution_config is None:
            raise TypeError
        Classifier = Simon(encoder={})  # dummy text classifier

        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config['encoder']
        checkpoint = config['checkpoint']

        X = encoder.encodeDataFrame(frame)

        # build classifier model
        model = Classifier.generate_model(maxlen, max_cells, category_count)
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model_compile = lambda m: m.compile(loss='binary_crossentropy',
                                            optimizer='adam',
                                            metrics=['binary_accuracy'])
        model_compile(model)
        y = model.predict(X)
        # discard empty column edge case
        y[np.all(frame.isnull(), axis=0)] = 0

        result = encoder.reverse_label_encode(y, p_threshold)

        ## LABEL COMBINED DATA AS CATEGORICAL/ORDINAL
        print("Beginning Guessing categorical/ordinal classifications...")
        start_time_guess = time.time()
        category_count = 0
        ordinal_count = 0
        raw_data = frame.as_matrix()
        for i in np.arange(raw_data.shape[1]):
            tmp = guess(raw_data[:, i], for_types='category')
            if tmp[0] == 'category':
                category_count += 1
                tmp2 = list(result[0][i])
                tmp2.append('categorical')
                result[0][i] = tuple(tmp2)
                result[1][i].append(1)
                if ('int' in result[1][i]) or ('float' in result[1][i]) \
                    or ('datetime' in result[1][i]):
                    ordinal_count += 1
                    tmp2 = list(result[0][i])
                    tmp2.append('ordinal')
                    result[0][i] = tuple(tmp2)
                    result[1][i].append(1)
        elapsed_time = time.time() - start_time_guess
        print("Total statistical variable guessing time is : %.2f sec" %
              elapsed_time)
        ## FINISHED LABELING COMBINED DATA AS CATEGORICAL/ORDINAL

        Classifier.clear_session()

        return self.encoder.encode((result))