def consolidate_results(eval_result):  
    '''
    function to save the metrics from all the trained models and save the results as 
    csv and place in the process folder
    '''

    # create data frame to hold results
    multiclass_modelperf_df = pd.DataFrame()
    multiclass_modelperf_df = pd.DataFrame(eval_result, columns = ['Model','Vectorizer', 'Max features','Accuracy', 'Precision', 'Recall', 'F1'])
    print(multiclass_modelperf_df)
        
    # Get the configuraton settings
    config = util.get_config() 
    # fetch the path for the result csv
    savecsvto = config.get('Processed_datafolder', 'processeddatadir') 
    print('Save evaluation report to : ', savecsvto)
    # fetch the choice of vectorizer to create a proper file name
    vect = config.get('Vectorizer', 'Vect') 
    print('Selected Vectorizer is : ', vect)    
    filename = savecsvto+'eval_report_'+vect+'.csv'
    # export evluation report to the csv
    multiclass_modelperf_df.to_csv(filename)
    print('Evaluation results exported')

    return True
def setup_training(model, batcher):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    default_device = tf.device('/gpu:0')
    with default_device:
        model.build_graph()  # build the graph
        if FLAGS.convert_to_coverage_model:
            assert FLAGS.coverage, "To convert your non-coverage model to a coverage model, run with convert_to_coverage_model=True and coverage=True"
            convert_to_coverage_model()

        saver = tf.train.Saver(max_to_keep=FLAGS.model_max_to_keep)  # only keep 1 checkpoint at a time

    sv = tf.train.Supervisor(logdir=train_dir,
                             is_chief=True,
                             saver=saver,
                             summary_op=None,
                             save_summaries_secs=60,  # save summaries for tensorboard every 60 secs
                             save_model_secs=0,  # checkpoint every 60 secs
                             global_step=model.global_step)
    summary_writer = sv.summary_writer
    tf.logging.info("Preparing or waiting for session...")
    sess_context_manager = sv.prepare_or_wait_for_session(config=util.get_config())
    tf.logging.info("Created session.")

    try:
        run_training(model, batcher, sess_context_manager, sv,
                     summary_writer)  # this is an infinite loop until interrupted
    except KeyboardInterrupt:
        tf.logging.info("Caught keyboard interrupt on worker. Stopping supervisor...")
        sv.stop()
Exemple #3
0
    def __init__(self):
        super().__init__(self.get_prefix, case_insensitive=True)
        logging.init_logging()

        self.db_client = init_client(self.loop)
        if self.db_client: logger.info("Connected to Database.")
        self.config = get_config()
        if self.config["trello"]["enabled"]:
            self.trello_client, self.trello_board = trelloinit(self.config)
        self.admin_db = self.db_client["management"]
        self.users_db = self.db_client["users"]
        if self.config["slothpixel_key"]:
            self.slothpixel_key_string = f'?key={self.config["slothpixel_key"]}'
        else:
            self.slothpixel_key_string = ''
        self.guilds_db = self.db_client["guilds"]
        self.scammer_db = self.db_client["scammer"]
        self.status_list = cycle(self.config["status_list"])
        self.remove_command("help")

        self.api_keys = self.config["api_keys"]

        if not self.api_keys:
            logger.warning(
                "PLEASE SET AT LEAST ON API KEY, ELSE THE BOT WON'T WORK.")

        self.events = []

        self.load_cogs()
def vectorizer(data, vectorizer, saveto):
    '''
    funciton to vectorize the data based on the parameter given in config file
    arguments: cleaned data, choice of vectorizer and path to save vectorized corpus
    returns vectorized training data set and testing data set (including features and targets)
    '''
    # reading configuration files
    config = util.get_config()
    m_features = int(config.get('MaxFeatures', 'm_features'))
    print('Max number of features selected : ', m_features)

    # calling split function and access the training and test data
    X_train, X_test, y_train, y_test = split(data)
    
    ext = '.pkl'  
    # applying the selected vectorizer to the split data    
    if vectorizer == "CV":
        # Creating count vectorizer objects
        catcv_vect = CountVectorizer( analyzer='word', 
                                     #stop_words='english',lowercase = True, 
                                        ngram_range=(2,3),       # ngrams - 2,3 
                                        max_features=m_features, # Had to restrict to 10000 features otherwise its running forever
                                        max_df = 0.5,  
                                        min_df = 3)         
        
        features_train = catcv_vect.fit_transform(X_train) 
        print(len(catcv_vect.get_feature_names()))            
        print(catcv_vect.fit_transform(X_train).shape)         
        features_test = catcv_vect.transform(X_test)
    
    elif vectorizer == "TFIDF":  
        # Creating TF-IDF vectorizer objects
        tfidf_vect = TfidfVectorizer(analyzer='word', 
                                     #stop_words='english',lowercase = True, 
                                     ngram_range=(2,3),
                                     max_features=m_features,
                                     max_df = 0.5,
                                     smooth_idf=True, 
                                     min_df=3)
        
        features_train = tfidf_vect.fit_transform(X_train) 
        print(len(tfidf_vect.get_feature_names()))          
        print(tfidf_vect.fit_transform(X_train).shape)         
        features_test = tfidf_vect.transform(X_test)
        
        
    # Confirming shapes of test and train transformations
    print('selected vectorizer is : ',vectorizer )
    print(features_train.shape)
    print(features_test.shape)
    
    filename = saveto+vectorizer+ext    
    print(filename)
    print('Vectorizer saved to location as : ', filename)
    
    # saving model to disk as specified in saveto
    joblib.dump(features_train, filename)
        
    return features_train, features_test, y_train, y_test
def loadvoacb(vectusing, modelusing, pickfrom, sample_text):
    '''
    function to load the trained voab
    arguments: 
      vectusing:choice of vectorizer
      modelusing: choice of model that needs to be applied
      pickfrom: pickled model to apply for new text for topic modelling
      sample_text: text for topic modelling
    returns: sample text given for topic modelling and the predicted label
    '''

    # assign models pickled filename to filename for accessing based on model parameter
    if modelusing == 'LRL1':
        modelfile = 'LogisticRegressionL1.pkl'
    elif modelusing == 'LRL2':
        modelfile = 'LogisticRegressionL2.pkl'
    elif modelusing == 'NB':
        modelfile = 'NaiveBayes.pkl'
    elif modelusing == 'RF':
        modelfile = 'RandomForest.pkl'

    #create filename using modelpath and modelfile
    filename = pickfrom + modelfile
    print(filename)
    # Load the model from the file
    model_from_joblib = joblib.load(filename)

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()

    data = config.get('Interim', 'Interim1')
    print('Pick cleaned data from : ', data)

    saveto = config.get('Modelpath', 'saveto')
    print('Save model to : ', saveto)

    # creates and pickles trained vocabulary
    new_vectorizer(data, vectusing)
    if vectusing == 'CV':
        filename = saveto + vectusing
        trainedvectvoacb = pickle.load(open(filename, 'rb'))
        #reloading trained vocabulary
        loadedvect = CountVectorizer(vocabulary=trainedvectvoacb)
    elif vectusing == 'TFIDF':
        filename = saveto + vectusing
        trainedvectvoacb = pickle.load(open(filename, 'rb'))
        #reloading trained vocabulary
        loadedvect = TfidfVectorizer(vocabulary=trainedvectvoacb)

    # validating picled vocabulary
    loadedvect._validate_vocabulary()
    # transform the new text using the loaded vocabulary (for applying the trained model)
    newtestvect = loadedvect.transform(sample_text)

    # Use the loaded model to make predictions for sample test
    label_predictions = model_from_joblib.predict(newtestvect)
    print(label_predictions, sample_text)
    return (label_predictions, sample_text)
def modelling():
    '''
    funciton to perform the topic modelling or classification of the new test data
    and create a data frame of the modelled list
    '''

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()

    data = config.get('Interim', 'Interim1')
    print('Pick cleaned data from : ', data)

    vect = config.get('Vectorizer', 'Vect')
    print('Selected Vectorizer is : ', vect)

    saveto = config.get('Modelpath', 'saveto')
    print('Save model to : ', saveto)

    # read the model path location from config file
    pickfrom = config.get('Modelpath', 'pickfrom')
    print('Save model to : ', pickfrom)

    # read the model from config file
    modelusing = config.get('TopicModelling', 'Model')
    print('Use model  : ', modelusing)

    # read the required vectorization from config file
    vectusing = config.get('TopicModelling', 'Vect')
    print('Use model  : ', vectusing)

    savemodelledto = config.get('TopicModelling', 'savemodelledto')
    print('Save evaluation report to : ', savemodelledto)

    # read the required modelling text from config file
    sample = config.get('TopicModelling', 'sample_text')
    print('Sample Test is  : ', sample)

    # clening the sample text using function in util module
    sample = util.textcleaning(sample)
    print(type(sample))
    # converting string to list for it iterable
    sample_text = [sample]
    print(type(sample_text))
    print(sample_text)

    #creating a list of modelled topics and creating dataframe for further reporting
    modelleddata = []
    #sample_text = ["The Browns have a better QB situation than the 49ers.\n\nBringing up the Browns to absolve the 49ers of their poor decision making is incredibly lazy. Both teams can completely suck at QB evaluation"]
    modelleddata.append(loadvoacb(vectusing, modelusing, pickfrom,
                                  sample_text))
    # creates dataframe from the modelled data for generating report
    modelled_df = pd.DataFrame(modelleddata,
                               columns=['PredictedLabel', 'GivenText'])
    filename = savemodelledto + 'topicmodelling.csv'
    # export modelled topic details to the csv
    modelled_df.to_csv(filename)
    return True
Exemple #7
0
def main():
    
    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()        
    Rptfolder = config.get('Reportfolder', 'Rptfolder')
    endtoend_pipeline()
    print('Entire end to end pipeline executed :')
    print('Check the below directory for the reports generated\n')
    print('    - ', Rptfolder)    
    return True
Exemple #8
0
def init_client(loop):
    logger.info("Connecting to Database...")
    config = get_config()["database"]
    if config["local"]:
        string = ''
    else:
        string = '+srv'
    return AsyncIOMotorClient(
        f"mongodb{string}://{config['username']}:{config['password']}@{config['address']}/{config['default_db']}?retryWrites=true&w=majority",
        io_loop=loop)
    def __init__(self):
        super().__init__(self.get_prefix,
                         case_insensitive=True,
                         intents=intents)

        logging.init_logging()

        logger.info([z for z in self.intents])
        self.db_client = init_client(self.loop)
        if self.db_client: logger.info("Connected to Database.")
        self.config = get_config()
        self.custom_emojis = get_config("emojis")
        if self.config["trello"]["enabled"]:
            self.trello_client, self.trello_board = trelloinit(self.config)
        self.admin_db = self.db_client["management"]
        self.users_db = self.db_client["users"]
        if self.config["slothpixel_key"]:
            self.slothpixel_key_string = f'?key={self.config["slothpixel_key"]}'
        else:
            self.slothpixel_key_string = ''
        if self.config["stats_api"] == "default":
            self.stats_api = "http://hypixel-skybot.ddns.net:3000/stats"
        else:
            self.stats_api = self.config["stats_api"]
        self.guilds_db = self.db_client["guilds"]
        self.scammer_db = self.db_client["scammer"]
        self.status_list = cycle(self.config["status_list"])
        self.remove_command("help")

        self.api_keys = self.config["api_keys"]

        if not self.api_keys:
            logger.warning(
                "PLEASE SET AT LEAST ON API KEY, ELSE THE BOT WON'T WORK.")

        self.events = []

        self.slash = SlashCommand(self, sync_commands=True)

        self.load_cogs()
        self.start_time = time()
def modeltrain(saveto, model = None):
    '''
    function to train the data
    arguments - folder to save the model, 2nd parameter model is optional
    '''
    
    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()   
    data = config.get('Interim', 'Interim1')
    vect = config.get('Vectorizer', 'Vect') 

     # training and test data -  features  & targets      
    X_train, X_test, y_train, y_test = vectorizer(data, vect, saveto) 
    if model == 'LRL1':
        # Creating instance for logistic regression with penalty L1
        model_clf = LogisticRegression(n_jobs=-1, 
                                   penalty='l1',
                                   multi_class='multinomial', 
                                   solver = 'saga',
                                   random_state=1) 
        modelnm = 'LogisticRegressionL1'        
    elif model == 'LRL2':
        # Creating instance for logistic regression with penalty L1
        model_clf = LogisticRegression(n_jobs=-1, 
                                   penalty='l2',
                                   multi_class='multinomial', 
                                   solver = 'saga',
                                   random_state=1) 
        modelnm = 'LogisticRegressionL2'    
    elif model == 'NB':
        # Create Multinomial NB classifier
        # added alpha = 1 for laplace smoothing for multi-classification
        model_clf = MultinomialNB(alpha = 1, class_prior=None, fit_prior=True) 
        modelnm = 'NaiveBayes'    
    elif model == 'RF':
        # Create random forest classifier object
        model_clf = RandomForestClassifier(n_jobs=-1, random_state=1)
        modelnm = 'RandomForest'
        
    # Applying classification model to train data set
    model_clf.fit(X_train, y_train)
    ext = '.pkl'
    # get date to append to the report
    time = datetime.now().strftime("%Y-%m-%d")
    #filename = saveto+modelnm+'_'+time+ext    
    filename = saveto+modelnm+ext 
    print('Model saved to location : ', filename)
    
    # saving model to disk as specified in saveto
    joblib.dump(model_clf, filename)    
    # predict the labels on trained dataset
    y_pred = model_clf.predict(X_test) 
    return y_test, y_pred
def run_eval(model, batcher):
    """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far."""
    model.build_graph()  # build the graph
    saver = tf.train.Saver(max_to_keep=3)  # we will keep 3 best checkpoints at a time
    sess = tf.Session(config=util.get_config())
    eval_dir = os.path.join(FLAGS.log_root, "eval_loss")  # make a subdir of the root dir for eval data
    bestmodel_save_path = os.path.join(eval_dir, 'bestmodel')  # this is where checkpoints of best models are saved
    summary_writer = tf.summary.FileWriter(eval_dir)
    running_avg_loss = 0  # the eval job keeps a smoother, running average loss to tell it when to implement early stopping
    best_loss = None  # will hold the best loss achieved so far
    train_dir = os.path.join(FLAGS.log_root, "train")

    while True:
        ckpt_state = tf.train.get_checkpoint_state(train_dir)
        tf.logging.info('max_enc_steps: %d, max_dec_steps: %d', FLAGS.max_enc_steps, FLAGS.max_dec_steps)
        _ = util.load_ckpt(saver, sess)  # load a new checkpoint
        batch = batcher.next_batch()  # get the next batch

        # run eval on the batch
        t0 = time.time()
        results = model.run_eval_step(sess, batch)
        t1 = time.time()
        tf.logging.info('seconds for batch: %.2f', t1 - t0)

        # print the loss and coverage loss to screen
        loss = results['loss']
        tf.logging.info('loss: %f', loss)
        train_step = results['global_step']

        tf.logging.info("pgen_avg: %f", results['p_gen_avg'])

        if FLAGS.coverage:
            tf.logging.info("coverage_loss: %f", results['coverage_loss'])

        # add summaries
        summaries = results['summaries']
        summary_writer.add_summary(summaries, train_step)

        # calculate running avg loss
        running_avg_loss = util.calc_running_avg_loss(np.asscalar(loss), running_avg_loss, summary_writer, train_step,
                                                      'running_avg_loss')

        # If running_avg_loss is best so far, save this checkpoint (early stopping).
        # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
        if best_loss is None or running_avg_loss < best_loss:
            tf.logging.info('Found new best model with %.3f running_avg_loss. Saving to %s', running_avg_loss,
                            bestmodel_save_path)
            saver.save(sess, bestmodel_save_path, global_step=train_step, latest_filename='checkpoint_best')
            best_loss = running_avg_loss

        # flush the summary writer every so often
        if train_step % 100 == 0:
            summary_writer.flush()
Exemple #12
0
def new_vectorizer(data, vectorizer):
    '''
    function to create the vocabulary using the choice of vectorizer
    arguments: source data and choice of vectorizer
    '''
    # split the data
    X_train, X_test, y_train, y_test = split(data)

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()

    # read urls from either from config file
    saveto = config.get('Modelpath', 'saveto')
    print('Save model to : ', saveto)

    m_features = int(config.get('MaxFeatures', 'm_features'))
    print('Max number of features selected : ', m_features)

    filename = saveto + vectorizer

    # saving model to disk as specified in saveto
    if vectorizer == "CV":
        # Creating count vectorizer objects
        catcv_vect = CountVectorizer(
            analyzer='word',
            stop_words='english',  # removes english stop words
            ngram_range=(2, 3),  # ngrams - 2,3 
            max_features=
            m_features,  # Had to restrict to 10000 features otherwise its running forever
            lowercase=True,
            max_df=0.5,
            min_df=3)

        catcv_vect.fit(X_train)
        pickle.dump(catcv_vect.vocabulary_, open(filename, 'wb'))
        print('Vectorizer saved to location as : ', filename)
        return

    elif vectorizer == "TFIDF":
        # Creating TF-IDF vectorizer objects
        tfidf_vect = TfidfVectorizer(analyzer='word',
                                     stop_words='english',
                                     ngram_range=(2, 3),
                                     max_features=m_features,
                                     lowercase=True,
                                     max_df=0.5,
                                     smooth_idf=True,
                                     min_df=3)

        tfidf_vect.fit(X_train)
        pickle.dump(tfidf_vect.vocabulary_, open(filename, 'wb'))
        print('Vectorizer saved to location as : ', filename)
        return
def evaluate_model():
    '''
    function to evaluate the model and save the result as list
    arguments: none
    returns the result set as list
    '''

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()

    # List to store model evaluation results
    allmodels_evalresults = []

    pickfrom = config.get('Modelpath', 'pickfrom')
    print('Pick fitted model from : ', pickfrom)

    data = config.get('Interim', 'Interim1')
    print('Pick cleaned data from : ', data)

    vect = config.get('Vectorizer', 'Vect')
    print('Selected Vectorizer is : ', vect)

    saveto = config.get('Modelpath', 'saveto')
    print('Save model to : ', saveto)

    # getting max features parameter from the config file
    m_features = int(config.get('MaxFeatures', 'm_features'))

    X_train, X_test, y_train, y_test = vectorizer(data, vect, saveto)

    # Get list of models
    model1 = config.get('Models', 'Model1')
    model2 = config.get('Models', 'Model2')
    model3 = config.get('Models', 'Model3')
    model4 = config.get('Models', 'Model4')
    # creating a list of the parameters
    Models_list = [model1, model2, model3, model4]
    print('Models being evaluated are : ', Models_list)
    # looping through the models list for training and printing the evaluation metrics
    for i in range(len(Models_list)):
        model = Models_list[i]
        print('Selected model is : ', model)
        y_test, y_pred = modeltrain(saveto, model=model)
        accuracy, precision, recall, f1 = evaluate.print_metrics(
            y_test, y_pred)
        #create a tuple of all parameters
        row = (model, vect, m_features, accuracy, precision, recall, f1)
        allmodels_evalresults.append(row)

    return allmodels_evalresults
def gen_final_modellingreport():
    '''
    function that create the topic modelling report
    
    '''
    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()
    evalfileloc = config.get('Processed_datafolder', 'processeddatadir')
    print('Evaluated results are saved at : ', evalfileloc)

    #get the folder where the evaluation report needs to be saved
    reportpath = config.get('Reportfolder', 'Rptfolder')
    print('Pick evaluated results from csv : ', reportpath)

    #calling function to generate markdown report
    to_markdown(evalfileloc, reportpath)
    return
    def __init__(self, model, batcher, vocab):
        """Initialize decoder.

    Args:
      model: a Seq2SeqAttentionModel object.
      batcher: a Batcher object.
      vocab: Vocabulary object
    """
        self._model = model
        self._model.build_graph()
        self._batcher = batcher
        self._vocab = vocab
        self._saver = tf.train.Saver(
            max_to_keep=3)  # we use this to load checkpoints for decoding
        self._sess = tf.Session(config=util.get_config())
        if FLAGS.mode == 'evalall':
            self.prepare_evaluate()
Exemple #16
0
def initiatemodel_wf(arg1, arg2, arg3):
    '''
    function to preprocess and vectorize as part of end to end pipeline
    takes 3 arguments called from end to end pipeline - read from config file    
    '''

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()
    # read source folder from config file
    source = config.get('Sourcefolder', 'source')
    preprocessing(source)
    print('Task1 done')

    vectorizer(arg1, arg2, arg3)
    print('Task2 done')

    return True
Exemple #17
0
def gen_report(eval_results):
    '''
    function to generate reports based on the evaluation metrics
    '''
    # calling the function to consolidation results
    consolidate_results(eval_results)

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()
    evalfileloc = config.get('Processed_datafolder', 'processeddatadir')
    print('Evaluated results are saved at : ', evalfileloc)

    #get the folder where the evaluation report needs to be saved
    reportpath = config.get('Reportfolder', 'Rptfolder')
    print('Pick evaluated results from csv : ', reportpath)

    #calling function to generate markdown report
    to_markdown(evalfileloc, reportpath)

    return True
Exemple #18
0
def endtoend_pipeline():
    '''
    function to perform end to end pipeline of 
        preprocessing
        vectorizing
        training models
        evaluating models
        generate evaluation metrics and report
        topic modelling and report on it
    '''

    # Get the configuraton settings to read URLs and symbols
    config = util.get_config()    
        
    data = config.get('Interim', 'Interim1')
    print('Pick cleaned data from : ', data)
    
    vect = config.get('Vectorizer', 'Vect') 
    print('Selected Vectorizer is : ', vect)
    
    # read modelpath where the vectorized and fitted vector needs to be saved from config file
    saveto = config.get('Modelpath', 'saveto') 
    print('Save model to : ', saveto)
    
    workflow1_prepdata.initiatemodel_wf(data, vect, saveto)
    print('Workflow 1 done')
    
    results = workflow2_buildevaluate_models.train_wf()
    print('Workflow 2 done')
    
    workflow3_generatereport.gen_report(results)
    print('Workflow 3 done and report generated')
    
    workflow4_Modelling.topicmodelling_wf()
    print('Workflow 4 executed to generate topic modelling report')
    
    return True
Exemple #19
0
def preprocessing(file):
    '''
    function that uses utility functions to clean the text, lemmatization and stemming operations.
    Saves the data as the csv file in iterim directory (mentioned in config file)
    Arguments: the source file
    '''

    # reading category data set
    catdf = util.json_pd(file)
    # encode labels
    labelencoder(catdf)
    # Sampling huge data set and at the same time, keep the fraction of the category by using same samplng fraction across
    catdf0 = catdf[catdf.cat == 0].sample(frac=0.06, random_state=1)
    catdf1 = catdf[catdf.cat == 1].sample(frac=0.06, random_state=1)
    catdf2 = catdf[catdf.cat == 2].sample(frac=0.06, random_state=1)
    catdf3 = catdf[catdf.cat == 3].sample(frac=0.06, random_state=1)
    sampledata = pd.concat([catdf0, catdf1, catdf2, catdf3])

    #Applying text cleaning on text field to clean it up
    sampledata['clndtxt'] = sampledata['txt'].apply(util.textcleaning)
    # Apply lemmatization using util function
    sampledata['lemmedtxt'] = sampledata['clndtxt'].apply(
        util.lemmatize_text).apply(lambda x: " ".join(x))
    # Apply Stemming using util function
    sampledata['stemmedtxt'] = sampledata['lemmedtxt'].apply(
        util.stemmed_words)  #.apply(lambda x : " ".join(x))
    # filtering rows that have null text after cleaning, lemmatization and stemming
    data = sampledata[pd.notnull(sampledata['stemmedtxt'])]

    #Get configuration file to read interim folder and file
    config = util.get_config()
    tempdatadir = config.get('Temp_datafolder', 'tempdatadir')
    interim_data = tempdatadir + 'interimdata.csv'
    data.to_csv(interim_data, sep=',')
    print('Interim csv loaded')
    return
def convert_to_coverage_model():
    """Load non-coverage checkpoint, add initialized extra variables for coverage, and save as new checkpoint"""
    tf.logging.info("converting non-coverage model to coverage model..")

    # initialize an entire coverage model from scratch
    sess = tf.Session(config=util.get_config())
    print "initializing everything..."
    sess.run(tf.global_variables_initializer())

    # load all non-coverage weights from checkpoint
    saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name])
    print "restoring non-coverage variables..."
    curr_ckpt = util.load_ckpt(saver, sess)
    print "restored."

    # save this model and quit
    ckpt_path = os.path.join(FLAGS.log_root, "train", "model.ckpt_cov")
    step = curr_ckpt.split('-')[1]
    new_fname = ckpt_path + '-' + step + '-init'
    print "saving model to %s..." % (new_fname)
    new_saver = tf.train.Saver()  # this one will save all variables that now exist
    new_saver.save(sess, new_fname)
    print "saved."
    exit()
Exemple #21
0
    def __init__(self, model, batcher, vocab):
        """Initialize decoder.

        Args:
          model: a SentSelector object.
          batcher: a Batcher object.
          vocab: Vocabulary object
        """
        # get the data split set
        if "train" in FLAGS.data_path:
            self._dataset = "train"
        elif "val" in FLAGS.data_path:
            self._dataset = "val"
        elif "test" in FLAGS.data_path:
            self._dataset = "test"
        else:
            raise ValueError(
                "FLAGS.data_path %s should contain one of train, val or test" %
                (FLAGS.data_path))

        # create the data loader
        self._batcher = batcher

        if FLAGS.eval_gt_rouge:  # no need to load model
            # Make a descriptive decode directory name
            self._decode_dir = os.path.join(FLAGS.log_root,
                                            'select_gt' + self._dataset)
            tf.logging.info('Save evaluation results to ' + self._decode_dir)
            if os.path.exists(self._decode_dir):
                raise Exception(
                    "single_pass decode directory %s should not already exist"
                    % self._decode_dir)

            # Make the decode dir
            os.makedirs(self._decode_dir)

            # Make the dirs to contain output written in the correct format for pyrouge
            self._rouge_ref_dir = os.path.join(self._decode_dir, "reference")
            if not os.path.exists(self._rouge_ref_dir):
                os.mkdir(self._rouge_ref_dir)
            self._rouge_gt_dir = os.path.join(self._decode_dir, "gt_selected")
            if not os.path.exists(self._rouge_gt_dir):
                os.mkdir(self._rouge_gt_dir)
        else:
            self._model = model
            self._model.build_graph()
            self._vocab = vocab
            self._saver = tf.train.Saver(
            )  # we use this to load checkpoints for decoding
            self._sess = tf.Session(config=util.get_config())

            # Load an initial checkpoint to use for decoding
            if FLAGS.load_best_eval_model:
                tf.logging.info('Loading best eval checkpoint')
                ckpt_path = util.load_ckpt(self._saver,
                                           self._sess,
                                           ckpt_dir='eval')
            elif FLAGS.eval_ckpt_path:
                ckpt_path = util.load_ckpt(self._saver,
                                           self._sess,
                                           ckpt_path=FLAGS.eval_ckpt_path)
            else:
                tf.logging.info('Loading best train checkpoint')
                ckpt_path = util.load_ckpt(self._saver, self._sess)

            if FLAGS.single_pass:
                # Make a descriptive decode directory name
                ckpt_name = "ckpt-" + ckpt_path.split('-')[
                    -1]  # this is something of the form "ckpt-123456"
                decode_root_dir, decode_dir = get_decode_dir_name(
                    ckpt_name, self._dataset)
                self._decode_root_dir = os.path.join(FLAGS.log_root,
                                                     decode_root_dir)
                self._decode_dir = os.path.join(FLAGS.log_root,
                                                decode_root_dir, decode_dir)
                tf.logging.info('Save evaluation results to ' +
                                self._decode_dir)
                if os.path.exists(self._decode_dir):
                    raise Exception(
                        "single_pass decode directory %s should not already exist"
                        % self._decode_dir)
            else:  # Generic decode dir name
                self._decode_dir = os.path.join(FLAGS.log_root, "select")

            # Make the decode dir if necessary
            if not os.path.exists(self._decode_dir):
                os.makedirs(self._decode_dir)

            if FLAGS.single_pass:
                # Make the dirs to contain output written in the correct format for pyrouge
                self._rouge_ref_dir = os.path.join(self._decode_dir,
                                                   "reference")
                if not os.path.exists(self._rouge_ref_dir):
                    os.mkdir(self._rouge_ref_dir)
                self._rouge_dec_dir = os.path.join(self._decode_dir,
                                                   "selected")
                if not os.path.exists(self._rouge_dec_dir):
                    os.mkdir(self._rouge_dec_dir)
                if FLAGS.save_pkl:
                    self._result_dir = os.path.join(self._decode_dir,
                                                    "select_result")
                    if not os.path.exists(self._result_dir):
                        os.mkdir(self._result_dir)

                self._probs_pkl_path = os.path.join(self._decode_root_dir,
                                                    "probs.pkl")
                if not os.path.exists(self._probs_pkl_path):
                    self._make_probs_pkl = True
                else:
                    self._make_probs_pkl = False
                self._precision = []
                self._recall = []
                self._accuracy = []
                self._ratio = []
                self._select_sent_num = []
Exemple #22
0
 async def reload_config(self, ctx):
     self.bot.config = get_config()
     return await ctx.send(f"`config reloaded by` {ctx.author.mention}")
def setup_training(model, batcher):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    default_device = tf.device('/gpu:0')
    with default_device:
        model.build_graph()  # build the graph
        if FLAGS.pretrained_selector_path:  # cross entropy loss eval best model or train model
            params = tf.global_variables()
            # do not load global step and adagrad states (since the pretrained model may come from best eval model and the eval model do not have adagrad states)
            selector_vars = [
                param for param in params
                if "SentSelector" in param.name and 'Adagrad' not in param.name
            ]
            uninitialized_vars = [
                param for param in params if param not in selector_vars
            ]
            pretrained_saver = tf.train.Saver(selector_vars)
            local_init_op = tf.variables_initializer(uninitialized_vars)
            saver = tf.train.Saver(max_to_keep=FLAGS.model_max_to_keep)
        else:
            saver = tf.train.Saver(max_to_keep=FLAGS.model_max_to_keep)

    if FLAGS.pretrained_selector_path:
        sv = tf.train.Supervisor(
            logdir=train_dir,
            is_chief=True,
            saver=None,
            local_init_op=local_init_op,
            summary_op=None,
            save_summaries_secs=
            60,  # save summaries for tensorboard every 60 secs
            save_model_secs=0,  # do not save checkpoint
            global_step=model.global_step)
    else:
        sv = tf.train.Supervisor(
            logdir=train_dir,
            is_chief=True,
            saver=saver,
            summary_op=None,
            save_summaries_secs=
            60,  # save summaries for tensorboard every 60 secs
            save_model_secs=0,  # do not save checkpoint
            global_step=model.global_step)

    summary_writer = sv.summary_writer
    tf.logging.info("Preparing or waiting for session...")
    sess_context_manager = sv.prepare_or_wait_for_session(
        config=util.get_config())
    tf.logging.info("Created session.")

    try:
        if FLAGS.pretrained_selector_path:
            run_training(model, batcher, sess_context_manager, sv, summary_writer, \
                         pretrained_saver, saver)  # this is an infinite loop until interrupted
        else:
            run_training(
                model, batcher, sess_context_manager, sv,
                summary_writer)  # this is an infinite loop until interrupted
    except KeyboardInterrupt:
        tf.logging.info(
            "Caught keyboard interrupt on worker. Stopping supervisor...")
        sv.stop()
Exemple #24
0
    # SlackMessage(MY_SLACK_TOKEN,monitors.__name__)
]

if args().restore:
    weight = last_cheackpoint(RESULT_PATH)
    init_epoch = int(weight.split("-")[-1].split(".")[0])
    model.load_weights(weight)
    print(
        f"*******************\ncheckpoint restored : {weight}\n*******************"
    )
else:
    init_epoch = 0
    print("*******************\nrestart training\n*******************")

train_options = {
    "optimizer": get_config(optim),
    "batchsize": BATCH_SIZE,
    "loss_function": loss_func,
    "input_shape": INPUT_IMAGE_SHAPE,
    "augmemtation": copy["augmemtation"]
}

print(json.dumps(train_options, indent=4, sort_keys=False))

with open(os.path.join(RESULT_PATH, 'train_options.json'), 'w') as f:
    f.write(json.dumps(train_options))

hist = model.fit_generator(
    generator=train_iterator,
    steps_per_epoch=None,
    epochs=100,
Exemple #25
0
def setup_training(model, batcher):
    """Does setup before starting training (run_training)"""
    train_dir = os.path.join(FLAGS.log_root, "train")
    if not os.path.exists(train_dir): os.makedirs(train_dir)

    default_device = tf.device('/gpu:0')
    with default_device:
        assert FLAGS.coverage, "Please run the end2end model with coverage mechanism."
        model.build_graph()  # build the graph

        if FLAGS.pretrained_selector_path and FLAGS.pretrained_rewriter_path:
            params = tf.global_variables()
            selector_vars = [
                param for param in params
                if "SentSelector" in param.name and 'Adagrad' not in param.name
            ]
            rewriter_vars = [
                param for param in params
                if "seq2seq" in param.name and 'Adagrad' not in param.name
            ]
            uninitialized_vars = [
                param for param in params
                if param not in selector_vars and param not in rewriter_vars
            ]
            selector_saver = tf.train.Saver(selector_vars)
            rewriter_saver = tf.train.Saver(rewriter_vars)
            local_init_op = tf.variables_initializer(uninitialized_vars)
            all_saver = tf.train.Saver(max_to_keep=FLAGS.model_max_to_keep)
        else:
            saver = tf.train.Saver(max_to_keep=FLAGS.model_max_to_keep)

    if FLAGS.pretrained_selector_path and FLAGS.pretrained_rewriter_path:
        sv = tf.train.Supervisor(
            logdir=train_dir,
            is_chief=True,
            saver=None,
            local_init_op=local_init_op,
            summary_op=None,
            save_summaries_secs=
            60,  # save summaries for tensorboard every 60 secs
            save_model_secs=0,  # checkpoint every 60 secs
            global_step=model.global_step)
    else:
        sv = tf.train.Supervisor(
            logdir=train_dir,
            is_chief=True,
            saver=saver,
            summary_op=None,
            save_summaries_secs=
            60,  # save summaries for tensorboard every 60 secs
            save_model_secs=0,  # checkpoint every 60 secs
            global_step=model.global_step)

    summary_writer = sv.summary_writer
    tf.logging.info("Preparing or waiting for session...")
    sess_context_manager = sv.prepare_or_wait_for_session(
        config=util.get_config())
    tf.logging.info("Created session.")

    try:
        if FLAGS.pretrained_selector_path and FLAGS.pretrained_rewriter_path:
            run_training(model, batcher, sess_context_manager, sv, summary_writer, \
                         selector_saver, rewriter_saver, all_saver) # this is an infinite loop until interrupted
        else:
            run_training(
                model, batcher, sess_context_manager, sv,
                summary_writer)  # this is an infinite loop until interrupted
    except KeyboardInterrupt:
        tf.logging.info(
            "Caught keyboard interrupt on worker. Stopping supervisor...")
        sv.stop()