def select_relevant(self):
        from statsmodels.discrete.discrete_model import LogitResults

        # load classifier model
        model = LogitResults.load(KeytermClassifier.MODEL_STORE_FILE)

        # prepare feature df
        X = self.keyterm_feature_df.copy()
        X = X.drop(['doc_url', "is_url", 'term'], axis = 1)
        X['intercept'] = 1

        self.keyterm_feature_df['relevant_pred'] = model.predict(X)
        self.keyterm_feature_df.sort_values(["relevant_pred", "cvalue"], ascending=[False,False], inplace=True)
        # self.keyterm_feature_df.sort_values(["relevant_pred", "tf"], ascending=[False,False], inplace=True)

        #topk_keyterms = self.keyterm_feature_df[:self.topk]['term'].values
        topk_keyterms = self._top_selection()
        return topk_keyterms
def extract_test_keywords(train_dataset_file, test_dataset_file, retrain = False):
    import os
    from statsmodels.discrete.discrete_model import LogitResults

    cl = KeytermClassifier(train_dataset_file, test_dataset_file)

    if os.path.exists(KeytermClassifier.MODEL_STORE_FILE):
        if retrain:
            print "Preparing training set ..."
            cl.prepare_training_set()

            print "Training model ..."
            cl.fit()

            print "Evaluating model ..."
            cl.prepare_test_set()
            cl.test()

            print "Saving model ..."
            cl.model.save(KeytermClassifier.MODEL_STORE_FILE)
        else:
            print "Loading model ..."
            saved_model = LogitResults.load(KeytermClassifier.MODEL_STORE_FILE)
            cl.model = saved_model
    else:
        print "Preparing training set ..."
        cl.prepare_training_set()

        print "Training model ..."
        cl.fit()

        print "Evaluating model ..."
        cl.prepare_test_set()
        cl.test()

        print "Saving model ..."
        cl.model.save(KeytermClassifier.MODEL_STORE_FILE)

    print "Preparing test set ..."
    cl.prepare_test_set()

    print "Extracting keywords ..."
    return cl.extract_test_keywords()
    def __init__(self, saved_classifier_file, topk = 10):
        self._classifier_file = saved_classifier_file
        self.topk = topk

        self.model = LogitResults.load(saved_classifier_file)