def perform_classification(classifier, algo, cv=False, roc=True):
    if os.path.isfile("../Dataset/Train_LSI.pkl") and os.path.isfile("../Dataset/Test_LSI.pkl"):  # load pickle file if it exists
        train_lsi, test_lsi = utility.apply_lsi([], [])
    else:
        train_idf, test_idf = utility.model_text_data(processed_train_dataset, processed_test_dataset)
        train_lsi, test_lsi = utility.apply_lsi(train_idf, test_idf)

    if cv: # SVM Cross Validation Testing
        logger.info("Calculating Best Parameter Value")
        C = [-3,-2,-1,0,1,2,3]
        best_scores = []

        for i in C:
            logger.info("Parameter Value: {}".format(i))
            clf = svm.SVC(kernel='linear', C = 10**C[i])
            scores = cross_validation.cross_val_score(clf, train_lsi,processed_train_dataset.target,cv=5)
            best_scores.append(np.mean(scores))

        logger.info(best_scores)
        logger.info("Best Parameter Value: {}".format(best_scores.index(max(best_scores))))
        classifier = svm.SVC(kernel='linear',C=10**best_scores.index(max(best_scores)))

    logger.info("Training {0} Classifier ".format(algo))
    classifier.fit(train_lsi, processed_train_dataset.target)  # fit the training data
    logger.info("Testing {0} Classifier".format(algo))  # predict the testing data
    test_predicted = classifier.predict(test_lsi)

    utility.calculate_statistics(processed_test_dataset.target, test_predicted)  # calculate classifier statistics

    if roc: # plot ROC curve
        utility.plot_ROC(processed_test_dataset.target, test_predicted, algo)
def question_i():
    logger.info("EXECUTING: QUESTION I")
    logger.info("Multi-Class Classification")

    category = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','misc.forsale','soc.religion.christian']
    train, test = utility.load_dataset(category)

    logger.info("Processing Training Dataset")
    for data,pos in zip(train.data,range(len(train.data))):
        processedData = utility.preprocess_data(data)
        train.data[pos] = ' '.join(processedData)

    logger.info("Processing Testing Dataset")
    for data,pos in zip(test.data,range(len(test.data))):
        processedData = utility.preprocess_data(data)
        test.data[pos] = ' '.join(processedData)

    logger.info("Creating TFxIDF Vector Representations")

    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    # using CountVectorizer and TFxIDF Transformer
    count_vect = CountVectorizer(stop_words=stop_words, lowercase=True)
    train_counts = count_vect.fit_transform(train.data)
    test_counts = count_vect.transform(test.data)
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    train_idf = tfidf_transformer.fit_transform(train_counts)
    test_idf = tfidf_transformer.transform(test_counts)

    logger.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.transform(test_idf)

    logger.info("TFxIDF Matrices Transformed")

    logger.info("Size of Transformed Training Dataset: {0}".format(train_lsi.shape))
    logger.info("Size of Transformed Testing Dataset: {0}".format(test_lsi.shape))

    clf_list = [OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear'))]
    clf_name = ['OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM','OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM']

    # perform classification
    for clf,clf_n in zip(clf_list,clf_name):
        logger.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logger.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utility.calculate_statistics(test.target, test_predicted)
Ejemplo n.º 3
0
def perform_classification(classifier, algo, cv=False, roc=True):
    if os.path.isfile("../Dataset/Train_LSI.pkl") and os.path.isfile(
            "../Dataset/Test_LSI.pkl"):  # load pickle file if it exists
        train_lsi, test_lsi = utility.apply_lsi([], [])
    else:
        train_idf, test_idf = utility.model_text_data(processed_train_dataset,
                                                      processed_test_dataset)
        train_lsi, test_lsi = utility.apply_lsi(train_idf, test_idf)

    if cv:  # SVM Cross Validation Testing
        logger.info("Calculating Best Parameter Value")
        C = [-3, -2, -1, 0, 1, 2, 3]
        best_scores = []

        for i in C:
            logger.info("Parameter Value: {}".format(i))
            clf = svm.SVC(kernel='linear', C=10**C[i])
            scores = cross_validation.cross_val_score(
                clf, train_lsi, processed_train_dataset.target, cv=5)
            best_scores.append(np.mean(scores))

        logger.info(best_scores)
        logger.info("Best Parameter Value: {}".format(
            best_scores.index(max(best_scores))))
        classifier = svm.SVC(kernel='linear',
                             C=10**best_scores.index(max(best_scores)))

    logger.info("Training {0} Classifier ".format(algo))
    classifier.fit(train_lsi,
                   processed_train_dataset.target)  # fit the training data
    logger.info(
        "Testing {0} Classifier".format(algo))  # predict the testing data
    test_predicted = classifier.predict(test_lsi)

    utility.calculate_statistics(
        processed_test_dataset.target,
        test_predicted)  # calculate classifier statistics

    if roc:  # plot ROC curve
        utility.plot_ROC(processed_test_dataset.target, test_predicted, algo)
Ejemplo n.º 4
0
 def CalcTimeseriesStatistics(
         sim_obs_dict,  # type: Dict[AnyStr, Dict[AnyStr, Union[float, List[Union[datetime, float]]]]]
         stime=None,  # type: Optional[datetime]
         etime=None  # type: Optional[datetime]
 ):
     # type: (...) -> (List[AnyStr], List[float])
     objnames = calculate_statistics(sim_obs_dict, stime, etime)
     if objnames is None:
         return None, None
     comb_vars = list()
     obj_values = list()
     for var in sim_obs_dict.keys():
         for objn in objnames:
             comb_vars.append('%s-%s' % (var, objn))
             objv = sim_obs_dict[var][objn]
             if objn.upper() == 'PBIAS':
                 objv = math.fabs(objv)
             obj_values.append(objv)
     return comb_vars, obj_values
Ejemplo n.º 5
0
def question_i():
    logger.info("EXECUTING: QUESTION I")
    logger.info("Multi-Class Classification")

    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utility.load_dataset(category)

    logger.info("Processing Training Dataset")
    for data, pos in zip(train.data, range(len(train.data))):
        processedData = utility.preprocess_data(data)
        train.data[pos] = ' '.join(processedData)

    logger.info("Processing Testing Dataset")
    for data, pos in zip(test.data, range(len(test.data))):
        processedData = utility.preprocess_data(data)
        test.data[pos] = ' '.join(processedData)

    logger.info("Creating TFxIDF Vector Representations")

    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    # using CountVectorizer and TFxIDF Transformer
    count_vect = CountVectorizer(stop_words=stop_words, lowercase=True)
    train_counts = count_vect.fit_transform(train.data)
    test_counts = count_vect.transform(test.data)
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    train_idf = tfidf_transformer.fit_transform(train_counts)
    test_idf = tfidf_transformer.transform(test_counts)

    logger.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.transform(test_idf)

    logger.info("TFxIDF Matrices Transformed")

    logger.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logger.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logger.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logger.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utility.calculate_statistics(test.target, test_predicted)
Ejemplo n.º 6
0
    def __init__(self, cfg):
        # type: (PostConfig) -> None
        """Constructor"""
        self.model = MainSEIMS(args_dict=cfg.model_cfg.ConfigDict)
        self.ws = self.model.OutputDirectory
        if not FileClass.is_dir_exists(self.ws):
            raise ValueError('The output directory %s is not existed!' %
                             self.ws)
        self.plot_vars = cfg.plot_vars
        self.plot_cfg = cfg.plot_cfg  # type: PlotConfig
        # UTCTIME, calibration period
        self.stime = cfg.cali_stime
        self.etime = cfg.cali_etime
        self.subbsnID = cfg.plt_subbsnid
        # validation period
        self.vali_stime = cfg.vali_stime
        self.vali_etime = cfg.vali_etime

        # Read model data from MongoDB, the time period of simulation is read from FILE_IN.
        mongoclient = ConnectMongoDB(self.model.host,
                                     self.model.port).get_conn()
        self.readData = ReadModelData(mongoclient, self.model.db_name)
        self.mode = self.readData.Mode
        self.interval = self.readData.Interval
        # check start and end time of calibration
        st, et = self.readData.SimulationPeriod
        self.plot_validation = True
        if st > self.stime:
            self.stime = st
        if et < self.etime:
            self.etime = et
        if st > self.etime > self.stime:
            self.stime = st
            self.etime = et
            # in this circumstance, no validation should be calculated.
            self.vali_stime = None
            self.vali_etime = None
            self.plot_validation = False
        # check validation time period
        if self.vali_stime and self.vali_etime:
            if self.vali_stime >= self.vali_etime or st > self.vali_etime > self.vali_stime \
                or self.vali_stime >= et:
                self.vali_stime = None
                self.vali_etime = None
                self.plot_validation = False
            elif st > self.vali_stime:
                self.vali_stime = st
            elif et < self.vali_etime:
                self.vali_etime = et
        else:
            self.plot_validation = False
        # Set start time and end time of both calibration and validation periods
        start = self.stime
        end = self.etime
        if self.plot_validation:
            start = self.stime if self.stime < self.vali_stime else self.vali_stime
            end = self.etime if self.etime > self.vali_etime else self.vali_etime
        self.outletid = self.readData.OutletID
        # read precipitation
        self.pcp_date_value = self.readData.Precipitation(
            self.subbsnID, start, end)
        # read simulated data and update the available variables
        self.plot_vars, self.sim_data_dict = read_simulation_from_txt(
            self.ws, self.plot_vars, self.outletid, start, end)
        self.sim_data_value = list(
        )  # type: List[List[Union[datetime, float]]]
        for d, vs in self.sim_data_dict.items():
            self.sim_data_value.append([d] + vs[:])
        # reset start time and end time
        if len(self.sim_data_value) == 0:
            raise RuntimeError(
                'No available simulate data, please check the start and end time!'
            )
        # read observation data from MongoDB
        self.obs_vars, self.obs_data_dict = self.readData.Observation(
            self.subbsnID, self.plot_vars, start, end)

        # Calibration period
        self.sim_obs_dict = match_simulation_observation(self.plot_vars,
                                                         self.sim_data_dict,
                                                         self.obs_vars,
                                                         self.obs_data_dict,
                                                         start_time=self.stime,
                                                         end_time=self.etime)
        calculate_statistics(self.sim_obs_dict)
        # Validation period if existed
        self.vali_sim_obs_dict = dict()
        if self.plot_validation:
            self.vali_sim_obs_dict = match_simulation_observation(
                self.plot_vars,
                self.sim_data_dict,
                self.obs_vars,
                self.obs_data_dict,
                start_time=self.vali_stime,
                end_time=self.vali_etime)
            calculate_statistics(self.vali_sim_obs_dict)