def main():
    '''
        Main module gathers new raw files for ingest into CityScrapeDB

    '''
    configure_log()
    logger = logging.getLogger(__name__)

    logger.info('Executing initial page scrape to look for new files...')
    soup = get_soup()

    logger.info('Fetching files now!')
    get_files(soup)

    logger.info('CityScrape download complete!')
Example #2
0
def main():
    '''
        Main module gathers new raw files for ingest into CityScrapeDB

    '''
    configure_log()
    logger = logging.getLogger(__name__)

    logger.info('Executing initial page scrape to look for new files...')
    soup = get_soup()

    logger.info('Fetching files now!')
    get_files(soup)

    logger.info('CityScrape download complete!')
Example #3
0
    def run_naive_bayes(cls, train, test, binarizer, labels, alpha):
        # logging
        logging = configure_log(__file__)

        logging.info("alpha = %s" % (str(alpha)))
        logging.info("Fitting Naive Bayes...")
        train_data, train_labels = train
        test_data, test_labels = test

        classifier = OneVsRestClassifier(MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None))
        with warnings.catch_warnings():  # FIXME: split the data set in a way that the train set has every label
            warnings.simplefilter("ignore")
            classifier.fit(train_data, train_labels)

        possible_labels = set()
        [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]]

        logging.info("Predicting test set...")
        test_predictions = cls.predict(
            classifier=classifier,
            data=test_data,
            labels=test_labels,
            possible_labels=possible_labels,
            binarizer=binarizer,
        )

        # logging.info('Predicting train set...')
        # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels,
        #                                  possible_labels=possible_labels, binarizer=binarizer)

        test_precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples")
        # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples')

        # return train_precision, test_precision
        return test_precision
Example #4
0
    def run_svm(cls, train, test, C, binarizer, labels, intercept):
        """


        :param binarizer:
        :param labels:
        :param train:
        :param test:
        :param C:
        """
        # logging
        logging = configure_log(__file__)

        logging.info("C = %s" % (str(C)))
        logging.info("Fitting Linear SVM...")
        train_data, train_labels = train
        test_data, test_labels = test

        dual = False if train_data.shape[0] > train_data.shape[1] else True
        classifier = OneVsRestClassifier(
            LinearSVC(dual=dual, class_weight=None, C=C, intercept_scaling=intercept)
        )  # C -> inf = hard-margin
        with warnings.catch_warnings():  # FIXME: split the data set in a way that the train set has every label
            warnings.simplefilter("ignore")
            classifier.fit(train_data, train_labels)

        possible_labels = set()
        [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]]

        seen_labels = set()
        [map(seen_labels.add, row) for row in [label.nonzero()[0] for label in train_labels]]

        logging.info("Predicting test set...")
        test_predictions = cls.predict(
            classifier=classifier,
            data=test_data,
            labels=test_labels,
            possible_labels=possible_labels,
            binarizer=binarizer,
        )
        # test_predictions = cls.predict_k(classifier=classifier, data=test_data, k=100, binarizer=binarizer)

        # logging.info('Predicting train set...')
        # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels,
        #                                 possible_labels=possible_labels, binarizer=binarizer)

        precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples")
        # (precision, recall, f, support) = precision_recall_fscore_support(y_true=test_labels, y_pred=test_predictions,
        #                                                                   average='samples')
        # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples')

        # return test_precision
        return precision  # , recall, f, support