Python ETLUtils.drop_fieldsの例

プログラミング言語: Python

名前空間/パッケージ名: etl

クラス/型: ETLUtils

メソッド/関数: drop_fields

hotexamples.comのコード掲載数: 12

Python ETLUtils.drop_fields - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのetl.ETLUtils.drop_fieldsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

load_json_file(30)

filter_records(23)

save_json_file(13)

save_csv_file(12)

split_train_test(7)

drop_fields(6)

load_csv_file(6)

filter_out_records(5)

select_fields(4)

split_train_test_copy(4)

add_transpose_list_column(2)

write_row_to_csv(2)

write_row_to_json(2)

count_frequency(1)

keep_fields(1)

search_sentences(1)

コード例 #1

ファイルを表示

ファイル: review_analysis.py プロジェクト: antoine-tran/yelp

    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)


        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))

コード例 #2

ファイルを表示

    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)

        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))

コード例 #3

ファイルを表示

ファイル: business_etl.py プロジェクト: antoine-tran/yelp

    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes',
            'business_id',
            'categories',
            'city',
            'full_address',
            'latitude',
            'longitude',
            'hours',
            'name',
            'neighborhoods',
            'open',
            'review_count',
            'stars',
            'state',
            'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)

コード例 #4

ファイルを表示

ファイル: yelp_reviews_preprocessor.py プロジェクト: neostoic/yelp-1

    def drop_unnecessary_fields(self):
        print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            Constants.VOTES_FIELD,
            Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)

コード例 #5

ファイルを表示

    def drop_unnecessary_fields(self):
        print(
            '%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            # Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)

コード例 #6

ファイルを表示

    def test_drop_fields(self):

        drop_fields = [
            'cleanliness_rating', 'location_rating', 'rooms_rating',
            'service_rating', 'value_rating'
        ]

        test_list = list(reviews_matrix_5)

        ETLUtils.drop_fields(drop_fields, test_list)
        self.assertEqual(reviews_matrix_5_short, test_list)

        test_list = list(reviews_matrix_5_short)
        self.assertEqual(reviews_matrix_5_short, test_list)

コード例 #7

ファイルを表示

ファイル: business_etl.py プロジェクト: neostoic/yelp-1

    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes', 'business_id', 'categories', 'city', 'full_address',
            'latitude', 'longitude', 'hours', 'name', 'neighborhoods', 'open',
            'review_count', 'stars', 'state', 'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)

コード例 #8

ファイルを表示

ファイル: test_etl_utils.py プロジェクト: antoine-tran/yelp

    def test_drop_fields(self):

        drop_fields = [
            'cleanliness_rating',
            'location_rating',
            'rooms_rating',
            'service_rating',
            'value_rating'
        ]

        test_list = list(reviews_matrix_5)

        ETLUtils.drop_fields(drop_fields, test_list)
        self.assertEqual(reviews_matrix_5_short, test_list)

        test_list = list(reviews_matrix_5_short)
        self.assertEqual(reviews_matrix_5_short, test_list)

コード例 #9

ファイルを表示

ファイル: yelp_phoenix_extractor.py プロジェクト: antoine-tran/yelp

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json'
    reviews = ETLUtils.load_json_file(reviews_file)

    select_fields = ['user_id', 'business_id', 'stars']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['business_id', 'stars'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    reviews = clean_reviews(reviews)

    return reviews

コード例 #10

ファイルを表示

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json'
    reviews = ETLUtils.load_json_file(reviews_file)

    select_fields = ['user_id', 'business_id', 'stars']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['business_id', 'stars'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    reviews = clean_reviews(reviews)

    return reviews

コード例 #11

ファイルを表示

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/'
    review_file_path = data_folder + 'review.txt'
    # review_file_path = data_folder + 'review-short.json'
    reviews = ETLUtils.load_json_file(review_file_path)

    select_fields = ['ratings', 'author', 'offering_id']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['author', 'ratings'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat')
    reviews = clean_reviews(reviews)

    return reviews

コード例 #12

ファイルを表示

ファイル: extractor.py プロジェクト: antoine-tran/yelp

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/'
    review_file_path = data_folder + 'review.txt'
    # review_file_path = data_folder + 'review-short.json'
    reviews = ETLUtils.load_json_file(review_file_path)

    select_fields = ['ratings', 'author', 'offering_id']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['author', 'ratings'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat')
    reviews = clean_reviews(reviews)

    return reviews