Beispiel #1
0
    list_id = [int(x) for x in list_id]

    return list_id


if __name__ == '__main__':
    parser = build_argparser()
    args = parser.parse_args()
    option = args.option
    if (start_index := args.start_from):
        start_index = start_index - 1

    data_dir = os.path.join(BASE_DIR, 'data', 'taipei_shop_rent_price')
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    data_info = DataInfo(data_info_path)

    download_dirpath = data_info.get_download_dirpath()
    main_xhr_response_filepath = os.path.join(download_dirpath, '591_xhr_responses.json')
    output_filename = '591_lat_long_lookup.json'
    output_filepath = os.path.join(download_dirpath, output_filename)

    list_post_id = get_listing_list_id(main_xhr_response_filepath)

    # set webdriver, request interceptor scope, and wait object
    print("note: this scrapping will take hours (there are some brakes "
        "to respect the website). The program heavily depend on your internet connection")
    print("INFO: setup crawler, use Firefox driver")
    webdriver_options = webdriver.FirefoxOptions()
    if option == 'hide':
        webdriver_options.headless = True
Beispiel #2
0
                lambda x: station_dict.get(re.sub('[0-9]|[a-z]|[A-Z]', '', x))
            )

        data_df['station_out_village_code'] = data_df['station_out_village_code'].apply(int)

    return data_df


if __name__ == '__main__':
    data_name = 'taipei_mrt_info'
    data_dir = os.path.join(BASE_DIR, 'data', data_name)
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    helper_dict_filepath = os.path.join(CURRENT_DIR, 'data', 'chinese_english_column_helper_dict.pkl')
    station_dimension_filepath = os.path.join(CURRENT_DIR, 'data', 'taipei_mrt_map_coordinate.csv')

    data_info = DataInfo(data_info_path)

    input_filepath_list = data_info.get_structured_filepath_list()
    output_dirpath = data_info.get_normalized_dirpath()
    filename_prefix = 'taipei_mrt_passenger_data'

    used_keys = UsedKeys()
    helper_dict = load_dictionary(helper_dict_filepath)
    station_dimension_data = pd.read_csv(station_dimension_filepath)
    station_dimension_data.set_index('station_name', inplace=True)

    if os.path.isdir(output_dirpath):
        shutil.rmtree(output_dirpath)
        os.mkdir(output_dirpath)
    else:
        os.mkdir(output_dirpath)
Beispiel #3
0
                        metavar='option',
                        default='hide',
                        help="available choices: [ " +
                        ' | '.join(available_option) + ' ]')

    return parser


if __name__ == '__main__':
    parser = build_argparser()
    args = parser.parse_args()
    option = args.option

    data_dir = os.path.join(BASE_DIR, 'data', 'taipei_shop_rent_price')
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    data_info = DataInfo(data_info_path)

    output_filename = '591_xhr_responses.json'
    output_filepath = os.path.join(data_info.get_download_dirpath(),
                                   output_filename)

    # set webdriver, request interceptor scope, and wait object
    print("note: the program heavily depend on your internet connection")
    print("INFO: setup crawler, use Firefox driver")
    webdriver_options = webdriver.FirefoxOptions()
    if option == 'hide':
        webdriver_options.headless = True
    elif option == 'show':
        webdriver_options.headless = False

    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(),
import os

import pandas as pd

from lib import shared_lib
from shared_lib.data_info import DataInfo

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

if __name__ == '__main__':
    data_dir = os.path.join(BASE_DIR, 'data', 'taipei_income_by_village')
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    data_info = DataInfo(data_info_path)

    input_filepath_list = data_info.get_download_filepath_list()
    output_filepath = data_info.get_structured_filepath()

    df_data = pd.DataFrame()

    for path in input_filepath_list:
        print('INFO: processing data from {}'.format(path))
        _county_id = os.path.basename(path).split('.')[0].split('-')[1]
        _read_csv = pd.read_csv(path)
        _read_csv['county_id'] = _county_id
        df_data = df_data.append(_read_csv, ignore_index=True)

    print('INFO: saving data to {}'.format(output_filepath))
    df_data.to_csv(output_filepath, index=False)
        print('ERROR: generate link might not supported')
        sys.exit(45)


if __name__ == "__main__":
    # setting arguments parser
    parser = build_argparser()
    args = parser.parse_args()

    data_name = args.data_name
    print("INFO: start getting links for data {}".format(data_name))

    data_dir = os.path.join(BASE_DIR, 'data', data_name)
    data_info_path = os.path.join(data_dir, 'data_info.csv')

    data_info = DataInfo(data_info_path)

    url = data_info.get_info_force('main_source')

    download_filetype = data_info.get_info('get_datatype')
    output_filepath = data_info.get_download_links_filepath()

    if isinstance(url, list):
        download_list = []
        for x in url:
            download_list.append(try_to_get_link(x, download_filetype))
        download_list = itertools.chain.from_iterable(download_list)
    else:
        download_list = try_to_get_link(url, download_filetype)

    if download_list:
                        metavar='option',
                        default='hide',
                        help="available choices: [ " +
                        ' | '.join(available_option) + ' ]')

    return parser


if __name__ == '__main__':
    parser = build_argparser()
    args = parser.parse_args()
    option = args.option

    data_dir = os.path.join(BASE_DIR, 'data', 'taipei_mrt_map_coordinate')
    data_info_path = os.path.join(data_dir, 'data_info.csv')
    data_info = DataInfo(data_info_path)

    output_filename = 'taipei_mrt_map_coordinate.csv'
    output_filepath = os.path.join(data_info.get_download_dirpath(),
                                   output_filename)

    # set webdriver, request interceptor scope, and wait object
    print("note: the program heavily depend on your internet connection")
    print("INFO: setup crawler, use Firefox driver")
    webdriver_options = webdriver.FirefoxOptions()
    if option == 'hide':
        webdriver_options.headless = True
    elif option == 'show':
        webdriver_options.headless = False

    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(),