Example #1
0
def get_raw_data(data_params):
    """
    This function collates all scraping logic

    :param:     data_params     A dictionary containing all data parameters. The only one used is
                                the location at which to save raw data, and the states to scrape
    """
    # store raw data here
    raw_data = []

    # iterate over every state
    for state, url in data_params["states"]:
        all_routes = find_all_routes_in_area(url)

        # for every route in the state, get the route data
        for route_url in tqdm(all_routes):
            route_data = get_route_data(route_url)
            if route_data:
                raw_data.append(route_data)

        # save the raw data
        with open(
                make_absolute(data_params["raw_data_folder"] + state +
                              ".json"), "w") as f:
            json.dump(raw_data, f)

        # after saving the raw data, clear the raw data list
        raw_data = []
Example #2
0
def run_data(config):
    """
    This file downloads and saves raw and processed data to the folder "data/raw/" and 
    "data/processed/" by default. These save locations can be changed in the config file

    :param:     config      The config file. Default settings can be found in "config/default.json"
    """
    # the folders in which to save data
    data_path = make_absolute(config["data_path"])
    raw_path = os.path.join(data_path, config["raw_folder"])
    adj_close_path = os.path.join(raw_path, config["adj_close_folder"])
    iv_path = os.path.join(raw_path, config["iv_folder"])
    processed_path = os.path.join(data_path, config["processed_folder"])

    # delete then recreate the data folders
    # this is to completly overwrite all data if it exists
    shutil.rmtree(data_path, ignore_errors=True)
    os.mkdir(data_path)
    os.mkdir(raw_path)
    os.mkdir(adj_close_path)
    os.mkdir(iv_path)
    os.mkdir(processed_path)

    # download raw data
    download_data(config)

    # process the data
    process_data(config)
Example #3
0
def download_data(config):
    """
    Download the raw data needed for every ticker

    :param:     config      The config file
    """
    # the folders in which to save data
    data_path = make_absolute(config["data_path"])
    raw_path = os.path.join(data_path, config["raw_folder"])
    adj_close_path = os.path.join(raw_path, config["adj_close_folder"])
    iv_path = os.path.join(raw_path, config["iv_folder"])

    # initialize quandl with the api key
    quandl.ApiConfig.api_key = os.environ["QUANDL_API_KEY"]

    print("Downloading data...")

    # store iv metadata here
    iv_metadata = []

    # iterate over each ticker
    for ticker in tqdm(config["tickers"]):
        # this try except should catch tickers that do not exist in Quandl's EOD database
        try:
            # for each ticker download and save adj_close data
            data = get_ticker_adj_close(ticker)
            data.to_csv(os.path.join(adj_close_path, ticker + ".csv"),
                        index=False)

            # for each ticker get iv data/metadata
            # save data and store metadata
            data, metadata = get_ticker_iv(ticker)
            data.to_csv(os.path.join(iv_path, ticker + ".csv"), index=False)
            iv_metadata.append(metadata)
        except quandl.errors.quandl_error.NotFoundError as e:
            error_str = f"Ticker {ticker} does not exist in Quandl's EOD database. It " + \
                "will be removed for the rest of the current run."

            # log the error
            if (config["log"]):
                log(error_str)

            # print out an error statement
            print()
            print(error_str)

            # remove the ticker from the config file
            config["tickers"].remove(ticker)

    # save metadata
    iv_metadata = pd.DataFrame(iv_metadata,
                               columns=[
                                   "ticker", "next_earnings_day",
                                   "trading_days", "calendar_days",
                                   "crush_rate"
                               ])
    iv_metadata.to_csv(iv_path + "metadata.csv", index=False)

    print("Done\n")
Example #4
0
def get_clean_data(data_params):
    """
    This function collates all cleaning logic

    :param:     data_params     A dictionary containing all data parameters. The only ones used are
                                the location at which to download raw data and the location at which
                                to save clean data
    """
    # iterate over every state
    # it is assumed that data is saved and named accoring to get_raw_data.py
    for state in data_params["state"]:
        # get the url at which raw data will be found
        raw_data_path = make_absolute(data_params["raw_data_folder"] + state + ".json")
        print(raw_data_path)
        
        # get the data
        with open(raw_data_path, "r") as f:
            raw_data = json.load(f)

        # store all clean data as a list of lists
        # note that the first input row is the column names
        climb_data = [["climb_id", "name", "description", "image_url", "latitude", "longitude", 
            "avg_rating", "num_ratings", "url", "climb_type", "height_ft", "height_m", "pitches",
            "grade", "protection", "difficulty", 'rock_climb', 'boulder_climb']]
        #user_data = [["user_id", "climb_id", "rating"]]

        # process the data
        for climb in raw_data:
            # get the climb/user data and add it to the list of lists
            climb_row = split_into_user_climb(climb)
            climb_data.append(climb_row)
            #for user_row in user_rows:
            #    user_data.append(user_row)

        # save the lists of lists as csv data in the proper location
        clean_data_path = str(make_absolute(data_params["clean_data_folder"])) + "/"
        with open(clean_data_path + state + "_climbs.csv", "w", encoding="utf-8", 
            newline="") as f:
            writer = csv.writer(f)
            writer.writerows(climb_data)
Example #5
0
def run_sheet(config):
    """
    This file collates the logic needed to put processed data into the sheet

    :param:     config      The config file

    :return:    str         The name of the wb saved
    """
    # the folders in which to get data
    data_path = make_absolute(config["data_path"])
    processed_path = os.path.join(data_path, config["processed_folder"])
    xl_path = make_absolute(config["save_location"])

    # create the wb with the proper pages for each sheet
    wb = get_workbook(config["tickers"])

    # add short term data to the wb

    # get the short term metadata
    metadata = pd.read_csv(os.path.join(processed_path, "metadata.csv"))

    # iterate over every ticker with short term data
    print("Adding short term data...")
    for ticker in tqdm(config["tickers"]):
        # get the short term data for this ticker
        data = pd.read_csv(os.path.join(processed_path, ticker + ".csv"))

        # add the short term data and metadata to the wb
        add_short_term_sheet(ticker, wb[ticker], data,
                             metadata[metadata["ticker"] == ticker], config)
    print("Done\n")

    print("Adding long term data...")
    # add all long term sheets
    add_long_term_sheets(wb, config)
    print("Done\n")

    # at the end save the sheet
    return save(wb, xl_path)
Example #6
0
def process_data(config):
    """
    This function contains the logic to change raw data into processed data, as described in the 
    file header

    :param:     config      The config file
    """
    print("Processing Data...")

    # the folders in which to download/save data
    data_path = make_absolute(config["data_path"])
    raw_path = os.path.join(data_path, config["raw_folder"])
    adj_close_path = os.path.join(raw_path, config["adj_close_folder"])
    iv_path = os.path.join(raw_path, config["iv_folder"])
    processed_path = os.path.join(data_path, config["processed_folder"])

    # store short term data statistics here
    short_term_stats = pd.DataFrame(columns=[
        "ticker", "n", "mean", "20 Day STD", "40 Day STD", "60 Day STD"
    ])

    # iterate over all the raw data
    for ticker in config["tickers"]:
        # open the adj_close csv file and select the most recent 60 rows
        os.path.join(adj_close_path, ticker + ".csv")
        data = pd.read_csv(os.path.join(adj_close_path, ticker +
                                        ".csv"))[:60].reset_index(drop=True)

        # compute the various short term data stats
        n = 60
        mean = data["Adj_Close"].mean()
        std_20 = data["Adj_Close"][-20:].std()
        std_40 = data["Adj_Close"][-40:].std()
        std_60 = data["Adj_Close"].std()

        # add the short term stats to the df
        short_term_stats.loc[len(short_term_stats.index)] = [
            ticker, n, mean, std_20, std_40, std_60
        ]

        # open the iv csv file and add the columns to data
        iv_data = pd.read_csv(os.path.join(iv_path, ticker + ".csv"))
        data[["IV30 %", "IV30 Rank", "IV30 Rating"
              ]] = iv_data[["Iv30Percentile", "Iv30Rank", "Iv30Rating"]]

        # save the combined columns to a csv file
        data.to_csv(os.path.join(processed_path, ticker + ".csv"), index=False)

    # add short term data statistics to the metadata then save
    metadata = pd.read_csv(os.path.join(iv_path, "metadata.csv"))
    metadata = metadata.merge(short_term_stats, on="ticker", how="inner")
    metadata.to_csv(os.path.join(processed_path, "metadata.csv"), index=False)

    # then compute and save the long term data
    # create the structure to hold the data
    percent_change = []
    std = []
    freq = []

    # get the percent change for each ticker
    for ticker in config["tickers"]:
        data = pd.read_csv(os.path.join(adj_close_path, ticker + ".csv"),
                           parse_dates=["Date"])

        # get the percentage change for every month in the last 10 years for the ticker
        monthly = get_monthly_for_stock(data)

        # add the various values to the proper lists
        percent_change.append([ticker] + monthly[0])
        std.append([ticker] + monthly[1])
        freq.append([ticker] + monthly[2])

    # change the lists of lists to dfs
    columns = [
        "Ticker", "Jan (1)", "Feb (2)", "Mar (3)", "Apr (4)", "May (5)",
        "Jun (6)", "Jul (7)", "Aug (8)", "Sep (9)", "Oct (10)", "Nov (11)",
        "Dec (12)"
    ]
    percent_change = pd.DataFrame(percent_change, columns=columns)
    std = pd.DataFrame(std, columns=columns)
    freq = pd.DataFrame(freq, columns=columns)

    # save the long term data
    percent_change.to_csv(os.path.join(make_absolute(processed_path),
                                       "perc.csv"),
                          index=False)
    std.to_csv(os.path.join(make_absolute(processed_path), "std.csv"),
               index=False)
    freq.to_csv(os.path.join(make_absolute(processed_path), "freq.csv"),
                index=False)

    print("Done\n")
Example #7
0
def add_long_term_sheet(wb, config, file_name, sheet_name, sheet_location,
                        avg_function):
    """
    Add a specific long term sheet using the data from the file_name parameter.

    :param:     wb                  The workbook to add the sheet to
    :param:     config              The config file. By default "config/default.json"
    :param:     file_name           The name (no extension) of the file where data for this sheet
                                    can be found. The file_name should also reflect the default
                                    settings in "config/default.json" since the coloring looks for
                                    the key "{file_name}_low_high" in the config file for how to 
                                    color
    :param:     sheet_name          The name of the sheet to insert
    :param:     sheet_location      Where in the workbook to put the sheet
    :param:     avg_function        The function used to generate the "4 Month column". This 
                                    function should take in a series with four elements and return
                                    a number (usually float)
    """
    # get the current year and month
    current_year = datetime.date.today().year
    current_month = datetime.date.today().month

    # create the sheet at the correct location
    sheet = wb.create_sheet(sheet_name, sheet_location)

    # get the data to fill into the sheet

    data_path = os.path.join(make_absolute(config["data_path"]),
                             config["processed_folder"], file_name + ".csv")
    data = pd.read_csv(data_path)

    # get the correct order of months
    month_order = get_month_order(current_month)
    four_month = None
    if (current_month >= 10):
        # if the month is Oct, Nov, or Dec, then there will be an empty column placed after Dec,
        # which will cause issues with the averaging functions. so remove the empty column
        four_month = month_order.copy()[1:6]
        four_month.remove("Empty")
    else:
        four_month = month_order[1:5]

    # compute the four month average based on the specific average function
    data["4 Month"] = data[four_month].apply(avg_function, axis=1)

    # add the empty column
    data["Empty"] = ""

    # reorder the columns
    data = data[month_order]

    # add all the data to the sheet
    for row in dataframe_to_rows(data, index=False, header=True):
        sheet.append(row)

    # remove the "Empty" column header
    # get the letter column it appears in and set value to ""
    column_letter = integer_to_letter(month_order.index("Empty") + 1)
    sheet[column_letter + "1"] = ""

    # format all column headers
    # don't touch ticker, but set all other headers to center alignment
    for letter_int in range(2, 16):
        sheet[integer_to_letter(letter_int) +
              "1"].alignment = Alignment(horizontal="center")

    # move all cells down one
    sheet.move_range("A1:O" + str(1 + len(data.index)), rows=1, cols=0)

    # merge super header cells and add the years
    # if the current month is January, then all data comes from the previous year
    if (current_month == 1):
        sheet.merge_cells("B1:N1")
        sheet["B1"] = current_year - 1
    # otherwise...
    else:
        # merge all cells up to but not including the empty column
        sheet.merge_cells("B1:" +
                          integer_to_letter(month_order.index("Empty")) + "1")
        sheet["B1"] = current_year - 1
        sheet["B1"].alignment = Alignment(horizontal="center")

        # merge all cells after the empty column
        start_cell = integer_to_letter(month_order.index("Empty") + 2) + "1"
        end_cell = integer_to_letter((month_order.index("Empty") + 2) +
                                     (current_month - 2)) + "1"
        sheet.merge_cells(start_cell + ":" + end_cell)
        sheet[start_cell] = current_year
        sheet[start_cell].alignment = Alignment(horizontal="center")

    # color the sheet

    # get the starting cell (top left) and ending cell (bottom right) for the coloring
    start_cell = "B3"
    end_cell = "O" + str(len(data.index) + 2)

    # get the low/high color theresholds and the color gradient
    low_high = config[file_name + "_low_high"]
    color_gradient = config["color_gradient"]

    # iterate over every cell
    for row_of_cells in sheet[start_cell + ":" + end_cell]:
        for cell in row_of_cells:
            # color the cell based on the value of the cell, ignoring when the cell is empty
            if (cell.value != ""):
                cell.fill = get_color(low_high[0], low_high[1],
                                      float(cell.value), color_gradient)