def populate_raw_data(tickers, fields, raw_path):
    """tickers is a dict with the ticker string as the key and the SID
    as the value.  """
    quandl_tools.set_api_key()

    # existing = listdir(RAW_FLDR)

    for ticker, sid in tickers.items():
        # if "%d.csv" % sid in existing:
        #     continue
        try:
            query_str = "%s %s" % (DS_NAME, ticker)
            print("fetching data for: {}".format(query_str))

            # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE)
            df = quandl.get_table(
                DS_NAME,
                calendardate={
                    'gte': START_DATE,
                    'lte': END_DATE
                },
                ticker=ticker,
                qopts={'columns': ['dimension', 'datekey'] + fields})

            df = df[df.dimension == "ARQ"]  # only use As-Reported numbers

            #  Change column name to field
            df = df.rename(columns={"datekey": "Date"})
            df = df.drop(["dimension"], axis=1)

            # write raw file: raw/
            df.to_csv(os.path.join(raw_path, "{}.csv".format(sid)))
        except quandl.errors.quandl_error.NotFoundError:
            print("error with ticker: {}".format(ticker))
def download(
    bundle=KERNEL_BUNDLE,
    start_date='2007-01-01',
    tickers=None,
    fields=None,
    dimensions=None,
):
    """
    this method is a top-level executor of the download
    download volume could be reduced by setting start_date, tickers, fields, dimensions parameters
    with all parameters set as default will need couple of hours to complete the task
    for each field it gets each dimension available - thus returns fields X dimension values
    :param bundle: bundle which to be used to get the universe of tickers, sharadar-prices by default
    :param start_date: first date of the set
    :param tickers: list of tickers, all tickers by default
    :param fields: list of fields, all fields by default
    :param dimensions: list of dimensions, all dimensions by default (skipping MRs)
    """
    quandl_tools.set_api_key()
    data = download_fundamendals_data(
        bundle=bundle,
        start_date=start_date,
        tickers=tickers,
        fields=fields,
        dimensions=dimensions,
    )
    return data
def populate_raw_data(tickers, fields, dimensions, raw_path):
    """tickers is a dict with the ticker string as the key and the SID
    as the value.
    For each field a dimension is required, so dimensions should be a list
    of dimensions for each field.
    """
    assert len(fields) == len(dimensions)
    quandl_tools.set_api_key()

    # existing = listdir(RAW_FLDR)

    for ticker, sid in tickers.items():
        # if "%d.csv" % sid in existing:
        #     continue
        try:
            query_str = "%s %s" % (DS_NAME, ticker)
            print("fetching data for: {}".format(query_str))

            # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE)
            df = quandl.get_table(
                DS_NAME,
                calendardate={
                    'gte': START_DATE,
                    'lte': END_DATE
                },
                ticker=ticker,
                qopts={'columns': ['dimension', 'datekey'] + fields})
            df = df.rename(columns={'datekey': 'Date'}).set_index('Date')

            # loop over the fields and dimensions
            series = []
            for i, field in enumerate(fields):
                s = df[df.dimension == dimensions[i]][field]
                series.append(s)
            df = pd.concat(series, axis=1)
            print(df)

            # write raw file: raw/
            df.to_csv(os.path.join(raw_path, "{}.csv".format(sid)))
        except quandl.errors.quandl_error.NotFoundError:
            print("error with ticker: {}".format(ticker))
Beispiel #4
0
def populate_raw_data(tickers, input_fields, output_fields, raw_path):
    """tickers is a dict with the ticker string as the key and the SID
    as the value.  """
    quandl_tools.set_api_key()

    # existing = listdir(RAW_FLDR)

    for ticker, sid in tickers.items():
        # if "%d.csv" % sid in existing:
        #     continue
        try:
            time.sleep(0.1)
            query_str = "%s %s" % (DS_NAME, ticker)
            print("fetching data for: {}".format(query_str))

            # df = quandl.get_table(query_str, start_date=START_DATE, end_date=END_DATE)

            rawData = quandl.get_table(DS_NAME,
                                       filingdate={
                                           'gte': START_DATE,
                                           'lte': END_DATE
                                       },
                                       ticker=ticker,
                                       qopts={'columns': input_fields},
                                       paginate=True)

            df = fnProcessInsiderTrades(rawData, nDaysDiff=3)

            #  Group by and Change column name to field
            if not df.empty:
                df = df.groupby('filingdate').sum().reset_index()
            df = df.rename(columns={"filingdate": "Date"})

            df = df[output_fields]

            # write raw file: raw/
            df = df.rename_axis('None', axis=0)
            df.to_csv(os.path.join(raw_path, "{}.csv".format(sid)))
        except quandl.errors.quandl_error.NotFoundError:
            print("error with ticker: {}".format(ticker))
    # df['unadjusted_close'].loc[(df.date<='2016-06-23') & (df.ticker=='SH')] =

    df['unad']


# get last updated
# quandl.get_table('SHARADAR/SFP', lastupdated={'gte':'2017-11-03'},  ticker=['SH','SPY'],)

# get more than 10,000 rows of data
# quandl.get_table('SHARADAR/SFP', date={'gte':startDate,'lte':endDate}, ticker=['SH','SPY'], paginate=True)

# get specific date range
# quandl.get_table('SHARADAR/SFP', date={'gte':'2017-01-01', 'lte':'2017-10-30'}, ticker=['SH','SPY'], paginate=True)

if __name__ == '__main__':

    # custom pandas settings
    setPandas()
    # set quandl API key
    quandl_tools.set_api_key()

    startDate = pd.to_datetime(config.get('start_date'),
                               format='%Y%m%d').tz_localize(pytz.utc)
    endDate = pd.to_datetime(config.get('end_date'),
                             format='%Y%m%d').tz_localize(pytz.utc)

    # bulk data download
    df = fnDownloadData(startDate=startDate, endDate=endDate)

    # df.head()