def _write_to_single_csv(self, series_financials_dict: dict): attr2id = self._get_attr2id( series_financials_dict=series_financials_dict) symbols = load_symbol_list(self.symbols_list_name) assert len(sorted(self.ordered_symbols)) >= len(sorted(attr2id.keys())) with open(self.csv_filename, mode='w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=self.ordered_symbols) writer.writeheader() period_indices = ['{}{}'.format(year, quarter) for year, quarter in itertools.product(self.year_range, self.quarters_names)] for idx in period_indices: for symbol in symbols: symbol_dict = series_financials_dict[symbol] if idx in symbol_dict.keys() and \ Tags.current_date in symbol_dict[idx]: row = symbol_dict[idx] writer.writerow(row) return self.csv_filename
def get_data(thresholds, resample_period='1W', symbols_list_name='sp500', start_date='2006-01-01', target_shift=4): print("Getting data for: %s - %s from %s with thresholds %s" % (symbols_list_name, resample_period, start_date, list(thresholds))) # while not decided imputation/removals symbols = load_symbol_list(symbols_list_name) end_date = '2019-12-31' df_prices = get_prices(symbols_list_name=symbols_list_name, start_date=start_date, resample_period=resample_period) df_fund = get_fundamentals(symbols_list_name=symbols_list_name, start_date=start_date, end_date=end_date, resample_period=resample_period) sic_code, sic_industry = load_sic() alist_path = os.path.join(DATA_PATH, 'available_%s' % symbols_list_name) if os.path.isfile(alist_path): available_symbols = [l.strip() for l in open(alist_path).readlines()] else: df_fund = compss_wait_on(df_fund) available_symbols = set( [symbol for symbol, date in df_fund.index.values]) unavailable = [s for s in symbols if s not in available_symbols] removed_symbols = ['ULTA'] print("Not available symbols: %s\nRemoved symbols: %s" % (unavailable, removed_symbols)) for s in removed_symbols: try: available_symbols.remove(s) except KeyError: print("Couldn't remove symbol %s" % s) with open(os.path.join(DATA_PATH, 'available_%s' % symbols_list_name), 'w') as f: f.write('\n'.join(available_symbols)) df = process_symbols(available_symbols, df_fund, df_prices, sic_code, sic_industry, thresholds, target_shift) normal_name, z_name = get_datasets_name(resample_period, symbols_list_name, thresholds, target_shift) normal_file = os.path.join(DATA_PATH, normal_name) z_file = os.path.join(DATA_PATH, z_name) res = post_process(df, (normal_file, z_file)) return res
def collect(self) -> pd.DataFrame: symbols = load_symbol_list(self.symbols_list_name) dfs = [] for symbol in symbols: dfs.append( self._get_symbol_prices(symbol=symbol)) return pd.concat(dfs)
def _add_periods_info(self, series_financials_dict: dict, save: bool = True): # ============================================================================================ # Compute some target average (or not) price # Parameter dependent variables symbols = load_symbol_list(self.symbols_list_name) for symbol in symbols: url = self.report_periods_url.substitute(symbol=symbol) data_json = call_and_cache(url, cache=self.cache) reporting_periods = data_json['data'] for i in range(len(reporting_periods)): period_info = reporting_periods[i] start_date = period_info['start_date'] end_date = period_info['end_date'] quarter = period_info['fiscal_period'] year = period_info['fiscal_year'] # Starting date of the next reporting period next_date = (datetime.strptime(end_date, DATE_FORMAT) + relativedelta(months=3)).strftime(DATE_FORMAT) if i > 0: next_date = reporting_periods[i - 1]['start_date'] # Starting date of the previous reporting period prev_date = (datetime.strptime(start_date, DATE_FORMAT) - relativedelta(months=3)).strftime(DATE_FORMAT) if i < len(reporting_periods) - 1: prev_date = reporting_periods[i + 1]['start_date'] period_dict = {Tags.symbol: symbol, Tags.quarter: quarter, Tags.year: year, Tags.current_date: end_date, Tags.prev_date: prev_date, Tags.next_date: next_date, Tags.rep_period: '{}:{}' .format(start_date, end_date), Tags.next_rep_period: '{}:{}' .format(end_date, next_date)} period_key = '{}{}'.format(year, quarter) try: series_financials_dict[symbol][period_key].update( period_dict) except KeyError: print("No fundamental info for symbol %s in period %s" % ( symbol, period_key))
def _collect_fundamentals(self, save=True): # Parameter dependent variables year_range = list(range(self.start_year, self.end_year)) symbols = load_symbol_list(self.symbols_list_name) series_financials_dict = {} for symbol in symbols: period_dict = {} for year in year_range: for quarter in self.quarters_names: symbol_dict = {Tags.symbol: symbol, Tags.year: year, Tags.quarter: quarter} for statement in self.statements: url = self.fundamentals_url \ .substitute(symbol=symbol, statement=statement, year=year, period=quarter) data_json, retries = {}, 3 while 'data' not in data_json and retries > 0: data_json = call_and_cache(url, cache=self.cache) statement_dict = {} if 'data' in data_json: for element in data_json['data']: statement_dict[element['tag']] = element[ 'value'] symbol_dict.update(statement_dict) if retries == 0: print( "Couldn't get data after 3 retries for url: %s" % url) period_dict['{}{}'.format(year, quarter)] = symbol_dict series_financials_dict[symbol] = period_dict if save: save_obj(series_financials_dict, '%s/obj/%s_%s-%s_financials' % ( DATA_PATH, self.symbols_list_name, self.start_year, self.end_year)) return series_financials_dict
def main(): if len(sys.argv) != 3: print('Usage {}: <symbol-file> <destination-dir>'.format(sys.argv[0])) exit(1) symbols = utils.load_symbol_list(sys.argv[1]) base_dir = sys.argv[2] for symbol in symbols: dest_dir = os.path.join(base_dir, symbol + '.csv') try: finance.fetch_historical_yahoo(symbol, (2000, 1, 1), (2014, 9, 21), cachename=dest_dir) except urllib2.URLError: print('Download failed for symbol: {}'.format(symbol)) time.sleep(30)
def _collect_attr_names(self, save=True): # Parameter dependent variables symbols = load_symbol_list(self.symbols_list_name) attr_names = defaultdict(set) for symbol in symbols: for year in self.year_range: for quarter_name in self.quarters_names: for statement in self.statements: url = self.fundamentals_url.substitute(symbol=symbol, statement=statement, year=year, period=quarter_name) data_json, retries = {}, 3 while 'data' not in data_json and retries > 0: data_json = call_and_cache(url, cache=self.cache) statement_dict = {} for element in data_json['data']: statement_dict[element['tag']] = element[ 'value'] attr_names[statement].update(statement_dict.keys()) if retries == 0: print( "Couldn't get data after 3 retries for url: %s" % url) if save: save_obj(attr_names, '%s/%s_%s-%s_attr_names' % (DATA_PATH, self.symbols_list_name, self.start_year, self.end_year)) return attr_names
dfs.append( self._get_symbol_prices(symbol=symbol)) return pd.concat(dfs) if __name__ == "__main__": symbols_list_name = 'sp500' start_date = '2006-01-01' # df = PriceExtractor(symbols_list_name=symbols_list_name, # start_date=start_date).collect() import fix_yahoo_finance as yf symbols = load_symbol_list(symbols_list_name) end_date = '2018-12-31' dfs = [] for s in symbols: try: data = yf.download(s, start_date, end_date) df = (data .assign(symbol=s)[['Adj Close', 'symbol']] .rename(index=str, columns={'Adj Close': 'price'})) dfs.append(df) except ValueError as e: print(e) print("Exception downloading: %s" % s) pzs = (pd.concat(dfs)