# number of new records collected new_records_processed = 0 with open(OUTPUT_FILE, 'a') as o: with open(OSF_PREPRINT_FILE, 'r') as f: for line in f: records = json.loads(line) for record in records['data']: if record['id'] not in records_processed: contributor_url = record['relationships']['contributors']['links']['related']['href'] contributor_data = { 'id' : record['id'], 'data' : [] } while True: status, data = util.download_from_url(contributor_url) if status == 'SUCCESS': contributor_data['data'].append(data) if data['links']['next'] is None: break # get link to next set of records to download contributor_url = data['links']['next'] else: break json.dump(contributor_data, o) o.write('\n') o.flush()
def download_cash_flow(symbol, force=True): url = cash_flow_url.format(symbol) filename = os.path.join(data_dir, symbol + '.cash_flow.xlsx') return download_from_url(url, filename, overwrite=force)
def download_income_stmt(symbol, force=True): url = income_statement_url.format(symbol) filename = os.path.join(data_dir, symbol + '.income_stmt.xlsx') return download_from_url(url, filename, overwrite=force)
def download_balance_sheet(symbol, force=True): url = balance_sheet_url.format(symbol) filename = os.path.join(data_dir, symbol + '.balance_sheet.xlsx') return download_from_url(url, filename, overwrite=force)
if record['id'] not in records_processed: institution_data = {'id': record['id'], 'data': {}} for author_list in record['data']: for author in author_list['data']: try: institution_url = author['embeds']['users'][ 'data']['relationships']['institutions'][ 'links']['related']['href'] except: institution_url = None if institution_url is not None: institution_list = [] while True: status, data = util.download_from_url( institution_url) if status == 'SUCCESS': institution_list.append(data) if data['links']['next'] is None: break # get link to next set of records to download institution_url = data['links']['next'] else: break institution_data[author['id']] = institution_list json.dump(institution_data, o)