def download_filings(feedpath, args=None): """Go through all entries in the given EDGAR RSS feed and download any missing or new filings.""" logger.info("Processing RSS feed %s", feedpath) dir = filings_dir(feedpath) os.makedirs(dir, exist_ok=True) filing_urls = [] for filing in feed_tools.read_feed(feedpath): if args: if args.company_re and not bool( args.company_re.match(filing['companyName'])): continue if args.cik and args.cik != filing['cikNumber']: continue if args.sic and args.sic != filing['assignedSic']: continue if args.form_type and args.form_type != filing['formType']: continue if 'enclosureUrl' in filing and not exists_filing( dir, filing['enclosureUrl'], filing['enclosureLength']): filing_urls.append(filing['enclosureUrl']) logger.info("Start downloading %d new filings", len(filing_urls)) with concurrent.futures.ThreadPoolExecutor( max_workers=args.max_threads) as executor: futures = [ executor.submit(download_filing, dir, url, args.max_retries) for url in filing_urls ] for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as e: print(e)
def build_secdb(feeds): # Setup python logging framework setup_logging(args.log_file) tickers = load_ticker_symbols() # Setup up DB connection global db_connect db_connect = setup_db_connect(args.db_driver,args.db_name) # Create all required DB tables if args.create_tables: create_db_tables() create_db_indices() insert_ticker_symbols(tickers) # Process all filings in the given RSS feeds one month after another for filepath in feeds: # Load EDGAR filing metadata from RSS feed (and filter out all non 10-K/10-Q filings or companies without an assigned ticker symbol) filings = {} for filing in feed_tools.read_feed(filepath): if args.cik is None or args.cik == filing['cikNumber']: if filing['formType'] in ('10-K','10-K/A','10-Q','10-Q/A') and filing['cikNumber'] in tickers: filing['ticker'] = tickers[filing['cikNumber']] filings.setdefault(filing['cikNumber'],[]).append(filing) # Process the selected XBRL filings process_filings(filings)
def download_filings(feedpath, args=None): """Go through all entries in the given EDGAR RSS feed and download any missing or new filings.""" logger.info("Processing RSS feed %s", feedpath) dir = filings_dir(feedpath) os.makedirs(dir, exist_ok=True) filing_urls = [] for filing in feed_tools.read_feed(feedpath): if args: if args.company_re and not bool( args.company_re.match(filing['companyName'])): continue if args.cik and args.cik != filing['cikNumber']: continue if args.sic and args.sic != filing['assignedSic']: continue if args.form_type and args.form_type != filing['formType']: continue if 'enclosureUrl' in filing and not exists_filing( dir, filing['enclosureUrl'], filing['enclosureLength']): filing_urls.append(filing['enclosureUrl']) logger.info("Start downloading %d new filings", len(filing_urls)) for url in filing_urls: download_filing(dir, url, args.max_retries)
def generate_project(feedpath): filings = feed_tools.read_feed(feedpath) filings_by_company = {} for filing in filings: filings_by_company.setdefault(filing['companyName'], []).append(filing) month = re.fullmatch(r'.*xbrlrss-(\d{4}-\d{2})\.xml', os.path.basename(feedpath)).group(1) dir = os.path.join(feed_tools.filings_dir, month) file = os.path.join(dir, '%s.spp' % month) print('Generating project file', file) with open(file, 'w') as f: f.write("""\ <?xml version="1.0" encoding="UTF-8"?> <Project> """) f.write("""\ <Folder FolderName="Filings by name" ExtStr="xml"> """) for filing in filings: if filing['instanceUrl']: f.write("""\ <File FilePath="%s" HomeFolder="Yes"/> """ % filing['instanceUrl'][len('filings/YYYY-MM/'):].replace( '%7Czip/', '|zip\\')) f.write("""\ </Folder> """) f.write("""\ <Folder FolderName="Filings by company"> """) for company in sorted(filings_by_company.keys()): f.write("""\ <Folder FolderName="%s" ExtStr="xml"> """ % company.replace('&', '&').replace('<', '<')) for filing in filings_by_company[company]: if filing['instanceUrl']: f.write("""\ <File FilePath="%s" HomeFolder="Yes"/> """ % filing['instanceUrl'][len('filings/YYYY-MM/'):].replace( '%7Czip/', '|zip\\')) f.write("""\ </Folder> """) f.write("""\ </Folder> """) f.write("""\ </Project> """)
def generate_project(feedpath): filings = feed_tools.read_feed(feedpath) filings_by_company = {} for filing in filings: filings_by_company.setdefault(filing['companyName'],[]).append(filing) month = re.fullmatch(r'.*xbrlrss-(\d{4}-\d{2})\.xml',os.path.basename(feedpath)).group(1) dir = os.path.join(feed_tools.filings_dir,month) file = os.path.join(dir,'%s.spp'%month) print('Generating project file',file) with open(file,'w') as f: f.write("""\ <?xml version="1.0" encoding="UTF-8"?> <Project> """) f.write("""\ <Folder FolderName="Filings by name" ExtStr="xml"> """) for filing in filings: if filing['instanceUrl']: f.write("""\ <File FilePath="%s" HomeFolder="Yes"/> """%filing['instanceUrl'][len('filings/YYYY-MM/'):].replace('%7Czip/','|zip\\')) f.write("""\ </Folder> """) f.write("""\ <Folder FolderName="Filings by company"> """) for company in sorted(filings_by_company.keys()): f.write("""\ <Folder FolderName="%s" ExtStr="xml"> """%company.replace('&','&').replace('<','<')) for filing in filings_by_company[company]: if filing['instanceUrl']: f.write("""\ <File FilePath="%s" HomeFolder="Yes"/> """%filing['instanceUrl'][len('filings/YYYY-MM/'):].replace('%7Czip/','|zip\\')) f.write("""\ </Folder> """) f.write("""\ </Folder> """) f.write("""\ </Project> """)
def main(): # Parse script arguments args = parse_args() # Setup python logging framework setup_logging(args) # Validate all filings in the given RSS feeds one month after another for filepath in collect_feeds(args): # Load EDGAR filing metadata from RSS feed (and filter out all non 10-K/10-Q filings or companies without an assigned ticker symbol) filings = [] for filing in feed_tools.read_feed(filepath): # Google to Alphabet reorganization if filing['cikNumber'] == 1288776: filing['cikNumber'] = 1652044 if args.form_type is None or args.form_type == filing['formType']: if args.sic is None or args.sic == filing['assignedSic']: if args.cik is None or args.cik == filing['cikNumber']: filings.append(filing) # Validate the selected XBRL filings validate_filings(filings[:100], args.max_threads)
def download_filings(feedpath,args=None): """Go through all entries in the given EDGAR RSS feed and download any missing or new filings.""" logger.info("Processing RSS feed %s",feedpath) dir = filings_dir(feedpath) os.makedirs(dir,exist_ok=True) filing_urls = [] for filing in feed_tools.read_feed(feedpath): if args: if args.company_re and not bool(args.company_re.match(filing['companyName'])): continue if args.cik and args.cik != filing['cikNumber']: continue if args.sic and args.sic != filing['assignedSic']: continue if args.form_type and args.form_type != filing['formType']: continue if 'enclosureUrl' in filing and not exists_filing(dir,filing['enclosureUrl'],filing['enclosureLength']): filing_urls.append(filing['enclosureUrl']) logger.info("Start downloading %d new filings",len(filing_urls)) for url in filing_urls: download_filing(dir,url,args.max_retries)