def do_download(path, ticker, form): dl = Downloader(path) dl.get(form, ticker, after="2015-01-01") path = os.path.join(path, "sec-edgar-filings", ticker, form) if not os.path.isdir(path): return pattern = re.compile("([0-9]+)") for filingDir in os.listdir(path): fullSubmissionFname = os.path.join(path, filingDir, "full-submission.txt") htmlFname = os.path.join(path, filingDir, "filing-details.html") if not os.path.isfile(fullSubmissionFname): print("skipping ", fullSubmissionFname) continue found = False with open(fullSubmissionFname) as f: for line in f: if line.startswith("FILED AS OF DATE"): date = re.search(pattern, line).group(0) found = True if not found: print("skipping ", filingDir) continue url_rewrite(htmlFname, find_filename(path, ticker, form, date)) shutil.rmtree(os.path.join(path, filingDir))
def download_forms(company_index, year: str): """ Reads index file and downloads 10Ks and 10Qs """ dl = Downloader(mypath+'/data/') for i in company_index.index: dl.get("10-K", company_index.TICKER[i], after_date=year+'0101', include_amends=False) dl.get("10-Q", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
def download_latest_filing(file_type, ticker): dl = Downloader(os.getcwd()) dl.get(file_type, ticker, amount=1) dl_path = os.getcwd() + '/sec-edgar-filings/{}/{}/'.format( ticker, file_type) inner_most_dir = [x[0] for x in os.walk(dl_path)][1] html_path = f'{inner_most_dir}/filing-details.html' txt_path = f'{inner_most_dir}/full-submission.txt' return (html_path, txt_path)
class FilingsDownloader: def __init__(self, downloadPath): self.downloadPath = downloadPath self.downloader = Downloader(self.downloadPath) # Download 10 latest 10-K filings for the given company ticker def downloadFilings(self, ticker, filing_type="10-K", latest=10): self.downloader.get(filing_type, ticker, latest) def removefilings(self, path): shutil.rmtree(path, ignore_errors=True)
def get_files(CIK,x): save_path = '/Users/yijingtan/Downloads/d' dl = Downloader(save_path) dl.get('13F-HR', CIK,include_amends=True,after_date=x) #CIK = CIK.lstrip("0") files = os.listdir('/Users/yijingtan/Downloads/d/sec_edgar_filings/' + CIK +'/' +'13F-HR') data = [parse(file, CIK) for file in sorted(files)] print(data) try: return pd.concat(data) except ValueError: print("All Values are None") return None
def get_files(filing, CIK, number): save_path = r"C:\Users\smore\Documents\13F" dl = Downloader(save_path) dl.get(filing, CIK, number) CIK = CIK.lstrip("0") print(CIK) files = os.listdir( f"C:/Users/smore/Documents/13F/sec_edgar_filings/{CIK}/{filing}") data = [parse(file, CIK) for file in sorted(files)] try: print(pd.concat(data)) return pd.concat(data) except ValueError: print("All Values are None") return None
def Bulk_extraction(ticker, filetype, date, location): ''' ticker = company ticker or list of tickers filetype = type of financial doc (8-K,10-K) date = all filetypes after this date format: year-day-month location = local directory to store files as string''' dl = Downloader(str(location)) for company in ticker: dl.get(str(filetype), str(company), after=str(date), download_details=True) return "Complete" # Have to double check to see if all funds are updated for 2019-2020 #Bulk_extraction(holdings,'10-K','2019-01-01',dl)
class FinancialStatementDatasetBuilder(tfds.core.GeneratorBasedBuilder): def __init__(self, args, log): self.args = args self.log = log self.VERSION = tfds.core.Version(self.args.dataset_version) self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually" super(tfds.core.GeneratorBasedBuilder, self).__init__() self.dl = Downloader(self.args.download_path) self.parser = FinancialReportParser() self.text_processor = get_text_processor(args.model)(args) def _info(self): return tfds.core.DatasetInfo( builder=self, description=("Financial statements data."), features=tfds.features.FeaturesDict({ "documents": tfds.features.Tensor(dtype=tf.string, shape=(self.args.number_of_periods, )), "label": tfds.features.Tensor(dtype=tf.int64, shape=(2, )) }), supervised_keys=("documents", "label"), homepage="https://xxx", citation=r"""@article{my-awesome-dataset-2020, author = {Hurubaru, Sebastian},"}""", ) def _split_generators(self, dl_manager): # Specify the splits return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'train') }, ), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'dev') }, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'test') }, ) ] def _generate_examples(self, input_dir): # Get the content of the dataset file dataset = tf.data.experimental.make_csv_dataset( os.path.join(input_dir, self.args.company_files), batch_size=1, column_defaults=[tf.string, tf.string, tf.string, tf.int32], label_name='label', na_value="?", num_epochs=1, ignore_errors=True) for company_info, label in dataset: ciks = company_info['cik'].numpy()[0].decode('utf-8').split(';') ciks.sort(reverse=True, key=lambda cik: int(cik)) end_date = company_info['end_date'].numpy()[0].decode('utf-8') try: documents = [] # For multiple CIKs take in the descending order the last args.number_of_periods 10-K reports for cik in ciks: cik_folder = os.path.join( os.path.expanduser(self.args.download_path), 'sec_edgar_filings', cik.strip().lstrip("0"), '10-K') # Download if and only if the directories do not exist if (os.path.exists(cik_folder) is False): self.dl.get("10-K", cik, before_date=end_date, num_filings_to_download=self.args. number_of_periods) for r, d, f in os.walk(cik_folder): for file in f: if '.txt' in file: documents.append( tf.convert_to_tensor( self.parser.parse_10K_txt_file( os.path.join(r, file)), dtype=tf.string)) if len(documents) < self.args.number_of_periods: raise Exception( f'Could not retrieve {self.args.number_of_periods} 10-K records for {cik}' ) yield cik, { 'documents': tf.stack(documents)[:self.args.number_of_periods], 'label': [1, 0] if label.numpy()[0] == 0 else [0, 1] } except Exception as e: self.log.error(f'Exception occurred for cik {cik}: {e}') def _process_text_map_fn(self, text, label): processed_text, label = tf.py_function(self._process_text, inp=[text, label], Tout=(tf.float32, tf.int64)) return processed_text, label def _process_text(self, text, label): # To allow debugging in the combined static eager mode # pydevd.settrace(suspend=True) # Process the text processed_text = self.text_processor.process_text(text) return (processed_text, label)
import pandas as pd from sec_edgar_downloader import Downloader x = pd.read_csv( r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping/companies.csv" ) x = x['Column_Name'] dl = Downloader( r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping" ) for i in x: dl.get("8-K", i, after_date="20200328")
from sec_edgar_downloader import Downloader dl = Downloader('C:\\Users\\willi\\Documents\\Company_filings\\BRK-A\\13F-HR') # Downloads using CIK of Berkshire hathaway BERK-A dl.get("13F-HR", "0001067983")
total = total + 1 data_file.seek(0) curr = 1 positionStr = 'Current company: ' + str(curr).rjust( 5) + ' Total company: ' + str(total).rjust(6) print(positionStr) time1 = time.time() for ticker in csv_reader: if ticker[0] == "Ticker": continue print(ticker[0]) try: if "1" in ft: dl.get(filing_type[0], ticker[0], after_date=date1, before_date=date2) except: try: print("Internet error occurred- retrying") dl.get(filing_type[0], ticker[0], after_date=date1, before_date=date2) except: print("Couldn't resolve, moving to next ticker") try: if "2" in ft: dl.get(filing_type[1], ticker[0],
import numpy as np import pandas as pd from sec_edgar_downloader import Downloader df = pd.read_csv('data/sp500_list.csv') drop_column = df.columns[0] df.drop(columns=drop_column, inplace=True) df.CIK = df.CIK.astype(str) df['CIK'] = df['CIK'].str.zfill(10) dl = Downloader('test_data/') for i in df.index: print(f"{df.index[i]}: Pulling 8Ks for {df.COMPANY[i]}") dl.get("10-K", df.CIK[i], after_date="20190101", include_amends=False) dl.get("10-Q", df.CIK[i], after_date="20190101", include_amends=False) print(f"{df.index[i]}: {df.COMPANY[i]} done.")
from sec_edgar_downloader import Downloader path = 'E:\stockdata3\Filings' dl = Downloader(path) aapl = dl.get('10-K', 'aapl', 12) pass
from sec_edgar_downloader import Downloader # Initialize a downloader instance. # If no argument is passed to the constructor, the package # will attempt to locate the user's downloads folder. dl = Downloader("/j/tmp32/edgar") # Get the latest 10-K filing for Ubiquiti dl.get("10-K", "UI", 1) # Get the latest 10-Q filing for Ubiquiti dl.get("10-Q", "UI", 1)
from sec_edgar_downloader import Downloader f = open('companies.txt', 'r') stickers = f.read().splitlines() f.close() dl = Downloader(".") after_date = "20090101" before_date = "20200427" for sticker in stickers: dl.get("10-K", sticker, after_date="20100101", before_date="20200325")
from sec_edgar_downloader import Downloader # In[8]: cik = pd.read_csv("FINAL_COMPANY_LIST_SEC_INFO.csv")[['cik']] cik = cik.values.tolist() print(cik) # In[9]: file_location = "/pylon5/tr5pi7p/suli2020/uspto/rebranding/10K" cik = cik[:10] dl = Downloader(file_location) # In[ ]: num = 0 for lst in cik: for c in lst: print('Started: ' + c) dl.get("10-K", c, 30) num += 1 print('Downloaded: ' + c + ', ' + num +' in total')
def getAll10k(company): dl = Downloader() # Get all 10-K filings for Microsoft dl.get("10-K", company, amount=10)
def download_and_parse(actual_stock, ciks, dict): """ This function is the meat and potatoes of downloading the SEC 10-K filings. It uses the sec_edgar_downloader package to download the 10-K filing. Then it uses code from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 to parse the 10-K filing for Item 1A. The code separates Item 1A into sentences and outputs it to a dictionary associated with the CIK value. :param actual_stock: the stock ticker :param ciks: CIK - stock ticker dictionary/crosswalk :param dict: a dictionary to store the 10-K Item 1A sentences :return: nothing, but it constantly updates/adds to the dictionary """ if actual_stock[0] in [ 'BF.B', 'BF-B', 'bf-b', 'bf.b', 'HES', 'hes', 'hpe', 'HPE' ]: print("This stock has no CIK... issue there so I am skipping") return cik = convert_ticker_to_cik(actual_stock, ciks) cik = cik.zfill(10) dl = Downloader() #stock_ticker = "0001067983" dl.get("10-K", cik, after="2015-01-01", download_details=False) count = 0 for root, dirs, files in os.walk( "./sec-edgar-filings/{}/10-K/".format(cik)): # search through each years' 10-k filing for file in files: # find the txt document of the 10-K filing if file == 'full-submission.txt': try: year = re.findall(r'\-[0-9][0-9]\-', root) year = year[len(year) - 1][1:-1] # certain stocks have issues for certain years. I will include code to exclude them here # if year == 21 and stock_ticker in ("'APA'", "'ADM'"): # print("Skipping year {} for ticker {} due to issues...".format(year, stock_ticker)) # read the file filing_text = read_file(root + '/' + file) # this code comes from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 doc_start_pattern = re.compile(r'<DOCUMENT>') doc_end_pattern = re.compile(r'</DOCUMENT>') # Regex to find <TYPE> tag prceeding any characters, terminating at new line type_pattern = re.compile(r'<TYPE>[^\n]+') # Create 3 lists with the span idices for each regex ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc ### First filter will give us document tag start <end> and document tag end's <start> ### We will use this to later grab content in between these tags doc_start_is = [ x.end() for x in doc_start_pattern.finditer(filing_text) ] doc_end_is = [ x.start() for x in doc_end_pattern.finditer(filing_text) ] ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K' ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' ### as section names doc_types = [ x[len('<TYPE>'):] for x in type_pattern.findall(filing_text) ] document = {} # Create a loop to go through each section type and save only the 10-K section in the dictionary for doc_type, doc_start, doc_end in zip( doc_types, doc_start_is, doc_end_is): if doc_type == '10-K': document[doc_type] = filing_text[doc_start:doc_end] regex = re.compile( r'(>(Item|ITEM)(\s| | | )(1A|1B)\.{0,1})|(ITEM\s(1A|1B))|(<B>Item</B><B></B><B> 1A</B>)|( Item 1B.)|(Item<font style="font-family:Times New Roman, Times, serif;font-size:10pt;"> 1B)|(Item<font style="font-family: Times New Roman, Times, serif; font-size: 10pt;"> 1B)' ) matches = regex.finditer(document['10-K']) test_df = pd.DataFrame([ (x.group(), x.span()[0], x.span()[1]) for x in matches ]) if len(test_df) == 0: print("error... didn't pick up anything") break test_df.columns = ['item', 'start', 'end'] test_df['item'] = test_df.item.str.lower() test_df.replace( '<font style="font-family:times new roman, times, serif;font-size:10pt;">', ' ', regex=True, inplace=True) test_df.replace( '<font style="font-family: times new roman, times, serif; font-size: 10pt;">', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', '', regex=True, inplace=True) test_df.replace('\.', '', regex=True, inplace=True) test_df.replace('>', '', regex=True, inplace=True) test_df.replace('<b', '', regex=True, inplace=True) test_df.replace('</b', '', regex=True, inplace=True) pos_dat = test_df.sort_values( 'start', ascending=True).drop_duplicates(subset=['item'], keep='last') pos_dat.set_index('item', inplace=True) # Check conditionals here if 'item1a' in pos_dat.index and 'item1b' in pos_dat.index: item_1a_raw = document['10-K'][ pos_dat['start'].loc['item1a']:pos_dat['start']. loc['item1b']] item_1a_content = BeautifulSoup(item_1a_raw, 'lxml') test_df["text"] = item_1a_content.get_text() test_df.replace( '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents', ' ', regex=True, inplace=True) test_df.replace('Table of Contents', ' ', regex=True, inplace=True) test_df.replace('\s\s', ' ', regex=True, inplace=True) test_df.replace('\\u200b', ' ', regex=True, inplace=True) test_df.replace('\\n[0-9]', ' ', regex=True, inplace=True) test_df.replace('[0-9]\\n', ' ', regex=True, inplace=True) test_df.replace('\\xa0', ' ', regex=True, inplace=True) test_df.replace('\\x92', ' ', regex=True, inplace=True) test_df.replace('\\x93', ' ', regex=True, inplace=True) test_df.replace('\\x94', ' ', regex=True, inplace=True) test_df.replace('\\x95', ' ', regex=True, inplace=True) test_df.replace('\\n', ' ', regex=True, inplace=True) test_df.replace('\n', ' ', regex=False, inplace=True) # output the text to the dict sentences = nltk.sent_tokenize(str(test_df['text'][0])) if count == 0: output_frame = pd.DataFrame( [[year, sentences]], columns=["year", "text"]) else: output_frame = output_frame.append( pd.DataFrame([[year, sentences]], columns=["year", "text"]), ignore_index=True) dict[cik] = output_frame print( "finished processing ticker {} ({}) and added to dictionary for year {}" .format(cik, actual_stock[0], year)) print(75 * '') count += 1 else: regex = re.compile( r'(>(Item|ITEM)(\s| | | )(1A|2)\.{0,1})|(ITEM\s(1A|2))|(<B>Item</B><B></B><B> 1A</B>)|( Item 2.)' ) matches = regex.finditer(document['10-K']) test_df = pd.DataFrame([(x.group(), x.span()[0], x.span()[1]) for x in matches]) if len(test_df) == 0: print("error... didn't pick up anything") break test_df.columns = ['item', 'start', 'end'] test_df['item'] = test_df.item.str.lower() test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', '', regex=True, inplace=True) test_df.replace('\.', '', regex=True, inplace=True) test_df.replace('>', '', regex=True, inplace=True) test_df.replace('<b', '', regex=True, inplace=True) test_df.replace('</b', '', regex=True, inplace=True) pos_dat = test_df.sort_values( 'start', ascending=True).drop_duplicates(subset=['item'], keep='last') pos_dat.set_index('item', inplace=True) item_1a_raw = document['10-K'][ pos_dat['start'].loc['item1a']:pos_dat['start']. loc['item2']] item_1a_content = BeautifulSoup(item_1a_raw, 'lxml') test_df["text"] = item_1a_content.get_text() test_df.replace( '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents', ' ', regex=True, inplace=True) test_df.replace('Table of Contents', ' ', regex=True, inplace=True) test_df.replace('\s\s', ' ', regex=True, inplace=True) test_df.replace('\\u200b', ' ', regex=True, inplace=True) test_df.replace('\\n[0-9]', ' ', regex=True, inplace=True) test_df.replace('[0-9]\\n', ' ', regex=True, inplace=True) test_df.replace('\\xa0', ' ', regex=True, inplace=True) test_df.replace('\\x92', ' ', regex=True, inplace=True) test_df.replace('\\x93', ' ', regex=True, inplace=True) test_df.replace('\\x94', ' ', regex=True, inplace=True) test_df.replace('\\x95', ' ', regex=True, inplace=True) test_df.replace('\\n', ' ', regex=True, inplace=True) test_df.replace('\n', ' ', regex=False, inplace=True) # output the text to the dict sentences = nltk.sent_tokenize(str(test_df['text'][0])) if count == 0: output_frame = pd.DataFrame( [[year, sentences]], columns=["year", "text"]) else: output_frame = output_frame.append( pd.DataFrame([[year, sentences]], columns=["year", "text"]), ignore_index=True) dict[cik] = output_frame print( "finished processing ticker {} ({}) and added to dictionary for year {}" .format(cik, actual_stock[0], year)) print(75 * '') count += 1 except: print("error occurred")
# -*- coding: utf-8 -*- """ Created on Sun Mar 1 12:30:27 2020 @author: Stephen Sigrist """ import pandas as pd from sec_edgar_downloader import Downloader tickers_data = pd.read_csv("..."\ "...TriggeredEvents\\input files\\equity tickers.csv") tickers_data.head() #del tickers_data dl = Downloader("...TriggeredEvents\\downloaded\\SEC") for i in range(len(tickers_data)): print("Begin Downloading " + tickers_data['ticker'][i]) for filing_type in dl.supported_filings: try: dl.get(filing_type, tickers_data['ticker'][i], 10) except: print("An Error Occured in Downloading Process") print("Finished Downloading " + tickers_data['ticker'][i])