def do_download(path, ticker, form): dl = Downloader(path) dl.get(form, ticker, after="2015-01-01") path = os.path.join(path, "sec-edgar-filings", ticker, form) if not os.path.isdir(path): return pattern = re.compile("([0-9]+)") for filingDir in os.listdir(path): fullSubmissionFname = os.path.join(path, filingDir, "full-submission.txt") htmlFname = os.path.join(path, filingDir, "filing-details.html") if not os.path.isfile(fullSubmissionFname): print("skipping ", fullSubmissionFname) continue found = False with open(fullSubmissionFname) as f: for line in f: if line.startswith("FILED AS OF DATE"): date = re.search(pattern, line).group(0) found = True if not found: print("skipping ", filingDir) continue url_rewrite(htmlFname, find_filename(path, ticker, form, date)) shutil.rmtree(os.path.join(path, filingDir))
def download_forms(company_index, year: str): """ Reads index file and downloads 10Ks and 10Qs """ dl = Downloader(mypath+'/data/') for i in company_index.index: dl.get("10-K", company_index.TICKER[i], after_date=year+'0101', include_amends=False) dl.get("10-Q", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
def download(tickers): path = get_filings_folder() dl = Downloader(path) n = len(tickers) for i in range(n): print_progress(i, n) if os.path.exists('../Filings/sec_edgar_filings/' + tickers[i]) == False: dl.get_10k_filings(tickers[i])
def download_latest_filing(file_type, ticker): dl = Downloader(os.getcwd()) dl.get(file_type, ticker, amount=1) dl_path = os.getcwd() + '/sec-edgar-filings/{}/{}/'.format( ticker, file_type) inner_most_dir = [x[0] for x in os.walk(dl_path)][1] html_path = f'{inner_most_dir}/filing-details.html' txt_path = f'{inner_most_dir}/full-submission.txt' return (html_path, txt_path)
class FilingsDownloader: def __init__(self, downloadPath): self.downloadPath = downloadPath self.downloader = Downloader(self.downloadPath) # Download 10 latest 10-K filings for the given company ticker def downloadFilings(self, ticker, filing_type="10-K", latest=10): self.downloader.get(filing_type, ticker, latest) def removefilings(self, path): shutil.rmtree(path, ignore_errors=True)
def get_files(CIK,x): save_path = '/Users/yijingtan/Downloads/d' dl = Downloader(save_path) dl.get('13F-HR', CIK,include_amends=True,after_date=x) #CIK = CIK.lstrip("0") files = os.listdir('/Users/yijingtan/Downloads/d/sec_edgar_filings/' + CIK +'/' +'13F-HR') data = [parse(file, CIK) for file in sorted(files)] print(data) try: return pd.concat(data) except ValueError: print("All Values are None") return None
def __init__(self, args, log): self.args = args self.log = log self.VERSION = tfds.core.Version(self.args.dataset_version) self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually" super(tfds.core.GeneratorBasedBuilder, self).__init__() self.dl = Downloader(self.args.download_path) self.parser = FinancialReportParser() self.text_processor = get_text_processor(args.model)(args)
def pull_filings(corporations): print('Pulling filings from SEC...') dl=Downloader("./sec/filings") added = [] for corp in corporations: been_added = set(added) has_pulled = False if corp['cik'] in been_added: has_pulled = True if corp['main'] != True and has_pulled == False: dl.get_all_available_filings(corp['cik']) print(f'Pulled: {corp["name"]}') added.append(corp['cik'])
def get_files(filing, CIK, number): save_path = r"C:\Users\smore\Documents\13F" dl = Downloader(save_path) dl.get(filing, CIK, number) CIK = CIK.lstrip("0") print(CIK) files = os.listdir( f"C:/Users/smore/Documents/13F/sec_edgar_filings/{CIK}/{filing}") data = [parse(file, CIK) for file in sorted(files)] try: print(pd.concat(data)) return pd.concat(data) except ValueError: print("All Values are None") return None
def Bulk_extraction(ticker, filetype, date, location): ''' ticker = company ticker or list of tickers filetype = type of financial doc (8-K,10-K) date = all filetypes after this date format: year-day-month location = local directory to store files as string''' dl = Downloader(str(location)) for company in ticker: dl.get(str(filetype), str(company), after=str(date), download_details=True) return "Complete" # Have to double check to see if all funds are updated for 2019-2020 #Bulk_extraction(holdings,'10-K','2019-01-01',dl)
def test_constructor_relative_path(): dl = Downloader("./Downloads") expected = Path.cwd().joinpath("Downloads") assert dl.download_folder == expected
def test_constructor_no_params(): dl = Downloader() expected = Path.home().joinpath("Downloads") assert dl.download_folder == expected
def test_constructor_blank_path(): dl = Downloader("") # pathlib treats blank paths as the current working directory expected = Path.cwd() assert dl.download_folder == expected
def downloader(tmpdir): tmp_dir = Path(tmpdir.mkdir("Downloads")) dl = Downloader(tmp_dir) yield dl, tmp_dir shutil.rmtree(tmp_dir)
def downloader(tmp_path): tmp_dir = tmp_path / "Downloads" tmp_dir.mkdir() dl = Downloader(tmp_dir) yield dl, tmp_dir shutil.rmtree(tmp_dir)
# import required packages import os import requests import pandas as pd import time from sec_edgar_downloader import Downloader # api related info. dl = Downloader('D:/Thesis_data') tickers = pd.read_excel('D:\\Thesis_data\\cik_ticker.xlsx') nyse_nas = pd.concat([ tickers[tickers['Exchange'] == 'NYSE'], tickers[tickers['Exchange'] == 'NASDAQ'] ], axis=0) n_cik = nyse_nas.shape[0] error_list = [] print('Number of Stocks: ' + str(n_cik)) n = 0 # 從清單中刪除已下載 def diff(first, second): n = len(os.listdir('D:\\Thesis_data\\sec_edgar_filings')) second = set(second) return [item for item in first if str(item) not in second], n download_list, n = diff(list(nyse_nas['CIK']), os.listdir('D:\\Thesis_data\\sec_edgar_filings'))
# -*- coding: utf-8 -*- """ Created on Sun Mar 1 12:30:27 2020 @author: Stephen Sigrist """ import pandas as pd from sec_edgar_downloader import Downloader tickers_data = pd.read_csv("..."\ "...TriggeredEvents\\input files\\equity tickers.csv") tickers_data.head() #del tickers_data dl = Downloader("...TriggeredEvents\\downloaded\\SEC") for i in range(len(tickers_data)): print("Begin Downloading " + tickers_data['ticker'][i]) for filing_type in dl.supported_filings: try: dl.get(filing_type, tickers_data['ticker'][i], 10) except: print("An Error Occured in Downloading Process") print("Finished Downloading " + tickers_data['ticker'][i])
def form_valid(self, form): #get user and validate form form.instance.user = self.request.user super(Createlisting, self).form_valid(form) #get Company CIK tik = form.data['ticker'] URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany' CIK_RE = re.compile(r'.*CIK=(\d{10}).*') f = requests.get(URL.format(tik), stream=True) results = CIK_RE.findall(f.text) if len(results): cik = results[0] cmp_name = self.edgar.getCompanyNameByCik(cik) #create object in database company = Company(ticker=tik, cik=cik, name=cmp_name, user=self.request.user) company.save() # delete empty database queryset = Company.objects.filter(name='').delete() # Creating filename and url structure BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = os.path.join(BASE_DIR, 'static') out_path = path + "/sec_edgar_filings/" + tik + "/10-K/" # creating object of class with path to the download and downloading the txt file which is too big to load without xml iteration && should look for alternative option than that dl = Downloader(path) # download the latest one by adding the 1 dl.get_10k_filings(tik, 1) #removing the - and .txt to get the number to the current filing submission accession number file_name = [ f for f in os.listdir(out_path) if os.path.isfile(os.path.join(out_path, f)) ] switched_filename = file_name[0] switched_filename = switched_filename.replace('-', '').replace( '.txt', '/index.json') # creating base url configuration, i can do a better job than this!!! bare_url = r"https://www.sec.gov/Archives/edgar/data/" base_url = r"https://www.sec.gov" documents_url = bare_url + str(results[0]) + "/" + switched_filename #retreieve the files and get the summary content = requests.get(documents_url).json() for file in content['directory']['item']: # Grab the filing summary and create a new url leading to the file so we can download it. if file['name'] == 'FilingSummary.xml': xml_summary = base_url + content['directory'][ 'name'] + "/" + file['name'] #print info print('-' * 50) print('File Name: ' + file['name']) print('File Path: ' + xml_summary) # define a new base url that represents the filing folder. This will come in handy when we need to download the reports. base_url = xml_summary.replace('FilingSummary.xml', '') # request and parse the content content = requests.get(xml_summary).content soup = BeautifulSoup(content, 'lxml') # find the 'myreports' tag because this contains all the individual reports submitted. reports = soup.find('myreports') # I want a list to store all the individual components of the report, so create the master list. master_reports = [] # loop through each report in the 'myreports' tag but except the last one it produces an error. for report in reports.find_all('report')[:-1]: # create a dictionary to store all the different parts we need. report_dict = {} report_dict['name_short'] = report.shortname.text report_dict['name_long'] = report.longname.text report_dict['position'] = report.position.text report_dict['menu_category'] = report.menucategory.text report_dict['url'] = base_url + report.htmlfilename.text # append the dictionary to the master list. master_reports.append(report_dict) if report_dict[ 'name_short'] == 'Consolidated Statements of Cash Flows': # print the info. print('-' * 50) print(base_url + report.htmlfilename.text) print(report.longname.text) print(report.shortname.text) print(report.menucategory.text) print(report.position.text) # creating a holder for the url since a Bug creating a different file path into the database! redirect_url_to_statement = base_url + report.htmlfilename.text # in case of multiple statements statements_url = [] for report_dict in master_reports: # define the statements we want to look for. item1 = r"Consolidated Statements of Cash Flows" # store them in a list. report_list = [item1] # if the short name can be found in the report list. if report_dict['name_short'] in report_list: # print some info and store it in the statements url. print('-' * 50) print(report_dict['name_short']) print(report_dict['url']) statements_url.append(report_dict['url']) statement = Statement(year=2019, type="CONSOLIDATED STATEMENTS OF CASH FLOWS", url=redirect_url_to_statement, company=company) statement.save() statements_data = [] # loop through each statement url for statementUrl in statements_url: # define a dictionary that will store the different parts of the statement. statement_data = {} statement_data['headers'] = [] statement_data['sections'] = [] statement_data['data'] = [] # request the statement file content content = requests.get(statementUrl).content report_soup = BeautifulSoup(content, 'html') # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list. for index, row in enumerate(report_soup.table.find_all('tr')): # first let's get all the elements. cols = row.find_all('td') # if it's a regular row and not a section or a table header if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): reg_row = [ele.text.strip() for ele in cols] statement_data['data'].append(reg_row) # if it's a regular row and a section but not a table header elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0): sec_row = cols[0].text.strip() statement_data['sections'].append(sec_row) # finally if it's not any of those it must be a header elif (len(row.find_all('th')) != 0): hed_row = [ele.text.strip() for ele in row.find_all('th')] statement_data['headers'].append(hed_row) else: print('We encountered an error.') #Creating DAtA into Database #Creating each header and rotating thru all data values print("HEADERSSSSS ") print("Saving Headers...") for i in range(len(statement_data['headers'][1])): print(statement_data['headers'][1][i]) statementHeader = Statment_element_headers( field=statement_data['headers'][1][i], statement=statement) statementHeader.save() print("DATAAAAAAAA ") print("Saving Data Element...") for j in statement_data['data']: print(j) print(j[i + 1]) k = j[i + 1] # Optimizing the Data Format if '$' or ',' or '(' in k: k = k.replace('$', '') k = k.replace(' ', '') k = k.replace(',', '.') k = k.replace('(', '-') k = k.replace(')', '') k = float(k) print(k) statementData = Statement_element_data( key=j[0], value=k, statement=statement, company=company, header=statementHeader) statementData.save() print(j) print("Saving Data Done for Element") print("Saving Headers Done") print("SECTIONSSSS ") print("Saving Headers ...") for i in statement_data['sections']: print(i) statementSections = Statement_element_section( fieldName=i, statement=statement) statementSections.save() print("Saving Sections Done...") # append it to the master list for future analysis with Panda streams and NLP statements_data.append(statement_data) # print(statements_data) return redirect('home')
def get_links(self): dl = Downloader(desktop_path + "/13f filings") # get past 68 13f filings for each company # for c in CIKs.values(): dl.get_13f_hr_filings('0000807985', 68)
def test_invalid_save_path_constructor(): test_path = str(Path.home().joinpath("Downloads", "invalid_dir")) with pytest.raises(IOError) as excinfo: Downloader(test_path) expected_msg = f"The folder for saving company filings ({test_path}) does not exist." assert expected_msg in str(excinfo.value)
import pandas as pd from sec_edgar_downloader import Downloader x = pd.read_csv( r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping/companies.csv" ) x = x['Column_Name'] dl = Downloader( r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping" ) for i in x: dl.get("8-K", i, after_date="20200328")
from sec_edgar_downloader import Downloader path = 'E:\stockdata3\Filings' dl = Downloader(path) aapl = dl.get('10-K', 'aapl', 12) pass
# Edgar SEC 10k extracting import pandas as pd from sec_edgar_downloader import Downloader # importing IBB holdings. csv file = 'IBB-holdings.csv' IBB = pd.read_csv(file, skiprows=13) holdings = list(IBB["Symbol"]) #setting Download location dl = Downloader('/Users/nick/Documents/cs506/project') def Bulk_extraction(ticker, filetype, date, location): ''' ticker = company ticker or list of tickers filetype = type of financial doc (8-K,10-K) date = all filetypes after this date format: year-day-month location = local directory to store files as string''' dl = Downloader(str(location)) for company in ticker: dl.get(str(filetype), str(company), after=str(date), download_details=True) return "Complete" # Have to double check to see if all funds are updated for 2019-2020 #Bulk_extraction(holdings,'10-K','2019-01-01',dl)
import numpy as np import pandas as pd from sec_edgar_downloader import Downloader df = pd.read_csv('data/sp500_list.csv') drop_column = df.columns[0] df.drop(columns=drop_column, inplace=True) df.CIK = df.CIK.astype(str) df['CIK'] = df['CIK'].str.zfill(10) dl = Downloader('test_data/') for i in df.index: print(f"{df.index[i]}: Pulling 8Ks for {df.COMPANY[i]}") dl.get("10-K", df.CIK[i], after_date="20190101", include_amends=False) dl.get("10-Q", df.CIK[i], after_date="20190101", include_amends=False) print(f"{df.index[i]}: {df.COMPANY[i]} done.")
def test_constructor_user_path(): dl = Downloader("~/Downloads") expected = Path.home().joinpath("Downloads") assert dl.download_folder == expected
class FinancialStatementDatasetBuilder(tfds.core.GeneratorBasedBuilder): def __init__(self, args, log): self.args = args self.log = log self.VERSION = tfds.core.Version(self.args.dataset_version) self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually" super(tfds.core.GeneratorBasedBuilder, self).__init__() self.dl = Downloader(self.args.download_path) self.parser = FinancialReportParser() self.text_processor = get_text_processor(args.model)(args) def _info(self): return tfds.core.DatasetInfo( builder=self, description=("Financial statements data."), features=tfds.features.FeaturesDict({ "documents": tfds.features.Tensor(dtype=tf.string, shape=(self.args.number_of_periods, )), "label": tfds.features.Tensor(dtype=tf.int64, shape=(2, )) }), supervised_keys=("documents", "label"), homepage="https://xxx", citation=r"""@article{my-awesome-dataset-2020, author = {Hurubaru, Sebastian},"}""", ) def _split_generators(self, dl_manager): # Specify the splits return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'train') }, ), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'dev') }, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ "input_dir": os.path.join(self.args.input_dir, 'test') }, ) ] def _generate_examples(self, input_dir): # Get the content of the dataset file dataset = tf.data.experimental.make_csv_dataset( os.path.join(input_dir, self.args.company_files), batch_size=1, column_defaults=[tf.string, tf.string, tf.string, tf.int32], label_name='label', na_value="?", num_epochs=1, ignore_errors=True) for company_info, label in dataset: ciks = company_info['cik'].numpy()[0].decode('utf-8').split(';') ciks.sort(reverse=True, key=lambda cik: int(cik)) end_date = company_info['end_date'].numpy()[0].decode('utf-8') try: documents = [] # For multiple CIKs take in the descending order the last args.number_of_periods 10-K reports for cik in ciks: cik_folder = os.path.join( os.path.expanduser(self.args.download_path), 'sec_edgar_filings', cik.strip().lstrip("0"), '10-K') # Download if and only if the directories do not exist if (os.path.exists(cik_folder) is False): self.dl.get("10-K", cik, before_date=end_date, num_filings_to_download=self.args. number_of_periods) for r, d, f in os.walk(cik_folder): for file in f: if '.txt' in file: documents.append( tf.convert_to_tensor( self.parser.parse_10K_txt_file( os.path.join(r, file)), dtype=tf.string)) if len(documents) < self.args.number_of_periods: raise Exception( f'Could not retrieve {self.args.number_of_periods} 10-K records for {cik}' ) yield cik, { 'documents': tf.stack(documents)[:self.args.number_of_periods], 'label': [1, 0] if label.numpy()[0] == 0 else [0, 1] } except Exception as e: self.log.error(f'Exception occurred for cik {cik}: {e}') def _process_text_map_fn(self, text, label): processed_text, label = tf.py_function(self._process_text, inp=[text, label], Tout=(tf.float32, tf.int64)) return processed_text, label def _process_text(self, text, label): # To allow debugging in the combined static eager mode # pydevd.settrace(suspend=True) # Process the text processed_text = self.text_processor.process_text(text) return (processed_text, label)
def test_constructor_custom_path(): custom_path = Path.home().joinpath("Downloads/SEC/EDGAR/Downloader") dl = Downloader(custom_path) assert dl.download_folder == custom_path
def downloader(tmp_path): dl = Downloader(tmp_path) yield dl, tmp_path shutil.rmtree(tmp_path)
def download_and_parse(actual_stock, ciks, dict): """ This function is the meat and potatoes of downloading the SEC 10-K filings. It uses the sec_edgar_downloader package to download the 10-K filing. Then it uses code from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 to parse the 10-K filing for Item 1A. The code separates Item 1A into sentences and outputs it to a dictionary associated with the CIK value. :param actual_stock: the stock ticker :param ciks: CIK - stock ticker dictionary/crosswalk :param dict: a dictionary to store the 10-K Item 1A sentences :return: nothing, but it constantly updates/adds to the dictionary """ if actual_stock[0] in [ 'BF.B', 'BF-B', 'bf-b', 'bf.b', 'HES', 'hes', 'hpe', 'HPE' ]: print("This stock has no CIK... issue there so I am skipping") return cik = convert_ticker_to_cik(actual_stock, ciks) cik = cik.zfill(10) dl = Downloader() #stock_ticker = "0001067983" dl.get("10-K", cik, after="2015-01-01", download_details=False) count = 0 for root, dirs, files in os.walk( "./sec-edgar-filings/{}/10-K/".format(cik)): # search through each years' 10-k filing for file in files: # find the txt document of the 10-K filing if file == 'full-submission.txt': try: year = re.findall(r'\-[0-9][0-9]\-', root) year = year[len(year) - 1][1:-1] # certain stocks have issues for certain years. I will include code to exclude them here # if year == 21 and stock_ticker in ("'APA'", "'ADM'"): # print("Skipping year {} for ticker {} due to issues...".format(year, stock_ticker)) # read the file filing_text = read_file(root + '/' + file) # this code comes from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 doc_start_pattern = re.compile(r'<DOCUMENT>') doc_end_pattern = re.compile(r'</DOCUMENT>') # Regex to find <TYPE> tag prceeding any characters, terminating at new line type_pattern = re.compile(r'<TYPE>[^\n]+') # Create 3 lists with the span idices for each regex ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc ### First filter will give us document tag start <end> and document tag end's <start> ### We will use this to later grab content in between these tags doc_start_is = [ x.end() for x in doc_start_pattern.finditer(filing_text) ] doc_end_is = [ x.start() for x in doc_end_pattern.finditer(filing_text) ] ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K' ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' ### as section names doc_types = [ x[len('<TYPE>'):] for x in type_pattern.findall(filing_text) ] document = {} # Create a loop to go through each section type and save only the 10-K section in the dictionary for doc_type, doc_start, doc_end in zip( doc_types, doc_start_is, doc_end_is): if doc_type == '10-K': document[doc_type] = filing_text[doc_start:doc_end] regex = re.compile( r'(>(Item|ITEM)(\s| | | )(1A|1B)\.{0,1})|(ITEM\s(1A|1B))|(<B>Item</B><B></B><B> 1A</B>)|( Item 1B.)|(Item<font style="font-family:Times New Roman, Times, serif;font-size:10pt;"> 1B)|(Item<font style="font-family: Times New Roman, Times, serif; font-size: 10pt;"> 1B)' ) matches = regex.finditer(document['10-K']) test_df = pd.DataFrame([ (x.group(), x.span()[0], x.span()[1]) for x in matches ]) if len(test_df) == 0: print("error... didn't pick up anything") break test_df.columns = ['item', 'start', 'end'] test_df['item'] = test_df.item.str.lower() test_df.replace( '<font style="font-family:times new roman, times, serif;font-size:10pt;">', ' ', regex=True, inplace=True) test_df.replace( '<font style="font-family: times new roman, times, serif; font-size: 10pt;">', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', '', regex=True, inplace=True) test_df.replace('\.', '', regex=True, inplace=True) test_df.replace('>', '', regex=True, inplace=True) test_df.replace('<b', '', regex=True, inplace=True) test_df.replace('</b', '', regex=True, inplace=True) pos_dat = test_df.sort_values( 'start', ascending=True).drop_duplicates(subset=['item'], keep='last') pos_dat.set_index('item', inplace=True) # Check conditionals here if 'item1a' in pos_dat.index and 'item1b' in pos_dat.index: item_1a_raw = document['10-K'][ pos_dat['start'].loc['item1a']:pos_dat['start']. loc['item1b']] item_1a_content = BeautifulSoup(item_1a_raw, 'lxml') test_df["text"] = item_1a_content.get_text() test_df.replace( '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents', ' ', regex=True, inplace=True) test_df.replace('Table of Contents', ' ', regex=True, inplace=True) test_df.replace('\s\s', ' ', regex=True, inplace=True) test_df.replace('\\u200b', ' ', regex=True, inplace=True) test_df.replace('\\n[0-9]', ' ', regex=True, inplace=True) test_df.replace('[0-9]\\n', ' ', regex=True, inplace=True) test_df.replace('\\xa0', ' ', regex=True, inplace=True) test_df.replace('\\x92', ' ', regex=True, inplace=True) test_df.replace('\\x93', ' ', regex=True, inplace=True) test_df.replace('\\x94', ' ', regex=True, inplace=True) test_df.replace('\\x95', ' ', regex=True, inplace=True) test_df.replace('\\n', ' ', regex=True, inplace=True) test_df.replace('\n', ' ', regex=False, inplace=True) # output the text to the dict sentences = nltk.sent_tokenize(str(test_df['text'][0])) if count == 0: output_frame = pd.DataFrame( [[year, sentences]], columns=["year", "text"]) else: output_frame = output_frame.append( pd.DataFrame([[year, sentences]], columns=["year", "text"]), ignore_index=True) dict[cik] = output_frame print( "finished processing ticker {} ({}) and added to dictionary for year {}" .format(cik, actual_stock[0], year)) print(75 * '') count += 1 else: regex = re.compile( r'(>(Item|ITEM)(\s| | | )(1A|2)\.{0,1})|(ITEM\s(1A|2))|(<B>Item</B><B></B><B> 1A</B>)|( Item 2.)' ) matches = regex.finditer(document['10-K']) test_df = pd.DataFrame([(x.group(), x.span()[0], x.span()[1]) for x in matches]) if len(test_df) == 0: print("error... didn't pick up anything") break test_df.columns = ['item', 'start', 'end'] test_df['item'] = test_df.item.str.lower() test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', ' ', regex=True, inplace=True) test_df.replace(' ', '', regex=True, inplace=True) test_df.replace('\.', '', regex=True, inplace=True) test_df.replace('>', '', regex=True, inplace=True) test_df.replace('<b', '', regex=True, inplace=True) test_df.replace('</b', '', regex=True, inplace=True) pos_dat = test_df.sort_values( 'start', ascending=True).drop_duplicates(subset=['item'], keep='last') pos_dat.set_index('item', inplace=True) item_1a_raw = document['10-K'][ pos_dat['start'].loc['item1a']:pos_dat['start']. loc['item2']] item_1a_content = BeautifulSoup(item_1a_raw, 'lxml') test_df["text"] = item_1a_content.get_text() test_df.replace( '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents', ' ', regex=True, inplace=True) test_df.replace('Table of Contents', ' ', regex=True, inplace=True) test_df.replace('\s\s', ' ', regex=True, inplace=True) test_df.replace('\\u200b', ' ', regex=True, inplace=True) test_df.replace('\\n[0-9]', ' ', regex=True, inplace=True) test_df.replace('[0-9]\\n', ' ', regex=True, inplace=True) test_df.replace('\\xa0', ' ', regex=True, inplace=True) test_df.replace('\\x92', ' ', regex=True, inplace=True) test_df.replace('\\x93', ' ', regex=True, inplace=True) test_df.replace('\\x94', ' ', regex=True, inplace=True) test_df.replace('\\x95', ' ', regex=True, inplace=True) test_df.replace('\\n', ' ', regex=True, inplace=True) test_df.replace('\n', ' ', regex=False, inplace=True) # output the text to the dict sentences = nltk.sent_tokenize(str(test_df['text'][0])) if count == 0: output_frame = pd.DataFrame( [[year, sentences]], columns=["year", "text"]) else: output_frame = output_frame.append( pd.DataFrame([[year, sentences]], columns=["year", "text"]), ignore_index=True) dict[cik] = output_frame print( "finished processing ticker {} ({}) and added to dictionary for year {}" .format(cik, actual_stock[0], year)) print(75 * '') count += 1 except: print("error occurred")
from sec_edgar_downloader import Downloader dl = Downloader('C:\\Users\\willi\\Documents\\Company_filings\\BRK-A\\13F-HR') # Downloads using CIK of Berkshire hathaway BERK-A dl.get("13F-HR", "0001067983")