def get_10k(self): file_dir = os.getcwd() + '/filings/' + self.ticker + '/10-k/' file_name = "" if os.path.isdir(file_dir): # first file is the most recently downloaded if len(os.listdir(file_dir)) > 0: for file in os.listdir(file_dir): file_name = file return file_dir + file_name try: file_dir = os.getcwd() + '/filings/' my_filings = Filing(cik_lookup=self.ticker, filing_type=FilingType.FILING_10K, count=self.NUM_10Ks) my_filings.save(file_dir) print(self.ticker + " 10k downloaded") file_dir += self.ticker + '/10-k/' for file in os.listdir(file_dir): file_name = file return file_dir + file_name except OSError as err: print("OS error: {0}".format(err)) print('Unable to download ' + self.ticker + ' 10k!') return None
def filing(ctx, lookups, ftype, start, end, count, directory): r"""Click command for downloading filings. Run ``secedgar filing --help`` for info. \f Args: ctx (click.core.Context): Click context. lookups (str): Companies and tickers to include in filing download. ftype (str): String of FilingType enum. start (str): Start date for filings in YYYYMMDD format. Will implicitly default to first available filing. end (str): End date for filings in YYYYMMDD format. Will implicitly default to today. count (int): Number of filings to save per ticker/company. directory (str): Directory where files should be saved. Defaults to current working directory. Returns: None """ # If given filing type is not valid enum, raise FilingTypeError try: ftype = FilingType[ftype] except KeyError: raise FilingTypeError() f = Filing(cik_lookup=lookups, filing_type=ftype, start_date=date_cleanup(start), end_date=date_cleanup(end), count=count, user_agent=ctx.obj['user_agent']) f.save(directory=directory)
def get_most_recent_10_K(firms, fp): for firm in firms: my_filings = Filing(cik_lookup=firm, filing_type=FilingType.FILING_10K, count=1) my_filings.save(fp) return
def filing(lookups, ftype, start, end, count, directory): """Click command for downloading filings. Run ``secedgar filing --help`` for info.""" # If given filing type is not valid enum, raise FilingTypeError try: ftype = FilingType[ftype] except KeyError: raise FilingTypeError() f = Filing(cik_lookup=lookups, filing_type=ftype, start_date=date_cleanup(start), end_date=date_cleanup(end), count=count) f.save(directory=directory)
def run(df): cik = list(df['CIK']) names = list(df['Name']) for c, n in zip(cik, names): if len(str(c)) < 10: missing = 10 - len(str(c)) temp = ("0" * missing) + str(c) print("SCRAPING {} ...".format(temp)) my_filings = Filing(cik=temp, filing_type=FilingType.FILING_10K ) # 10-Q filings for Apple (NYSE: AAPL) try: my_filings.save( './filings/' ) # Saves last 15 10Q reports from AAPL to ~/path/to/dir except ValueError: print("No {}".format(n))
def test_filing_raises_warning_when_less_filings_than_count(self, recwarn, count, raises_error, tmp_data_directory, mock_cik_validator_get_single_cik, mock_single_cik_filing_limited_responses): # noqa:E501 f = Filing(cik_lookup=['aapl', 'msft', 'amzn'], filing_type=FilingType.FILING_10Q, count=count, client=NetworkClient(batch_size=10)) f.save(tmp_data_directory) if raises_error: w = recwarn.pop(UserWarning) assert issubclass(w.category, UserWarning) else: try: w = recwarn.pop(UserWarning) pytest.fail("Expected no UserWarning, but received one.") # Should raise assertion error since no UserWarning should be found except AssertionError: pass
def download_filing_4(symbol, data_filings_path, start_date=datetime(2019, 7, 1), end_date=datetime(2020, 6, 30)): """ Download Form 4 from SEC Downloads all the info about the form 4 in multiple txt files. TODO: Look if it overwrites. Save a file with metadata of already looked dates. Create a folder for the symbol and for the looked filing """ current_path = os.path.join(data_filings_path, symbol.lower()) if not(os.path.exists(current_path)): filing = Filing(cik_lookup=symbol.lower(), filing_type=FilingType.FILING_4, start_date=start_date, end_date=end_date) filing.save(data_filings_path) else: print("Already downloaded")
def test_filing_raises_warning_when_less_filings_than_count( self, monkeypatch, recwarn, count, raises_error, tmp_data_directory): monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorGetCIKs.get_ciks) monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFilingLimitedResponses(10)) f = Filing(cik_lookup=['aapl', 'msft', 'amzn'], filing_type=FilingType.FILING_10Q, count=count, client=NetworkClient(batch_size=10)) f.save(tmp_data_directory) if raises_error: w = recwarn.pop(UserWarning) assert issubclass(w.category, UserWarning) else: try: w = recwarn.pop(UserWarning) pytest.fail("Expected no UserWarning, but received one.") # Should raise assertion error since no UserWarning should be found except AssertionError: pass
def test_save_no_filings_raises_error(self, tmp_data_directory, monkeypatch, no_urls): monkeypatch.setattr(Filing, "get_urls", lambda x: no_urls) f = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10K) with pytest.raises(ValueError): f.save(tmp_data_directory)
def test_filing_save_multiple_ciks(self, tmp_data_directory, mock_cik_validator_get_multiple_ciks, mock_single_cik_filing, mock_filing_response): f = Filing(["aapl", "amzn", "msft"], FilingType.FILING_10Q, count=3) f.save(tmp_data_directory)
nest_asyncio.apply() # In[19]: from secedgar.filings import Filing, FilingType # 10Q filings for Apple (ticker "aapl") from secedgar.filings import Filing, FilingType my_filings = Filing(cik_lookup=['gme'], filing_type=FilingType.FILING_10K, count=1326380, user_agent='deeptendies') my_filings.save('filings') # # Parse Data # In[1]: import glob from secedgar.parser import MetaParser from pathlib import Path out_dir = 'parsed_filings' Path(out_dir).mkdir(parents=True, exist_ok=True) directory = 'filings' for filepath in glob.iglob('filings/*/*/*.txt'): print(filepath)
from secedgar.filings import Filing, FilingType import os from tqdm import tqdm f = open('tickers.txt', 'r') tickers = [] for x in f: tickers.append(x) for ticker in tqdm(tickers): ticker = ticker[:-1] try: file_dir = os.getcwd() + '/filings/' my_filings = Filing(cik_lookup=ticker, filing_type=FilingType.FILING_10K, count=1) my_filings.save(file_dir) print(ticker + " 10k downloaded") except OSError as err: print("OS error: {0}".format(err)) print('Unable to download ' + ticker + ' 10k!')
from secedgar.utils import get_cik_map from secedgar.filings import Filing, FilingType print(list(get_cik_map().items())[:5]) my_filings = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10Q, count=1) my_filings.save('/home/sroot/kaizha/temp')
from secedgar.filings import Filing, FilingType # 10Q filings for Apple (ticker "aapl") my_filings = Filing(cik_lookup='aapl', filing_type=FilingType.FILING_10Q, count=15) my_filings.save('./data')
def __get_data(self, cik, filing_type, data_set): result = pd.DataFrame() filing_word_count = dict() my_filings = Filing(cik=str(cik), filing_type=filing_type) path = f'../data/company_filings/{cik}_{filing_type.value}/' if not os.path.exists(path): try: print( f'Fetching data for cik={cik}, filing_type={filing_type}') my_filings.save(path) except: try: if os.path.exists(path): shutil.rmtree(path) except OSError as e: print("Error: %s : %s" % (path, e.strerror)) else: print(f'Skipping data fetching. Using cache at {path}') for subdir, dirs, files in os.walk(path): for file in files: file_metadata = self.__get_file_metadata(f'{subdir}/{file}') for url in my_filings.get_urls(): if url.rsplit('/')[-1].strip() == file: file_metadata['url'] = url break assert len( file_metadata ) == 8, "Could not get all relevant metadata: %r" % file_metadata if file_metadata['year'] < 2007 or \ (file_metadata['form_type'] != '10-K' and file_metadata['form_type'] != '10-Q'): print( f'Skipping file. year={file_metadata["year"]} form_type={file_metadata["form_type"]}' ) continue violations_in_file, local_word_count = self.__get_violations_for_file( f'{subdir}/{file}') file_info = { 'cik': cik, 'firm name': file_metadata['company_name'], 'firm address': file_metadata['address'], 'zip code': str(file_metadata['zip']), 'year': file_metadata['year'], 'quarter': file_metadata['quarter'] if filing_type is FilingType.FILING_10Q else None, 'url': file_metadata['url'], 'filing type': filing_type.value, 'dataset': data_set, 'has covenant violation': 0 if violations_in_file == 0 else 1, 'total violations': violations_in_file } result = result.append(pd.DataFrame(file_info, index=[0])) for word in local_word_count: if word in filing_word_count: filing_word_count[word] = filing_word_count[ word] + local_word_count[word] else: filing_word_count[word] = local_word_count[word] return result, filing_word_count
def test_filing_simple_example(self, tmp_data_directory): my_filings = Filing(cik_lookup='IBM', filing_type=FilingType.FILING_10Q) my_filings.save(tmp_data_directory)
# https://github.com/coyo8/sec-edgar # pip install secedgar from secedgar.filings import CIKLookup, Filing, FilingType lookup = '0000320193' lookups = CIKLookup(['aapl', 'msft', 'Facebook']) my_filings = Filing(cik_lookup=lookup, filing_type=FilingType.FILING_10Q) my_filings.save('tempdir')
def test_filing_save_single_cik(self, tmp_data_directory, mock_cik_validator_get_single_cik, mock_single_cik_filing): f = Filing('aapl', FilingType.FILING_10Q, count=3) f.save(tmp_data_directory)
def test_filing_save_multiple_ciks(self, tmp_data_directory, mock_cik_validator_get_multiple_ciks, mock_single_cik_filing): f = Filing(['aapl', 'amzn', 'msft'], FilingType.FILING_10Q, count=3) f.save(tmp_data_directory)
def test_filing_save_multiple_ciks(self, tmp_data_directory, monkeypatch): monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorMultipleCIKs.get_ciks) monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) f = Filing(['aapl', 'amzn', 'msft'], FilingType.FILING_10Q, count=3) f.save(tmp_data_directory)
def get_mda(id): def normalize_text(text): """Normalize Text """ text = unicodedata.normalize("NFKD", text) # Normalize text = '\n'.join(text.splitlines()) # Unicode break lines # Convert to upper text = text.upper() # Convert to upper # Take care of breaklines & whitespaces combinations due to beautifulsoup parsing text = re.sub(r'[ ]+\n', '\n', text) text = re.sub(r'\n[ ]+', '\n', text) text = re.sub(r'\n+', '\n', text) # To find MDA section, reformat item headers text = text.replace('\n.\n', '.\n') # Move Period to beginning text = text.replace('\nI\nTEM', '\nITEM') text = text.replace('\nITEM\n', '\nITEM ') text = text.replace('\nITEM ', '\nITEM ') text = text.replace(':\n', '.\n') # Math symbols for clearer looks text = text.replace('$\n', '$') text = text.replace('\n%', '%') # Reformat text = text.replace('\n', '\n\n') # Reformat by additional breakline return text def find_mda_from_text(text, start=0): """Find MDA (Management Discussion and Analysis) section from normalized text Args: text (str)s """ debug = False mda = "" end = 0 # Define start & end signal for parsing item7_begins = [ '\nITEM 7.', '\nITEM 7 –', '\nITEM 7:', '\nITEM 7 ', '\nITEM 7\n' ] item7_ends = ['\nITEM 7A'] if start != 0: item7_ends.append('\nITEM 7') # Case: ITEM 7A does not exist item8_begins = ['\nITEM 8'] """ Parsing code section """ text = text[start:] # Get begin for item7 in item7_begins: begin = text.find(item7) if debug: print(item7, begin) if begin != -1: break if begin != -1: # Begin found for item7A in item7_ends: end = text.find(item7A, begin + 1) if debug: print(item7A, end) if end != -1: break if end == -1: # ITEM 7A does not exist for item8 in item8_begins: end = text.find(item8, begin + 1) if debug: print(item8, end) if end != -1: break # Get MDA if end > begin: mda = text[begin:end].strip() else: end = 0 return mda, end # df_names = pd.read_csv('master/stocknames_form10k.csv') # df_names.drop_duplicates(subset=['gvkey'], inplace = True) df_names = pd.read_pickle('df_company.pkl') gvkeys = df_names['gvkey'].values[id * 14:(id + 1) * 14] names = df_names['CoName'].values[id * 14:(id + 1) * 14] for j in range(len(gvkeys)): my_filings = Filing(cik_lookup=names[j], filing_type=FilingType.FILING_10K, start_date=dt.datetime(2010, 1, 1), end_date=dt.datetime(2020, 12, 31)) try: my_filings.save('Corpus_10k') except: continue company = names[j] files = glob.glob('Corpus_10k/' + company + '/10-k/*') files.sort() df = pd.DataFrame() for i in files: print(i) try: with open(i) as f: content = f.read() except: continue try: soup = bs4.BeautifulSoup(content, "html.parser") except: continue text = soup.get_text("\n") text = normalize_text(text) mda, end = find_mda_from_text(text) if mda and len(mda.encode('utf-8')) < 1000: mda, _ = find_mda_from_text(text, start=end) if len(mda.encode('utf-8')) < 1000: continue df = df.append( pd.DataFrame({ 'company': [company], 'filename': [i], 'mda': [mda] })) df.to_pickle('Corpus_mda/' + str(gvkeys[j]) + '_mda.pkl') shutil.rmtree('Corpus_10k/' + company) return 0
def test_filing_save_single_cik(self, tmp_data_directory, monkeypatch): f = Filing('aapl', FilingType.FILING_10Q, count=3) monkeypatch.setattr(_CIKValidator, "get_ciks", MockCIKValidatorGetCIKs.get_ciks) monkeypatch.setattr(NetworkClient, "get_response", MockSingleCIKFiling) f.save(tmp_data_directory)
from secedgar.filings import Filing, FilingType from secedgar.utils import get_cik_map import pandas as pd import os path = 'E:\stockdata3\Filings' filing = Filing(cik_lookup='msft', filing_type=FilingType.FILING_10K, count=10) filing.save(path) pass