Example #1
0
def do_download(path, ticker, form):
    dl = Downloader(path)
    dl.get(form, ticker, after="2015-01-01")
    path = os.path.join(path, "sec-edgar-filings", ticker, form)

    if not os.path.isdir(path):
        return

    pattern = re.compile("([0-9]+)")
    for filingDir in os.listdir(path):
        fullSubmissionFname = os.path.join(path, filingDir,
                                           "full-submission.txt")
        htmlFname = os.path.join(path, filingDir, "filing-details.html")
        if not os.path.isfile(fullSubmissionFname):
            print("skipping ", fullSubmissionFname)
            continue
        found = False
        with open(fullSubmissionFname) as f:
            for line in f:
                if line.startswith("FILED AS OF DATE"):
                    date = re.search(pattern, line).group(0)
                    found = True
        if not found:
            print("skipping ", filingDir)
            continue
        url_rewrite(htmlFname, find_filename(path, ticker, form, date))
        shutil.rmtree(os.path.join(path, filingDir))
Example #2
0
def download_forms(company_index, year: str):
    """ Reads index file and downloads 10Ks and 10Qs 
    """
    dl = Downloader(mypath+'/data/')
    
    for i in company_index.index:
        
        dl.get("10-K", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
        dl.get("10-Q", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
Example #3
0
def download_latest_filing(file_type, ticker):
    dl = Downloader(os.getcwd())
    dl.get(file_type, ticker, amount=1)
    dl_path = os.getcwd() + '/sec-edgar-filings/{}/{}/'.format(
        ticker, file_type)

    inner_most_dir = [x[0] for x in os.walk(dl_path)][1]
    html_path = f'{inner_most_dir}/filing-details.html'
    txt_path = f'{inner_most_dir}/full-submission.txt'

    return (html_path, txt_path)
Example #4
0
class FilingsDownloader:
    def __init__(self, downloadPath):
        self.downloadPath = downloadPath
        self.downloader = Downloader(self.downloadPath)

    # Download 10 latest 10-K filings for the given company ticker
    def downloadFilings(self, ticker, filing_type="10-K", latest=10):
        self.downloader.get(filing_type, ticker, latest)

    def removefilings(self, path):
        shutil.rmtree(path, ignore_errors=True)
Example #5
0
def get_files(CIK,x):
    save_path = '/Users/yijingtan/Downloads/d' 
    dl = Downloader(save_path)
    dl.get('13F-HR', CIK,include_amends=True,after_date=x) 
    #CIK = CIK.lstrip("0")
    files = os.listdir('/Users/yijingtan/Downloads/d/sec_edgar_filings/' + CIK +'/' +'13F-HR')
    data = [parse(file, CIK) for file in sorted(files)]
    print(data)
    try:
        return pd.concat(data)
    except ValueError:
        print("All Values are None")
        return None
Example #6
0
def get_files(filing, CIK, number):
    save_path = r"C:\Users\smore\Documents\13F"
    dl = Downloader(save_path)
    dl.get(filing, CIK, number)
    CIK = CIK.lstrip("0")
    print(CIK)
    files = os.listdir(
        f"C:/Users/smore/Documents/13F/sec_edgar_filings/{CIK}/{filing}")
    data = [parse(file, CIK) for file in sorted(files)]
    try:
        print(pd.concat(data))
        return pd.concat(data)
    except ValueError:
        print("All Values are None")
        return None
Example #7
0
def Bulk_extraction(ticker, filetype, date, location):
    ''' ticker = company ticker or list of tickers 
        filetype = type of financial doc (8-K,10-K)
        date = all filetypes after this date format: year-day-month
        location = local directory to store files as string'''
    dl = Downloader(str(location))
    for company in ticker:
        dl.get(str(filetype),
               str(company),
               after=str(date),
               download_details=True)
    return "Complete"


# Have to double check to see if all funds are updated for 2019-2020
#Bulk_extraction(holdings,'10-K','2019-01-01',dl)
Example #8
0
class FinancialStatementDatasetBuilder(tfds.core.GeneratorBasedBuilder):
    def __init__(self, args, log):

        self.args = args
        self.log = log

        self.VERSION = tfds.core.Version(self.args.dataset_version)
        self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually"

        super(tfds.core.GeneratorBasedBuilder, self).__init__()

        self.dl = Downloader(self.args.download_path)

        self.parser = FinancialReportParser()
        self.text_processor = get_text_processor(args.model)(args)

    def _info(self):

        return tfds.core.DatasetInfo(
            builder=self,
            description=("Financial statements data."),
            features=tfds.features.FeaturesDict({
                "documents":
                tfds.features.Tensor(dtype=tf.string,
                                     shape=(self.args.number_of_periods, )),
                "label":
                tfds.features.Tensor(dtype=tf.int64, shape=(2, ))
            }),
            supervised_keys=("documents", "label"),
            homepage="https://xxx",
            citation=r"""@article{my-awesome-dataset-2020,
                                  author = {Hurubaru, Sebastian},"}""",
        )

    def _split_generators(self, dl_manager):

        # Specify the splits
        return [
            tfds.core.SplitGenerator(
                name=tfds.Split.TRAIN,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'train')
                },
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.VALIDATION,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'dev')
                },
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'test')
                },
            )
        ]

    def _generate_examples(self, input_dir):

        # Get the content of the dataset file
        dataset = tf.data.experimental.make_csv_dataset(
            os.path.join(input_dir, self.args.company_files),
            batch_size=1,
            column_defaults=[tf.string, tf.string, tf.string, tf.int32],
            label_name='label',
            na_value="?",
            num_epochs=1,
            ignore_errors=True)

        for company_info, label in dataset:

            ciks = company_info['cik'].numpy()[0].decode('utf-8').split(';')
            ciks.sort(reverse=True, key=lambda cik: int(cik))
            end_date = company_info['end_date'].numpy()[0].decode('utf-8')

            try:

                documents = []

                # For multiple CIKs take in the descending order the last args.number_of_periods 10-K reports
                for cik in ciks:

                    cik_folder = os.path.join(
                        os.path.expanduser(self.args.download_path),
                        'sec_edgar_filings',
                        cik.strip().lstrip("0"), '10-K')

                    # Download if and only if the directories do not exist
                    if (os.path.exists(cik_folder) is False):
                        self.dl.get("10-K",
                                    cik,
                                    before_date=end_date,
                                    num_filings_to_download=self.args.
                                    number_of_periods)

                    for r, d, f in os.walk(cik_folder):
                        for file in f:
                            if '.txt' in file:
                                documents.append(
                                    tf.convert_to_tensor(
                                        self.parser.parse_10K_txt_file(
                                            os.path.join(r, file)),
                                        dtype=tf.string))

                if len(documents) < self.args.number_of_periods:
                    raise Exception(
                        f'Could not retrieve {self.args.number_of_periods} 10-K records for {cik}'
                    )

                yield cik, {
                    'documents':
                    tf.stack(documents)[:self.args.number_of_periods],
                    'label': [1, 0] if label.numpy()[0] == 0 else [0, 1]
                }

            except Exception as e:
                self.log.error(f'Exception occurred for cik {cik}: {e}')

    def _process_text_map_fn(self, text, label):
        processed_text, label = tf.py_function(self._process_text,
                                               inp=[text, label],
                                               Tout=(tf.float32, tf.int64))
        return processed_text, label

    def _process_text(self, text, label):

        # To allow debugging in the combined static eager mode
        # pydevd.settrace(suspend=True)

        # Process the text
        processed_text = self.text_processor.process_text(text)

        return (processed_text, label)
import pandas as pd
from sec_edgar_downloader import Downloader

x = pd.read_csv(
    r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping/companies.csv"
)
x = x['Column_Name']
dl = Downloader(
    r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping"
)

for i in x:
    dl.get("8-K", i, after_date="20200328")
from sec_edgar_downloader import Downloader

dl = Downloader('C:\\Users\\willi\\Documents\\Company_filings\\BRK-A\\13F-HR')

# Downloads using CIK of Berkshire hathaway BERK-A
dl.get("13F-HR", "0001067983")
Example #11
0
    total = total + 1

data_file.seek(0)
curr = 1
positionStr = 'Current company: ' + str(curr).rjust(
    5) + '     Total company: ' + str(total).rjust(6)
print(positionStr)
time1 = time.time()
for ticker in csv_reader:
    if ticker[0] == "Ticker":
        continue
    print(ticker[0])
    try:
        if "1" in ft:
            dl.get(filing_type[0],
                   ticker[0],
                   after_date=date1,
                   before_date=date2)
    except:
        try:
            print("Internet error occurred- retrying")
            dl.get(filing_type[0],
                   ticker[0],
                   after_date=date1,
                   before_date=date2)
        except:
            print("Couldn't resolve, moving to next ticker")
    try:

        if "2" in ft:
            dl.get(filing_type[1],
                   ticker[0],
import numpy as np
import pandas as pd
from sec_edgar_downloader import Downloader

df = pd.read_csv('data/sp500_list.csv')
drop_column = df.columns[0]
df.drop(columns=drop_column, inplace=True)
df.CIK = df.CIK.astype(str)
df['CIK'] = df['CIK'].str.zfill(10)
dl = Downloader('test_data/')
for i in df.index:
    print(f"{df.index[i]}: Pulling 8Ks for {df.COMPANY[i]}")
    dl.get("10-K", df.CIK[i], after_date="20190101", include_amends=False)
    dl.get("10-Q", df.CIK[i], after_date="20190101", include_amends=False)
    print(f"{df.index[i]}: {df.COMPANY[i]} done.")
from sec_edgar_downloader import Downloader

path = 'E:\stockdata3\Filings'
dl = Downloader(path)
aapl = dl.get('10-K', 'aapl', 12)
pass
Example #14
0
from sec_edgar_downloader import Downloader

# Initialize a downloader instance.
# If no argument is passed to the constructor, the package
# will attempt to locate the user's downloads folder.
dl = Downloader("/j/tmp32/edgar")

# Get the latest 10-K filing for Ubiquiti
dl.get("10-K", "UI", 1)

# Get the latest 10-Q filing for Ubiquiti
dl.get("10-Q", "UI", 1)
Example #15
0
from sec_edgar_downloader import Downloader

f = open('companies.txt', 'r')
stickers = f.read().splitlines()
f.close()

dl = Downloader(".")

after_date = "20090101"
before_date = "20200427"

for sticker in stickers:
    dl.get("10-K", sticker, after_date="20100101", before_date="20200325")
Example #16
0
from sec_edgar_downloader import Downloader


# In[8]:


cik = pd.read_csv("FINAL_COMPANY_LIST_SEC_INFO.csv")[['cik']]
cik = cik.values.tolist()
print(cik)


# In[9]:


file_location = "/pylon5/tr5pi7p/suli2020/uspto/rebranding/10K"
cik = cik[:10]
dl = Downloader(file_location)


# In[ ]:


num = 0
for lst in cik:
    for c in lst:
        print('Started: ' + c)
        dl.get("10-K", c, 30)
        num += 1
        print('Downloaded: ' + c + ', ' + num +' in total')

Example #17
0
def getAll10k(company):
    dl = Downloader()
    # Get all 10-K filings for Microsoft
    dl.get("10-K", company, amount=10)
def download_and_parse(actual_stock, ciks, dict):
    """
    This function is the meat and potatoes of downloading the SEC 10-K filings.
    It uses the sec_edgar_downloader package to download the 10-K filing.
    Then it uses code from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 to parse the 10-K filing for Item 1A.
    The code separates Item 1A into sentences and outputs it to a dictionary associated with the CIK value.
    :param actual_stock: the stock ticker
    :param ciks: CIK - stock ticker dictionary/crosswalk
    :param dict: a dictionary to store the 10-K Item 1A sentences
    :return: nothing, but it constantly updates/adds to the dictionary
    """
    if actual_stock[0] in [
            'BF.B', 'BF-B', 'bf-b', 'bf.b', 'HES', 'hes', 'hpe', 'HPE'
    ]:
        print("This stock has no CIK... issue there so I am skipping")
        return

    cik = convert_ticker_to_cik(actual_stock, ciks)
    cik = cik.zfill(10)

    dl = Downloader()
    #stock_ticker = "0001067983"
    dl.get("10-K", cik, after="2015-01-01", download_details=False)
    count = 0
    for root, dirs, files in os.walk(
            "./sec-edgar-filings/{}/10-K/".format(cik)):

        # search through each years' 10-k filing
        for file in files:
            # find the txt document of the 10-K filing
            if file == 'full-submission.txt':

                try:
                    year = re.findall(r'\-[0-9][0-9]\-', root)
                    year = year[len(year) - 1][1:-1]

                    # certain stocks have issues for certain years. I will include code to exclude them here
                    # if year == 21 and stock_ticker in ("'APA'", "'ADM'"):
                    #     print("Skipping year {} for ticker {} due to issues...".format(year, stock_ticker))

                    # read the file
                    filing_text = read_file(root + '/' + file)

                    # this code comes from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2

                    doc_start_pattern = re.compile(r'<DOCUMENT>')
                    doc_end_pattern = re.compile(r'</DOCUMENT>')
                    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
                    type_pattern = re.compile(r'<TYPE>[^\n]+')

                    # Create 3 lists with the span idices for each regex

                    ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
                    ### First filter will give us document tag start <end> and document tag end's <start>
                    ### We will use this to later grab content in between these tags
                    doc_start_is = [
                        x.end()
                        for x in doc_start_pattern.finditer(filing_text)
                    ]
                    doc_end_is = [
                        x.start()
                        for x in doc_end_pattern.finditer(filing_text)
                    ]

                    ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
                    ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
                    ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
                    ### as section names
                    doc_types = [
                        x[len('<TYPE>'):]
                        for x in type_pattern.findall(filing_text)
                    ]

                    document = {}
                    # Create a loop to go through each section type and save only the 10-K section in the dictionary
                    for doc_type, doc_start, doc_end in zip(
                            doc_types, doc_start_is, doc_end_is):
                        if doc_type == '10-K':
                            document[doc_type] = filing_text[doc_start:doc_end]

                    regex = re.compile(
                        r'(>(Item|ITEM)(\s|&#160;|&nbsp;|&#xa0;)(1A|1B)\.{0,1})|(ITEM\s(1A|1B))|(<B>Item</B><B></B><B>&nbsp;1A</B>)|(&nbsp;Item&nbsp;1B.)|(Item<font style="font-family:Times New Roman, Times, serif;font-size:10pt;">&nbsp;1B)|(Item<font style="font-family: Times New Roman, Times, serif; font-size: 10pt;">&nbsp;1B)'
                    )

                    matches = regex.finditer(document['10-K'])

                    test_df = pd.DataFrame([
                        (x.group(), x.span()[0], x.span()[1]) for x in matches
                    ])

                    if len(test_df) == 0:
                        print("error... didn't pick up anything")
                        break

                    test_df.columns = ['item', 'start', 'end']
                    test_df['item'] = test_df.item.str.lower()

                    test_df.replace(
                        '<font style="font-family:times new roman, times, serif;font-size:10pt;">',
                        ' ',
                        regex=True,
                        inplace=True)
                    test_df.replace(
                        '<font style="font-family: times new roman, times, serif; font-size: 10pt;">',
                        ' ',
                        regex=True,
                        inplace=True)
                    test_df.replace('&#160;', ' ', regex=True, inplace=True)
                    test_df.replace('&nbsp;', ' ', regex=True, inplace=True)
                    test_df.replace('&#xa0;', ' ', regex=True, inplace=True)
                    test_df.replace(' ', '', regex=True, inplace=True)
                    test_df.replace('\.', '', regex=True, inplace=True)
                    test_df.replace('>', '', regex=True, inplace=True)
                    test_df.replace('<b', '', regex=True, inplace=True)
                    test_df.replace('</b', '', regex=True, inplace=True)

                    pos_dat = test_df.sort_values(
                        'start',
                        ascending=True).drop_duplicates(subset=['item'],
                                                        keep='last')

                    pos_dat.set_index('item', inplace=True)

                    # Check conditionals here
                    if 'item1a' in pos_dat.index and 'item1b' in pos_dat.index:

                        item_1a_raw = document['10-K'][
                            pos_dat['start'].loc['item1a']:pos_dat['start'].
                            loc['item1b']]
                        item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

                        test_df["text"] = item_1a_content.get_text()
                        test_df.replace(
                            '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents',
                            ' ',
                            regex=True,
                            inplace=True)
                        test_df.replace('Table of Contents',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\s\s', ' ', regex=True, inplace=True)
                        test_df.replace('\\u200b',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\n[0-9]',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('[0-9]\\n',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\xa0', ' ', regex=True, inplace=True)
                        test_df.replace('\\x92', ' ', regex=True, inplace=True)
                        test_df.replace('\\x93', ' ', regex=True, inplace=True)
                        test_df.replace('\\x94', ' ', regex=True, inplace=True)
                        test_df.replace('\\x95', ' ', regex=True, inplace=True)
                        test_df.replace('\\n', ' ', regex=True, inplace=True)
                        test_df.replace('\n', ' ', regex=False, inplace=True)

                        # output the text to the dict

                        sentences = nltk.sent_tokenize(str(test_df['text'][0]))

                        if count == 0:
                            output_frame = pd.DataFrame(
                                [[year, sentences]], columns=["year", "text"])
                        else:
                            output_frame = output_frame.append(
                                pd.DataFrame([[year, sentences]],
                                             columns=["year", "text"]),
                                ignore_index=True)

                        dict[cik] = output_frame
                        print(
                            "finished processing ticker {} ({}) and added to dictionary for year {}"
                            .format(cik, actual_stock[0], year))
                        print(75 * '')
                        count += 1

                    else:
                        regex = re.compile(
                            r'(>(Item|ITEM)(\s|&#160;|&nbsp;|&#xa0;)(1A|2)\.{0,1})|(ITEM\s(1A|2))|(<B>Item</B><B></B><B>&nbsp;1A</B>)|(&nbsp;Item&nbsp;2.)'
                        )

                        matches = regex.finditer(document['10-K'])

                        test_df = pd.DataFrame([(x.group(), x.span()[0],
                                                 x.span()[1])
                                                for x in matches])

                        if len(test_df) == 0:
                            print("error... didn't pick up anything")
                            break

                        test_df.columns = ['item', 'start', 'end']
                        test_df['item'] = test_df.item.str.lower()

                        test_df.replace('&#160;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('&nbsp;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('&#xa0;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace(' ', '', regex=True, inplace=True)
                        test_df.replace('\.', '', regex=True, inplace=True)
                        test_df.replace('>', '', regex=True, inplace=True)
                        test_df.replace('<b', '', regex=True, inplace=True)
                        test_df.replace('</b', '', regex=True, inplace=True)

                        pos_dat = test_df.sort_values(
                            'start',
                            ascending=True).drop_duplicates(subset=['item'],
                                                            keep='last')

                        pos_dat.set_index('item', inplace=True)

                        item_1a_raw = document['10-K'][
                            pos_dat['start'].loc['item1a']:pos_dat['start'].
                            loc['item2']]
                        item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

                        test_df["text"] = item_1a_content.get_text()
                        test_df.replace(
                            '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents',
                            ' ',
                            regex=True,
                            inplace=True)
                        test_df.replace('Table of Contents',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\s\s', ' ', regex=True, inplace=True)
                        test_df.replace('\\u200b',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\n[0-9]',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('[0-9]\\n',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\xa0', ' ', regex=True, inplace=True)
                        test_df.replace('\\x92', ' ', regex=True, inplace=True)
                        test_df.replace('\\x93', ' ', regex=True, inplace=True)
                        test_df.replace('\\x94', ' ', regex=True, inplace=True)
                        test_df.replace('\\x95', ' ', regex=True, inplace=True)
                        test_df.replace('\\n', ' ', regex=True, inplace=True)
                        test_df.replace('\n', ' ', regex=False, inplace=True)

                        # output the text to the dict

                        sentences = nltk.sent_tokenize(str(test_df['text'][0]))

                        if count == 0:
                            output_frame = pd.DataFrame(
                                [[year, sentences]], columns=["year", "text"])
                        else:
                            output_frame = output_frame.append(
                                pd.DataFrame([[year, sentences]],
                                             columns=["year", "text"]),
                                ignore_index=True)

                        dict[cik] = output_frame
                        print(
                            "finished processing ticker {} ({}) and added to dictionary for year {}"
                            .format(cik, actual_stock[0], year))
                        print(75 * '')
                        count += 1

                except:
                    print("error occurred")
Example #19
0
# -*- coding: utf-8 -*-
"""
Created on Sun Mar  1 12:30:27 2020

@author: Stephen Sigrist
"""
import pandas as pd
from sec_edgar_downloader import Downloader
tickers_data = pd.read_csv("..."\
"...TriggeredEvents\\input files\\equity tickers.csv")
tickers_data.head()
#del tickers_data

dl = Downloader("...TriggeredEvents\\downloaded\\SEC")
for i in range(len(tickers_data)):
    print("Begin Downloading " + tickers_data['ticker'][i])
    for filing_type in dl.supported_filings:
        try:
            dl.get(filing_type, tickers_data['ticker'][i], 10)
        except:
            print("An Error Occured in Downloading Process")
    print("Finished Downloading " + tickers_data['ticker'][i])