def do_download(path, ticker, form):
    dl = Downloader(path)
    dl.get(form, ticker, after="2015-01-01")
    path = os.path.join(path, "sec-edgar-filings", ticker, form)

    if not os.path.isdir(path):
        return

    pattern = re.compile("([0-9]+)")
    for filingDir in os.listdir(path):
        fullSubmissionFname = os.path.join(path, filingDir,
                                           "full-submission.txt")
        htmlFname = os.path.join(path, filingDir, "filing-details.html")
        if not os.path.isfile(fullSubmissionFname):
            print("skipping ", fullSubmissionFname)
            continue
        found = False
        with open(fullSubmissionFname) as f:
            for line in f:
                if line.startswith("FILED AS OF DATE"):
                    date = re.search(pattern, line).group(0)
                    found = True
        if not found:
            print("skipping ", filingDir)
            continue
        url_rewrite(htmlFname, find_filename(path, ticker, form, date))
        shutil.rmtree(os.path.join(path, filingDir))
Beispiel #2
0
def download_forms(company_index, year: str):
    """ Reads index file and downloads 10Ks and 10Qs 
    """
    dl = Downloader(mypath+'/data/')
    
    for i in company_index.index:
        
        dl.get("10-K", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
        dl.get("10-Q", company_index.TICKER[i], after_date=year+'0101', include_amends=False)
Beispiel #3
0
def download(tickers):
    path = get_filings_folder()
    dl = Downloader(path)
    n = len(tickers)
    for i in range(n):
        print_progress(i, n)
        if os.path.exists('../Filings/sec_edgar_filings/' +
                          tickers[i]) == False:
            dl.get_10k_filings(tickers[i])
Beispiel #4
0
def download_latest_filing(file_type, ticker):
    dl = Downloader(os.getcwd())
    dl.get(file_type, ticker, amount=1)
    dl_path = os.getcwd() + '/sec-edgar-filings/{}/{}/'.format(
        ticker, file_type)

    inner_most_dir = [x[0] for x in os.walk(dl_path)][1]
    html_path = f'{inner_most_dir}/filing-details.html'
    txt_path = f'{inner_most_dir}/full-submission.txt'

    return (html_path, txt_path)
Beispiel #5
0
class FilingsDownloader:
    def __init__(self, downloadPath):
        self.downloadPath = downloadPath
        self.downloader = Downloader(self.downloadPath)

    # Download 10 latest 10-K filings for the given company ticker
    def downloadFilings(self, ticker, filing_type="10-K", latest=10):
        self.downloader.get(filing_type, ticker, latest)

    def removefilings(self, path):
        shutil.rmtree(path, ignore_errors=True)
Beispiel #6
0
def get_files(CIK,x):
    save_path = '/Users/yijingtan/Downloads/d' 
    dl = Downloader(save_path)
    dl.get('13F-HR', CIK,include_amends=True,after_date=x) 
    #CIK = CIK.lstrip("0")
    files = os.listdir('/Users/yijingtan/Downloads/d/sec_edgar_filings/' + CIK +'/' +'13F-HR')
    data = [parse(file, CIK) for file in sorted(files)]
    print(data)
    try:
        return pd.concat(data)
    except ValueError:
        print("All Values are None")
        return None
Beispiel #7
0
    def __init__(self, args, log):

        self.args = args
        self.log = log

        self.VERSION = tfds.core.Version(self.args.dataset_version)
        self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually"

        super(tfds.core.GeneratorBasedBuilder, self).__init__()

        self.dl = Downloader(self.args.download_path)

        self.parser = FinancialReportParser()
        self.text_processor = get_text_processor(args.model)(args)
Beispiel #8
0
def pull_filings(corporations):
    print('Pulling filings from SEC...')
    dl=Downloader("./sec/filings")
    added = []

    for corp in corporations:
        been_added = set(added)
        has_pulled = False
        if corp['cik'] in been_added:
            has_pulled = True

        if corp['main'] != True and has_pulled == False:
            dl.get_all_available_filings(corp['cik'])
            print(f'Pulled: {corp["name"]}')
            added.append(corp['cik'])
Beispiel #9
0
def get_files(filing, CIK, number):
    save_path = r"C:\Users\smore\Documents\13F"
    dl = Downloader(save_path)
    dl.get(filing, CIK, number)
    CIK = CIK.lstrip("0")
    print(CIK)
    files = os.listdir(
        f"C:/Users/smore/Documents/13F/sec_edgar_filings/{CIK}/{filing}")
    data = [parse(file, CIK) for file in sorted(files)]
    try:
        print(pd.concat(data))
        return pd.concat(data)
    except ValueError:
        print("All Values are None")
        return None
Beispiel #10
0
def Bulk_extraction(ticker, filetype, date, location):
    ''' ticker = company ticker or list of tickers 
        filetype = type of financial doc (8-K,10-K)
        date = all filetypes after this date format: year-day-month
        location = local directory to store files as string'''
    dl = Downloader(str(location))
    for company in ticker:
        dl.get(str(filetype),
               str(company),
               after=str(date),
               download_details=True)
    return "Complete"


# Have to double check to see if all funds are updated for 2019-2020
#Bulk_extraction(holdings,'10-K','2019-01-01',dl)
def test_constructor_relative_path():
    dl = Downloader("./Downloads")
    expected = Path.cwd().joinpath("Downloads")
    assert dl.download_folder == expected
def test_constructor_no_params():
    dl = Downloader()
    expected = Path.home().joinpath("Downloads")
    assert dl.download_folder == expected
def test_constructor_blank_path():
    dl = Downloader("")
    # pathlib treats blank paths as the current working directory
    expected = Path.cwd()
    assert dl.download_folder == expected
Beispiel #14
0
def downloader(tmpdir):
    tmp_dir = Path(tmpdir.mkdir("Downloads"))
    dl = Downloader(tmp_dir)
    yield dl, tmp_dir
    shutil.rmtree(tmp_dir)
def downloader(tmp_path):
    tmp_dir = tmp_path / "Downloads"
    tmp_dir.mkdir()
    dl = Downloader(tmp_dir)
    yield dl, tmp_dir
    shutil.rmtree(tmp_dir)
# import required packages
import os
import requests
import pandas as pd
import time
from sec_edgar_downloader import Downloader

# api related info.
dl = Downloader('D:/Thesis_data')
tickers = pd.read_excel('D:\\Thesis_data\\cik_ticker.xlsx')
nyse_nas = pd.concat([
    tickers[tickers['Exchange'] == 'NYSE'],
    tickers[tickers['Exchange'] == 'NASDAQ']
],
                     axis=0)

n_cik = nyse_nas.shape[0]
error_list = []
print('Number of Stocks: ' + str(n_cik))
n = 0


# 從清單中刪除已下載
def diff(first, second):
    n = len(os.listdir('D:\\Thesis_data\\sec_edgar_filings'))
    second = set(second)
    return [item for item in first if str(item) not in second], n


download_list, n = diff(list(nyse_nas['CIK']),
                        os.listdir('D:\\Thesis_data\\sec_edgar_filings'))
Beispiel #17
0
# -*- coding: utf-8 -*-
"""
Created on Sun Mar  1 12:30:27 2020

@author: Stephen Sigrist
"""
import pandas as pd
from sec_edgar_downloader import Downloader
tickers_data = pd.read_csv("..."\
"...TriggeredEvents\\input files\\equity tickers.csv")
tickers_data.head()
#del tickers_data

dl = Downloader("...TriggeredEvents\\downloaded\\SEC")
for i in range(len(tickers_data)):
    print("Begin Downloading " + tickers_data['ticker'][i])
    for filing_type in dl.supported_filings:
        try:
            dl.get(filing_type, tickers_data['ticker'][i], 10)
        except:
            print("An Error Occured in Downloading Process")
    print("Finished Downloading " + tickers_data['ticker'][i])
Beispiel #18
0
    def form_valid(self, form):
        #get user and validate form
        form.instance.user = self.request.user
        super(Createlisting, self).form_valid(form)

        #get Company CIK
        tik = form.data['ticker']
        URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
        CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
        f = requests.get(URL.format(tik), stream=True)
        results = CIK_RE.findall(f.text)
        if len(results):
            cik = results[0]

        cmp_name = self.edgar.getCompanyNameByCik(cik)

        #create object in database
        company = Company(ticker=tik,
                          cik=cik,
                          name=cmp_name,
                          user=self.request.user)
        company.save()

        # delete empty database
        queryset = Company.objects.filter(name='').delete()

        # Creating filename and url structure
        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        path = os.path.join(BASE_DIR, 'static')
        out_path = path + "/sec_edgar_filings/" + tik + "/10-K/"

        # creating object of class with path to the download and downloading the txt file which is too big to load without xml iteration && should look for alternative option than that
        dl = Downloader(path)

        # download the latest one by adding the 1
        dl.get_10k_filings(tik, 1)

        #removing the - and .txt to get the number to the current filing submission accession number
        file_name = [
            f for f in os.listdir(out_path)
            if os.path.isfile(os.path.join(out_path, f))
        ]
        switched_filename = file_name[0]
        switched_filename = switched_filename.replace('-', '').replace(
            '.txt', '/index.json')

        # creating base url configuration, i can do a better job than this!!!
        bare_url = r"https://www.sec.gov/Archives/edgar/data/"
        base_url = r"https://www.sec.gov"
        documents_url = bare_url + str(results[0]) + "/" + switched_filename

        #retreieve the files and get the summary
        content = requests.get(documents_url).json()

        for file in content['directory']['item']:

            # Grab the filing summary and create a new url leading to the file so we can download it.
            if file['name'] == 'FilingSummary.xml':

                xml_summary = base_url + content['directory'][
                    'name'] + "/" + file['name']

                #print info
                print('-' * 50)
                print('File Name: ' + file['name'])
                print('File Path: ' + xml_summary)

        # define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
        base_url = xml_summary.replace('FilingSummary.xml', '')

        # request and parse the content
        content = requests.get(xml_summary).content
        soup = BeautifulSoup(content, 'lxml')

        # find the 'myreports' tag because this contains all the individual reports submitted.
        reports = soup.find('myreports')

        # I want a list to store all the individual components of the report, so create the master list.
        master_reports = []

        # loop through each report in the 'myreports' tag but except the last one it produces an error.
        for report in reports.find_all('report')[:-1]:

            # create a dictionary to store all the different parts we need.
            report_dict = {}
            report_dict['name_short'] = report.shortname.text
            report_dict['name_long'] = report.longname.text
            report_dict['position'] = report.position.text
            report_dict['menu_category'] = report.menucategory.text
            report_dict['url'] = base_url + report.htmlfilename.text

            # append the dictionary to the master list.
            master_reports.append(report_dict)
            if report_dict[
                    'name_short'] == 'Consolidated Statements of Cash Flows':

                # print the info.
                print('-' * 50)
                print(base_url + report.htmlfilename.text)
                print(report.longname.text)
                print(report.shortname.text)
                print(report.menucategory.text)
                print(report.position.text)

                # creating a holder for the url since a Bug creating a different file path into the database!
                redirect_url_to_statement = base_url + report.htmlfilename.text

        # in case of multiple statements
        statements_url = []

        for report_dict in master_reports:

            # define the statements we want to look for.
            item1 = r"Consolidated Statements of Cash Flows"

            # store them in a list.
            report_list = [item1]

            # if the short name can be found in the report list.
            if report_dict['name_short'] in report_list:

                # print some info and store it in the statements url.
                print('-' * 50)
                print(report_dict['name_short'])
                print(report_dict['url'])

                statements_url.append(report_dict['url'])

        statement = Statement(year=2019,
                              type="CONSOLIDATED STATEMENTS OF CASH FLOWS",
                              url=redirect_url_to_statement,
                              company=company)
        statement.save()

        statements_data = []

        # loop through each statement url
        for statementUrl in statements_url:

            # define a dictionary that will store the different parts of the statement.
            statement_data = {}
            statement_data['headers'] = []

            statement_data['sections'] = []

            statement_data['data'] = []

            # request the statement file content
            content = requests.get(statementUrl).content
            report_soup = BeautifulSoup(content, 'html')

            # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
            for index, row in enumerate(report_soup.table.find_all('tr')):

                # first let's get all the elements.
                cols = row.find_all('td')

                # if it's a regular row and not a section or a table header
                if (len(row.find_all('th')) == 0
                        and len(row.find_all('strong')) == 0):
                    reg_row = [ele.text.strip() for ele in cols]
                    statement_data['data'].append(reg_row)

                # if it's a regular row and a section but not a table header
                elif (len(row.find_all('th')) == 0
                      and len(row.find_all('strong')) != 0):
                    sec_row = cols[0].text.strip()
                    statement_data['sections'].append(sec_row)

                # finally if it's not any of those it must be a header
                elif (len(row.find_all('th')) != 0):
                    hed_row = [ele.text.strip() for ele in row.find_all('th')]
                    statement_data['headers'].append(hed_row)

                else:
                    print('We encountered an error.')

            #Creating DAtA into Database
            #Creating each header and rotating thru all data values
            print("HEADERSSSSS   ")
            print("Saving Headers...")
            for i in range(len(statement_data['headers'][1])):
                print(statement_data['headers'][1][i])
                statementHeader = Statment_element_headers(
                    field=statement_data['headers'][1][i], statement=statement)
                statementHeader.save()
                print("DATAAAAAAAA   ")
                print("Saving Data Element...")
                for j in statement_data['data']:
                    print(j)
                    print(j[i + 1])
                    k = j[i + 1]
                    # Optimizing the Data Format
                    if '$' or ',' or '(' in k:
                        k = k.replace('$', '')
                        k = k.replace(' ', '')
                        k = k.replace(',', '.')
                        k = k.replace('(', '-')
                        k = k.replace(')', '')
                        k = float(k)
                        print(k)
                    statementData = Statement_element_data(
                        key=j[0],
                        value=k,
                        statement=statement,
                        company=company,
                        header=statementHeader)
                    statementData.save()

                    print(j)

                    print("Saving Data Done for Element")
            print("Saving Headers Done")

            print("SECTIONSSSS   ")
            print("Saving Headers ...")
            for i in statement_data['sections']:
                print(i)
                statementSections = Statement_element_section(
                    fieldName=i, statement=statement)
                statementSections.save()
            print("Saving Sections Done...")

            # append it to the master list for future analysis with Panda streams and NLP
            statements_data.append(statement_data)

        # print(statements_data)

        return redirect('home')
Beispiel #19
0
 def get_links(self):
     dl = Downloader(desktop_path + "/13f filings")
     # get past 68 13f filings for each company
     # for c in CIKs.values():
     dl.get_13f_hr_filings('0000807985', 68)
def test_invalid_save_path_constructor():
    test_path = str(Path.home().joinpath("Downloads", "invalid_dir"))
    with pytest.raises(IOError) as excinfo:
        Downloader(test_path)
    expected_msg = f"The folder for saving company filings ({test_path}) does not exist."
    assert expected_msg in str(excinfo.value)
import pandas as pd
from sec_edgar_downloader import Downloader

x = pd.read_csv(
    r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping/companies.csv"
)
x = x['Column_Name']
dl = Downloader(
    r"/home/mohit/Dropbox/ra_tasks_ms/bailout_firms/Regular Scheduled SEC scraping"
)

for i in x:
    dl.get("8-K", i, after_date="20200328")
from sec_edgar_downloader import Downloader

path = 'E:\stockdata3\Filings'
dl = Downloader(path)
aapl = dl.get('10-K', 'aapl', 12)
pass
Beispiel #23
0
# Edgar SEC 10k extracting
import pandas as pd

from sec_edgar_downloader import Downloader

# importing IBB holdings. csv
file = 'IBB-holdings.csv'
IBB = pd.read_csv(file, skiprows=13)
holdings = list(IBB["Symbol"])

#setting Download location
dl = Downloader('/Users/nick/Documents/cs506/project')


def Bulk_extraction(ticker, filetype, date, location):
    ''' ticker = company ticker or list of tickers 
        filetype = type of financial doc (8-K,10-K)
        date = all filetypes after this date format: year-day-month
        location = local directory to store files as string'''
    dl = Downloader(str(location))
    for company in ticker:
        dl.get(str(filetype),
               str(company),
               after=str(date),
               download_details=True)
    return "Complete"


# Have to double check to see if all funds are updated for 2019-2020
#Bulk_extraction(holdings,'10-K','2019-01-01',dl)
import numpy as np
import pandas as pd
from sec_edgar_downloader import Downloader

df = pd.read_csv('data/sp500_list.csv')
drop_column = df.columns[0]
df.drop(columns=drop_column, inplace=True)
df.CIK = df.CIK.astype(str)
df['CIK'] = df['CIK'].str.zfill(10)
dl = Downloader('test_data/')
for i in df.index:
    print(f"{df.index[i]}: Pulling 8Ks for {df.COMPANY[i]}")
    dl.get("10-K", df.CIK[i], after_date="20190101", include_amends=False)
    dl.get("10-Q", df.CIK[i], after_date="20190101", include_amends=False)
    print(f"{df.index[i]}: {df.COMPANY[i]} done.")
def test_constructor_user_path():
    dl = Downloader("~/Downloads")
    expected = Path.home().joinpath("Downloads")
    assert dl.download_folder == expected
Beispiel #26
0
class FinancialStatementDatasetBuilder(tfds.core.GeneratorBasedBuilder):
    def __init__(self, args, log):

        self.args = args
        self.log = log

        self.VERSION = tfds.core.Version(self.args.dataset_version)
        self.MANUAL_DOWNLOAD_INSTRUCTIONS = "Dataset already downloaded manually"

        super(tfds.core.GeneratorBasedBuilder, self).__init__()

        self.dl = Downloader(self.args.download_path)

        self.parser = FinancialReportParser()
        self.text_processor = get_text_processor(args.model)(args)

    def _info(self):

        return tfds.core.DatasetInfo(
            builder=self,
            description=("Financial statements data."),
            features=tfds.features.FeaturesDict({
                "documents":
                tfds.features.Tensor(dtype=tf.string,
                                     shape=(self.args.number_of_periods, )),
                "label":
                tfds.features.Tensor(dtype=tf.int64, shape=(2, ))
            }),
            supervised_keys=("documents", "label"),
            homepage="https://xxx",
            citation=r"""@article{my-awesome-dataset-2020,
                                  author = {Hurubaru, Sebastian},"}""",
        )

    def _split_generators(self, dl_manager):

        # Specify the splits
        return [
            tfds.core.SplitGenerator(
                name=tfds.Split.TRAIN,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'train')
                },
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.VALIDATION,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'dev')
                },
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs={
                    "input_dir": os.path.join(self.args.input_dir, 'test')
                },
            )
        ]

    def _generate_examples(self, input_dir):

        # Get the content of the dataset file
        dataset = tf.data.experimental.make_csv_dataset(
            os.path.join(input_dir, self.args.company_files),
            batch_size=1,
            column_defaults=[tf.string, tf.string, tf.string, tf.int32],
            label_name='label',
            na_value="?",
            num_epochs=1,
            ignore_errors=True)

        for company_info, label in dataset:

            ciks = company_info['cik'].numpy()[0].decode('utf-8').split(';')
            ciks.sort(reverse=True, key=lambda cik: int(cik))
            end_date = company_info['end_date'].numpy()[0].decode('utf-8')

            try:

                documents = []

                # For multiple CIKs take in the descending order the last args.number_of_periods 10-K reports
                for cik in ciks:

                    cik_folder = os.path.join(
                        os.path.expanduser(self.args.download_path),
                        'sec_edgar_filings',
                        cik.strip().lstrip("0"), '10-K')

                    # Download if and only if the directories do not exist
                    if (os.path.exists(cik_folder) is False):
                        self.dl.get("10-K",
                                    cik,
                                    before_date=end_date,
                                    num_filings_to_download=self.args.
                                    number_of_periods)

                    for r, d, f in os.walk(cik_folder):
                        for file in f:
                            if '.txt' in file:
                                documents.append(
                                    tf.convert_to_tensor(
                                        self.parser.parse_10K_txt_file(
                                            os.path.join(r, file)),
                                        dtype=tf.string))

                if len(documents) < self.args.number_of_periods:
                    raise Exception(
                        f'Could not retrieve {self.args.number_of_periods} 10-K records for {cik}'
                    )

                yield cik, {
                    'documents':
                    tf.stack(documents)[:self.args.number_of_periods],
                    'label': [1, 0] if label.numpy()[0] == 0 else [0, 1]
                }

            except Exception as e:
                self.log.error(f'Exception occurred for cik {cik}: {e}')

    def _process_text_map_fn(self, text, label):
        processed_text, label = tf.py_function(self._process_text,
                                               inp=[text, label],
                                               Tout=(tf.float32, tf.int64))
        return processed_text, label

    def _process_text(self, text, label):

        # To allow debugging in the combined static eager mode
        # pydevd.settrace(suspend=True)

        # Process the text
        processed_text = self.text_processor.process_text(text)

        return (processed_text, label)
def test_constructor_custom_path():
    custom_path = Path.home().joinpath("Downloads/SEC/EDGAR/Downloader")
    dl = Downloader(custom_path)
    assert dl.download_folder == custom_path
def downloader(tmp_path):
    dl = Downloader(tmp_path)
    yield dl, tmp_path
    shutil.rmtree(tmp_path)
def download_and_parse(actual_stock, ciks, dict):
    """
    This function is the meat and potatoes of downloading the SEC 10-K filings.
    It uses the sec_edgar_downloader package to download the 10-K filing.
    Then it uses code from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2 to parse the 10-K filing for Item 1A.
    The code separates Item 1A into sentences and outputs it to a dictionary associated with the CIK value.
    :param actual_stock: the stock ticker
    :param ciks: CIK - stock ticker dictionary/crosswalk
    :param dict: a dictionary to store the 10-K Item 1A sentences
    :return: nothing, but it constantly updates/adds to the dictionary
    """
    if actual_stock[0] in [
            'BF.B', 'BF-B', 'bf-b', 'bf.b', 'HES', 'hes', 'hpe', 'HPE'
    ]:
        print("This stock has no CIK... issue there so I am skipping")
        return

    cik = convert_ticker_to_cik(actual_stock, ciks)
    cik = cik.zfill(10)

    dl = Downloader()
    #stock_ticker = "0001067983"
    dl.get("10-K", cik, after="2015-01-01", download_details=False)
    count = 0
    for root, dirs, files in os.walk(
            "./sec-edgar-filings/{}/10-K/".format(cik)):

        # search through each years' 10-k filing
        for file in files:
            # find the txt document of the 10-K filing
            if file == 'full-submission.txt':

                try:
                    year = re.findall(r'\-[0-9][0-9]\-', root)
                    year = year[len(year) - 1][1:-1]

                    # certain stocks have issues for certain years. I will include code to exclude them here
                    # if year == 21 and stock_ticker in ("'APA'", "'ADM'"):
                    #     print("Skipping year {} for ticker {} due to issues...".format(year, stock_ticker))

                    # read the file
                    filing_text = read_file(root + '/' + file)

                    # this code comes from https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2

                    doc_start_pattern = re.compile(r'<DOCUMENT>')
                    doc_end_pattern = re.compile(r'</DOCUMENT>')
                    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
                    type_pattern = re.compile(r'<TYPE>[^\n]+')

                    # Create 3 lists with the span idices for each regex

                    ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
                    ### First filter will give us document tag start <end> and document tag end's <start>
                    ### We will use this to later grab content in between these tags
                    doc_start_is = [
                        x.end()
                        for x in doc_start_pattern.finditer(filing_text)
                    ]
                    doc_end_is = [
                        x.start()
                        for x in doc_end_pattern.finditer(filing_text)
                    ]

                    ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
                    ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
                    ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K'
                    ### as section names
                    doc_types = [
                        x[len('<TYPE>'):]
                        for x in type_pattern.findall(filing_text)
                    ]

                    document = {}
                    # Create a loop to go through each section type and save only the 10-K section in the dictionary
                    for doc_type, doc_start, doc_end in zip(
                            doc_types, doc_start_is, doc_end_is):
                        if doc_type == '10-K':
                            document[doc_type] = filing_text[doc_start:doc_end]

                    regex = re.compile(
                        r'(>(Item|ITEM)(\s|&#160;|&nbsp;|&#xa0;)(1A|1B)\.{0,1})|(ITEM\s(1A|1B))|(<B>Item</B><B></B><B>&nbsp;1A</B>)|(&nbsp;Item&nbsp;1B.)|(Item<font style="font-family:Times New Roman, Times, serif;font-size:10pt;">&nbsp;1B)|(Item<font style="font-family: Times New Roman, Times, serif; font-size: 10pt;">&nbsp;1B)'
                    )

                    matches = regex.finditer(document['10-K'])

                    test_df = pd.DataFrame([
                        (x.group(), x.span()[0], x.span()[1]) for x in matches
                    ])

                    if len(test_df) == 0:
                        print("error... didn't pick up anything")
                        break

                    test_df.columns = ['item', 'start', 'end']
                    test_df['item'] = test_df.item.str.lower()

                    test_df.replace(
                        '<font style="font-family:times new roman, times, serif;font-size:10pt;">',
                        ' ',
                        regex=True,
                        inplace=True)
                    test_df.replace(
                        '<font style="font-family: times new roman, times, serif; font-size: 10pt;">',
                        ' ',
                        regex=True,
                        inplace=True)
                    test_df.replace('&#160;', ' ', regex=True, inplace=True)
                    test_df.replace('&nbsp;', ' ', regex=True, inplace=True)
                    test_df.replace('&#xa0;', ' ', regex=True, inplace=True)
                    test_df.replace(' ', '', regex=True, inplace=True)
                    test_df.replace('\.', '', regex=True, inplace=True)
                    test_df.replace('>', '', regex=True, inplace=True)
                    test_df.replace('<b', '', regex=True, inplace=True)
                    test_df.replace('</b', '', regex=True, inplace=True)

                    pos_dat = test_df.sort_values(
                        'start',
                        ascending=True).drop_duplicates(subset=['item'],
                                                        keep='last')

                    pos_dat.set_index('item', inplace=True)

                    # Check conditionals here
                    if 'item1a' in pos_dat.index and 'item1b' in pos_dat.index:

                        item_1a_raw = document['10-K'][
                            pos_dat['start'].loc['item1a']:pos_dat['start'].
                            loc['item1b']]
                        item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

                        test_df["text"] = item_1a_content.get_text()
                        test_df.replace(
                            '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents',
                            ' ',
                            regex=True,
                            inplace=True)
                        test_df.replace('Table of Contents',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\s\s', ' ', regex=True, inplace=True)
                        test_df.replace('\\u200b',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\n[0-9]',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('[0-9]\\n',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\xa0', ' ', regex=True, inplace=True)
                        test_df.replace('\\x92', ' ', regex=True, inplace=True)
                        test_df.replace('\\x93', ' ', regex=True, inplace=True)
                        test_df.replace('\\x94', ' ', regex=True, inplace=True)
                        test_df.replace('\\x95', ' ', regex=True, inplace=True)
                        test_df.replace('\\n', ' ', regex=True, inplace=True)
                        test_df.replace('\n', ' ', regex=False, inplace=True)

                        # output the text to the dict

                        sentences = nltk.sent_tokenize(str(test_df['text'][0]))

                        if count == 0:
                            output_frame = pd.DataFrame(
                                [[year, sentences]], columns=["year", "text"])
                        else:
                            output_frame = output_frame.append(
                                pd.DataFrame([[year, sentences]],
                                             columns=["year", "text"]),
                                ignore_index=True)

                        dict[cik] = output_frame
                        print(
                            "finished processing ticker {} ({}) and added to dictionary for year {}"
                            .format(cik, actual_stock[0], year))
                        print(75 * '')
                        count += 1

                    else:
                        regex = re.compile(
                            r'(>(Item|ITEM)(\s|&#160;|&nbsp;|&#xa0;)(1A|2)\.{0,1})|(ITEM\s(1A|2))|(<B>Item</B><B></B><B>&nbsp;1A</B>)|(&nbsp;Item&nbsp;2.)'
                        )

                        matches = regex.finditer(document['10-K'])

                        test_df = pd.DataFrame([(x.group(), x.span()[0],
                                                 x.span()[1])
                                                for x in matches])

                        if len(test_df) == 0:
                            print("error... didn't pick up anything")
                            break

                        test_df.columns = ['item', 'start', 'end']
                        test_df['item'] = test_df.item.str.lower()

                        test_df.replace('&#160;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('&nbsp;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('&#xa0;',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace(' ', '', regex=True, inplace=True)
                        test_df.replace('\.', '', regex=True, inplace=True)
                        test_df.replace('>', '', regex=True, inplace=True)
                        test_df.replace('<b', '', regex=True, inplace=True)
                        test_df.replace('</b', '', regex=True, inplace=True)

                        pos_dat = test_df.sort_values(
                            'start',
                            ascending=True).drop_duplicates(subset=['item'],
                                                            keep='last')

                        pos_dat.set_index('item', inplace=True)

                        item_1a_raw = document['10-K'][
                            pos_dat['start'].loc['item1a']:pos_dat['start'].
                            loc['item2']]
                        item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

                        test_df["text"] = item_1a_content.get_text()
                        test_df.replace(
                            '([0-9]|[0-9][0-9])(\s{0,3})Table of Contents',
                            ' ',
                            regex=True,
                            inplace=True)
                        test_df.replace('Table of Contents',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\s\s', ' ', regex=True, inplace=True)
                        test_df.replace('\\u200b',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\n[0-9]',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('[0-9]\\n',
                                        ' ',
                                        regex=True,
                                        inplace=True)
                        test_df.replace('\\xa0', ' ', regex=True, inplace=True)
                        test_df.replace('\\x92', ' ', regex=True, inplace=True)
                        test_df.replace('\\x93', ' ', regex=True, inplace=True)
                        test_df.replace('\\x94', ' ', regex=True, inplace=True)
                        test_df.replace('\\x95', ' ', regex=True, inplace=True)
                        test_df.replace('\\n', ' ', regex=True, inplace=True)
                        test_df.replace('\n', ' ', regex=False, inplace=True)

                        # output the text to the dict

                        sentences = nltk.sent_tokenize(str(test_df['text'][0]))

                        if count == 0:
                            output_frame = pd.DataFrame(
                                [[year, sentences]], columns=["year", "text"])
                        else:
                            output_frame = output_frame.append(
                                pd.DataFrame([[year, sentences]],
                                             columns=["year", "text"]),
                                ignore_index=True)

                        dict[cik] = output_frame
                        print(
                            "finished processing ticker {} ({}) and added to dictionary for year {}"
                            .format(cik, actual_stock[0], year))
                        print(75 * '')
                        count += 1

                except:
                    print("error occurred")
from sec_edgar_downloader import Downloader

dl = Downloader('C:\\Users\\willi\\Documents\\Company_filings\\BRK-A\\13F-HR')

# Downloads using CIK of Berkshire hathaway BERK-A
dl.get("13F-HR", "0001067983")