# In[12]:

ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date
                })

project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5],
                                ['cik', 'file', 'file_date'])

# ## Preprocess the Data
# ### Clean Up
# As you can see, the text for the documents are very messy. To clean this up, we'll remove the html and lowercase all the text.

# In[13]:


def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()

    return text


def clean_text(text):
    return doc_type.lower()


#Filter out the non 10-k documents from the fillings using the get_document_type function.change cil_lookup into CIK_dict1
ten_ks_by_ticker = {}
for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': CIK_dict1[ticker],
                    'file': document,
                    'file_date': file_date
                })
project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5],
                                ['cik', 'file', 'file_date'])

# Preprocess Data
# remove the html and make all text lowercase to clean up the document text


def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()

    return text


def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)