# In[12]: ten_ks_by_ticker = {} for ticker, filling_documents in filling_documents_by_ticker.items(): ten_ks_by_ticker[ticker] = [] for file_date, documents in filling_documents.items(): for document in documents: if get_document_type(document) == '10-k': ten_ks_by_ticker[ticker].append({ 'cik': cik_lookup[ticker], 'file': document, 'file_date': file_date }) project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date']) # ## Preprocess the Data # ### Clean Up # As you can see, the text for the documents are very messy. To clean this up, we'll remove the html and lowercase all the text. # In[13]: def remove_html_tags(text): text = BeautifulSoup(text, 'html.parser').get_text() return text def clean_text(text):
return doc_type.lower() #Filter out the non 10-k documents from the fillings using the get_document_type function.change cil_lookup into CIK_dict1 ten_ks_by_ticker = {} for ticker, filling_documents in filling_documents_by_ticker.items(): ten_ks_by_ticker[ticker] = [] for file_date, documents in filling_documents.items(): for document in documents: if get_document_type(document) == '10-k': ten_ks_by_ticker[ticker].append({ 'cik': CIK_dict1[ticker], 'file': document, 'file_date': file_date }) project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date']) # Preprocess Data # remove the html and make all text lowercase to clean up the document text def remove_html_tags(text): text = BeautifulSoup(text, 'html.parser').get_text() return text def clean_text(text): text = text.lower() text = remove_html_tags(text)