def get_filings_by_company(company, type, n): tree = company.get_all_filings(filing_type=type) docs_lxml = Company.get_documents(tree, no_of_documents=n) docs_data = Company.get_documents(tree, no_of_documents=n, as_documents=True) if not isinstance(docs_lxml, list): docs_lxml = [docs_lxml] docs_data = [docs_data] return (docs_lxml, docs_data)
def get_filing_metadata(context, name: str, cik: str, filing: str, no_filings: int): comp = Company(name, cik) tree = comp.get_all_filings(filing) docs = comp.get_documents(tree, no_filings, True) filings = [] #TODO #38 change return method to yield AssetMaterialization() for document in docs: filings.append[clean_filings(document, cik, filing)] context.log.info(log_assert_type(filings, dict)) return filings
def pull_10K(company_name, company_id): company = Company(company_name, company_id) tree = company.get_all_filings(filing_type="10-K") pre_time = time.time() offset = random.randint(1, 25) #seconds if pre_time + offset > time.time(): docs = Company.get_documents(tree, no_of_documents=3) pre_time = time.time() text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def file_date(com, cik, no_docs): """ This function is to pull only the filing date Serves as the date of measurement for analyzing returns. """ company = Company(com, cik, no_docs) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=no_docs, as_documents=True) dates = [] for x in range(no_docs): doc = docs[x] dates.append(doc.content['Filing Date']) return dates
def pull_10K(name, company_id): ''' we use this function to perform the get filings. we need to run this function and iterarte over our list of tickers. Each ticker will get parsed and collected into a dataframe. ''' company = Company(name, company_id) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=6) # print("checkpoint: retrieving documents...") text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def get_edgar_filing_text(comp_tuples, f_type, n_docs, file_dir, dates_dir): ''' --------------------------------------------------------------------------- Scraping function to get the text from company filings from EDGAR. Inputs: - comp_tuples : A list with pairwise company tuples. The first element must be a string with the company name as listed on the EDGAR database. The second element must be a string with the CIK identifier. See get_sp500_tickers_cik_industry(argin) to easily get the tuples from the S&P500 listed firms. - f_type : A string with the filing type. - n_docs : Number of filings to be fetched, in descending order, i.e. n_docs = 3 will fetch the three newest filings of type f_type. As a double integer. - file_dir : The master directory where all filings are to be saved. As a string. - dates_dir : The master directory where all filing dates are saved. If a directory is missing, the function will instead scrape the dates using get_edgar_filing_date(argin), and create a new folder with the dates. Example: comp_tuples = [['APPLE INC' , '0000320193'], ['MCDONALDS CORP', '0000063908'], ['MICROSOFT CORP', '0000789019']] f_type = '10-K' [Or '10-Q'] n_docs = 3 file_dir = 'Users/Tobias/Dropbox/textfolder/Text Reports U.S.' dates_dir = 'Users/Tobias/Dropbox/textfolder/Dates Reports U.S' get_edgar_filing_text(comp_tuples,f_type,n_docs,file_dir,dates_dir) --------------------------------------------------------------------------- ''' print('Fetching data...') print('-' * 80 + '\n') for idx, comp_tuple in enumerate(comp_tuples): comp = edgar.Company(comp_tuple[0], comp_tuple[1]) tree = comp.get_all_filings(filing_type=f_type) docs = Company.get_documents(tree, no_of_documents=n_docs) # Now that we have the filings, find get the filing dates for each # document. If we have them already, then great, let's load them. If # not, call get_edgar_filing_date to get them for this company. dir = os.path.join(dates_dir, f_type, comp_tuple[0]) if not os.path.exists(dir): print(('\nCannot find the dates for ' + comp_tuple[0] + '. Attempting to download them...')) get_edgar_filing_date([comp_tuple], f_type, dates_dir) else: os.chdir(dir) if '.' in comp_tuple[0][-1]: comp_tuple[0] = comp_tuple[0][:-1] with open(comp_tuple[0] + '.pickle', 'rb') as file: dates = pickle.load(file) dates = dates[:n_docs] dir = os.path.join(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) if not os.path.exists(file_dir + '\\' + f_type + '\\' + comp_tuple[0]): os.makedirs(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) os.chdir(file_dir + '\\' + f_type + '\\' + comp_tuple[0]) for date, doc in zip(dates, docs): f = open(date.replace('.pickle', '') + '.txt', 'w', encoding='utf8') f.write(str(doc)) f.close() mes = ('Status: ' + str(int( (idx + 1) / len(comp_tuples) * 100)) + '% done') sys.stdout.write('\r' + mes)