def get_company_by_cik(cik): cik_table = str(cik) length = len(cik_table) cik = "0" * (10 - length) + cik_table name = edgar.get_company_name_by_cik(cik) company = Company(name, cik) return company
def get_10K_doc_raw(name, cik): """ Get the latest 10-K filing document for a given company using the edgar package """ company = Company(name, cik) # tree = company.get_all_filings(filing_type="10-K") # docs = Company.get_documents(tree, no_of_documents=1) docs = company.get_10Ks(no_of_documents=1) return docs
def findWord(comp,cik): try: company = Company(comp,cik) doc = company.get_10K() text = TXTML.parse_full_10K(doc) #print(text) if (re.search('blockchain', text , re.IGNORECASE)): return("exists") else : return("dosenot") except: return("No 10-k")
def main(): # establish a list of companies to extract data from company_list = [('AMAZON COM INC', '0001018724'), ('Apple Inc.', '0000320193')] # iterate through the companies, calling the get_xbrl function on each xbrl_files = [get_xbrl(Company(pair[0], pair[1])) for pair in company_list] # fill pandas with the segment data segment_df = xbrl_to_df(xbrl_files[0]) segment_df.to_csv(Path.cwd() / 'SegmentData.csv', index=False)
def get_filing_metadata(context, name: str, cik: str, filing: str, no_filings: int): comp = Company(name, cik) tree = comp.get_all_filings(filing) docs = comp.get_documents(tree, no_filings, True) filings = [] #TODO #38 change return method to yield AssetMaterialization() for document in docs: filings.append[clean_filings(document, cik, filing)] context.log.info(log_assert_type(filings, dict)) return filings
def pull_10K(company_name, company_id): company = Company(company_name, company_id) tree = company.get_all_filings(filing_type="10-K") pre_time = time.time() offset = random.randint(1, 25) #seconds if pre_time + offset > time.time(): docs = Company.get_documents(tree, no_of_documents=3) pre_time = time.time() text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def file_date(com, cik, no_docs): """ This function is to pull only the filing date Serves as the date of measurement for analyzing returns. """ company = Company(com, cik, no_docs) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=no_docs, as_documents=True) dates = [] for x in range(no_docs): doc = docs[x] dates.append(doc.content['Filing Date']) return dates
def pull_10K(name, company_id): ''' we use this function to perform the get filings. we need to run this function and iterarte over our list of tickers. Each ticker will get parsed and collected into a dataframe. ''' company = Company(name, company_id) tree = company.get_all_filings(filing_type="10-K") docs = Company.get_documents(tree, no_of_documents=6) # print("checkpoint: retrieving documents...") text_l = [] for i in range(len(docs)): try: text = TXTML.parse_full_10K(docs[i]) text_l.append(text) except IndexError: pass return text_l
def has_ex_10(edgar): company_df = init_edgar_df(edgar) company_df['ex-10'] = False for _, row in company_df.drop_duplicates( subset='cik', keep='first').iloc[4000:7000].iterrows(): cik = row['cik'] # initialize a Company instance company = Company(name=edgar.get_company_name_by_cik(cik), cik=cik) # get all the "EX-10" type documents from the company's 10K documents = company.get_document_type_from_10K('EX-10', no_of_documents=1) if documents: company_df.at[_, 'ex-10'] = True ex_10_df = company_df[company_df['ex-10'] == True] ex_10_df.to_csv( '/Users/sorenlittle/PycharmProjects/edgar_spacy_training/ex_10_df/ex_10_df_4000_7000.csv' )
from edgar import Company, XBRL, XBRLElement, TXTML, Edgar, Document from sumy.summarizers.text_rank import TextRankSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer import sys # company = Company("INTERNATIONAL BUSINESS MACHINES CORP", "0000051143") # company2 = Company("twitter", "0001418091") # company3 = Company("Oracle Corp", "0001341439") company4 = Company("GOOGLE INC", "0001288776") # edgar = Edgar() # possible_companies = edgar.find_company_name("Cisco System") # # print(possible_companies) doc = company4.get_10K() text = TXTML.parse_full_10K(doc) print('1') f = open("text2.txt", "w+") f.write(text) f.close() # f = open('text.txt', 'r') # for line in f: # print(line) # print()
from edgar import Company, TXTML, XBRLElement, XBRL, Edgar db = Edgar() comp_name = 'TESLA, INC.' company = Company(comp_name, db.all_companies_dict[comp_name]) ''' company = Company("Oracle Corp", "0001341439") tree = company.get_all_filings(filing_type = "10-K") docs = Company.get_documents(tree, no_of_documents=5) print (docs) text = TXTML.parse_full_10K(docs[0]) #print (text) #company = edgar.Company(Ticker,"21344") #print company company = Company("Oracle Corp", "0001341439") results = company.get_data_files_from_10K("EX-101.INS", isxml=True) xbrl = XBRL(results[0]) element = XBRLElement(xbrl.relevant_children_parsed[15]).to_dict()#// returns a dictionary of name, value, and schemaRef print(element) '''
def search_company(self, name, cik, filing_type, filing_subtype, no_of_entries, filing_date_before, filing_pattern, filing_rsrc_cache): base_url = self.aconfig['args'].endpoint acquirePatterns = OrderedDict() if len(filing_pattern) == 0 and not filing_rsrc_cache: print("Ambiguous options: no pattern search: (-P [-P] , and no download of resources: -d. Choose one mode") return for pattern in filing_pattern: acquirePatterns[pattern] = re.compile(pattern) self.alogger.debug("Name:{0} CIK:{1} Filing:{2} Subtype:{3}".format(name, cik, filing_type, filing_subtype)) company = Company(name, cik) print("Filings endpoint:", company.get_filings_url()) tree = company.get_all_filings(filing_type=filing_type, no_of_entries=no_of_entries, prior_to=filing_date_before) url_groups = company._group_document_type(tree, filing_type) result = OrderedDict() for url_group in url_groups: for url in url_group: url = base_url + url self.alogger.debug("In Content page: {0} ".format(url)) content_page = Company.get_request(url) try: table = content_page.find_class("tableFile")[0] for row in table.getchildren(): # Match on 4th column of the row `Type` if filing_subtype in row.getchildren()[3].text: self.alogger.debug("Subtype found: {0}".format(row.getchildren()[3].text)) href = row.getchildren()[2].getchildren()[0].attrib["href"] href_txt = row.getchildren()[2].getchildren()[0].text_content() if href and not href_txt: self.alogger.debug(" but no link for the resource posted. skipping") continue # SEC XRBL. Remove that cruft, get raw document if applicable href = href.replace("/ix?doc=", "") href = base_url + href self.alogger.debug("Processing resource: {0}".format(href)) # Fetch the filing doc and process if filing_rsrc_cache: rsrc_cache_path = urlparse(href).path.strip("/") rsrc_cache_dir = os.path.dirname(rsrc_cache_path) r = requests.get(href) self.alogger.debug("Making repository structure") os.makedirs(rsrc_cache_dir, exist_ok=True) print("Storing {} from {} locally: {}".format(href_txt, href, rsrc_cache_path)) with open(rsrc_cache_path, 'wb') as f: f.write(r.content) else: print("Working on {} ...".format(href)) doc = Company.get_request(href) tree_str = str(etree.tostring(doc), 'utf-8') tree_str_text = html2text.html2text(tree_str) result[href] = tree_str_text except IndexError as ie: pass if not filing_rsrc_cache and len(filing_pattern) != 0: self.alogger.debug("Matched filing types count: {} ".format(len(result))) self.alogger.debug("Performing pattern matching") for filing_resource, filing_text in result.items(): for pattern, cpattern in acquirePatterns.items(): if re.search(cpattern, filing_text): self.alogger.debug("Pattern Matches: {0}".format(filing_resource)) self.search_string(filing_text, 1, 1, pattern)
from edgar import Company, TXTML import re import pandas as pd df = pd.read_excel(r'companylist.xls') expense_estimates = [] for i in df.index: print(expense_estimates) CIK_string = df['CIK'][i].split("; ") print(df['Company Name'][i]) company = Company("df['Company Name'][i]", CIK_string[0]) try: doc = company.get_10K() text = TXTML.parse_full_10K(doc) except IndexError: expense_estimates.append(float("NaN")) continue if not ('hipping' in text): expense_estimates.append(float("NaN")) continue matches = [m.start() for m in re.finditer('hipping', text)] #print(matches) string = "" est_available = False for i in matches: if '$' in text[i:i + 50]: string = text[i:i + 200] est_available = True break if not est_available:
dfmap['id'] = dfmap['id'].astype(str).str.zfill(10) #read the source list of tickers dft = pd.read_csv('et.csv', header=None) dft.columns = ['ticker'] #join with the sec ticker master file to add the 'id' column dft = dft.merge(dfmap, on='ticker', how='inner') dft = dft.drop_duplicates() dfsftcols = ['ticker', 'earn_datetime'] dfSECFileTimes = pd.DataFrame(columns=dfsftcols) for row in dft.itertuples(): print(row.ticker + ' ' + row.id) company = Company(row.ticker, row.id) tree = company.get_all_filings(filing_type="8-K") hrefs = tree.xpath('//*[@id="documentsbutton"]') descs = tree.xpath('//div[4]/div[4]//td[3]') for i in zip(descs, hrefs): if i[0].text_content().strip().find(' 2.02') > -1: lnk = 'https://www.sec.gov' + i[1].get('href') con = Documents(lnk).content if con['Accepted'][:4] == '2014': break sleep(0.2) dfSECFileTimes = dfSECFileTimes.append( pd.DataFrame([[row.ticker, con['Accepted']]], columns=dfsftcols)) print(" ".join([row.ticker, con['Accepted'], lnk]))