コード例 #1
0
def get_company_by_cik(cik):
    cik_table = str(cik)
    length = len(cik_table)
    cik = "0" * (10 - length) + cik_table
    name = edgar.get_company_name_by_cik(cik)
    company = Company(name, cik)

    return company
コード例 #2
0
def get_10K_doc_raw(name, cik):
    """
    Get the latest 10-K filing document for a given company
    using the edgar package
    """
    company = Company(name, cik)
    # tree = company.get_all_filings(filing_type="10-K")
    # docs = Company.get_documents(tree, no_of_documents=1)
    docs = company.get_10Ks(no_of_documents=1)
    return docs
コード例 #3
0
def findWord(comp,cik): 
    try: 
        company = Company(comp,cik) 
        doc = company.get_10K() 
        text = TXTML.parse_full_10K(doc) 
        #print(text) 
        if (re.search('blockchain', text , re.IGNORECASE)): 
            return("exists") 
        else : 
            return("dosenot") 
    except: 
        return("No 10-k") 
コード例 #4
0
def main():
    # establish a list of companies to extract data from
    company_list = [('AMAZON COM INC', '0001018724'),
                    ('Apple Inc.', '0000320193')]

    # iterate through the companies, calling the get_xbrl function on each
    xbrl_files = [get_xbrl(Company(pair[0], pair[1])) for pair in company_list]

    # fill pandas with the segment data
    segment_df = xbrl_to_df(xbrl_files[0])

    segment_df.to_csv(Path.cwd() / 'SegmentData.csv', index=False)
コード例 #5
0
def get_filing_metadata(context, name: str, cik: str, filing: str,
                        no_filings: int):
    comp = Company(name, cik)
    tree = comp.get_all_filings(filing)
    docs = comp.get_documents(tree, no_filings, True)

    filings = []

    #TODO #38 change return method to yield AssetMaterialization()
    for document in docs:
        filings.append[clean_filings(document, cik, filing)]

    context.log.info(log_assert_type(filings, dict))
    return filings
コード例 #6
0
def pull_10K(company_name, company_id):
    company = Company(company_name, company_id)
    tree = company.get_all_filings(filing_type="10-K")
    pre_time = time.time()
    offset = random.randint(1, 25)  #seconds
    if pre_time + offset > time.time():
        docs = Company.get_documents(tree, no_of_documents=3)
        pre_time = time.time()
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
コード例 #7
0
def file_date(com, cik, no_docs):
    """
    This function is to pull only the filing date
    Serves as the date of measurement for analyzing returns.
    """
    company = Company(com, cik, no_docs)
    tree = company.get_all_filings(filing_type="10-K")
    docs = Company.get_documents(tree,
                                 no_of_documents=no_docs,
                                 as_documents=True)
    dates = []
    for x in range(no_docs):
        doc = docs[x]
        dates.append(doc.content['Filing Date'])

    return dates
コード例 #8
0
def pull_10K(name, company_id):
    '''
    we use this function to perform the get filings.
    we need to run this function and iterarte over our
    list of tickers. Each ticker will get parsed and
    collected into a dataframe.
    '''
    company = Company(name, company_id)
    tree = company.get_all_filings(filing_type="10-K")

    docs = Company.get_documents(tree, no_of_documents=6)
    # print("checkpoint: retrieving documents...")
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
コード例 #9
0
def has_ex_10(edgar):
    company_df = init_edgar_df(edgar)
    company_df['ex-10'] = False

    for _, row in company_df.drop_duplicates(
            subset='cik', keep='first').iloc[4000:7000].iterrows():
        cik = row['cik']

        # initialize a Company instance
        company = Company(name=edgar.get_company_name_by_cik(cik), cik=cik)

        # get all the "EX-10" type documents from the company's 10K
        documents = company.get_document_type_from_10K('EX-10',
                                                       no_of_documents=1)

        if documents:
            company_df.at[_, 'ex-10'] = True

    ex_10_df = company_df[company_df['ex-10'] == True]
    ex_10_df.to_csv(
        '/Users/sorenlittle/PycharmProjects/edgar_spacy_training/ex_10_df/ex_10_df_4000_7000.csv'
    )
コード例 #10
0
from edgar import Company, XBRL, XBRLElement, TXTML, Edgar, Document
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import sys

# company = Company("INTERNATIONAL BUSINESS MACHINES CORP", "0000051143")
# company2 = Company("twitter", "0001418091")
# company3 = Company("Oracle Corp", "0001341439")
company4 = Company("GOOGLE INC", "0001288776")

# edgar = Edgar()
# possible_companies = edgar.find_company_name("Cisco System")
#
# print(possible_companies)

doc = company4.get_10K()
text = TXTML.parse_full_10K(doc)

print('1')

f = open("text2.txt", "w+")
f.write(text)
f.close()

# f = open('text.txt', 'r')
# for line in f:
#     print(line)
#     print()
コード例 #11
0
from edgar import Company, TXTML, XBRLElement, XBRL, Edgar

db = Edgar()
comp_name = 'TESLA, INC.'

company = Company(comp_name, db.all_companies_dict[comp_name])
'''
company = Company("Oracle Corp", "0001341439")
tree = company.get_all_filings(filing_type = "10-K")
docs = Company.get_documents(tree, no_of_documents=5)
print (docs)

text = TXTML.parse_full_10K(docs[0])
#print (text)
#company = edgar.Company(Ticker,"21344")
#print company


company = Company("Oracle Corp", "0001341439")
results = company.get_data_files_from_10K("EX-101.INS", isxml=True)
xbrl = XBRL(results[0])
element = XBRLElement(xbrl.relevant_children_parsed[15]).to_dict()#// returns a dictionary of name, value, and schemaRef
print(element)
'''
コード例 #12
0
    def search_company(self, name, cik,
                       filing_type, filing_subtype, no_of_entries, filing_date_before, filing_pattern,
                       filing_rsrc_cache):
        base_url = self.aconfig['args'].endpoint
        acquirePatterns = OrderedDict()

        if len(filing_pattern) == 0 and not filing_rsrc_cache:
            print("Ambiguous options: no pattern search: (-P [-P] , and no download of resources: -d. Choose one mode")
            return

        for pattern in filing_pattern:
            acquirePatterns[pattern] = re.compile(pattern)

        self.alogger.debug("Name:{0} CIK:{1} Filing:{2} Subtype:{3}".format(name, cik, filing_type, filing_subtype))
        company = Company(name, cik)

        print("Filings endpoint:", company.get_filings_url())
        tree = company.get_all_filings(filing_type=filing_type,
                                       no_of_entries=no_of_entries, prior_to=filing_date_before)

        url_groups = company._group_document_type(tree, filing_type)
        result = OrderedDict()
        for url_group in url_groups:
            for url in url_group:
                url = base_url + url
                self.alogger.debug("In Content page: {0} ".format(url))
                content_page = Company.get_request(url)
                try:
                    table = content_page.find_class("tableFile")[0]
                    for row in table.getchildren():

                        # Match on 4th column of the row `Type`
                        if filing_subtype in row.getchildren()[3].text:
                            self.alogger.debug("Subtype found: {0}".format(row.getchildren()[3].text))
                            href = row.getchildren()[2].getchildren()[0].attrib["href"]
                            href_txt = row.getchildren()[2].getchildren()[0].text_content()

                            if href and not href_txt:
                                self.alogger.debug(" but no link for the resource posted. skipping")
                                continue

                            # SEC XRBL. Remove that cruft, get raw document if applicable
                            href = href.replace("/ix?doc=", "")
                            href = base_url + href

                            self.alogger.debug("Processing resource: {0}".format(href))
                            # Fetch the filing doc and process
                            if filing_rsrc_cache:
                                rsrc_cache_path = urlparse(href).path.strip("/")
                                rsrc_cache_dir = os.path.dirname(rsrc_cache_path)
                                r = requests.get(href)
                                self.alogger.debug("Making repository structure")
                                os.makedirs(rsrc_cache_dir, exist_ok=True)
                                print("Storing {} from {} locally: {}".format(href_txt, href, rsrc_cache_path))
                                with open(rsrc_cache_path, 'wb') as f:
                                    f.write(r.content)
                            else:
                                print("Working on {} ...".format(href))
                                doc = Company.get_request(href)
                                tree_str = str(etree.tostring(doc), 'utf-8')
                                tree_str_text = html2text.html2text(tree_str)
                                result[href] = tree_str_text

                except IndexError as ie:
                    pass

        if not filing_rsrc_cache and len(filing_pattern) != 0:
            self.alogger.debug("Matched filing types count: {} ".format(len(result)))

            self.alogger.debug("Performing pattern matching")
            for filing_resource, filing_text in result.items():
                for pattern, cpattern in acquirePatterns.items():
                    if re.search(cpattern, filing_text):
                        self.alogger.debug("Pattern Matches: {0}".format(filing_resource))
                        self.search_string(filing_text, 1, 1, pattern)
コード例 #13
0
from edgar import Company, TXTML
import re
import pandas as pd

df = pd.read_excel(r'companylist.xls')

expense_estimates = []
for i in df.index:
    print(expense_estimates)
    CIK_string = df['CIK'][i].split("; ")
    print(df['Company Name'][i])
    company = Company("df['Company Name'][i]", CIK_string[0])
    try:
        doc = company.get_10K()
        text = TXTML.parse_full_10K(doc)
    except IndexError:
        expense_estimates.append(float("NaN"))
        continue
    if not ('hipping' in text):
        expense_estimates.append(float("NaN"))
        continue
    matches = [m.start() for m in re.finditer('hipping', text)]
    #print(matches)
    string = ""
    est_available = False
    for i in matches:
        if '$' in text[i:i + 50]:
            string = text[i:i + 200]
            est_available = True
            break
    if not est_available:
コード例 #14
0
dfmap['id'] = dfmap['id'].astype(str).str.zfill(10)

#read the source list of tickers
dft = pd.read_csv('et.csv', header=None)
dft.columns = ['ticker']

#join with the sec ticker master file to add the 'id' column
dft = dft.merge(dfmap, on='ticker', how='inner')
dft = dft.drop_duplicates()

dfsftcols = ['ticker', 'earn_datetime']
dfSECFileTimes = pd.DataFrame(columns=dfsftcols)

for row in dft.itertuples():
    print(row.ticker + ' ' + row.id)
    company = Company(row.ticker, row.id)
    tree = company.get_all_filings(filing_type="8-K")
    hrefs = tree.xpath('//*[@id="documentsbutton"]')
    descs = tree.xpath('//div[4]/div[4]//td[3]')

    for i in zip(descs, hrefs):
        if i[0].text_content().strip().find(' 2.02') > -1:
            lnk = 'https://www.sec.gov' + i[1].get('href')
            con = Documents(lnk).content
            if con['Accepted'][:4] == '2014':
                break
            sleep(0.2)
            dfSECFileTimes = dfSECFileTimes.append(
                pd.DataFrame([[row.ticker, con['Accepted']]],
                             columns=dfsftcols))
            print(" ".join([row.ticker, con['Accepted'], lnk]))