Ejemplo n.º 1
0
def get_filing_metadata(context, name: str, cik: str, filing: str,
                        no_filings: int):
    comp = Company(name, cik)
    tree = comp.get_all_filings(filing)
    docs = comp.get_documents(tree, no_filings, True)

    filings = []

    #TODO #38 change return method to yield AssetMaterialization()
    for document in docs:
        filings.append[clean_filings(document, cik, filing)]

    context.log.info(log_assert_type(filings, dict))
    return filings
def pull_10K(company_name, company_id):
    company = Company(company_name, company_id)
    tree = company.get_all_filings(filing_type="10-K")
    pre_time = time.time()
    offset = random.randint(1, 25)  #seconds
    if pre_time + offset > time.time():
        docs = Company.get_documents(tree, no_of_documents=3)
        pre_time = time.time()
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
Ejemplo n.º 3
0
def file_date(com, cik, no_docs):
    """
    This function is to pull only the filing date
    Serves as the date of measurement for analyzing returns.
    """
    company = Company(com, cik, no_docs)
    tree = company.get_all_filings(filing_type="10-K")
    docs = Company.get_documents(tree,
                                 no_of_documents=no_docs,
                                 as_documents=True)
    dates = []
    for x in range(no_docs):
        doc = docs[x]
        dates.append(doc.content['Filing Date'])

    return dates
Ejemplo n.º 4
0
def pull_10K(name, company_id):
    '''
    we use this function to perform the get filings.
    we need to run this function and iterarte over our
    list of tickers. Each ticker will get parsed and
    collected into a dataframe.
    '''
    company = Company(name, company_id)
    tree = company.get_all_filings(filing_type="10-K")

    docs = Company.get_documents(tree, no_of_documents=6)
    # print("checkpoint: retrieving documents...")
    text_l = []
    for i in range(len(docs)):
        try:
            text = TXTML.parse_full_10K(docs[i])
            text_l.append(text)
        except IndexError:
            pass
    return text_l
Ejemplo n.º 5
0
    def search_company(self, name, cik,
                       filing_type, filing_subtype, no_of_entries, filing_date_before, filing_pattern,
                       filing_rsrc_cache):
        base_url = self.aconfig['args'].endpoint
        acquirePatterns = OrderedDict()

        if len(filing_pattern) == 0 and not filing_rsrc_cache:
            print("Ambiguous options: no pattern search: (-P [-P] , and no download of resources: -d. Choose one mode")
            return

        for pattern in filing_pattern:
            acquirePatterns[pattern] = re.compile(pattern)

        self.alogger.debug("Name:{0} CIK:{1} Filing:{2} Subtype:{3}".format(name, cik, filing_type, filing_subtype))
        company = Company(name, cik)

        print("Filings endpoint:", company.get_filings_url())
        tree = company.get_all_filings(filing_type=filing_type,
                                       no_of_entries=no_of_entries, prior_to=filing_date_before)

        url_groups = company._group_document_type(tree, filing_type)
        result = OrderedDict()
        for url_group in url_groups:
            for url in url_group:
                url = base_url + url
                self.alogger.debug("In Content page: {0} ".format(url))
                content_page = Company.get_request(url)
                try:
                    table = content_page.find_class("tableFile")[0]
                    for row in table.getchildren():

                        # Match on 4th column of the row `Type`
                        if filing_subtype in row.getchildren()[3].text:
                            self.alogger.debug("Subtype found: {0}".format(row.getchildren()[3].text))
                            href = row.getchildren()[2].getchildren()[0].attrib["href"]
                            href_txt = row.getchildren()[2].getchildren()[0].text_content()

                            if href and not href_txt:
                                self.alogger.debug(" but no link for the resource posted. skipping")
                                continue

                            # SEC XRBL. Remove that cruft, get raw document if applicable
                            href = href.replace("/ix?doc=", "")
                            href = base_url + href

                            self.alogger.debug("Processing resource: {0}".format(href))
                            # Fetch the filing doc and process
                            if filing_rsrc_cache:
                                rsrc_cache_path = urlparse(href).path.strip("/")
                                rsrc_cache_dir = os.path.dirname(rsrc_cache_path)
                                r = requests.get(href)
                                self.alogger.debug("Making repository structure")
                                os.makedirs(rsrc_cache_dir, exist_ok=True)
                                print("Storing {} from {} locally: {}".format(href_txt, href, rsrc_cache_path))
                                with open(rsrc_cache_path, 'wb') as f:
                                    f.write(r.content)
                            else:
                                print("Working on {} ...".format(href))
                                doc = Company.get_request(href)
                                tree_str = str(etree.tostring(doc), 'utf-8')
                                tree_str_text = html2text.html2text(tree_str)
                                result[href] = tree_str_text

                except IndexError as ie:
                    pass

        if not filing_rsrc_cache and len(filing_pattern) != 0:
            self.alogger.debug("Matched filing types count: {} ".format(len(result)))

            self.alogger.debug("Performing pattern matching")
            for filing_resource, filing_text in result.items():
                for pattern, cpattern in acquirePatterns.items():
                    if re.search(cpattern, filing_text):
                        self.alogger.debug("Pattern Matches: {0}".format(filing_resource))
                        self.search_string(filing_text, 1, 1, pattern)
Ejemplo n.º 6
0
#read the source list of tickers
dft = pd.read_csv('et.csv', header=None)
dft.columns = ['ticker']

#join with the sec ticker master file to add the 'id' column
dft = dft.merge(dfmap, on='ticker', how='inner')
dft = dft.drop_duplicates()

dfsftcols = ['ticker', 'earn_datetime']
dfSECFileTimes = pd.DataFrame(columns=dfsftcols)

for row in dft.itertuples():
    print(row.ticker + ' ' + row.id)
    company = Company(row.ticker, row.id)
    tree = company.get_all_filings(filing_type="8-K")
    hrefs = tree.xpath('//*[@id="documentsbutton"]')
    descs = tree.xpath('//div[4]/div[4]//td[3]')

    for i in zip(descs, hrefs):
        if i[0].text_content().strip().find(' 2.02') > -1:
            lnk = 'https://www.sec.gov' + i[1].get('href')
            con = Documents(lnk).content
            if con['Accepted'][:4] == '2014':
                break
            sleep(0.2)
            dfSECFileTimes = dfSECFileTimes.append(
                pd.DataFrame([[row.ticker, con['Accepted']]],
                             columns=dfsftcols))
            print(" ".join([row.ticker, con['Accepted'], lnk]))
    # break