コード例 #1
0
ファイル: edgar.py プロジェクト: mchrzanowski/SEC10KParser
def get_name_of_company_from_cik(CIK):
    ''' given a CIK, return the company's name '''

    url_data = _pull_edgar_search_page(CIK)

    company_name = _process_url_into_soup_and_get_data(url_data, _get_company_name_from_soup)

    if company_name is not None and re.search("[A-Za-z]+", company_name):
        CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, company_name)

    return company_name
コード例 #2
0
ファイル: edgar.py プロジェクト: mchrzanowski/SEC10KParser
def get_cik_of_company_from_name(name):
    ''' given a company name, return the company's name '''

    url_data = _pull_edgar_search_page(company_name=name)

    # this is only going to work when there is no ambiguity as to
    # which CIK corresponds to this given company name; if a search page
    # is encountered, we return None!
    CIK = _process_url_into_soup_and_get_data(url_data, _get_cik_from_soup)

    if CIK is not None:
        CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, name)

    return CIK
コード例 #3
0
def _character_count_test(CIK, filing_year, new_data, corpus_file):
    
    parser_alpha_numeric_count =  Utilities.get_alpha_numeric_count(''.join(blob for blob in new_data))

    with open(corpus_file, 'r') as f:
        
        text_from_file = f.read()
        file_alpha_numeric_count = Utilities.get_alpha_numeric_count(text_from_file)
    
    change = (parser_alpha_numeric_count - file_alpha_numeric_count) / file_alpha_numeric_count
    result = abs(change) < Constants.REGRESSION_CHAR_COUNT_CHANGE_THRESHOLD
    
    print "CIK:%r, Year:%r, New Count:%r, " % (CIK, filing_year, parser_alpha_numeric_count),
    print "Corpus Count:%r, Passed:%r" % (file_alpha_numeric_count, result)
    
    if result is False:
        CorpusAccess.write_comparison_to_file(new_data, text_from_file, CIK, filing_year)
コード例 #4
0
def _get_raw_data(CIK, year):
    '''
        process-safe way of accessing a given 10-K as indexed
        by CIK and filing year. method will store the data to disk
        if it's not already there
    '''
    # maintain exclusive zone when acquiring raw data.
    # this section of the code could, based on OS scheduling, easily
    # lead to multiple download attempts of the same data.
    _corpus_access_mutex.acquire()

    raw_data = CorpusAccess.get_raw_website_data_from_corpus(CIK=CIK, filing_year=year)

    if raw_data is None:
        url = edgar.get_10k_url(CIK=CIK, filing_year=year)

        if url is not None:
            raw_data = urllib2.urlopen(url, timeout=Constants.URL_DOWNLOAD_TIMEOUT_IN_SECS).read()
            CorpusAccess.write_raw_url_data_to_file(raw_data, CIK=CIK, filing_year=year)

    _corpus_access_mutex.release()

    return raw_data
コード例 #5
0
def _litigation_footnote_unit_test(CIK, filing_year, corpus_file):
    processed_website_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, filing_year)
    company_name = CorpusAccess.get_company_name_from_corpus(CIK)

    result = Litigation10KParsing.parse(CIK, filing_year, company_name, processed_website_data, get_litigation_footnotes_only=True)
    
    if processed_website_data is None:
        CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year)

    if company_name is None:
        CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name)    
    
    _character_count_test(CIK, filing_year, result.legal_note_mentions, corpus_file)
コード例 #6
0
def _write_files_to_corpus(root_path, cik):

    cik = Utilities.format_CIK(cik)
    results = _get_results(cik, 2004, 2012)

    name = CorpusAccess.get_company_name_from_corpus(cik)

    name = re.sub("\/", "", name)
    name = name.strip()

    folder_name = cik + " - " + name

    cik_path = os.path.join(root_path, folder_name)
    if not os.path.exists(cik_path):
        os.makedirs(cik_path)

    print "Writing to:", cik_path

    _write_year_files(cik_path, results)
    _write_all_file(cik_path, results)
コード例 #7
0
def main():
    
    CIK = Utilities.format_CIK('0000859475')
    
    for year in xrange(2004, 2012 + 1):
                
        print "Begin:\tCIK:%s\t%s" % (CIK, year)
        
        try:
            
            processed_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, year)
            
            company_name = CorpusAccess.get_company_name_from_corpus(CIK)

            results = Litigation10KParsing.parse(CIK, year, company_name, processed_website_data=processed_data)

            print "Wrote mapping:",
            if CorpusAccess.get_company_name_from_corpus(CIK) is None:
                CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, results.company_name)
                print "\tYES"
            else:
                print "\tNO"
            
            print "Wrote Processed URL Data: ",
            if processed_data is None:        
                CorpusAccess.write_processed_url_data_to_file(data=results.processed_text, CIK=results.CIK, filing_year=results.filing_year)
                print "\tYES"
            else:
                print "\tNO"
            
            print "Wrote Legal Proceeding Data: ",
            if results.legal_proceeding_mention is not None:
                CorpusAccess.write_to_legal_proceeding_corpus(CIK=results.CIK, data=results.legal_proceeding_mention, filing_year=results.filing_year)
                print "\tYES"
            else:
                print "\tNO"
            
            print "Wrote Legal Footnote Data: ",    
            if len(results.legal_note_mentions) > 0:
                CorpusAccess.write_to_litigation_footnote_corpus(results.legal_note_mentions, results.CIK, results.filing_year)
                print "\tYES"
            else:
                print "\tNO"
            
        except Exception as exception:
            print "Exception: ", exception
            traceback.print_exc()
コード例 #8
0
def _get_results(cik, start_year, end_year):
    
    results = dict()

    cik = Utilities.format_CIK(cik)

    for year in xrange(start_year, end_year + 1):

        year = str(year)
        
        print "Processing %s %s" % (cik, year)
        
        lfp_path = os.path.join(Constants.PATH_TO_LEGAL_FOOTNOTE_CORPUS, cik, year + '.txt')
        lpp_path = os.path.join(Constants.PATH_TO_LEGAL_PROCEEDING_CORPUS, cik, year + '.txt')

        processed_data = CorpusAccess.get_processed_website_data_from_corpus(cik, year)    
        company_name = CorpusAccess.get_company_name_from_corpus(cik)

        get_lpp_only = False
        get_lfp_only = False

        if os.path.exists(lfp_path):
            get_lpp_only = True

        if os.path.exists(lpp_path):
            get_lfp_only = True

        try:
            result = Litigation10KParsing.parse(cik, year, company_name, processed_website_data=processed_data, \
                get_legal_proceeding_only=get_lpp_only, get_litigation_footnotes_only=get_lfp_only)

            if get_lpp_only:
                with open(lfp_path) as f:
                    result.legal_note_mentions = f.read()
            else:
                if result.legal_note_mentions is not None:
                    try:
                        CorpusAccess.write_to_litigation_footnote_corpus(result.legal_note_mentions, result.CIK, result.filing_year)
                    except Exception as exception:
                        print "Exception: ", exception
                        traceback.print_exc()
            if get_lfp_only:
                with open(lpp_path) as f:
                    result.legal_proceeding_mention = f.read()
            else:
                if result.legal_proceeding_mention is not None:
                    try:
                        CorpusAccess.write_to_legal_proceeding_corpus(CIK=result.CIK, \
                            data=result.legal_proceeding_mention, filing_year=result.filing_year)
                    except Exception as exception:
                        print "Exception: ", exception
                        traceback.print_exc()

            if company_name is None and result.company_name is not None:
                try:
                    CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name)
                except Exception as exception:
                    print "Exception: ", exception
                    traceback.print_exc()

            if processed_data is None and result.processed_text is not None:  
                try:      
                    CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year)
                except Exception as exception:
                    print "Exception: ", exception
                    traceback.print_exc()
            
            results[year] = result

        except Exception as exception:
            print "Exception: ", exception
            traceback.print_exc()

    return results
コード例 #9
0
def run_regression_test_suite(legal_footnotes_only, legal_proceeding_only):
    CorpusAccess.wipe_existing_failed_unit_tests()
    if not legal_footnotes_only:
        _run_legal_proceeding_test_suite()
    if not legal_proceeding_only:
        _run_legal_footnotes_test_suite()