def get_name_of_company_from_cik(CIK): ''' given a CIK, return the company's name ''' url_data = _pull_edgar_search_page(CIK) company_name = _process_url_into_soup_and_get_data(url_data, _get_company_name_from_soup) if company_name is not None and re.search("[A-Za-z]+", company_name): CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, company_name) return company_name
def get_cik_of_company_from_name(name): ''' given a company name, return the company's name ''' url_data = _pull_edgar_search_page(company_name=name) # this is only going to work when there is no ambiguity as to # which CIK corresponds to this given company name; if a search page # is encountered, we return None! CIK = _process_url_into_soup_and_get_data(url_data, _get_cik_from_soup) if CIK is not None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, name) return CIK
def _character_count_test(CIK, filing_year, new_data, corpus_file): parser_alpha_numeric_count = Utilities.get_alpha_numeric_count(''.join(blob for blob in new_data)) with open(corpus_file, 'r') as f: text_from_file = f.read() file_alpha_numeric_count = Utilities.get_alpha_numeric_count(text_from_file) change = (parser_alpha_numeric_count - file_alpha_numeric_count) / file_alpha_numeric_count result = abs(change) < Constants.REGRESSION_CHAR_COUNT_CHANGE_THRESHOLD print "CIK:%r, Year:%r, New Count:%r, " % (CIK, filing_year, parser_alpha_numeric_count), print "Corpus Count:%r, Passed:%r" % (file_alpha_numeric_count, result) if result is False: CorpusAccess.write_comparison_to_file(new_data, text_from_file, CIK, filing_year)
def _get_raw_data(CIK, year): ''' process-safe way of accessing a given 10-K as indexed by CIK and filing year. method will store the data to disk if it's not already there ''' # maintain exclusive zone when acquiring raw data. # this section of the code could, based on OS scheduling, easily # lead to multiple download attempts of the same data. _corpus_access_mutex.acquire() raw_data = CorpusAccess.get_raw_website_data_from_corpus(CIK=CIK, filing_year=year) if raw_data is None: url = edgar.get_10k_url(CIK=CIK, filing_year=year) if url is not None: raw_data = urllib2.urlopen(url, timeout=Constants.URL_DOWNLOAD_TIMEOUT_IN_SECS).read() CorpusAccess.write_raw_url_data_to_file(raw_data, CIK=CIK, filing_year=year) _corpus_access_mutex.release() return raw_data
def _litigation_footnote_unit_test(CIK, filing_year, corpus_file): processed_website_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, filing_year) company_name = CorpusAccess.get_company_name_from_corpus(CIK) result = Litigation10KParsing.parse(CIK, filing_year, company_name, processed_website_data, get_litigation_footnotes_only=True) if processed_website_data is None: CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year) if company_name is None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name) _character_count_test(CIK, filing_year, result.legal_note_mentions, corpus_file)
def _write_files_to_corpus(root_path, cik): cik = Utilities.format_CIK(cik) results = _get_results(cik, 2004, 2012) name = CorpusAccess.get_company_name_from_corpus(cik) name = re.sub("\/", "", name) name = name.strip() folder_name = cik + " - " + name cik_path = os.path.join(root_path, folder_name) if not os.path.exists(cik_path): os.makedirs(cik_path) print "Writing to:", cik_path _write_year_files(cik_path, results) _write_all_file(cik_path, results)
def main(): CIK = Utilities.format_CIK('0000859475') for year in xrange(2004, 2012 + 1): print "Begin:\tCIK:%s\t%s" % (CIK, year) try: processed_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, year) company_name = CorpusAccess.get_company_name_from_corpus(CIK) results = Litigation10KParsing.parse(CIK, year, company_name, processed_website_data=processed_data) print "Wrote mapping:", if CorpusAccess.get_company_name_from_corpus(CIK) is None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, results.company_name) print "\tYES" else: print "\tNO" print "Wrote Processed URL Data: ", if processed_data is None: CorpusAccess.write_processed_url_data_to_file(data=results.processed_text, CIK=results.CIK, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Proceeding Data: ", if results.legal_proceeding_mention is not None: CorpusAccess.write_to_legal_proceeding_corpus(CIK=results.CIK, data=results.legal_proceeding_mention, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Footnote Data: ", if len(results.legal_note_mentions) > 0: CorpusAccess.write_to_litigation_footnote_corpus(results.legal_note_mentions, results.CIK, results.filing_year) print "\tYES" else: print "\tNO" except Exception as exception: print "Exception: ", exception traceback.print_exc()
def _get_results(cik, start_year, end_year): results = dict() cik = Utilities.format_CIK(cik) for year in xrange(start_year, end_year + 1): year = str(year) print "Processing %s %s" % (cik, year) lfp_path = os.path.join(Constants.PATH_TO_LEGAL_FOOTNOTE_CORPUS, cik, year + '.txt') lpp_path = os.path.join(Constants.PATH_TO_LEGAL_PROCEEDING_CORPUS, cik, year + '.txt') processed_data = CorpusAccess.get_processed_website_data_from_corpus(cik, year) company_name = CorpusAccess.get_company_name_from_corpus(cik) get_lpp_only = False get_lfp_only = False if os.path.exists(lfp_path): get_lpp_only = True if os.path.exists(lpp_path): get_lfp_only = True try: result = Litigation10KParsing.parse(cik, year, company_name, processed_website_data=processed_data, \ get_legal_proceeding_only=get_lpp_only, get_litigation_footnotes_only=get_lfp_only) if get_lpp_only: with open(lfp_path) as f: result.legal_note_mentions = f.read() else: if result.legal_note_mentions is not None: try: CorpusAccess.write_to_litigation_footnote_corpus(result.legal_note_mentions, result.CIK, result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if get_lfp_only: with open(lpp_path) as f: result.legal_proceeding_mention = f.read() else: if result.legal_proceeding_mention is not None: try: CorpusAccess.write_to_legal_proceeding_corpus(CIK=result.CIK, \ data=result.legal_proceeding_mention, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if company_name is None and result.company_name is not None: try: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name) except Exception as exception: print "Exception: ", exception traceback.print_exc() if processed_data is None and result.processed_text is not None: try: CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() results[year] = result except Exception as exception: print "Exception: ", exception traceback.print_exc() return results
def run_regression_test_suite(legal_footnotes_only, legal_proceeding_only): CorpusAccess.wipe_existing_failed_unit_tests() if not legal_footnotes_only: _run_legal_proceeding_test_suite() if not legal_proceeding_only: _run_legal_footnotes_test_suite()