def _litigation_footnote_unit_test(CIK, filing_year, corpus_file): processed_website_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, filing_year) company_name = CorpusAccess.get_company_name_from_corpus(CIK) result = Litigation10KParsing.parse(CIK, filing_year, company_name, processed_website_data, get_litigation_footnotes_only=True) if processed_website_data is None: CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year) if company_name is None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name) _character_count_test(CIK, filing_year, result.legal_note_mentions, corpus_file)
def main(): CIK = Utilities.format_CIK('0000859475') for year in xrange(2004, 2012 + 1): print "Begin:\tCIK:%s\t%s" % (CIK, year) try: processed_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, year) company_name = CorpusAccess.get_company_name_from_corpus(CIK) results = Litigation10KParsing.parse(CIK, year, company_name, processed_website_data=processed_data) print "Wrote mapping:", if CorpusAccess.get_company_name_from_corpus(CIK) is None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, results.company_name) print "\tYES" else: print "\tNO" print "Wrote Processed URL Data: ", if processed_data is None: CorpusAccess.write_processed_url_data_to_file(data=results.processed_text, CIK=results.CIK, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Proceeding Data: ", if results.legal_proceeding_mention is not None: CorpusAccess.write_to_legal_proceeding_corpus(CIK=results.CIK, data=results.legal_proceeding_mention, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Footnote Data: ", if len(results.legal_note_mentions) > 0: CorpusAccess.write_to_litigation_footnote_corpus(results.legal_note_mentions, results.CIK, results.filing_year) print "\tYES" else: print "\tNO" except Exception as exception: print "Exception: ", exception traceback.print_exc()
def _get_results(cik, start_year, end_year): results = dict() cik = Utilities.format_CIK(cik) for year in xrange(start_year, end_year + 1): year = str(year) print "Processing %s %s" % (cik, year) lfp_path = os.path.join(Constants.PATH_TO_LEGAL_FOOTNOTE_CORPUS, cik, year + '.txt') lpp_path = os.path.join(Constants.PATH_TO_LEGAL_PROCEEDING_CORPUS, cik, year + '.txt') processed_data = CorpusAccess.get_processed_website_data_from_corpus(cik, year) company_name = CorpusAccess.get_company_name_from_corpus(cik) get_lpp_only = False get_lfp_only = False if os.path.exists(lfp_path): get_lpp_only = True if os.path.exists(lpp_path): get_lfp_only = True try: result = Litigation10KParsing.parse(cik, year, company_name, processed_website_data=processed_data, \ get_legal_proceeding_only=get_lpp_only, get_litigation_footnotes_only=get_lfp_only) if get_lpp_only: with open(lfp_path) as f: result.legal_note_mentions = f.read() else: if result.legal_note_mentions is not None: try: CorpusAccess.write_to_litigation_footnote_corpus(result.legal_note_mentions, result.CIK, result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if get_lfp_only: with open(lpp_path) as f: result.legal_proceeding_mention = f.read() else: if result.legal_proceeding_mention is not None: try: CorpusAccess.write_to_legal_proceeding_corpus(CIK=result.CIK, \ data=result.legal_proceeding_mention, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if company_name is None and result.company_name is not None: try: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name) except Exception as exception: print "Exception: ", exception traceback.print_exc() if processed_data is None and result.processed_text is not None: try: CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() results[year] = result except Exception as exception: print "Exception: ", exception traceback.print_exc() return results