def __init__(self, index, date_field, CIK, plaintiff, case_name): self.index = index self.CIK = Utilities.format_CIK(CIK) self.date_field = date_field self.plaintiff = plaintiff self.case_name = case_name self._years_in_which_litigation_is_mentioned = set()
def __init__(self, CIK, filing_year, company_name, processed_text): self.CIK = Utilities.format_CIK(CIK) self.filing_year = Utilities.sanitize_filing_year(filing_year) self.company_name = company_name self.legal_proceeding_mention = None self.legal_note_mentions = None self.processed_text = processed_text
def get_raw_website_data_from_corpus(CIK, filing_year): CIK = Utilities.format_CIK(CIK) filing_year = Utilities.sanitize_filing_year(filing_year) candidate_path = os.path.join(Constants.PATH_TO_RAW_URL_DATA, CIK, str(filing_year) + ".txt") if os.path.exists(candidate_path): with open(candidate_path, 'rb') as f: return f.read()
def get_company_name_from_corpus(CIK): CIK = Utilities.format_CIK(CIK) if os.path.exists(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE): with open(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE, 'r') as f: for line in f: data = re.split(Constants.COMPANY_NAME_AND_CIK_MAPPING_FILE_DELIMITER, line) if len(data) != 2: continue if data[0] == CIK: return data[1]
def write_to_legal_proceeding_corpus(data, CIK, filing_year): ''' we'll dump our resulting data to a text file. it will be structured thusly: legal_foonotes CIK_1 filing_year_1.txt filing_year_2.txt and so on. ''' CIK = Utilities.format_CIK(CIK) filing_year = Utilities.sanitize_filing_year(filing_year) path = os.path.join(Constants.PATH_TO_LEGAL_PROCEEDING_CORPUS, CIK) write_data_to_corpus(data, CIK, filing_year, path)
def write_processed_url_data_to_file(data, CIK, filing_year): CIK = Utilities.format_CIK(CIK) filing_year = Utilities.sanitize_filing_year(filing_year) path = os.path.join(Constants.PATH_TO_PROCESSED_URL_DATA, CIK) if not os.path.exists(path): os.mkdir(path) path_with_file = os.path.join(path, filing_year + ".txt") if not os.path.exists(path_with_file): with open(path_with_file, 'w') as f: f.writelines(data)
def write_data_to_corpus(data, CIK, filing_year, path): if data is None or len(data) == 0: raise Exception("Nothing to write!") CIK = Utilities.format_CIK(CIK) filing_year = Utilities.sanitize_filing_year(filing_year) if not os.path.exists(path): os.makedirs(path) path_with_file = os.path.join(path, filing_year + ".txt") if os.path.exists(path_with_file) or not os.path.exists(path_with_file): with open(path_with_file, 'w') as f: f.writelines(data)
def main(): CIK = Utilities.format_CIK('0000859475') for year in xrange(2004, 2012 + 1): print "Begin:\tCIK:%s\t%s" % (CIK, year) try: processed_data = CorpusAccess.get_processed_website_data_from_corpus(CIK, year) company_name = CorpusAccess.get_company_name_from_corpus(CIK) results = Litigation10KParsing.parse(CIK, year, company_name, processed_website_data=processed_data) print "Wrote mapping:", if CorpusAccess.get_company_name_from_corpus(CIK) is None: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(CIK, results.company_name) print "\tYES" else: print "\tNO" print "Wrote Processed URL Data: ", if processed_data is None: CorpusAccess.write_processed_url_data_to_file(data=results.processed_text, CIK=results.CIK, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Proceeding Data: ", if results.legal_proceeding_mention is not None: CorpusAccess.write_to_legal_proceeding_corpus(CIK=results.CIK, data=results.legal_proceeding_mention, filing_year=results.filing_year) print "\tYES" else: print "\tNO" print "Wrote Legal Footnote Data: ", if len(results.legal_note_mentions) > 0: CorpusAccess.write_to_litigation_footnote_corpus(results.legal_note_mentions, results.CIK, results.filing_year) print "\tYES" else: print "\tNO" except Exception as exception: print "Exception: ", exception traceback.print_exc()
def write_raw_url_data_to_file(data, CIK, filing_year): CIK = Utilities.format_CIK(CIK) filing_year = Utilities.sanitize_filing_year(filing_year) path = os.path.join(Constants.PATH_TO_RAW_URL_DATA, CIK) _raw_data_writing_mutex.acquire() if not os.path.exists(path): os.mkdir(path) path_with_file = os.path.join(path, filing_year + ".txt") if not os.path.exists(path_with_file): with open(path_with_file, 'w') as f: f.writelines(data) _raw_data_writing_mutex.release()
def _write_files_to_corpus(root_path, cik): cik = Utilities.format_CIK(cik) results = _get_results(cik, 2004, 2012) name = CorpusAccess.get_company_name_from_corpus(cik) name = re.sub("\/", "", name) name = name.strip() folder_name = cik + " - " + name cik_path = os.path.join(root_path, folder_name) if not os.path.exists(cik_path): os.makedirs(cik_path) print "Writing to:", cik_path _write_year_files(cik_path, results) _write_all_file(cik_path, results)
def write_comparison_to_file(new_output, old_output, CIK, filing_year): CIK = Utilities.format_CIK(CIK) path = os.path.join(Constants.PATH_TO_FAILED_UNIT_TESTS, CIK) if not os.path.exists(path): os.makedirs(path) log_file = os.path.join(path, filing_year + '.txt') with open(log_file, 'w') as f: print "Writing log of failed unit test to %s" % log_file f.write("OLD:\n") f.writelines(old_output) f.write("\n") f.write("================================================") f.write("\n") f.write("NEW:\n") f.writelines(new_output)
def write_company_name_and_cik_mapping_to_corpus(CIK, company_name): company_name = company_name.strip("\n\r\t\s") CIK = Utilities.format_CIK(CIK) if CIK is None or company_name is None: return None mapping = {CIK : company_name} if os.path.exists(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE): with open(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE, 'r') as f: for line in f: data = re.split(Constants.COMPANY_NAME_AND_CIK_MAPPING_FILE_DELIMITER, line) if len(data) != 2: continue if data[0] != CIK: mapping[data[0]] = data[1] with lockfile.FileLock(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE), \ open(Constants.PATH_TO_COMPANY_NAME_AND_CIK_MAPPING_FILE, 'w') as f: for key in mapping: f.write(key + Constants.COMPANY_NAME_AND_CIK_MAPPING_FILE_DELIMITER + mapping[key] + '\n')
def _get_results(cik, start_year, end_year): results = dict() cik = Utilities.format_CIK(cik) for year in xrange(start_year, end_year + 1): year = str(year) print "Processing %s %s" % (cik, year) lfp_path = os.path.join(Constants.PATH_TO_LEGAL_FOOTNOTE_CORPUS, cik, year + '.txt') lpp_path = os.path.join(Constants.PATH_TO_LEGAL_PROCEEDING_CORPUS, cik, year + '.txt') processed_data = CorpusAccess.get_processed_website_data_from_corpus(cik, year) company_name = CorpusAccess.get_company_name_from_corpus(cik) get_lpp_only = False get_lfp_only = False if os.path.exists(lfp_path): get_lpp_only = True if os.path.exists(lpp_path): get_lfp_only = True try: result = Litigation10KParsing.parse(cik, year, company_name, processed_website_data=processed_data, \ get_legal_proceeding_only=get_lpp_only, get_litigation_footnotes_only=get_lfp_only) if get_lpp_only: with open(lfp_path) as f: result.legal_note_mentions = f.read() else: if result.legal_note_mentions is not None: try: CorpusAccess.write_to_litigation_footnote_corpus(result.legal_note_mentions, result.CIK, result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if get_lfp_only: with open(lpp_path) as f: result.legal_proceeding_mention = f.read() else: if result.legal_proceeding_mention is not None: try: CorpusAccess.write_to_legal_proceeding_corpus(CIK=result.CIK, \ data=result.legal_proceeding_mention, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() if company_name is None and result.company_name is not None: try: CorpusAccess.write_company_name_and_cik_mapping_to_corpus(result.CIK, result.company_name) except Exception as exception: print "Exception: ", exception traceback.print_exc() if processed_data is None and result.processed_text is not None: try: CorpusAccess.write_processed_url_data_to_file(data=result.processed_text, CIK=result.CIK, filing_year=result.filing_year) except Exception as exception: print "Exception: ", exception traceback.print_exc() results[year] = result except Exception as exception: print "Exception: ", exception traceback.print_exc() return results