def process_filing(s3_path: str, filing_buffer: Union[str, bytes] = None, store_raw: bool = False, store_text: bool = False): """ Process a filing from an S3 path or filing buffer. :param s3_path: S3 path to process; if filing_buffer is none, retrieved from here :param filing_buffer: buffer; if not present, s3_path must be set :param store_raw: :param store_text: :return: """ # Log entry logger.info("Processing filing {0}...".format(s3_path)) # Check for existing record first try: filing = Filing.objects.get(s3_path=s3_path) if filing is not None: logger.error("Filing {0} has already been created in record {1}".format(s3_path, filing)) return None except Filing.DoesNotExist: logger.info("No existing record found.") except Filing.MultipleObjectsReturned: logger.error("Multiple existing record found.") return None # Get buffer if filing_buffer is None: logger.info("Retrieving filing buffer from S3...") filing_buffer = openedgar.clients.s3.get_buffer(s3_path) # Get main filing data structure filing_data = openedgar.parsers.edgar.parse_filing(filing_buffer, extract=store_text) if filing_data["cik"] is None: logger.error("Unable to parse CIK from filing {0}; assuming broken and halting...".format(s3_path)) return None try: # Get company company = Company.objects.get(cik=filing_data["cik"]) logger.info("Found existing company record.") # Check if record exists for date try: _ = CompanyInfo.objects.get(company=company, date=filing_data["date_filed"]) logger.info("Found existing company info record.") except CompanyInfo.DoesNotExist: # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = filing_data["company_name"] company_info.sic = filing_data["sic"] company_info.state_incorporation = filing_data["state_incorporation"] company_info.state_location = filing_data["state_location"] company_info.date = filing_data["date_filed"].date() if isinstance(filing_data["date_filed"], datetime.datetime) else \ filing_data["date_filed"] company_info.save() logger.info("Created new company info record.") except Company.DoesNotExist: # Create company company = Company() company.cik = filing_data["cik"] try: # Catch race with another task/thread company.save() try: _ = CompanyInfo.objects.get(company=company, date=filing_data["date_filed"]) except CompanyInfo.DoesNotExist: # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = filing_data["company_name"] company_info.sic = filing_data["sic"] company_info.state_incorporation = filing_data["state_incorporation"] company_info.state_location = filing_data["state_location"] company_info.date = filing_data["date_filed"] company_info.save() except django.db.utils.IntegrityError: company = Company.objects.get(cik=filing_data["cik"]) logger.info("Created company and company info records.") # Now create the filing record try: filing = Filing() filing.form_type = filing_data["form_type"] filing.accession_number = filing_data["accession_number"] filing.date_filed = filing_data["date_filed"] filing.document_count = filing_data["document_count"] filing.company = company filing.sha1 = hashlib.sha1(filing_buffer).hexdigest() filing.s3_path = s3_path filing.is_processed = False filing.is_error = True filing.save() except Exception as e: # pylint: disable=broad-except logger.error("Unable to create filing record: {0}".format(e)) return None # Create filing document records try: create_filing_documents(filing_data["documents"], filing, store_raw=store_raw, store_text=store_text) filing.is_processed = True filing.is_error = False filing.save() return filing except Exception as e: # pylint: disable=broad-except logger.error("Unable to create filing documents for {0}: {1}".format(filing, e)) return None
def process_filing(client, file_path: str, filing_buffer: Union[str, bytes] = None, store_raw: bool = False, store_text: bool = False): """ Process a filing from a path or filing buffer. :param file_path: path to process; if filing_buffer is none, retrieved from here :param filing_buffer: buffer; if not present, s3_path must be set :param store_raw: :param store_text: :return: """ # Log entry logger.info("Processing filing {0}...".format(file_path)) # Check for existing record first try: filing = Filing.objects.get(s3_path=file_path) if filing is not None: logger.error("Filing {0} has already been created in record {1}".format(file_path, filing)) return None except Filing.DoesNotExist: logger.info("No existing record found.") except Filing.MultipleObjectsReturned: logger.error("Multiple existing record found.") return None # Get buffer if filing_buffer is None: logger.info("Retrieving filing buffer from S3...") filing_buffer = client.get_buffer(file_path) # Get main filing data structure filing_data = openedgar.parsers.edgar.parse_filing(filing_buffer, extract=store_text) if filing_data["cik"] is None: logger.error("Unable to parse CIK from filing {0}; assuming broken and halting...".format(file_path)) return None try: # Get company company = Company.objects.get(cik=filing_data["cik"]) logger.info("Found existing company record.") # Check if record exists for date try: _ = CompanyInfo.objects.get(company=company, date=filing_data["date_filed"]) logger.info("Found existing company info record.") except CompanyInfo.DoesNotExist: # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = filing_data["company_name"] company_info.sic = filing_data["sic"] company_info.state_incorporation = filing_data["state_incorporation"] company_info.state_location = filing_data["state_location"] company_info.date = filing_data["date_filed"].date() if isinstance(filing_data["date_filed"], datetime.datetime) else \ filing_data["date_filed"] company_info.save() logger.info("Created new company info record.") except Company.DoesNotExist: # Create company company = Company() company.cik = filing_data["cik"] try: # Catch race with another task/thread company.save() try: _ = CompanyInfo.objects.get(company=company, date=filing_data["date_filed"]) except CompanyInfo.DoesNotExist: # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = filing_data["company_name"] company_info.sic = filing_data["sic"] company_info.state_incorporation = filing_data["state_incorporation"] company_info.state_location = filing_data["state_location"] company_info.date = filing_data["date_filed"] company_info.save() except django.db.utils.IntegrityError: company = Company.objects.get(cik=filing_data["cik"]) logger.info("Created company and company info records.") # Now create the filing record try: filing = Filing() filing.form_type = filing_data["form_type"] filing.accession_number = filing_data["accession_number"] filing.date_filed = filing_data["date_filed"] filing.document_count = filing_data["document_count"] filing.company = company filing.sha1 = hashlib.sha1(filing_buffer).hexdigest() filing.s3_path = file_path filing.is_processed = False filing.is_error = True filing.save() except Exception as e: # pylint: disable=broad-except logger.error("Unable to create filing record: {0}".format(e)) return None # Create filing document records try: create_filing_documents(client, filing_data["documents"], filing, store_raw=store_raw, store_text=store_text) filing.is_processed = True filing.is_error = False filing.save() return filing except Exception as e: # pylint: disable=broad-except logger.error("Unable to create filing documents for {0}: {1}".format(filing, e)) return None
def create_filing_error(row, filing_path: str): """ Create a Filing error record from an index row. :param row: :param filing_path: :return: """ # Get vars cik = row["CIK"] company_name = row["Company Name"] form_type = row["Form Type"] try: date_filed = dateutil.parser.parse(str(row["Date Filed"])).date() except ValueError: date_filed = None except IndexError: date_filed = None # Create empty error filing record filing = Filing() filing.form_type = form_type filing.date_filed = date_filed filing.s3_path = filing_path filing.is_error = True filing.is_processed = False # Get company info try: company = Company.objects.get(cik=cik) try: _ = CompanyInfo.objects.get(company=company, date=date_filed) except CompanyInfo.DoesNotExist: # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = company_name company_info.sic = None company_info.state_incorporation = None company_info.state_location = None company_info.date = date_filed company_info.save() except Company.DoesNotExist: # Create company company = Company() company.cik = cik try: company.save() except django.db.utils.IntegrityError: return create_filing_error(row, filing_path) # Create company info record company_info = CompanyInfo() company_info.company = company company_info.name = company_name company_info.sic = None company_info.state_incorporation = None company_info.state_location = None company_info.date = date_filed company_info.save() # Finally update company and save filing.company = company filing.save() return True