Beispiel #1
0
def uploading_text_in_filing_documents(store_raw: False, store_text: True):

    client = LocalClient()
    processed_filings = Filing.objects.filter(is_processed=True)

    for filing in processed_filings:
        buffer_data = client.get_buffer(filing.s3_path)
        logger.info("parsing id# {0} s3_path: {1}".format(
            filing.id, filing.s3_path))
        filing_data = openedgar.parsers.edgar.parse_filing(buffer_data,
                                                           extract=True)
        filing_documents = filing.filingdocument_set.all()
        logger.info("number of FilingDocument objects calculated: {0}".format(
            len(filing_documents)))
        documents_data = filing_data["documents"]
        logger.info("number of documents coming from data stream: {0}".format(
            len(documents_data)))

        # Iterate through documents
        for document in filing_documents:
            logger.info("WE'RE IN!!!!!!!!!!!!!!!!!!!!!")
            filing_data = None

            for d in documents_data:
                logger.info("documents_data sequence: {0} type: {1}".format(
                    d["sequence"], type(d["sequence"])))
                logger.info("FilingDocument sequence: {0} type: {1}".format(
                    document.sequence, type(document.sequence)))
                if int(d["sequence"]) == document.sequence:
                    logger.info("YAY")
                    filing_data = d
            if filing_data is not None:
                # Upload text to S3 if requested
                if store_text and filing_data["content_text"] is not None:
                    raw_path = pathlib.Path(DOCUMENT_PATH, "text",
                                            filing_data["sha1"]).as_posix()
                    if not client.path_exists(raw_path):
                        client.put_buffer(raw_path,
                                          filing_data["content_text"],
                                          write_bytes=False)
                        logger.info(
                            "Uploaded text contents for filing={0}, sequence={1}, sha1={2}"
                            .format(filing, filing_data["sequence"],
                                    filing_data["sha1"]))
                    else:
                        logger.info(
                            "Text contents for filing={0}, sequence={1}, sha1={2} already exists on S3"
                            .format(filing, filing_data["sequence"],
                                    filing_data["sha1"]))
            else:
                document.is_processed = False
                document.is_error = True
                document.save()
 def content(self):
     client = LocalClient()
     filing_buffer = client.get_buffer(self.filing.s3_path)
     return filing_buffer[self.start_pos:self.end_pos]
 def get_buffer(self, filing_path):
     logger.info("Retrieving buffer from S3...")
     client = LocalClient()
     filing_buffer = client.get_buffer(filing_path)
     return openedgar.parsers.edgar.parse_filing(filing_buffer)
Beispiel #4
0
def process_filing_index(client_type: str, file_path: str, filing_index_buffer: Union[str, bytes] = None,
                         form_type_list: Iterable[str] = None, store_raw: bool = False, store_text: bool = False):
    """
    Process a filing index from an S3 path or buffer.
    :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here
    :param filing_index_buffer: buffer; if not present, s3_path must be set
    :param form_type_list: optional list of form type to process
    :param store_raw:
    :param store_text:
    :return:
    """
    # Log entry
    logger.info("Processing filing index {0}...".format(file_path))

    if client_type == "S3":
        client = S3Client()
    else:
        client = LocalClient()

    # Retrieve buffer if not passed
    if filing_index_buffer is None:
        logger.info("Retrieving filing index buffer for: {}...".format(file_path))
        filing_index_buffer = client.get_buffer(file_path)

    # Write to disk to handle headaches
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file.write(filing_index_buffer)
    temp_file.close()

    # Get main filing data structure
    filing_index_data = openedgar.parsers.edgar.parse_index_file(temp_file.name)
    logger.info("Parsed {0} records from index".format(filing_index_data.shape[0]))

    # Iterate through rows
    bad_record_count = 0
    for _, row in filing_index_data.iterrows():
        # Check for form type whitelist
        if form_type_list is not None:
            if row["Form Type"] not in form_type_list:
                logger.info("Skipping filing {0} with form type {1}...".format(row["File Name"], row["Form Type"]))
                continue

        # Cleanup path
        if row["File Name"].lower().startswith("data/"):
            filing_path = "edgar/{0}".format(row["File Name"])
        elif row["File Name"].lower().startswith("edgar/"):
            filing_path = row["File Name"]

        # Check if filing record exists
        try:
            filing = Filing.objects.get(s3_path=filing_path)
            logger.info("Filing record already exists: {0}".format(filing))
        except Filing.MultipleObjectsReturned as e:
            # Create new filing record
            logger.error("Multiple Filing records found for s3_path={0}, skipping...".format(filing_path))
            logger.info("Raw exception: {0}".format(e))
            continue
        except Filing.DoesNotExist as f:
            # Create new filing record
            logger.info("No Filing record found for {0}, creating...".format(filing_path))
            logger.info("Raw exception: {0}".format(f))

            # Check if exists; download and upload to S3 if missing
            if not client.path_exists(filing_path):
                # Download
                try:
                    filing_buffer, _ = openedgar.clients.edgar.get_buffer("/Archives/{0}".format(filing_path))
                except RuntimeError as g:
                    logger.error("Unable to access resource {0} from EDGAR: {1}".format(filing_path, g))
                    bad_record_count += 1
                    create_filing_error(row, filing_path)
                    continue

                # Upload
                client.put_buffer(filing_path, filing_buffer)

                logger.info("Downloaded from EDGAR and uploaded to {}...".format(client_type))
            else:
                # Download
                logger.info("File already stored on {}, retrieving and processing...".format(client_type))
                filing_buffer = client.get_buffer(filing_path)

            # Parse
            filing_result = process_filing(client, filing_path, filing_buffer, store_raw=store_raw, store_text=store_text)
            if filing_result is None:
                logger.error("Unable to process filing.")
                bad_record_count += 1
                create_filing_error(row, filing_path)

    # Create a filing index record
    edgar_url = "/Archives/{0}".format(file_path).replace("//", "/")
    try:
        filing_index = FilingIndex.objects.get(edgar_url=edgar_url)
        filing_index.total_record_count = filing_index_data.shape[0]
        filing_index.bad_record_count = bad_record_count
        filing_index.is_processed = True
        filing_index.is_error = False
        filing_index.save()
        logger.info("Updated existing filing index record.")
    except FilingIndex.DoesNotExist:
        filing_index = FilingIndex()
        filing_index.edgar_url = edgar_url
        filing_index.date_published = None
        filing_index.date_downloaded = datetime.date.today()
        filing_index.total_record_count = filing_index_data.shape[0]
        filing_index.bad_record_count = bad_record_count
        filing_index.is_processed = True
        filing_index.is_error = False
        filing_index.save()
        logger.info("Created new filing index record.")

    # Delete file if we make it this far
    os.remove(temp_file.name)
Beispiel #5
0
def process_company_filings(client_type: str,
                            cik: str,
                            store_raw: bool = False,
                            store_text: bool = False):
    """
    Process a filing index from an S3 path or buffer.
    :param file_path: S3 or local path to process; if filing_index_buffer is none, retrieved from here
    :param filing_index_buffer: buffer; if not present, s3_path must be set
    :param form_type_list: optional list of form type to process
    :param store_raw:
    :param store_text:
    :return:
    """

    # Log entry
    logger.info("Processing company cik {0}...".format(cik))

    # Get path to filings folder for cik
    cik_path = openedgar.clients.edgar.get_cik_path(cik)
    links = links_10k(cik)

    if client_type == "S3":
        client = S3Client()
    else:
        client = LocalClient()

    # Iterate through links
    bad_record_count = 0
    for row in links:

        # Cleanup path
        if row.lower().startswith("data/"):
            filing_path = "edgar/{0}".format(row)
        elif row.lower().startswith("edgar/"):
            filing_path = row

        # Check if filing record exists
        try:
            filing = Filing.objects.get(s3_path=filing_path)
            logger.info("Filing record already exists: {0}".format(filing))
        except Filing.MultipleObjectsReturned as e:
            # Create new filing record
            logger.error(
                "Multiple Filing records found for s3_path={0}, skipping...".
                format(filing_path))
            logger.info("Raw exception: {0}".format(e))
            continue
        except Filing.DoesNotExist as f:
            # Create new filing record
            logger.info("No Filing record found for {0}, creating...".format(
                filing_path))
            logger.info("Raw exception: {0}".format(f))

            # Check if exists; download and upload to S3 if missing
            if not client.path_exists(filing_path):
                # Download
                try:
                    filing_buffer, _ = openedgar.clients.edgar.get_buffer(
                        "/Archives/{0}".format(filing_path))
                except RuntimeError as g:
                    logger.error(
                        "Unable to access resource {0} from EDGAR: {1}".format(
                            filing_path, g))
                    bad_record_count += 1
                    #create_filing_error(row, filing_path)
                    continue

                # Upload
                client.put_buffer(filing_path, filing_buffer)

                logger.info(
                    "Downloaded from EDGAR and uploaded to {}...".format(
                        client_type))
            else:
                # Download
                logger.info(
                    "File already stored on {}, retrieving and processing...".
                    format(client_type))
                filing_buffer = client.get_buffer(filing_path)

            # Parse
            filing_result = process_filing(client,
                                           filing_path,
                                           filing_buffer,
                                           store_raw=store_raw,
                                           store_text=store_text)
            if filing_result is None:
                logger.error("Unable to process filing.")
                bad_record_count += 1