Esempio n. 1
0
    def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None:
        sys.stdout.write("Graph size is {0:d} nodes.\n".format(self.count))
        sys.stdout.flush()

        index_during_subtask = False
        if self.index == "concurrently":
            index_during_subtask = True

        chunk = []
        chunk_size = 100
        processed_count = 0
        throttle = CeleryThrottle(queue_name=queue_name)
        for opinion_pk in opinion_pks:
            throttle.maybe_wait()
            processed_count += 1
            last_item = self.count == processed_count
            chunk.append(opinion_pk)
            if processed_count % chunk_size == 0 or last_item:
                find_citations_for_opinion_by_pks.apply_async(
                    args=(chunk, index_during_subtask),
                    queue=queue_name,
                )
                chunk = []

            self.log_progress(processed_count, opinion_pk)
Esempio n. 2
0
def extract_doc_content(pk, do_ocr=False, citation_jitter=False):
    """
    Given an opinion PK, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    This implementation uses local paths.

    :param pk: The opinion primary key to work on
    :param do_ocr: Whether the PDF converting function should use OCR
    :param citation_jitter: Whether to apply jitter before running the citation
    parsing code. This can be useful do spread these tasks out when doing a
    larger scrape.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split(".")[-1]
    if extension == "doc":
        content, err = extract_from_doc(path)
    elif extension == "docx":
        content, err = extract_from_docx(path)
    elif extension == "html":
        content, err = extract_from_html(path)
    elif extension == "pdf":
        content, err = extract_from_pdf(path, opinion, do_ocr)
    elif extension == "txt":
        content, err = extract_from_txt(path)
    elif extension == "wpd":
        content, err = extract_from_wpd(path, opinion)
    else:
        print("*****Unable to extract content due to unknown extension: %s "
              "on opinion: %s****" % (extension, opinion))
        return

    assert isinstance(
        content, str), "content must be of type str, not %s" % type(content)

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ["html", "wpd"]:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    update_document_from_text(opinion)

    if err:
        print(err)
        print("****Error extracting text from %s: %s****" %
              (extension, opinion))
        return

    # Save item, and index Solr if needed.
    # noinspection PyBroadException
    try:
        opinion.cluster.docket.save()
        opinion.cluster.save(index=False)
        if not citation_jitter:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return

    # Identify and link citations within the document content
    find_citations_for_opinion_by_pks.apply_async(
        ([opinion.pk], ), countdown=random.randint(0, 3600))
Esempio n. 3
0
def extract_doc_content(
    pk: int,
    ocr_available: bool = False,
    citation_jitter: bool = False,
) -> None:
    """
    Given an opinion PK, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    This implementation uses local paths.

    :param pk: The opinion primary key to work on
    :param ocr_available: Whether the PDF converting function should use OCR
    :param citation_jitter: Whether to apply jitter before running the citation
    parsing code. This can be useful do spread these tasks out when doing a
    larger scrape.
    """
    opinion = Opinion.objects.get(pk=pk)
    extension = opinion.local_path.name.split(".")[-1]

    with NamedTemporaryFile(
            prefix="extract_file_",
            suffix=f".{extension}",
            buffering=0,  # Make sure it's on disk when we try to use it
    ) as tmp:
        # Get file contents from S3 and put them in a temp file.
        tmp.write(opinion.local_path.read())

        if extension == "doc":
            content, err = extract_from_doc(tmp.name)
        elif extension == "docx":
            content, err = extract_from_docx(tmp.name)
        elif extension == "html":
            content, err = extract_from_html(tmp.name)
        elif extension == "pdf":
            content, err = extract_from_pdf(tmp.name, opinion, ocr_available)
        elif extension == "txt":
            content, err = extract_from_txt(tmp.name)
        elif extension == "wpd":
            content, err = extract_from_wpd(tmp.name, opinion)
        else:
            print(
                "*****Unable to extract content due to unknown extension: %s "
                "on opinion: %s****" % (extension, opinion))
            return

        # Do page count, if possible
        opinion.page_count = get_page_count(tmp.name, extension)

    assert isinstance(content,
                      str), f"content must be of type str, not {type(content)}"

    set_blocked_status(opinion, content, extension)
    update_document_from_text(opinion)

    if err:
        print(err)
        print(f"****Error extracting text from {extension}: {opinion}****")
        return

    # Save item, and index Solr if needed.
    # noinspection PyBroadException
    try:
        opinion.cluster.docket.save()
        opinion.cluster.save(index=False)
        if not citation_jitter:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return

    # Identify and link citations within the document content
    find_citations_for_opinion_by_pks.apply_async(
        ([opinion.pk], ), countdown=random.randint(0, 3600))
Esempio n. 4
0
def extract_doc_content(pk, do_ocr=False, citation_jitter=False):
    """
    Given an opinion PK, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    This implementation uses local paths.

    :param pk: The opinion primary key to work on
    :param do_ocr: Whether the PDF converting function should use OCR
    :param citation_jitter: Whether to apply jitter before running the citation
    parsing code. This can be useful do spread these tasks out when doing a
    larger scrape.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path)
    elif extension == 'docx':
        content, err = extract_from_docx(path)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        content, err = extract_from_pdf(path, opinion, do_ocr)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        content, err = extract_from_wpd(path, opinion)
    else:
        print ('*****Unable to extract content due to unknown extension: %s '
               'on opinion: %s****' % (extension, opinion))
        return

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ['html', 'wpd']:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" %
               (extension, opinion))
        return

    # Save item, and index Solr if needed.
    # noinspection PyBroadException
    try:
        if not citation_jitter:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return

    # Identify and link citations within the document content
    find_citations_for_opinion_by_pks.apply_async(
        ([opinion.pk],),
        countdown=random.randint(0, 3600)
    )