Example #1
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split('.')[-1]
    if extension == 'doc':
        content, err = extract_from_doc(path)
    elif extension == 'docx':
        content, err = extract_from_docx(path)
    elif extension == 'html':
        content, err = extract_from_html(path)
    elif extension == 'pdf':
        opinion, content, err = extract_from_pdf(opinion, path, callback)
    elif extension == 'txt':
        content, err = extract_from_txt(path)
    elif extension == 'wpd':
        opinion, content, err = extract_from_wpd(opinion, path)
    else:
        print ('*****Unable to extract content due to unknown extension: %s '
               'on opinion: %s****' % (extension, opinion))
        return 2

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ['html', 'wpd']:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" %
               (extension, opinion))
        return opinion

    # Save item, and index Solr if needed.
    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception:
        print("****Error saving text to the db for: %s****\n%s" %
              (opinion, traceback.format_exc()))
        return opinion

    # Identify and link citations within the document content
    update_document_by_id.apply_async(
        (opinion.pk,),
        countdown=citation_countdown
    )

    return opinion
Example #2
0
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion

    # Identify and link citations within the document content
    update_document_by_id.apply_async(
        (opinion.pk,),
        countdown=citation_countdown
    )

    return opinion


def convert_to_pngs(path, tmp_file_prefix):
    image_magick_command = ['convert', '-depth', '4', '-density', '300',
                            '-background', 'white', '+matte', path,
                            '%s.png' % tmp_file_prefix]
    magick_out = subprocess.check_output(image_magick_command,
                                         stderr=subprocess.STDOUT)
    return magick_out


def convert_to_txt(tmp_file_prefix):
Example #3
0
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception, e:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion

    # Identify and link citations within the document content
    update_document_by_id.apply_async((opinion.pk, ),
                                      countdown=citation_countdown)

    return opinion


def convert_to_pngs(path, tmp_file_prefix):
    image_magick_command = [
        'convert', '-depth', '4', '-density', '300', '-background', 'white',
        '+matte', path,
        '%s.png' % tmp_file_prefix
    ]
    magick_out = subprocess.check_output(image_magick_command,
                                         stderr=subprocess.STDOUT)
    return magick_out

Example #4
0
def extract_doc_content(pk, callback=None, citation_countdown=0):
    """
    Given a document, we extract it, sniffing its extension, then store its
    contents in the database.  Finally, we asynchronously find citations in
    the document content and match them to other documents.

    TODO: this implementation cannot be distributed due to using local paths.
    """
    opinion = Opinion.objects.get(pk=pk)

    path = opinion.local_path.path

    extension = path.split(".")[-1]
    if extension == "doc":
        content, err = extract_from_doc(path)
    elif extension == "html":
        content, err = extract_from_html(path)
    elif extension == "pdf":
        opinion, content, err = extract_from_pdf(opinion, path, callback)
    elif extension == "txt":
        content, err = extract_from_txt(path)
    elif extension == "wpd":
        opinion, content, err = extract_from_wpd(opinion, path)
    else:
        print (
            "*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion)
        )
        return 2

    # Do page count, if possible
    opinion.page_count = get_page_count(path, extension)

    # Do blocked status
    if extension in ["html", "wpd"]:
        opinion.html, blocked = anonymize(content)
    else:
        opinion.plain_text, blocked = anonymize(content)
    if blocked:
        opinion.cluster.blocked = True
        opinion.cluster.date_blocked = now()

    if err:
        print ("****Error extracting text from %s: %s****" % (extension, opinion))
        return opinion

    # Save item, and index Solr if needed.
    try:
        if citation_countdown == 0:
            # No waiting around. Save to the database now, but don't bother
            # with the index yet because citations are being done imminently.
            opinion.cluster.save(index=False)
            opinion.save(index=False)
        else:
            # Save to the index now, citations come later, commit comes
            # according to schedule
            opinion.cluster.save(index=False)
            opinion.save(index=True)
    except Exception:
        print "****Error saving text to the db for: %s****" % opinion
        print traceback.format_exc()
        return opinion

    # Identify and link citations within the document content
    update_document_by_id.apply_async((opinion.pk,), countdown=citation_countdown)

    return opinion