def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split('.')[-1] if extension == 'doc': content, err = extract_from_doc(path) elif extension == 'docx': content, err = extract_from_docx(path) elif extension == 'html': content, err = extract_from_html(path) elif extension == 'pdf': opinion, content, err = extract_from_pdf(opinion, path, callback) elif extension == 'txt': content, err = extract_from_txt(path) elif extension == 'wpd': opinion, content, err = extract_from_wpd(opinion, path) else: print ('*****Unable to extract content due to unknown extension: %s ' 'on opinion: %s****' % (extension, opinion)) return 2 # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ['html', 'wpd']: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return opinion # Save item, and index Solr if needed. try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception: print("****Error saving text to the db for: %s****\n%s" % (opinion, traceback.format_exc())) return opinion # Identify and link citations within the document content update_document_by_id.apply_async( (opinion.pk,), countdown=citation_countdown ) return opinion
# with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion # Identify and link citations within the document content update_document_by_id.apply_async( (opinion.pk,), countdown=citation_countdown ) return opinion def convert_to_pngs(path, tmp_file_prefix): image_magick_command = ['convert', '-depth', '4', '-density', '300', '-background', 'white', '+matte', path, '%s.png' % tmp_file_prefix] magick_out = subprocess.check_output(image_magick_command, stderr=subprocess.STDOUT) return magick_out def convert_to_txt(tmp_file_prefix):
# No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception, e: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion # Identify and link citations within the document content update_document_by_id.apply_async((opinion.pk, ), countdown=citation_countdown) return opinion def convert_to_pngs(path, tmp_file_prefix): image_magick_command = [ 'convert', '-depth', '4', '-density', '300', '-background', 'white', '+matte', path, '%s.png' % tmp_file_prefix ] magick_out = subprocess.check_output(image_magick_command, stderr=subprocess.STDOUT) return magick_out
def extract_doc_content(pk, callback=None, citation_countdown=0): """ Given a document, we extract it, sniffing its extension, then store its contents in the database. Finally, we asynchronously find citations in the document content and match them to other documents. TODO: this implementation cannot be distributed due to using local paths. """ opinion = Opinion.objects.get(pk=pk) path = opinion.local_path.path extension = path.split(".")[-1] if extension == "doc": content, err = extract_from_doc(path) elif extension == "html": content, err = extract_from_html(path) elif extension == "pdf": opinion, content, err = extract_from_pdf(opinion, path, callback) elif extension == "txt": content, err = extract_from_txt(path) elif extension == "wpd": opinion, content, err = extract_from_wpd(opinion, path) else: print ( "*****Unable to extract content due to unknown extension: %s " "on opinion: %s****" % (extension, opinion) ) return 2 # Do page count, if possible opinion.page_count = get_page_count(path, extension) # Do blocked status if extension in ["html", "wpd"]: opinion.html, blocked = anonymize(content) else: opinion.plain_text, blocked = anonymize(content) if blocked: opinion.cluster.blocked = True opinion.cluster.date_blocked = now() if err: print ("****Error extracting text from %s: %s****" % (extension, opinion)) return opinion # Save item, and index Solr if needed. try: if citation_countdown == 0: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. opinion.cluster.save(index=False) opinion.save(index=False) else: # Save to the index now, citations come later, commit comes # according to schedule opinion.cluster.save(index=False) opinion.save(index=True) except Exception: print "****Error saving text to the db for: %s****" % opinion print traceback.format_exc() return opinion # Identify and link citations within the document content update_document_by_id.apply_async((opinion.pk,), countdown=citation_countdown) return opinion