def scan_add(request): """Displays a form for uploading a scan.""" FormClass = get_org_upload_form(request.user) form = FormClass(request.POST or None, request.FILES or None, types={ "pdf": "application/pdf", "zip": "application/zip", }) if form.is_valid(): if request.FILES['file'].name.lower().endswith(".zip"): with tempfile.NamedTemporaryFile(delete=False, suffix="scans.zip") as fh: for chunk in request.FILES['file'].chunks(): fh.write(chunk) task_id = tasks.process_zip.delay(filename=fh.name, uploader_id=request.user.pk, org_id=form.cleaned_data['organization'].pk, redirect=reverse("moderation.home") ) else: path = tasks.move_scan_file(uploaded_file=request.FILES['file']) scan = Scan.objects.create( uploader=request.user, pdf=os.path.relpath(path, settings.MEDIA_ROOT), under_construction=True, org=form.cleaned_data['organization']) task_id = tasks.split_scan.delay(scan_id=scan.pk, redirect=reverse("moderation.edit_scan", args=[scan.pk])) return redirect('moderation.wait_for_processing', task_id) return render(request, "scanning/upload.html", {'form': form})
def scan_replace(request, scan_id=None): try: scan = Scan.objects.org_filter(request.user, pk=scan_id).get() except Scan.DoesNotExist: raise PermissionDenied form = ScanUploadForm(request.POST or None, request.FILES or None, types={"pdf": "application/pdf"}) if form.is_valid(): filepath = tasks.move_scan_file(uploaded_file=request.FILES["file"]) scan.full_delete(filesonly=True) scan.uploader = request.user scan.pdf = os.path.relpath(filepath, settings.MEDIA_ROOT) scan.save() task_id = tasks.split_scan.delay(scan_id=scan.pk, redirect=reverse("moderation.edit_scan", args=[scan.pk])) return redirect("moderation.wait_for_processing", task_id) return render(request, "scanning/replace.html", {"form": form})
def scan_replace(request, scan_id=None): try: scan = Scan.objects.org_filter(request.user, pk=scan_id).get() except Scan.DoesNotExist: raise PermissionDenied form = ScanUploadForm(request.POST or None, request.FILES or None, types={ "pdf": "application/pdf", }) if form.is_valid(): filepath = tasks.move_scan_file(uploaded_file=request.FILES['file']) scan.full_delete(filesonly=True) scan.uploader = request.user scan.pdf = os.path.relpath(filepath, settings.MEDIA_ROOT) scan.save() task_id = tasks.split_scan.delay( scan_id=scan.pk, redirect=reverse("moderation.edit_scan", args=[scan.pk]) ) return redirect('moderation.wait_for_processing', task_id) return render(request, "scanning/replace.html", {'form': form})
def new_scan(filename=None, uploaded_file=None, uploader_id=None, scan_id=None): """ Given an absolute filename or a Django UploadedFile instance, write the file to MEDIA_ROOT, then hand off processing to the asynchronous task to create scans. """ if scan_id: scan = Scan.objects.get(pk=scan_id) scan.full_delete(filesonly=True) if not (filename or uploaded_file): raise Exception("Requires one of filename or uploaded_file") try: uploader = User.objects.get(pk=uploader_id) except User.DoesNotExist: raise Exception("Requires valid uploader_id.") dest = tasks.move_scan_file(uploaded_file, filename) after_processing = reverse("moderation.home") + "#/process" if scan_id: after_processing += "/scan/%s" % scan_id task_kwargs = {'redirect': after_processing} if scan_id: scan.pdf = dest scan.uploader = uploader scan.save() task_kwargs['scan_id'] = scan.id else: task_kwargs['filename'] = dest task_kwargs['uploader_id'] = uploader_id task_id = tasks.process_scan.delay(**task_kwargs).task_id # Create a TaskMeta for us to look at while processing happens. TaskMeta.objects.create(task_id=task_id, status="PENDING", result=str(task_kwargs), date_done=datetime.datetime.now()) return task_id
def load_test_data(): data_file = os.path.join(settings.MEDIA_ROOT, "test", "test_data.yaml") uploader = User.objects.get(username='******') commenter = User.objects.create(username="******") with open(data_file) as fh: data = yaml.safe_load(fh) orgs = {} print "Setting site..." site = Site.objects.get_current() site.domain = data['site']['domain'] site.name = data['site']['name'] site.save() print "Adding admins..." for admin_data in data['admins']: user, created = User.objects.get_or_create( username=admin_data['username'], is_superuser=True, is_staff=True, ) user.set_password(admin_data['password']) user.save() print "Adding orgs..." for org_data in data['orgs']: org, created = Organization.objects.get_or_create( name=org_data['name'], personal_contact=org_data['personal_contact'], slug=slugify(org_data['name']), public=org_data['public'], mailing_address=org_data['mailing_address'], about=org_data.get('about', ''), footer=org_data.get('footer', ''), ) orgs[org_data['name']] = org for mod_data in org_data['moderators']: u, created = User.objects.get_or_create( username=mod_data['username']) u.set_password(mod_data['password']) u.save() org.moderators.add(u) Group.objects.get(name='moderators').user_set.add(u) for org_data in data['orgs']: mail_handled_by = org_data.get('outgoing_mail_handled_by', None) if mail_handled_by: org = Organization.objects.get(name=org_data['name']) mailer = Organization.objects.get(name=mail_handled_by) org.outgoing_mail_handled_by = mailer org.save() print "Building pdfs and users..." for user_data in data['users']: user, created = User.objects.get_or_create( username=slugify(user_data['name'])) if user_data.get('managed', False): random_mailing_address = "\n".join([ # Prisoner number "#%s" % "".join(random.choice(string.digits) for a in range(8)), # Street "%s Cherry Tree Lane" % "".join(random.choice(string.digits) for a in range(3)), # City, state, zip "City Name, %s %s" % ( random.choice(US_STATES)[0], "".join(random.choice(string.digits) for a in range(5)), ) ]) else: random_mailing_address = "" user.profile.display_name = user_data['name'] user.profile.mailing_address = random_mailing_address user.profile.blogger = user_data.get('blogger', False) user.profile.managed = user_data.get('managed', False) user.profile.consent_form_received = user_data.get( 'consent_form_received', False) user.profile.blog_name = user_data.get('blog_name', None) or '' user.profile.save() for org_name in user_data['orgs']: orgs[org_name].members.add(user) for corresp in user_data['correspondence']: direction, content = corresp.items()[0] if direction == "received": # Build Scan pdf = build_pdf(content['parts'], user.profile) path = tasks.move_scan_file(filename=pdf) scan = Scan.objects.create(uploader=uploader, org=orgs[org_name], author=user, pdf=os.path.relpath( path, settings.MEDIA_ROOT), under_construction=True, processing_complete=True, created=content['date']) # execute synchronously tasks.split_scan(scan_id=scan.pk) # Build Documents page_count = 1 # ignore envelope for part in content['parts']: page_count += part["pages"] if part["type"] == "ignore": continue document = Document.objects.create( scan=scan, editor=uploader, author=user, type=part["type"], date_written=content["date"], created=content["date"], title=part.get("title", None) or "", ) for i, page_index in enumerate( range(page_count - part["pages"], page_count)): scanpage = scan.scanpage_set.get(order=page_index) DocumentPage.objects.create(document=document, scan_page=scanpage, order=i) # execute synchronously if part["type"] in ("profile", "post"): document.status = "published" else: document.status = "unpublishable" document.highlight_transform = '{"document_page_id": %s, "crop": [44.5, 58.66667175292969, 582.5, 288.6666717529297]}' % document.documentpage_set.all( )[0].pk document.save() tasks.update_document_images(document.pk) for comment in part.get('comments', []): Comment.objects.create( user=commenter, comment= "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec a diam lectus. Sed sit amet ipsum mauris. Maecenas congue ligula ac quam viverra nec consectetur ante hendrerit. Donec et mollis dolor. Praesent et diam eget libero egestas mattis sit amet vitae augue. Nam tincidunt congue enim, ut porta lorem lacinia consectetur. Donec ut libero sed arcu vehicula ultricies a non tortor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ut gravida lorem. Ut turpis felis, pulvinar a semper sed, adipiscing id dolor. Pellentesque auctor nisi id magna consequat sagittis.", document=document, created=comment['date'], ) # Finish received scans before parsing letters, to ensure comments/etc # are there yet. for corresp in user_data['correspondence']: direction, content = corresp.items()[0] if direction == "sent": letter = Letter( type=content['type'], auto_generated=True, sender=uploader, created=content['date'], sent=content['date'], recipient=user, org=Organization.objects.get(name=user_data['orgs'][0])) if content['type'] == "comments": letter.save() comments = Comment.objects.unmailed().filter( document__author=user, created__lt=content['date']) for comment in comments: letter.comments.add(comment) elif content['type'] == "letter": letter.body = content['body'] letter.save()
def edit_profile(request, user_id=None): edit_profile = can_edit_profile(request.user, user_id) edit_user = can_edit_user(request.user, user_id) if not edit_profile and not edit_user: raise PermissionDenied user = get_object_or_404(User, pk=user_id) try: document = Document.objects.filter(type="profile", status="published", author=user).order_by('-modified')[0] except IndexError: document = None # XXX Could probably simplify the permissions backflips by assuming that an # editor using this interface either has permissions to edit both # profile/user, or neither. user_form = None profile_form = None scan_upload_form = None ProfileForm = get_profile_form(request.user) if request.method == 'POST': if edit_profile: profile_form = ProfileForm(request.POST, instance=user.profile) scan_upload_form = ProfileUploadForm(request.POST, request.FILES) if edit_user: user_form = UserFormNoEmail(request.POST, instance=user) if (not profile_form or profile_form.is_valid()) and \ (not user_form or user_form.is_valid()) and \ (not scan_upload_form or scan_upload_form.is_valid()): if profile_form: profile_form.save() if user_form: user_form.save() if scan_upload_form and 'file' in request.FILES: pdf = move_scan_file(uploaded_file=request.FILES['file']) scan = Scan.objects.create( uploader=user, author=user, pdf=pdf ) task_id = process_scan_to_profile.delay( scan.pk, reverse('profiles.profile_show', args=[user_id]), ) return redirect('moderation.wait_for_processing', task_id=task_id) messages.success(request, _("Changes saved.")) return redirect('profiles.profile_show', user_id) else: if edit_profile: profile_form = ProfileForm(instance=user.profile) scan_upload_form = ProfileUploadForm() if edit_user: user_form = UserFormNoEmail(instance=user) return render(request, "profiles/profile_edit.html", { 'document': document, 'profile_form': profile_form, 'user_form': user_form, 'scan_upload_form': scan_upload_form, 'profile': user.profile, 'can_edit_profile': edit_profile, 'can_edit_user': edit_user, })
def handle(self, *args, **kwargs): base_url = "https://www.mailboxforwarding.com/" if (not hasattr(settings, "MAILBOX_FORWARDING") or not "username" in settings.MAILBOX_FORWARDING or not "password" in settings.MAILBOX_FORWARDING): print "Requires MAILBOX_FORWARDING settings, e.g.:" print 'MAILBOX_FORWARDING = {' print ' "username": "******",' print ' "password": "******",' print '}' print "exit 1" sys.exit(1) sess = requests.Session() res = sess.post(base_url + "manage/login.php", { "action": "login", "email": settings.MAILBOX_FORWARDING["username"], "password": settings.MAILBOX_FORWARDING["password"], "loginsubmit.x": "0", "loginsubmit.y": "0" }) # This is a dslightly dirty hack -- we're matching a javascript data # structure with a regex, converting the quotes to doubles so it resembles # JSON, and then loading it as JSON. This may prove brittle. match = re.search(r"Ext\.grid\.dummyData = (\[.*\]\]);", res.text, re.DOTALL) text = match.group(1) text = text.replace('"', '\\"') text = text.replace("'", '"') data = json.loads(text) scans = {} packages = {} for a,b,date,c,kind,status,dl in data: match = re.search("pdfview.php\?id=(\d+)", dl) if match: id_ = match.group(1) else: id_ = None obj = {"date": date, "kind": kind, "status": status} if status == "Scanned": scans[id_] = obj else: packages[id_] = obj uploader = User.objects.get(username="******") org = Organization.objects.get(pk=1) #TODO: generalize this? new_count = 0 for id_,details in scans.iteritems(): source_id = "mailboxforwarding.com-{}".format(id_) if Scan.objects.filter(source_id=source_id).exists(): continue new_count += 1 print "Downloading", source_id with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh: res = sess.get("{}manage/pdfview.php?id={}".format(base_url, id_)) fh.write(res.content) name = fh.name path = tasks.move_scan_file(filename=name) scan = Scan.objects.create( uploader=uploader, pdf=os.path.relpath(path, settings.MEDIA_ROOT), under_construction=True, org=org, source_id=source_id ) tasks.split_scan(scan=scan) if packages: print "Manual action needed on the following at " \ "https://www.mailboxforwarding.com/:" for id_,details in packages.iteritems(): new_count += 1 print details print "Examined {} letters, {} new.".format(len(data), new_count)
def handle(self, *args, **kwargs): base_url = "https://www.mailboxforwarding.com/" if (not hasattr(settings, "MAILBOX_FORWARDING") or not "username" in settings.MAILBOX_FORWARDING or not "password" in settings.MAILBOX_FORWARDING): print "Requires MAILBOX_FORWARDING settings, e.g.:" print 'MAILBOX_FORWARDING = {' print ' "username": "******",' print ' "password": "******",' print '}' print "exit 1" sys.exit(1) sess = requests.Session() res = sess.post( base_url + "manage/login.php", { "action": "login", "email": settings.MAILBOX_FORWARDING["username"], "password": settings.MAILBOX_FORWARDING["password"], "loginsubmit.x": "0", "loginsubmit.y": "0" }) # This is a slightly dirty hack -- we're matching a javascript data # structure with a regex, converting the quotes to doubles so it resembles # JSON, and then loading it as JSON. This may prove brittle. match = re.search(r"Ext\.grid\.dummyData = (\[.*\]\]);", res.text, re.DOTALL) if not match: raise Exception("Can't find data. Are login creds correct?") text = match.group(1) text = text.replace('"', '\\"') text = text.replace("'", '"') data = json.loads(text) scans = {} packages = {} for checkbox, date, envelope, type_status, dl in data: details = {} match = re.search("Status: <b>([^<]+)</b>.*Type: <b>([^<]+)</b>", type_status) if not match: raise Exception("Can't match type/status") details['kind'] = match.group(2) details['status'] = match.group(1) if details['kind'] == "Letter" and details['status'] != "Scanned": continue match = re.search("pdfview.php\?id=(\d+)", dl) if match: id_ = match.group(1) else: # TODO: Handle packages correctly continue #raise Exception("Can't find ID") match = re.search("src=\"([^\"]+)\"", envelope) if not match: raise Exception("Can't match envelope image") details['envelope'] = match.group(1) if details['status'] == "Scanned": scans[id_] = details elif details['kind'] != "Letter": packages[id_] = details uploader = User.objects.get(username="******") org = Organization.objects.get(pk=1) #TODO: generalize this? new_count = 0 for id_, details in scans.iteritems(): source_id = "mailboxforwarding.com-{}".format(id_) if Scan.objects.filter(source_id=source_id).exists(): continue new_count += 1 print "Downloading pdf", source_id res = sess.get("{}manage/pdfview.php?id={}".format(base_url, id_)) in_pdf_fh = StringIO() in_pdf_fh.write(res.content) in_pdf_fh.seek(0) reader = PdfFileReader(in_pdf_fh) print "Downloading envelope", details['envelope'] res = sess.get(details['envelope']) in_envelope_fh = StringIO() in_envelope_fh.write(res.content) in_envelope_fh.seek(0) img = Image.open(in_envelope_fh) out_envelope_fh = StringIO() img.save(out_envelope_fh, "pdf") envelope_reader = PdfFileReader(out_envelope_fh) writer = PdfFileWriter() writer.addPage(envelope_reader.getPage(0)) for page in range(reader.getNumPages()): writer.addPage(reader.getPage(page)) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh: writer.write(fh) dest_pdf_name = fh.name in_envelope_fh.close() out_envelope_fh.close() in_pdf_fh.close() path = tasks.move_scan_file(filename=dest_pdf_name) scan = Scan.objects.create(uploader=uploader, pdf=os.path.relpath( path, settings.MEDIA_ROOT), under_construction=True, org=org, source_id=source_id) tasks.split_scan(scan=scan) if packages: print "Manual action needed on the following at " \ "https://www.mailboxforwarding.com/:" for id_, details in packages.iteritems(): new_count += 1 print details print "Examined {} letters, {} new.".format(len(data), new_count)
def edit_profile(request, user_id=None): #FIXME: org permission here edit_profile = can_edit_profile(request.user, user_id) edit_user = can_edit_user(request.user, user_id) if not edit_profile and not edit_user: raise PermissionDenied user = get_object_or_404(User, pk=user_id) try: document = Document.objects.filter( type="profile", status="published", author=user).order_by('-modified')[0] except IndexError: document = None # XXX Could probably simplify the permissions backflips by assuming that an # editor using this interface either has permissions to edit both # profile/user, or neither. user_form = None profile_form = None scan_upload_form = None ProfileForm = get_profile_form(request.user) if request.method == 'POST': if edit_profile: profile_form = ProfileForm(request.POST, instance=user.profile) scan_upload_form = ProfileUploadForm(request.POST, request.FILES) if edit_user: user_form = UserFormNoEmail(request.POST, instance=user) if (not profile_form or profile_form.is_valid()) and \ (not user_form or user_form.is_valid()) and \ (not scan_upload_form or scan_upload_form.is_valid()): if profile_form: profile_form.save() if user_form: user_form.save() if scan_upload_form and 'file' in request.FILES: pdf = move_scan_file(uploaded_file=request.FILES['file']) scan = Scan.objects.create(uploader=user, author=user, pdf=pdf) task_id = process_scan_to_profile.delay( scan.pk, reverse('profiles.profile_show', args=[user_id]), ) return redirect('moderation.wait_for_processing', task_id=task_id) messages.success(request, _("Changes saved.")) return redirect('profiles.profile_show', user_id) else: if edit_profile: profile_form = ProfileForm(instance=user.profile) scan_upload_form = ProfileUploadForm() if edit_user: user_form = UserFormNoEmail(instance=user) return render( request, "profiles/profile_edit.html", { 'document': document, 'profile_form': profile_form, 'user_form': user_form, 'scan_upload_form': scan_upload_form, 'profile': user.profile, 'can_edit_profile': edit_profile, 'can_edit_user': edit_user, })
def load_test_data(): data_file = os.path.join(settings.MEDIA_ROOT, "test", "test_data.yaml") uploader = User.objects.get(username='******') commenter = User.objects.create(username="******") with open(data_file) as fh: data = yaml.safe_load(fh) orgs = {} print "Setting site..." site = Site.objects.get_current() site.domain = data['site']['domain'] site.name = data['site']['name'] site.save() print "Adding admins..." for admin_data in data['admins']: user, created = User.objects.get_or_create( username=admin_data['username'], is_superuser=True, is_staff=True, ) user.set_password(admin_data['password']) user.save() print "Adding orgs..." for org_data in data['orgs']: org, created = Organization.objects.get_or_create( name=org_data['name'], personal_contact=org_data['personal_contact'], slug=slugify(org_data['name']), public=org_data['public'], mailing_address=org_data['mailing_address'], about=org_data.get('about', ''), footer=org_data.get('footer', ''), ) orgs[org_data['name']] = org for mod_data in org_data['moderators']: u, created = User.objects.get_or_create( username=mod_data['username'] ) u.set_password(mod_data['password']) u.save() org.moderators.add(u) Group.objects.get(name='moderators').user_set.add(u) for org_data in data['orgs']: mail_handled_by = org_data.get('outgoing_mail_handled_by', None) if mail_handled_by: org = Organization.objects.get(name=org_data['name']) mailer = Organization.objects.get(name=mail_handled_by) org.outgoing_mail_handled_by = mailer org.save() print "Building pdfs and users..." for user_data in data['users']: user, created = User.objects.get_or_create( username=slugify(user_data['name']) ) if user_data.get('managed', False): random_mailing_address = "\n".join([ # Prisoner number "#%s" % "".join(random.choice(string.digits) for a in range(8)), # Street "%s Cherry Tree Lane" % "".join( random.choice(string.digits) for a in range(3)), # City, state, zip "City Name, %s %s" % ( random.choice(US_STATES)[0], "".join(random.choice(string.digits) for a in range(5)), ) ]) else: random_mailing_address = "" user.profile.display_name = user_data['name'] user.profile.mailing_address = random_mailing_address user.profile.blogger = user_data.get('blogger', False) user.profile.managed = user_data.get('managed', False) user.profile.consent_form_received = user_data.get('consent_form_received', False) user.profile.blog_name = user_data.get('blog_name', None) or '' user.profile.save() for org_name in user_data['orgs']: orgs[org_name].members.add(user) for corresp in user_data['correspondence']: direction, content = corresp.items()[0] if direction == "received": # Build Scan pdf = build_pdf(content['parts'], user.profile) path = tasks.move_scan_file(filename=pdf) scan = Scan.objects.create( uploader=uploader, org=orgs[org_name], author=user, pdf=os.path.relpath(path, settings.MEDIA_ROOT), under_construction=True, processing_complete=True, created=content['date']) # execute synchronously tasks.split_scan(scan_id=scan.pk) # Build Documents page_count = 1 # ignore envelope for part in content['parts']: page_count += part["pages"] if part["type"] == "ignore": continue document = Document.objects.create( scan=scan, editor=uploader, author=user, type=part["type"], date_written=content["date"], created=content["date"], title=part.get("title", None) or "", ) for i, page_index in enumerate( range(page_count - part["pages"], page_count)): scanpage = scan.scanpage_set.get(order=page_index) DocumentPage.objects.create( document=document, scan_page=scanpage, order=i) # execute synchronously if part["type"] in ("profile", "post"): document.status = "published" else: document.status = "unpublishable" document.highlight_transform = '{"document_page_id": %s, "crop": [44.5, 58.66667175292969, 582.5, 288.6666717529297]}' % document.documentpage_set.all()[0].pk document.save() tasks.update_document_images(document.pk) for comment in part.get('comments', []): Comment.objects.create( user=commenter, comment="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec a diam lectus. Sed sit amet ipsum mauris. Maecenas congue ligula ac quam viverra nec consectetur ante hendrerit. Donec et mollis dolor. Praesent et diam eget libero egestas mattis sit amet vitae augue. Nam tincidunt congue enim, ut porta lorem lacinia consectetur. Donec ut libero sed arcu vehicula ultricies a non tortor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean ut gravida lorem. Ut turpis felis, pulvinar a semper sed, adipiscing id dolor. Pellentesque auctor nisi id magna consequat sagittis.", document=document, created=comment['date'], ) # Finish received scans before parsing letters, to ensure comments/etc # are there yet. for corresp in user_data['correspondence']: direction, content = corresp.items()[0] if direction == "sent": letter = Letter(type=content['type'], auto_generated=True, sender=uploader, created=content['date'], sent=content['date'], recipient=user, org=Organization.objects.get(name=user_data['orgs'][0])) if content['type'] == "comments": letter.save() comments = Comment.objects.unmailed().filter( document__author=user, created__lt=content['date'] ) for comment in comments: letter.comments.add(comment) elif content['type'] == "letter": letter.body = content['body'] letter.save()
def handle(self, *args, **kwargs): base_url = "https://www.mailboxforwarding.com/" if (not hasattr(settings, "MAILBOX_FORWARDING") or not "username" in settings.MAILBOX_FORWARDING or not "password" in settings.MAILBOX_FORWARDING): print "Requires MAILBOX_FORWARDING settings, e.g.:" print 'MAILBOX_FORWARDING = {' print ' "username": "******",' print ' "password": "******",' print '}' print "exit 1" sys.exit(1) sess = requests.Session() res = sess.post(base_url + "manage/login.php", { "action": "login", "email": settings.MAILBOX_FORWARDING["username"], "password": settings.MAILBOX_FORWARDING["password"], "loginsubmit.x": "0", "loginsubmit.y": "0" }) # This is a slightly dirty hack -- we're matching a javascript data # structure with a regex, converting the quotes to doubles so it resembles # JSON, and then loading it as JSON. This may prove brittle. match = re.search(r"Ext\.grid\.dummyData = (\[.*\]\]);", res.text, re.DOTALL) if not match: raise Exception("Can't find data. Are login creds correct?") text = match.group(1) text = text.replace('"', '\\"') text = text.replace("'", '"') data = json.loads(text) scans = {} packages = {} for checkbox, date, envelope, type_status, dl in data: details = {} match = re.search("Type: <b>([^<]+)</b>.*Status: <b>([^<]+)</b>", type_status) if not match: raise Exception("Can't match type/status") details['kind'] = match.group(1) details['status'] = match.group(2) if details['kind'] == "Letter" and details['status'] != "Scanned": continue match = re.search("pdfview.php\?id=(\d+)", dl) if match: id_ = match.group(1) else: # TODO: Handle packages correctly continue #raise Exception("Can't find ID") match = re.search("src=\"([^\"]+)\"", envelope) if not match: raise Exception("Can't match envelope image") details['envelope'] = match.group(1) if details['status'] == "Scanned": scans[id_] = details elif details['kind'] != "Letter": packages[id_] = details uploader = User.objects.get(username="******") org = Organization.objects.get(pk=1) #TODO: generalize this? new_count = 0 for id_, details in scans.iteritems(): source_id = "mailboxforwarding.com-{}".format(id_) if Scan.objects.filter(source_id=source_id).exists(): continue new_count += 1 print "Downloading pdf", source_id res = sess.get("{}manage/pdfview.php?id={}".format(base_url, id_)) in_pdf_fh = StringIO() in_pdf_fh.write(res.content) in_pdf_fh.seek(0) reader = PdfFileReader(in_pdf_fh) print "Downloading envelope", details['envelope'] res = sess.get(details['envelope']) in_envelope_fh = StringIO() in_envelope_fh.write(res.content) in_envelope_fh.seek(0) img = Image.open(in_envelope_fh) out_envelope_fh = StringIO() img.save(out_envelope_fh, "pdf") envelope_reader = PdfFileReader(out_envelope_fh) writer = PdfFileWriter() writer.addPage(envelope_reader.getPage(0)) for page in range(reader.getNumPages()): writer.addPage(reader.getPage(page)) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh: writer.write(fh) dest_pdf_name = fh.name in_envelope_fh.close() out_envelope_fh.close() in_pdf_fh.close() path = tasks.move_scan_file(filename=dest_pdf_name) scan = Scan.objects.create( uploader=uploader, pdf=os.path.relpath(path, settings.MEDIA_ROOT), under_construction=True, org=org, source_id=source_id ) tasks.split_scan(scan=scan) if packages: print "Manual action needed on the following at " \ "https://www.mailboxforwarding.com/:" for id_, details in packages.iteritems(): new_count += 1 print details print "Examined {} letters, {} new.".format(len(data), new_count)