def handle(self, *args, **options): dry_run = options.get('dry_run') verbosity = options.get('verbosity', 1) if dry_run: self.stdout.write( 'Dry run. No new document records will be created.\n') client = get_client() self.stdout.write('Connecting to DocumentCloud...\n') project = client.projects.get( id=DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None if project: self.stdout.write('Pulling document list for "{0}"...\n\n'.format( project.title)) document_id_list = project.document_ids new_docs_list = [] for doc_id in document_id_list: if verbosity > 1: self.stdout.write('Checking "{0}"\n'.format(doc_id)) try: doc_obj = DocumentCloudProperties.objects.get(dc_id=doc_id) if verbosity > 1 and doc_obj: self.stdout.write( 'DocumentCloudProperties record for "{0}" already exists\n' .format(doc_id)) except DocumentCloudProperties.DoesNotExist: dc_obj = client.documents.get(id=doc_id) new_doc_props = DocumentCloudProperties( dc_id=dc_obj.id, dc_url=dc_obj.canonical_url) if verbosity > 1: self.stdout.write('Creating record for {0}\n'.format( dc_obj.id)) if verbosity > 2: self.stdout.write( 'with:\n\tTitle: {title}\n\tDescription {description}\n\tAccess: {access}\n' .format(title=dc_obj.title, description=dc_obj.description, access=dc_obj.access)) new_doc = Document(title=dc_obj.title, description=dc_obj.description, access_level=dc_obj.access) filename = os.path.basename('{0}.pdf'.format(dc_obj.id)) if verbosity > 1: self.stdout.write( 'Saving file named {0}\n'.format(filename)) if not dry_run: fp = TemporaryFile() djfp = File(fp) djfp.write(dc_obj.pdf) djfp.seek(0) try: if not dry_run: new_doc.file.save(filename, djfp) if not new_doc.file.closed: new_doc.file.close() except AttributeError as e: if verbosity > 1: self.stderr.write( 'Error saving doc:\n\t"{error_message}"\n'. format(error_message=repr(e))) if new_doc.file.url and new_doc.file.url != '': self.stdout.write( "New file at:\n\t{0}\n".format( new_doc.file.url)) pass if not dry_run: djfp.close() new_doc_props.save() new_doc.dc_properties = new_doc_props new_doc.save() new_docs_list.append(doc_id) self.stdout.write('\n') self.stdout.write( 'Added {num_docs} new Document records (out of {num_dc_docs})\n' .format(num_docs=len(new_docs_list), num_dc_docs=len(document_id_list))) else: raise CommandError( 'No DOCUMENTCLOUD_PROJECT_ID set in settings. Cannot proceed\n' )
def handle(self, *args, **options): if len(args) == 0: raise CommandError('You must pass one or more search strings') verbosity = options.get('verbosity', 1) add_to_project = options.get('add_to_project') overwrite_data = options.get('overwrite_data') dry_run = options.get('dry_run') client = get_client() prog = re.compile(CALLSIGN_REG) project = client.projects.get_by_id( DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None project_size = len(project.document_list) if dry_run: self.stdout.write('Dry run. Documents will not be updated.\n') for search_string in args: self.stdout.write('Search on "{0}"... '.format(search_string)) document_list = client.documents.search(search_string) self.stdout.write('Found {0} documents.\n'.format( len(document_list))) for doc in document_list: self.stdout.write('Processing doc: "{0}"... '.format(doc.id)) if add_to_project: if doc.id not in project.document_ids: self.stdout.write( 'Adding doc to project "{project_title}"\n'.format( project_title=project.title)) if not dry_run: project.document_list.append(doc) else: self.stdout.write( 'Doc exists in project "{0}"\n'.format( project.title)) else: self.stdout.write('\n') match_obj = prog.search(doc.title) possible_callsign = match_obj.group(1).strip().replace( ' ', '-') if hasattr(match_obj, 'group') else None new_data = copy.deepcopy(DOCUMENTCLOUD_META) if possible_callsign: self.stdout.write( 'Found possible callsign "{callsign}" in the title: "{title}"\n' .format(callsign=possible_callsign, title=doc.title)) broadcaster = None try: broadcaster = Broadcaster.objects.get( callsign__startswith=possible_callsign) except Broadcaster.DoesNotExist: self.stderr.write( "Can't find a Broadcaster with a callsign that matches {0}. Skipping...\n" .format(possible_callsign)) except Broadcaster.MultipleObjectsReturned: self.stderr.write( "document's callsign, {0}, matches multiple broadcasters. Skipping...\n" .format(possible_callsign)) if broadcaster: if verbosity > 1: self.stdout.write( 'Found broadcaster callsign "{0}"" to match "{1}"\n' .format(broadcaster.callsign, possible_callsign)) new_data.update({'callsign': broadcaster.callsign}) doc_data = copy.deepcopy(doc.data) if overwrite_data: doc_data.update(new_data) else: for key, value in new_data.iteritems(): if key not in doc_data: doc_data[key] = value if doc.data != doc_data: if not dry_run: doc.data = doc_data doc.put() sleep( 0.125 ) # Not sure if this is necessary, but let's play nice-ish else: self.stdout.write( 'The doc data is already up to date. There are no data changes to push\n' ) self.stdout.write('\n') if add_to_project: new_size = len(project.document_list) new_count = new_size - project_size if dry_run: self.stdout.write('SIMULATING: ') self.stdout.write( 'Updating project document list... Adding {0} documents. '. format(new_count)) if project_size != new_size: self.stdout.write( 'Project document count from {0} to {1}]\n'.format( project_size, new_size)) if not dry_run: project.put() self.stdout.write('\n')
def handle(self, *args, **options): if len(args) == 0: raise CommandError('You must pass one or more search strings') verbosity = options.get('verbosity', 1) add_to_project = options.get('add_to_project') overwrite_data = options.get('overwrite_data') dry_run = options.get('dry_run') client = get_client() prog = re.compile(CALLSIGN_REG) project = client.projects.get_by_id(DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None project_size = len(project.document_list) if dry_run: self.stdout.write('Dry run. Documents will not be updated.\n') for search_string in args: self.stdout.write('Search on "{0}"... '.format(search_string)) document_list = client.documents.search(search_string) self.stdout.write('Found {0} documents.\n'.format(len(document_list))) for doc in document_list: self.stdout.write('Processing doc: "{0}"... '.format(doc.id)) if add_to_project: if doc.id not in project.document_ids: self.stdout.write('Adding doc to project "{project_title}"\n'.format(project_title=project.title)) if not dry_run: project.document_list.append(doc) else: self.stdout.write('Doc exists in project "{0}"\n'.format(project.title)) else: self.stdout.write('\n') match_obj = prog.search(doc.title) possible_callsign = match_obj.group(1).strip().replace(' ', '-') if hasattr(match_obj, 'group') else None new_data = copy.deepcopy(DOCUMENTCLOUD_META) if possible_callsign: self.stdout.write('Found possible callsign "{callsign}" in the title: "{title}"\n'.format(callsign=possible_callsign, title=doc.title)) broadcaster = None try: broadcaster = Broadcaster.objects.get(callsign__startswith=possible_callsign) except Broadcaster.DoesNotExist: self.stderr.write("Can't find a Broadcaster with a callsign that matches {0}. Skipping...\n".format(possible_callsign)) except Broadcaster.MultipleObjectsReturned: self.stderr.write("document's callsign, {0}, matches multiple broadcasters. Skipping...\n".format(possible_callsign)) if broadcaster: if verbosity > 1: self.stdout.write('Found broadcaster callsign "{0}"" to match "{1}"\n'.format(broadcaster.callsign, possible_callsign)) new_data.update({'callsign': broadcaster.callsign}) doc_data = copy.deepcopy(doc.data) if overwrite_data: doc_data.update(new_data) else: for key, value in new_data.iteritems(): if key not in doc_data: doc_data[key] = value if doc.data != doc_data: if not dry_run: doc.data = doc_data doc.put() sleep(0.125) # Not sure if this is necessary, but let's play nice-ish else: self.stdout.write('The doc data is already up to date. There are no data changes to push\n') self.stdout.write('\n') if add_to_project: new_size = len(project.document_list) new_count = new_size - project_size if dry_run: self.stdout.write('SIMULATING: ') self.stdout.write('Updating project document list... Adding {0} documents. '.format(new_count)) if project_size != new_size: self.stdout.write('Project document count from {0} to {1}]\n'.format(project_size, new_size)) if not dry_run: project.put() self.stdout.write('\n')
def handle(self, *args, **options): dry_run = options.get("dry_run") verbosity = options.get("verbosity", 1) if dry_run: self.stdout.write("Dry run. No new document records will be created.\n") client = get_client() self.stdout.write("Connecting to DocumentCloud...\n") project = client.projects.get(id=DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None if project: self.stdout.write('Pulling document list for "{0}"...\n\n'.format(project.title)) document_id_list = project.document_ids new_docs_list = [] for doc_id in document_id_list: if verbosity > 1: self.stdout.write('Checking "{0}"\n'.format(doc_id)) try: doc_obj = DocumentCloudProperties.objects.get(dc_id=doc_id) if verbosity > 1 and doc_obj: self.stdout.write('DocumentCloudProperties record for "{0}" already exists\n'.format(doc_id)) except DocumentCloudProperties.DoesNotExist: dc_obj = client.documents.get(id=doc_id) new_doc_props = DocumentCloudProperties(dc_id=dc_obj.id, dc_url=dc_obj.canonical_url) if verbosity > 1: self.stdout.write("Creating record for {0}\n".format(dc_obj.id)) if verbosity > 2: self.stdout.write( "with:\n\tTitle: {title}\n\tDescription {description}\n\tAccess: {access}\n".format( title=dc_obj.title, description=dc_obj.description, access=dc_obj.access ) ) new_doc = Document(title=dc_obj.title, description=dc_obj.description, access_level=dc_obj.access) filename = os.path.basename("{0}.pdf".format(dc_obj.id)) if verbosity > 1: self.stdout.write("Saving file named {0}\n".format(filename)) if not dry_run: fp = TemporaryFile() djfp = File(fp) djfp.write(dc_obj.pdf) djfp.seek(0) try: if not dry_run: new_doc.file.save(filename, djfp) if not new_doc.file.closed: new_doc.file.close() except AttributeError as e: if verbosity > 1: self.stderr.write('Error saving doc:\n\t"{error_message}"\n'.format(error_message=repr(e))) if new_doc.file.url and new_doc.file.url != "": self.stdout.write("New file at:\n\t{0}\n".format(new_doc.file.url)) pass if not dry_run: djfp.close() new_doc_props.save() new_doc.dc_properties = new_doc_props new_doc.save() new_docs_list.append(doc_id) self.stdout.write("\n") self.stdout.write( "Added {num_docs} new Document records (out of {num_dc_docs})\n".format( num_docs=len(new_docs_list), num_dc_docs=len(document_id_list) ) ) else: raise CommandError("No DOCUMENTCLOUD_PROJECT_ID set in settings. Cannot proceed\n")