Exemple #1
0
    def handle(self, *args, **options):
        dry_run = options.get('dry_run')
        verbosity = options.get('verbosity', 1)

        if dry_run:
            self.stdout.write(
                'Dry run. No new document records will be created.\n')

        client = get_client()
        self.stdout.write('Connecting to DocumentCloud...\n')
        project = client.projects.get(
            id=DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None

        if project:
            self.stdout.write('Pulling document list for "{0}"...\n\n'.format(
                project.title))
            document_id_list = project.document_ids
            new_docs_list = []
            for doc_id in document_id_list:
                if verbosity > 1:
                    self.stdout.write('Checking "{0}"\n'.format(doc_id))
                try:
                    doc_obj = DocumentCloudProperties.objects.get(dc_id=doc_id)
                    if verbosity > 1 and doc_obj:
                        self.stdout.write(
                            'DocumentCloudProperties record for "{0}" already exists\n'
                            .format(doc_id))
                except DocumentCloudProperties.DoesNotExist:
                    dc_obj = client.documents.get(id=doc_id)
                    new_doc_props = DocumentCloudProperties(
                        dc_id=dc_obj.id, dc_url=dc_obj.canonical_url)
                    if verbosity > 1:
                        self.stdout.write('Creating record for {0}\n'.format(
                            dc_obj.id))
                        if verbosity > 2:
                            self.stdout.write(
                                'with:\n\tTitle: {title}\n\tDescription {description}\n\tAccess: {access}\n'
                                .format(title=dc_obj.title,
                                        description=dc_obj.description,
                                        access=dc_obj.access))
                    new_doc = Document(title=dc_obj.title,
                                       description=dc_obj.description,
                                       access_level=dc_obj.access)
                    filename = os.path.basename('{0}.pdf'.format(dc_obj.id))
                    if verbosity > 1:
                        self.stdout.write(
                            'Saving file named {0}\n'.format(filename))
                    if not dry_run:
                        fp = TemporaryFile()
                        djfp = File(fp)
                        djfp.write(dc_obj.pdf)
                        djfp.seek(0)
                    try:
                        if not dry_run:
                            new_doc.file.save(filename, djfp)
                            if not new_doc.file.closed:
                                new_doc.file.close()
                    except AttributeError as e:
                        if verbosity > 1:
                            self.stderr.write(
                                'Error saving doc:\n\t"{error_message}"\n'.
                                format(error_message=repr(e)))
                            if new_doc.file.url and new_doc.file.url != '':
                                self.stdout.write(
                                    "New file at:\n\t{0}\n".format(
                                        new_doc.file.url))
                        pass
                    if not dry_run:
                        djfp.close()
                        new_doc_props.save()
                        new_doc.dc_properties = new_doc_props
                        new_doc.save()
                    new_docs_list.append(doc_id)
                self.stdout.write('\n')
            self.stdout.write(
                'Added {num_docs} new Document records (out of {num_dc_docs})\n'
                .format(num_docs=len(new_docs_list),
                        num_dc_docs=len(document_id_list)))
        else:
            raise CommandError(
                'No DOCUMENTCLOUD_PROJECT_ID set in settings. Cannot proceed\n'
            )
    def handle(self, *args, **options):
        if len(args) == 0:
            raise CommandError('You must pass one or more search strings')

        verbosity = options.get('verbosity', 1)
        add_to_project = options.get('add_to_project')
        overwrite_data = options.get('overwrite_data')
        dry_run = options.get('dry_run')
        client = get_client()
        prog = re.compile(CALLSIGN_REG)
        project = client.projects.get_by_id(
            DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None
        project_size = len(project.document_list)

        if dry_run:
            self.stdout.write('Dry run. Documents will not be updated.\n')

        for search_string in args:
            self.stdout.write('Search on "{0}"...  '.format(search_string))
            document_list = client.documents.search(search_string)
            self.stdout.write('Found {0} documents.\n'.format(
                len(document_list)))
            for doc in document_list:
                self.stdout.write('Processing doc: "{0}"... '.format(doc.id))
                if add_to_project:
                    if doc.id not in project.document_ids:
                        self.stdout.write(
                            'Adding doc to project "{project_title}"\n'.format(
                                project_title=project.title))
                        if not dry_run:
                            project.document_list.append(doc)
                    else:
                        self.stdout.write(
                            'Doc exists in project "{0}"\n'.format(
                                project.title))
                else:
                    self.stdout.write('\n')
                match_obj = prog.search(doc.title)
                possible_callsign = match_obj.group(1).strip().replace(
                    ' ', '-') if hasattr(match_obj, 'group') else None
                new_data = copy.deepcopy(DOCUMENTCLOUD_META)
                if possible_callsign:
                    self.stdout.write(
                        'Found possible callsign "{callsign}" in the title: "{title}"\n'
                        .format(callsign=possible_callsign, title=doc.title))
                    broadcaster = None
                    try:
                        broadcaster = Broadcaster.objects.get(
                            callsign__startswith=possible_callsign)
                    except Broadcaster.DoesNotExist:
                        self.stderr.write(
                            "Can't find a Broadcaster with a callsign that matches {0}. Skipping...\n"
                            .format(possible_callsign))
                    except Broadcaster.MultipleObjectsReturned:
                        self.stderr.write(
                            "document's callsign, {0}, matches multiple broadcasters. Skipping...\n"
                            .format(possible_callsign))
                    if broadcaster:
                        if verbosity > 1:
                            self.stdout.write(
                                'Found broadcaster callsign "{0}"" to match "{1}"\n'
                                .format(broadcaster.callsign,
                                        possible_callsign))
                        new_data.update({'callsign': broadcaster.callsign})
                doc_data = copy.deepcopy(doc.data)
                if overwrite_data:
                    doc_data.update(new_data)
                else:
                    for key, value in new_data.iteritems():
                        if key not in doc_data:
                            doc_data[key] = value
                if doc.data != doc_data:
                    if not dry_run:
                        doc.data = doc_data
                        doc.put()
                        sleep(
                            0.125
                        )  # Not sure if this is necessary, but let's play nice-ish
                else:
                    self.stdout.write(
                        'The doc data is already up to date. There are no data changes to push\n'
                    )
                self.stdout.write('\n')

        if add_to_project:
            new_size = len(project.document_list)
            new_count = new_size - project_size
            if dry_run:
                self.stdout.write('SIMULATING: ')
            self.stdout.write(
                'Updating project document list... Adding {0} documents. '.
                format(new_count))
            if project_size != new_size:
                self.stdout.write(
                    'Project document count from {0} to {1}]\n'.format(
                        project_size, new_size))
            if not dry_run:
                project.put()
            self.stdout.write('\n')
    def handle(self, *args, **options):
        if len(args) == 0:
            raise CommandError('You must pass one or more search strings')

        verbosity = options.get('verbosity', 1)
        add_to_project = options.get('add_to_project')
        overwrite_data = options.get('overwrite_data')
        dry_run = options.get('dry_run')
        client = get_client()
        prog = re.compile(CALLSIGN_REG)
        project = client.projects.get_by_id(DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None
        project_size = len(project.document_list)

        if dry_run:
            self.stdout.write('Dry run. Documents will not be updated.\n')

        for search_string in args:
            self.stdout.write('Search on "{0}"...  '.format(search_string))
            document_list = client.documents.search(search_string)
            self.stdout.write('Found {0} documents.\n'.format(len(document_list)))
            for doc in document_list:
                self.stdout.write('Processing doc: "{0}"... '.format(doc.id))
                if add_to_project:
                    if doc.id not in project.document_ids:
                        self.stdout.write('Adding doc to project "{project_title}"\n'.format(project_title=project.title))
                        if not dry_run:
                            project.document_list.append(doc)
                    else:
                        self.stdout.write('Doc exists in project "{0}"\n'.format(project.title))
                else:
                    self.stdout.write('\n')
                match_obj = prog.search(doc.title)
                possible_callsign = match_obj.group(1).strip().replace(' ', '-') if hasattr(match_obj, 'group') else None
                new_data = copy.deepcopy(DOCUMENTCLOUD_META)
                if possible_callsign:
                    self.stdout.write('Found possible callsign "{callsign}" in the title: "{title}"\n'.format(callsign=possible_callsign, title=doc.title))
                    broadcaster = None
                    try:
                        broadcaster = Broadcaster.objects.get(callsign__startswith=possible_callsign)
                    except Broadcaster.DoesNotExist:
                        self.stderr.write("Can't find a Broadcaster with a callsign that matches {0}. Skipping...\n".format(possible_callsign))
                    except Broadcaster.MultipleObjectsReturned:
                        self.stderr.write("document's callsign, {0}, matches multiple broadcasters. Skipping...\n".format(possible_callsign))
                    if broadcaster:
                        if verbosity > 1:
                            self.stdout.write('Found broadcaster callsign "{0}"" to match "{1}"\n'.format(broadcaster.callsign, possible_callsign))
                        new_data.update({'callsign': broadcaster.callsign})
                doc_data = copy.deepcopy(doc.data)
                if overwrite_data:
                    doc_data.update(new_data)
                else:
                    for key, value in new_data.iteritems():
                        if key not in doc_data:
                            doc_data[key] = value
                if doc.data != doc_data:
                    if not dry_run:
                        doc.data = doc_data
                        doc.put()
                        sleep(0.125)  # Not sure if this is necessary, but let's play nice-ish
                else:
                    self.stdout.write('The doc data is already up to date. There are no data changes to push\n')
                self.stdout.write('\n')

        if add_to_project:
            new_size = len(project.document_list)
            new_count = new_size - project_size
            if dry_run:
                self.stdout.write('SIMULATING: ')
            self.stdout.write('Updating project document list... Adding {0} documents. '.format(new_count))
            if project_size != new_size:
                self.stdout.write('Project document count from {0} to {1}]\n'.format(project_size, new_size))
            if not dry_run:
                project.put()
            self.stdout.write('\n')
    def handle(self, *args, **options):
        dry_run = options.get("dry_run")
        verbosity = options.get("verbosity", 1)

        if dry_run:
            self.stdout.write("Dry run. No new document records will be created.\n")

        client = get_client()
        self.stdout.write("Connecting to DocumentCloud...\n")
        project = client.projects.get(id=DOCUMENTCLOUD_PROJECT_ID) if DOCUMENTCLOUD_PROJECT_ID else None

        if project:
            self.stdout.write('Pulling document list for "{0}"...\n\n'.format(project.title))
            document_id_list = project.document_ids
            new_docs_list = []
            for doc_id in document_id_list:
                if verbosity > 1:
                    self.stdout.write('Checking "{0}"\n'.format(doc_id))
                try:
                    doc_obj = DocumentCloudProperties.objects.get(dc_id=doc_id)
                    if verbosity > 1 and doc_obj:
                        self.stdout.write('DocumentCloudProperties record for "{0}" already exists\n'.format(doc_id))
                except DocumentCloudProperties.DoesNotExist:
                    dc_obj = client.documents.get(id=doc_id)
                    new_doc_props = DocumentCloudProperties(dc_id=dc_obj.id, dc_url=dc_obj.canonical_url)
                    if verbosity > 1:
                        self.stdout.write("Creating record for {0}\n".format(dc_obj.id))
                        if verbosity > 2:
                            self.stdout.write(
                                "with:\n\tTitle: {title}\n\tDescription {description}\n\tAccess: {access}\n".format(
                                    title=dc_obj.title, description=dc_obj.description, access=dc_obj.access
                                )
                            )
                    new_doc = Document(title=dc_obj.title, description=dc_obj.description, access_level=dc_obj.access)
                    filename = os.path.basename("{0}.pdf".format(dc_obj.id))
                    if verbosity > 1:
                        self.stdout.write("Saving file named {0}\n".format(filename))
                    if not dry_run:
                        fp = TemporaryFile()
                        djfp = File(fp)
                        djfp.write(dc_obj.pdf)
                        djfp.seek(0)
                    try:
                        if not dry_run:
                            new_doc.file.save(filename, djfp)
                            if not new_doc.file.closed:
                                new_doc.file.close()
                    except AttributeError as e:
                        if verbosity > 1:
                            self.stderr.write('Error saving doc:\n\t"{error_message}"\n'.format(error_message=repr(e)))
                            if new_doc.file.url and new_doc.file.url != "":
                                self.stdout.write("New file at:\n\t{0}\n".format(new_doc.file.url))
                        pass
                    if not dry_run:
                        djfp.close()
                        new_doc_props.save()
                        new_doc.dc_properties = new_doc_props
                        new_doc.save()
                    new_docs_list.append(doc_id)
                self.stdout.write("\n")
            self.stdout.write(
                "Added {num_docs} new Document records (out of {num_dc_docs})\n".format(
                    num_docs=len(new_docs_list), num_dc_docs=len(document_id_list)
                )
            )
        else:
            raise CommandError("No DOCUMENTCLOUD_PROJECT_ID set in settings. Cannot proceed\n")