コード例 #1
0
 def setUp(self):
     self.test_search = 'Calpers special review'
     self.test_id = '74103-report-of-the-calpers-special-review'
     self.public_client = DocumentCloud()
     self.private_client = DocumentCloud(DOCUMENTCLOUD_USERNAME,
                                         DOCUMENTCLOUD_PASSWORD)
     self.fake_client = DocumentCloud("John Doe", "TK")
コード例 #2
0
 def setUp(self):
     """
     Initialize a bunch of variables we'll use across tests.
     """
     self.test_id = '2511322-lafd-recruitment-report'
     self.public_client = DocumentCloud()
     self.private_client = DocumentCloud(
         os.environ['DOCUMENTCLOUD_TEST_USERNAME'],
         os.environ['DOCUMENTCLOUD_TEST_PASSWORD'])
     self.fake_client = DocumentCloud("John Doe", "TK")
     self.version = self.get_version()
コード例 #3
0
ファイル: test.py プロジェクト: qiz264/python-documentcloud
 def setUp(self):
     """
     Initialize a bunch of variables we'll use across tests.
     """
     self.test_id = '74103-report-of-the-calpers-special-review'
     self.public_client = DocumentCloud()
     self.private_client = DocumentCloud(
         os.environ['DOCUMENTCLOUD_TEST_USERNAME'],
         os.environ['DOCUMENTCLOUD_TEST_PASSWORD'])
     self.fake_client = DocumentCloud("John Doe", "TK")
     self.version = self.get_version()
コード例 #4
0
def edit():
    from documentcloud import DocumentCloud
    dc_id = request.args(0) or redirect(URL('document', 'index'))
    db.documentCloud.file.requires = None
    record = db.documentCloud(dc_id)
    form = SQLFORM(db.documentCloud, record)

    if form.validate():
        client = DocumentCloud(username=dc_username, password=dc_password)
        doc_cloud = client.documents.get(record.dc_id)
        doc_cloud.title = form.vars['title']
        doc_cloud.source = form.vars['source']
        doc_cloud.description = form.vars['description']
        doc_cloud.related_article = form.vars['related_article']
        doc_cloud.published_url = form.vars['published_url']
        doc_cloud.access = form.vars['access']
        doc_cloud.project = form.vars['project']
        #data=json.dumps(form.vars['data']),
        doc_cloud.secure = form.vars['secure']
        rest = doc_cloud.put()
        if rest is not None:
            response.flash = T('Documento actualizado')
    elif form.errors:
        response.flash = T('Hay errores en el formulario')
    else:
        response.flash = T('Por favor llene el formulario')

    return dict(form=form)
コード例 #5
0
    def update_to_documentcloud(self, field, value):
        if self.source_type not in AttachmentSourceType.DOCUMENTCLOUD_SOURCE_TYPES:
            return

        client = DocumentCloud(settings.DOCUMENTCLOUD_USER,
                               settings.DOCUMENTCLOUD_PASSWORD)

        try:
            doc = client.documents.get(self.external_id)
        except DoesNotExistError:
            logger.error(
                f'Cannot find document with external id {self.external_id} on DocumentCloud'
            )
            return

        if getattr(doc, field, None) == value:
            return

        setattr(doc, field, value)

        try:
            doc.save()
        except HTTPError:
            logger.error(
                f'Cannot save document with external id {self.external_id} on DocumentCloud'
            )
コード例 #6
0
def get_project(args):
    """Retrieve project metadata"""
    client = DocumentCloud(args.username, args.password)

    if args.id_or_title is None:
        # Get all projects
        projects = client.projects.all()
        print(json.dumps({
            'projects': [serialize_project(p) for p in projects],
        }))


    match_order = ['title', 'id']

    if re.match(r'^\d+$', args.id_or_title):
        match_order = ['id', 'title']

    for match_term in match_order:
        kwargs = {}
        kwargs[match_term] = args.id_or_title

        try:
            project = client.projects.get(**kwargs)
            print(json.dumps({
                'projects': [serialize_project(project)],
            }))
            break

        except DoesNotExistError:
            pass

    else:
        sys.stderr.write("Project with id or title '{}' does not exist\n".format(
            args.id_or_title))
        sys.exit(1)
コード例 #7
0
ファイル: importers.py プロジェクト: invinst/CPDBv2_backend
    def upload_to_documentcloud(self):
        client = DocumentCloud(settings.DOCUMENTCLOUD_USER,
                               settings.DOCUMENTCLOUD_PASSWORD)

        attachments = AttachmentFile.objects.filter(
            source_type=self.source_type,
            file_type=MEDIA_TYPE_DOCUMENT,
            pending_documentcloud_id__isnull=True,
            upload_fail_attempts__lte=UPLOAD_FAIL_MAX_ATTEMPTS)

        self.log_info(
            f'Uploading {len(attachments)} documents to DocumentCloud')

        for attachment in tqdm(attachments):
            source_type = AttachmentSourceType.SOURCE_TYPE_MAPPINGS[
                attachment.source_type]

            cloud_document = client.documents.upload(
                attachment.original_url,
                title=format_copa_documentcloud_title(
                    attachment.allegation.crid, attachment.title),
                description=source_type,
                access='public',
                force_ocr=True)

            attachment.pending_documentcloud_id = parse_id(cloud_document.id)
            attachment.save()

        self.log_info(f'Done uploading!')
コード例 #8
0
def download_ocr_text_per_page(documentcloud_id, page_num, credentials):
    page_text = None

    try:
        dc_client = DocumentCloud(**credentials)
        obj = dc_client.documents.get(documentcloud_id)
        '''
		workaround for private docs is, set public, wait, then private again.
		https://github.com/documentcloud/documentcloud/issues/220
		'''

        apply_access_workaround = (obj.access == DC_PRIVATE)

        if apply_access_workaround:
            obj.access = DC_PUBLIC
            obj.put()

            while obj.access in [DC_PRIVATE, DC_PENDING]:
                sleep(WORKAROUND_SLEEP)
                obj = dc_client.documets.get(documentcloud_id)

        page_text = obj.get_page_text(page_num)

        if apply_access_workaround:
            obj.access = DC_PRIVATE
            obj.put()

    except Exception as e:
        print "download_ocr_text_per_page ERROR"
        print e, type(e)

    return page_text
コード例 #9
0
 def __init__(self, logger, force_update=False, custom_search_syntaxes=None):
     super(DocumentCloudAttachmentImporter, self).__init__(logger)
     self.kept_attachments = []
     self.updated_attachments = []
     self.force_update = force_update
     self.custom_search_syntaxes = custom_search_syntaxes
     self.client = DocumentCloud(settings.DOCUMENTCLOUD_USER, settings.DOCUMENTCLOUD_PASSWORD)
コード例 #10
0
def upload_file(project_name, file_to_upload):
    client = DocumentCloud(USERNAME,
                           PASSWORD,
                           loglevel=logging.INFO,
                           timeout=30)
    project, created = client.projects.get_or_create_by_title(project_name)
    obj = client.documents.upload(file_to_upload,
                                  handle_errors=True,
                                  project=project.id)
コード例 #11
0
def connect_client():
    '''Connect your machine to DocumentCloud's API'''
    if PASSWORD and USERNAME:
        print('Getting DocumentCloud credentials from local_settings.py')
        client = DocumentCloud(USERNAME, PASSWORD)
        return client
    else:
        print("You must add your credentials to local_settings.py")
        exit()
コード例 #12
0
def upload_pdf_to_documentcloud(pdf, credentials):
    try:
        dc_client = DocumentCloud(**credentials)
        return dc_client.documents.upload(pdf, secure=True, force_ocr=True)
    except Exception as e:
        print "upload_pdf_to_documentcloud ERROR"
        print e, type(e)

    return None
コード例 #13
0
def add_document():
    import os
    import json
    from documentcloud import DocumentCloud
    dc_id = None
    respuesta = None
    #this is the controller function that will appear in our dialog
    form = SQLFORM(db.documentCloud)

    if form.validate():
        dc_cloud = DocumentCloud(username=dc_username, password=dc_password)
        dc_id = dc_cloud.documents.upload(
            os.path.join(request.folder, 'uploads', form.vars['file']),
            title=form.vars['title'],
            source=form.vars['source'],
            description=form.vars['description'],
            related_article=form.vars['related_article'],
            published_url=form.vars['published_url'],
            access=form.vars['access'],
            project=form.vars['project'],
            #data=json.dumps(form.vars['data']),
            secure=form.vars['secure'])
        if dc_id is not None:
            form.vars.dc_id = dc_id.id
            id = db.documentCloud.insert(
                **db.documentCloud._filter_fields(form.vars))
            respuesta = request.post_vars

            #Successfully added new item
            #do whatever else you may want

            #Then let the user know adding via our widget worked
            response.flash = T("Added")
            target = request.args[0]
            #close the widget's dialog box
            response.js = '$( "#%s_dialog-form" ).dialog( "close" ); ' % (
                target)
            #update the options they can select their new category in the main form
            response.js += """$("#%s").append("<option value='%s'>%s</option>");""" \
                    % (target, form.vars.id, form.vars.name)
            #and select the one they just added
            response.js += """var selected=$("#%s").val();""" % (target)
            response.js += """if (selected==null) { selected = [] }"""
            response.js += """selected.push("%s");""" % (form.vars.id)
            response.js += """$("#%s").val(selected);""" % (target)
            #finally, return a blank form incase for some reason they wanted to add another option
            return form
        else:
            response.flash = T('Error en subir Documento a DocumentCloud')
            return form
    elif form.errors:
        #silly user, just send back the form and it'll still be in our dialog box complete with error messages
        return form
    else:
        #hasn't been submitted yet, just give them the fresh blank form
        return form
コード例 #14
0
def update_all():
    from documentcloud import DocumentCloud
    client = DocumentCloud(username=dc_username, password=dc_password)
    client_docs = DocumentCloud(username=dc_username, password=dc_password)
    document_ids = {}
    insert = []

    projects_list = client.projects.all()
    for project in projects_list:
        try:
            obj = client_docs.projects.get(project.id)
            document_ids[project.id] = obj.document_list
        except:
            ex = T('No existe Projecto')
        #document_ids[project] = obj.document_ids
        #document_ids[project] = project.title
        if len(document_ids[project.id]) > 0:
            for doc in document_ids[project.id]:
                doc_cloud = db((db.documentCloud.dc_id == doc.id) & (
                    db.documentCloud.is_active == True)).select().first()
                if doc_cloud is None:
                    docs = db.documentCloud.validate_and_insert(
                        dc_id=doc.id,
                        title=doc.title,
                        project=project.id,
                        is_active=True)
                    insert.append(docs)
                else:
                    doc_cloud.dc_id = doc.id
                    doc_cloud.title = doc.title
                    doc_cloud.source = doc.source
                    doc_cloud.description = doc.description
                    doc_cloud.related_article = doc.related_article
                    doc_cloud.published_url = doc.published_url
                    doc_cloud.access = doc.access
                    doc_cloud.project = project.id
                    #data=json.dumps(form.vars['data']),
                    #doc_cloud.secure=doc.secure
                    doc_cloud.update_record()

    return dict(projects=projects_list, docs=insert)
コード例 #15
0
def search():
    from documentcloud import DocumentCloud
    client = DocumentCloud()

    obj_list = {}

    form = FORM(T('Búsqueda:'), INPUT(_name='q'), INPUT(_type='submit'))
    if form.validate():

        obj_list = client.documents.search(form.vars.q)

    return dict(form=form, obj_list=obj_list)
コード例 #16
0
def get_document(args):
    """Retrieve document metadata"""

    client = DocumentCloud(args.username, args.password)

    try:
        doc = client.documents.get(args.id)
        print(json.dumps(serialize_document(doc)))

    except DoesNotExistError:
        sys.stderr.write("Document with id '{}' does not exist\n".format(
            args.id))
        sys.exit(1)
コード例 #17
0
    def handle(self, *args, **options):
        client = DocumentCloud()

        for document_type, syntax in self.search_syntaxes:
            results = client.documents.search(syntax)

            if results:
                results = self.clean_documentcloud_results(results)
                for result in results:
                    self.process_documentcloud_result(result, document_type)

        DocumentCrawler.objects.create(num_documents=Document.objects.filter(
            documentcloud_id__gt=0).count())
コード例 #18
0
ファイル: signals.py プロジェクト: WPMedia/muckrock
def foia_file_delete_dc(sender, **kwargs):
    """Delete file from DocumentCloud after the model is deleted"""
    # pylint: disable=unused-argument

    foia_file = kwargs["instance"]
    if foia_file.doc_id:
        dc_client = DocumentCloud(
            username=settings.DOCUMENTCLOUD_BETA_USERNAME,
            password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
            base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
            auth_uri=f"{settings.SQUARELET_URL}/api/",
        )
        dc_client.documents.delete(foia_file.doc_id)
コード例 #19
0
def datum_per_page(crowdsource_pk, doc_id, metadata):
    """Create a crowdsource data item for each page of the document"""

    crowdsource = Crowdsource.objects.get(pk=crowdsource_pk)
    dc_client = DocumentCloud(
        username=settings.DOCUMENTCLOUD_BETA_USERNAME,
        password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
        base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
        auth_uri=f"{settings.SQUARELET_URL}/api/",
    )
    document = dc_client.documents.get(doc_id)
    for i in range(1, document.pages + 1):
        crowdsource.data.create(url=f"{document.canonical_url}/pages/{i}",
                                metadata=metadata)
コード例 #20
0
def search_all(logger=_logger, custom_search_syntaxes=None):
    client = DocumentCloud(settings.DOCUMENTCLOUD_USER,
                           settings.DOCUMENTCLOUD_PASSWORD)

    search_syntaxes = custom_search_syntaxes or DocumentCloudSearchQuery.objects.all(
    ).values_list('types', 'query')
    all_documents = []
    for document_types, syntax in search_syntaxes:
        if syntax:
            logger.info(f'Searching Documentcloud for {syntax}')
            all_documents += _remove_duplicated(
                _remove_invalid_documents(
                    _add_attributes(client.documents.search(syntax),
                                    document_types)))
    return all_documents
コード例 #21
0
def batch_upload_files(project_name, files_to_batch):
    if not files_to_batch:
        print('No files available to upload')
        return
    # Connect to documentcloud
    client = DocumentCloud(USERNAME,
                           PASSWORD,
                           loglevel=logging.INFO,
                           timeout=30)  #logging.INFO
    # Create the project
    project, created = client.projects.get_or_create_by_title(project_name)
    # Upload all the pdfs
    obj_list = client.documents.upload_directory(files_to_batch,
                                                 handle_errors=True,
                                                 project=project.id)
コード例 #22
0
def documentcloud_handler(request):
    methodname = documentcloud_handler.__name__
    try:
        client = DocumentCloud('email', 'password')
        obj_list = client.documents.search(request, data=True)
        for i in range(len(obj_list)):
            contrib = obj_list[i].contributor
            if contrib == "Laurent Bastien":
                break
        link = obj_list[i].canonical_url

        return link

    except Exception as error:
        errormsg = "Error in {}. Error is {}".format(methodname, error)
        print(errormsg)
コード例 #23
0
def get_document_entities(args):
    """Retrieve document entities"""

    client = DocumentCloud(args.username, args.password)

    try:
        doc = client.documents.get(args.id)

        print(json.dumps({
            'entities': [serialize_entity(e) for e in doc.entities],
        }))

    except DoesNotExistError:
        sys.stderr.write("Document with id '{}' does not exist\n".format(
            args.id))
        sys.exit(1)
コード例 #24
0
def upload(args):
    """Upload a document"""

    client = DocumentCloud(args.username, args.password)
    project = None

    if args.project:
        project = client.projects.get(title=args.project)

        # TODO: Handle error if this project doesn't exist
        project = client.projects.get(title=args.project)

    # HACK: The API docs seem to indicate that I should just be able to pass the
    # project ID here. I think things break with newer versions of python.
    for file_or_url in args.file_or_url:
        document_id = client.documents.upload(file_or_url,
            project=str(project.id))
コード例 #25
0
def create():
    #from documentCloud import document_cloud
    import os
    import json
    from documentcloud import DocumentCloud
    dc_id = None
    respuesta = None

    #db.documentCloud.referenceEntity.default='organizacion'
    #db.documentCloud.id_reference.default=_id
    #db.documentCloud.project.default=7144

    form = SQLFORM(db.documentCloud)

    jeison = {}
    if form.validate():
        dc_cloud = DocumentCloud(username=dc_username, password=dc_password)
        dc_id = dc_cloud.documents.upload(
            os.path.join(request.folder, 'uploads', form.vars['file']),
            title=form.vars['title'],
            source=form.vars['source'],
            description=form.vars['description'],
            related_article=form.vars['related_article'],
            published_url=form.vars['published_url'],
            access=form.vars['access'],
            project=form.vars['project'],
            #data=json.dumps(form.vars['data']),
            secure=form.vars['secure'])
        if dc_id is not None:
            form.vars.dc_id = dc_id.id
            id = db.documentCloud.insert(
                **db.documentCloud._filter_fields(form.vars))
            respuesta = request.post_vars
            response.flash = T('Formulario aceptado')
        else:
            response.flash = T('Error en subir Documento a DocumentCloud')

    elif form.errors:
        response.flash = T('Hay errores en el formulario')
    else:
        response.flash = T('Por favor llene el formulario')

    return dict(form=form)
コード例 #26
0
def import_doccloud_proj(crowdsource_pk, proj_id, metadata,
                         doccloud_each_page):
    """Import documents from a document cloud project"""
    crowdsource = Crowdsource.objects.get(pk=crowdsource_pk)

    dc_client = DocumentCloud(
        username=settings.DOCUMENTCLOUD_BETA_USERNAME,
        password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
        base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
        auth_uri=f"{settings.SQUARELET_URL}/api/",
    )
    project = dc_client.projects.get(proj_id)

    for document in project.documents:
        if doccloud_each_page:
            datum_per_page.delay(crowdsource.pk, document.id, metadata)
        else:
            crowdsource.data.create(url=document.canonical_url,
                                    metadata=metadata)
コード例 #27
0
ファイル: documentcloud_service.py プロジェクト: pdflu/CPDB
    def process_link(self, link, document_type):
        matched_link = self.parse_link(link)
        if matched_link:
            client = DocumentCloud()
            results = client.documents.search(
                self.ID_SEARCH_SYNTAX.format(
                    id=matched_link['documentcloud_id']))

            if results:
                title = results[0].title
                crid = self.parse_crid_from_title(title, document_type)
                if crid:
                    return {
                        'documentcloud_id': matched_link['documentcloud_id'],
                        'normalized_title': matched_link['normalized_title'],
                        'allegation_crid': crid,
                        'title': title
                    }

        return False
コード例 #28
0
ファイル: projects.py プロジェクト: Code4SA/pikoli-upload
from __future__ import print_function
import os
import sys
from documentcloud import DocumentCloud

print("Enter your Document Cloud Credentials")
sys.stdout.write("Username: "******"Password: "******"https://sourceafrica.net/api/"
client = DocumentCloud(username, password, base_uri)
import pdb
pdb.set_trace()
print(client.projects.all())
コード例 #29
0
"""
file: proj_get_doc_urls.py
what: script to get an url for each document in a given project
"""

# import the modules for this script
from documentcloud import DocumentCloud
from config_file import config_settings

# varible to hold the project we're targeting
PROJECT_ID = 16900

# authenticate with document cloud with user_name & password in docConfig.py
client = DocumentCloud(
    config_settings["user_name"], config_settings["password"]
)

def proj_get_doc_urls(project_id):
    """
    begin function to return document ids
    """

    # creates an object that contains the documents in the project
    obj = client.projects.get(id = project_id)

    # list to hold all of the documents ids
    list_of_documents = obj.document_ids

    # begin looping through each document in our list
    for document in list_of_documents:
コード例 #30
0
ファイル: bigcases.py プロジェクト: jroethomas/Big-Cases
class caseShare:

	VERBOSE = True

	# List of  filing types that should not be tweeted (because they're routine and seldom interesting)
	DONOTTWEET = ['Notice of Appearance','Pro Hac Vice', 'Appear Pro Hac Vice',  'Appearance', 'LCvR 7.1 Certificate of Disclosure - Corporate Affiliations/Financial Interests']
	
	tw = Twython(
		settings.twitter_app_key,
		settings.twitter_app_secret,
		settings.twitter_oauth_key,
		settings.twitter_oauth_secret
	)
	
	db = dbconnect_db_add.db(host = settings.db_host, user=settings.db_user, pwd=settings.db_pass, port=settings.db_port, database=settings.db_database)
	dc = DocumentCloud(settings.dc_user, settings.dc_pass)

	# Re-arrange list of big cases into a dict with a unique ID 
	bigcases = dict((item['court']+item['case_number'], item) for item in bigcases_list.cases)

	
	def __init__(self):
		if self.VERBOSE:
			self.bigCasesMessage()
		self.listNew()
		return
	
	def listNew(self):
		# List new filings in selected cases that haven't been tweeted yet
		cases = self.db.getDict(""" SELECT * 
						FROM pacer_raw
						WHERE bigcase = 1
						ORDER BY pid DESC
						LIMIT 100 """)
		for case in cases:
			self.share(case)
			self.update(case)
		return

	def update(self, case):
		# Update a case after it's tweeted
		self.db.run(""" UPDATE pacer_raw
				SET bigcase = 2
				WHERE pid = %s """,
				(case['pid'], ))

	def twitter_upload(self, image_list):
		# Upload images of first four pages
		media_ids = []
		for image in image_list:
			try:
				res = requests.get(image)
				res.raise_for_status()
				uploadable = BytesIO(res.content)
				response = self.tw.upload_media(media=uploadable)
				media_ids.append(response['media_id'])
			except:
				pass
		return media_ids
				
	def share(self, case):
		# Post a new filing to Twitter
		uid = case['court'] + case['case_number']
		DP1 = re.compile('\[(.*?)\].*?<a href="(.*?)"', re.IGNORECASE)
		DP2 = re.compile('\[(.*?)\]', re.IGNORECASE)
		d = case['description']
		media_ids = []
		typ = DP2.search(d).group(1)
		if case['dcid'] is not None:
			# Best case: We have the document on DC, and probably attached images
			link = case['document_location']
			nd = DP2.search(d).group(1) + '\n\n' + link
			doc = self.dc.documents.get(case['dcid'])
			images = doc.normal_image_url_list[:4]
			if len(images) > 0:
				media_ids = self.twitter_upload(images)
			
		elif DP1.search(d):
			# If the document hasn't made it to DC, send the PACER link
			link = DP1.search(d).group(2)
			nd = DP1.search(d).group(1) + '\n\nDoc. on PACER: ' + link
		elif DP2.search(d):
			# If there is no document, send a link to the PACER docket for the case
			nd = DP2.search(d).group(1) + '\n\nPACER Docket: ' + case['link']
		else:
			nd = False		
		if nd:	
			if typ not in self.DONOTTWEET:
				msg = 'New filing in %s: %s' % (self.bigcases[uid]['name'], nd )
				try:
					if len(media_ids) > 0:
						self.tw.update_status(status = msg, media_ids = media_ids)
					else:
						self.tw.update_status(status = msg)
				except Exception, e:
					print '##' + str(e)
					pass

		return