Example #1
0
    def _find_spam(self):
        context = {'model': model, 'user': self.user}

        dataset_names = p.toolkit.get_action('package_list')({}, {})
        num_datasets = len(dataset_names)
        page_size = 50
        num_pages = num_datasets / page_size
        spam_datasets = []

        try:
            for page in range(num_pages):
                search_params = {'rows': page_size, 'start': page,
                                 'sort': 'metadata_modified desc'}
                datasets = p.toolkit.get_action('package_search')(
                    {}, search_params)['results']
                for dataset in datasets:
                    if len(dataset.get('resources', [])) == 0:
                        print
                        print 'Name:', dataset['name']
                        print 'Title:', dataset.get('title')
                        print 'Description:',
                        print h.markdown_extract(dataset.get('notes'), 200)
                        is_spam = ''
                        while not is_spam in ['y', 'n']:
                            is_spam = raw_input('Spam? [y/n]  >> ')
                        if is_spam == 'y':
                            spam_datasets.append(dataset)
        except KeyboardInterrupt:
            print
        finally:
            for dataset in spam_datasets:
                self._spam_dataset(context, dataset)
Example #2
0
    def _find_spam(self):
        context = {'model': model, 'user': self.user}

        dataset_names = p.toolkit.get_action('package_list')({}, {})
        num_datasets = len(dataset_names)
        page_size = 50
        num_pages = num_datasets / page_size
        spam_datasets = []

        try:
            for page in range(num_pages):
                search_params = {
                    'rows': page_size,
                    'start': page,
                    'sort': 'metadata_modified desc'
                }
                datasets = p.toolkit.get_action('package_search')(
                    {}, search_params)['results']
                for dataset in datasets:
                    if len(dataset.get('resources', [])) == 0:
                        print
                        print 'Name:', dataset['name']
                        print 'Title:', dataset.get('title')
                        print 'Description:',
                        print h.markdown_extract(dataset.get('notes'), 200)
                        is_spam = ''
                        while not is_spam in ['y', 'n']:
                            is_spam = raw_input('Spam? [y/n]  >> ')
                        if is_spam == 'y':
                            spam_datasets.append(dataset)
        except KeyboardInterrupt:
            print
        finally:
            for dataset in spam_datasets:
                self._spam_dataset(context, dataset)
Example #3
0
def test_extract_markdown():
    with_html = u"""Data exposed: —
Size of dump and data set: size?
Notes: this is the classic RDF source but historically has had some problems with RDF correctness.
"""

    with_unicode = u"""[From the project website] This project collects information on China’s foreign aid from the China Commerce Yearbook (中国商务年鉴) and the Almanac of China’s Foreign Economic Relations & Trade (中国对外经济贸易年间), published annually by China’s Ministry of Commerce (MOFCOM). Data is reported for each year between 1990 and 2005, with the exception of 2002, in which year China’s Ministry of Commerce published no project-level data on its foreign aid giving."""

    assert "Data exposed" in h.markdown_extract(with_html)
    assert "collects information" in h.markdown_extract(with_unicode)
Example #4
0
def send_comment_notification_mail(recipient_name, recipient_email, dataset, comment):

    from ckanext.ytp_comments import email_template

    # Fill out the message template

    url = str(g.site_url) + toolkit.url_for(controller='package', action='read', id=dataset.id)

    if comment.user_id:
        userobj = model.User.get(comment.user_id)
        commenter_email = userobj.email
        commenter_name = userobj.name

    subject_vars = {
        'dataset': dataset.title
    }
    subject = email_template.subject.format(**subject_vars)

    message_vars = {
        'user': commenter_name,
        'email': commenter_email,
        'dataset': dataset.title,
        'link': url,
        'comment_subject': helpers.markdown_extract(comment.subject).strip(),
        'comment': helpers.markdown_extract(comment.comment).strip()
    }
    message = email_template.message.format(**message_vars)

    log.debug(subject)
    log.debug(message)

    # Locale fix
    current_locale = get_lang()
    locale = _get_safe_locale()

    if locale == 'en':
        _reset_lang()
    else:
        set_lang(locale)
    # Finally mail the user and reset locale

    try:
        log.debug("LOCALE: " + str(locale))
        log.debug(subject)
        log.debug(message)

        mail_recipient(recipient_name, recipient_email, subject, message)
    except MailerException, e:
        log.error(e)
Example #5
0
def send_comment_notification_mail(recipient_name, recipient_email, dataset, comment):

    from ckanext.ytp_comments import email_template

    # Fill out the message template

    url = str(g.site_url) + toolkit.url_for(controller='package', action='read', id=dataset.id)

    if comment.user_id:
        userobj = model.User.get(comment.user_id)
        commenter_email = userobj.email
        commenter_name = userobj.name

    subject_vars = {
        'dataset': dataset.title
    }
    subject = email_template.subject.format(**subject_vars)

    message_vars = {
        'user': commenter_name,
        'email': commenter_email,
        'dataset': dataset.title,
        'link': url,
        'comment_subject': helpers.markdown_extract(comment.subject).strip(),
        'comment': helpers.markdown_extract(comment.comment).strip()
    }
    message = email_template.message.format(**message_vars)

    log.debug(subject)
    log.debug(message)

    # Locale fix
    current_locale = get_lang()
    locale = _get_safe_locale()

    if locale == 'en':
        _reset_lang()
    else:
        set_lang(locale)
    # Finally mail the user and reset locale

    try:
        log.debug("LOCALE: " + str(locale))
        log.debug(subject)
        log.debug(message)

        mail_recipient(recipient_name, recipient_email, subject, message)
    except MailerException, e:
        log.error(e)
Example #6
0
def markdown_extract_strip(text, extract_length=190):
    ''' return the plain text representation of markdown encoded text.  That
    is the texted without any html tags.  If extract_length is 0 then it
    will not be truncated.'''
    result_text = h.markdown_extract(text, extract_length)
    result = result_text.rstrip('\n').replace(
        '\n', ' ').replace('\r', '').replace('"', """)
    return result
Example #7
0
def markdown_extract_strip(text, extract_length=190):
    ''' return the plain text representation of markdown encoded text.  That
    is the texted without any html tags.  If extract_length is 0 then it
    will not be truncated.'''
    result_text = h.markdown_extract(text, extract_length)
    result = result_text.rstrip('\n').replace(
        '\n', ' ').replace('\r', '').replace('"', """)
    return result
 def _mini_pkg_dict(self, pkg_id):
     '''For a package id, return the basic details for the package in a
     dictionary.
     Quite expensive - does two database lookups - so be careful with running it
     lots of times.
     '''
     pkg = model.Session.query(model.Package).get(pkg_id)
     pub = pkg.get_organization()
     return OrderedDict((('id', pkg_id),
                         ('name', pkg.name),
                         ('title', pkg.title),
                         ('notes', markdown_extract(pkg.notes)),
                         ('dataset_link', '/dataset/%s' % pkg.name),
                         ('publisher_title', pub.title if pub else None),
                         ('publisher_link', '/publisher/%s' % pub.name if pub else None),
                         # Metadata modified is a big query, so leave out unless required
                         # ('metadata_modified', pkg.metadata_modified.isoformat()),
                         ))
Example #9
0
 def _mini_pkg_dict(self, pkg_id):
     '''For a package id, return the basic details for the package in a
     dictionary.
     Quite expensive - does two database lookups - so be careful with running it
     lots of times.
     '''
     pkg = model.Session.query(model.Package).get(pkg_id)
     pub = pkg.get_organization()
     return OrderedDict((('id', pkg_id),
                         ('name', pkg.name),
                         ('title', pkg.title),
                         ('notes', markdown_extract(pkg.notes)),
                         ('dataset_link', '/dataset/%s' % pkg.name),
                         ('publisher_title', pub.title if pub else None),
                         ('publisher_link', '/publisher/%s' % pub.name if pub else None),
                         # Metadata modified is a big query, so leave out unless required
                         # ('metadata_modified', pkg.metadata_modified.isoformat()),
                         ))
Example #10
0
 def _mini_pkg_dict(self, pkg_id):
     """For a package id, return the basic details for the package in a
     dictionary.
     Quite expensive - does two database lookups - so be careful with running it
     lots of times.
     """
     pkg = model.Session.query(model.Package).get(pkg_id)
     pubs = pkg.get_groups()
     pub = pubs[0] if pubs else None
     return OrderedDict(
         (
             ("id", pkg_id),
             ("name", pkg.name),
             ("title", pkg.title),
             ("notes", markdown_extract(pkg.notes)),
             ("dataset_link", "/dataset/%s" % pkg.name),
             ("publisher_title", pub.title if pub else None),
             ("publisher_link", "/publisher/%s" % pub.name if pub else None),
             # Metadata modified is a big query, so leave out unless required
             # ('metadata_modified', pkg.metadata_modified.isoformat()),
         )
     )
Example #11
0
 def test_extract_markdown(self):
     assert "Data exposed" in h.markdown_extract(WITH_HTML)
     assert "collects information" in h.markdown_extract(WITH_UNICODE)
Example #12
0
 def test_extract_markdown(self):
     assert "Data exposed" in h.markdown_extract(WITH_HTML)
     assert "collects information" in h.markdown_extract(WITH_UNICODE)
Example #13
0
def _markdown(translation, length):
    return helpers.markdown_extract(translation, extract_length=length) if length is not True and isinstance(length, (int, long)) else \
        helpers.render_markdown(translation)
Example #14
0
def _markdown(translation, length):
    return helpers.markdown_extract(translation, extract_length=length) if length is not True and isinstance(length, (int, long)) else \
        helpers.render_markdown(translation)
Example #15
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        logger.debug("in import stage: %s" % harvest_object.guid)
        if not harvest_object:
            logger.error('No harvest object received')
            self._save_object_error('No harvest object received')
            return False
        try:
            self._set_config(harvest_object.job.source.config)

            package_dict = json.loads(harvest_object.content)
            data_dict = {}
            data_dict['id'] = package_dict['id']
            data_dict['title'] = package_dict['title']
            data_dict['name'] = munge_title_to_name(package_dict['name'])

            data_dict['notes'] = markdown_extract(
                package_dict.get('description'))

            tags = package_dict.get('keyword', [])
            data_dict['tag_string'] = ', '.join(
                [munge_tag(tag) for tag in tags])

            data_dict['private'] = False

            license_id = package_dict.get('license',
                                          'cc-by').strip('/').split('/')[-1]

            if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588':
                license_id = 'sprep-public-license'
            data_dict['license_id'] = license_id or 'notspecified'

            data_dict['created'] = _parse_drupal_date(package_dict['issued'])
            data_dict['modified'] = _parse_drupal_date(
                package_dict['modified'])

            c_point, c_email = package_dict['contactPoint'][
                'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1]
            if c_email != '*****@*****.**':
                data_dict['contact_uri'] = c_point
                data_dict['contact_email'] = c_email
            data_dict['resources'] = []
            for res in package_dict.get('distribution', []):

                # res['issued'] = _parse_drupal_date(res.pop('created'))
                # res['modified'] = _parse_drupal_date(
                #     res.pop('last_modified').replace('Date changed ', '')
                # )
                res['url'] = res.get('downloadURL') or res.get('accessURL')
                res['format'] = res['format']
                res['name'] = res['title']
                res['description'] = markdown_extract(res.get('description'))
                data_dict['resources'].append(res)
            if 'spatial' in package_dict:
                data_dict['spatial'] = package_dict['spatial']
                try:
                    data_dict['spatial'] = json.dumps({
                        "type":
                        "Polygon",
                        "coordinates":
                        [[[float(c) for c in pair.split()]
                          for pair in RE_SPATIAL.match(
                              data_dict['spatial']).group(1).split(', ')]]
                    })
                except KeyError:
                    pass
                # package_dict.pop('type')
            # add owner_org
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })

            owner_org = source_dataset.get('owner_org')
            data_dict['owner_org'] = owner_org
            data_dict['member_countries'] = country_mapping[None]
            if 'isPartOf' in package_dict:
                country = package_dict['isPartOf'].split('.')[0]
                data_dict['member_countries'] = country_mapping.get(
                    country, country_mapping[None])
                org = model.Session.query(
                    model.Group).filter_by(name=country + '-data').first()
                if org:
                    data_dict['owner_org'] = org.id

            data_dict['source'] = package_dict.get('landingPage')

            data_dict['theme'] = package_dict.get('theme', [])
            data_dict['theme'] = package_dict.get('theme', [])

            data_dict['thematic_area_string'] = _map_theme_to_topic(
                data_dict['theme'])

            data_dict['harvest_source'] = 'SPREP'

            self._create_or_update_package(data_dict, harvest_object,
                                           'package_show')

            Session.commit()

            logger.debug("Finished record")
        except:
            logger.exception('Something went wrong!')
            self._save_object_error('Exception in import stage',
                                    harvest_object)
            return False
        return True
Example #16
0
    def package_matrix(packages, core_fields):

        html = u''

        html += u'<table class="table table-bordered table-condensed packages">' + u"\n"

        table_rows = []
        table_heads = {}
        for pkg_dict in packages:
            dic = {}
            for key, value in pkg_dict.iteritems():
                if key == 'tags':
                    tags = []
                    for tag_dict in pkg_dict['tags']:
                        tags += [tag_dict['name']]
                    dic['tags'] = tags
                    table_heads['tags'] = ""
                elif key == 'groups':
                    groups = []
                    #for group_dict in pkg_dict['groups']:
                    #    groups += [group_dict['id']]
                    #dic['groups'] = groups
                    dic['groups'] = pkg_dict['groups']
                    table_heads['groups'] = ""
                elif key == 'extras':
                    for extra_dict in pkg_dict['extras']:
                        if not extra_dict['key'] in dic.keys():
                            dic[extra_dict['key']] = extra_dict['value']
                            table_heads[extra_dict['key']] = ""
                elif key in core_fields and key not in dic.keys():
                    dic[key] = value
                    table_heads[key] = ""
            table_rows.append(dic)
        if 'title' in table_heads:
            del table_heads['title']
        if 'id' in table_heads:
            del table_heads['id']
        table_heads_sorted = sorted(table_heads.iterkeys())

        html += u'<thead>' + u"\n"
        html += u'<tr>' + u"\n"
        html += u'<th class="edit narrowTh" style="width: 15px;"><input type="checkbox" name="checkall" value="checkall" class="checkall"/></th>' + u"\n"
        html += u'<th class="title wideTh" style="max-width: 250px;">Title</th>' + u"\n"
        for key in table_heads_sorted:
            html += u'<th class="' + unicode(key) + u' wideTh">' + unicode(
                _(key)) + u'</th>' + u"\n"
        html += u'<th class="single_edit narrowTh" style="width: 35px;">Edit</th>' + u"\n"
        html += u'</tr>' + u"\n"
        html += u'</thead>' + u"\n"
        html += u'<tbody>'

        for row in table_rows:

            html += u'<tr>'

            html += u'<td><input type="checkbox" name="package_select" class="package_select" value="' + unicode(
                row['id']) + u'" /></td>'
            html += u'<td class="title ' + row['id'] + '">'
            html += unicode(
                h.link_to(
                    row['title'] or row['name'],
                    h.url_for(controller='package',
                              action='read',
                              id=row['name'])))
            html += u'</td>'
            for key in table_heads_sorted:

                if key in row:

                    import json

                    try:
                        row_key = json.loads(row[key])
                    except (ValueError, TypeError):
                        row_key = row[key]
                    if key == "notes":
                        val = h.markdown_extract(row_key)
                    if key == "groups":
                        group_ids = []
                        group_names = []
                        for group_dict in row[key]:
                            group_ids += [group_dict['id']]
                            group_names += [
                                h.group_name_to_title(group_dict['name'])
                            ]
                        row_key = ", ".join(group_ids)
                        val = ", ".join(group_names)
                    elif isinstance(row_key, list):
                        val = ", ".join(row_key)
                    else:
                        val = row_key

                    full_val = row_key

                    html += u'<td class="' + unicode(key) + u' ' + unicode(
                        row['id']
                    ) + u'" title="' + unicode(
                        full_val
                    ) + u'" style="max-height: 100px; display: block; overflow-y: auto;">'
                    html += unicode(val)
                    html += u'</td>'
                else:
                    html += u'<td class="' + unicode(key) + u' ' + unicode(
                        row['id']
                    ) + u'" style="max-height: 100px; display: block; overflow-y: scroll;"></td>'
            html += u'<td class="single_edit">' + unicode(
                h.subnav_link(h.icon('package_edit'),
                              controller='package',
                              action='edit',
                              id=row['name'])) + u'</td>'
            html += u'</tr>'
        html += u'</tbody>'
        html += u'</table>'

        return toolkit.literal(html)
    def graph_from_dataset(self, dataset_dict, dataset_ref):

        g = self.g

        for prefix, namespace in namespaces.items():
            g.bind(prefix, namespace)

        # Dataset

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        ## Simple values
        items = [
            ("title", DCTERMS.title, None, Literal),
            ("name", DCTERMS.identifier, None, Literal),
            ("author", DC.creator, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Description
        dataset_desc = dataset_dict.get("notes")
        if dataset_desc:
            dataset_desc_value = markdown_extract(dataset_desc,
                                                  extract_length=0)
        g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc)))

        ## Language
        langs = dataset_dict.get("language")
        if langs:
            for lang in langs:
                language_uri = LANG_PREFIX + lang
                g.add((dataset_ref, DCTERMS.language, URIRef(language_uri)))

        ## Tags
        for tag in dataset_dict.get("tags", []):
            g.add((dataset_ref, DCAT.keyword, Literal(tag["name"])))

        ## Wikidata keywords
        for keyword in dataset_dict.get("keywords", []):
            g.add((dataset_ref, DCAT.theme, WD[keyword]))

        ## Data Type
        data_types = dataset_dict.get("data_type")
        if data_types:
            for data_type in data_types:
                g.add((dataset_ref, DCTERMS.type,
                       URIRef(DATA_TYPE_PREFIX + data_type)))

        ## Temporal Resolution
        temp_res = dataset_dict.get("temp_res")
        temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"}
        if temp_res:
            temp_res_value = temp_res_mapping[temp_res]
            g.add((dataset_ref, DCAT.temporalResolution,
                   Literal(temp_res_value, datatype=XSD.duration)))

        ## Start Time, End Time, and Created Time
        items = [("start_time", SCHEMA.startDate, None, Literal),
                 ("end_time", SCHEMA.endDate, None, Literal),
                 ("created_time", DCTERMS.issued, None, Literal)]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        ## Spatial Coverage
        spatial = dataset_dict.get("spatial")
        x_min = dataset_dict.get("x_min")
        x_max = dataset_dict.get("x_max")
        y_min = dataset_dict.get("y_min")
        y_max = dataset_dict.get("y_max")

        if any([spatial, x_min, x_max, y_min, y_max]):
            spatial_ref = BNode()
            g.add((spatial_ref, RDF.type, DCTERMS.Location))
            g.add((dataset_ref, DCTERMS.spatial, spatial_ref))

            if spatial:
                g.add((spatial_ref, LOCN.geometry,
                       Literal(spatial, datatype=GEOJSON_IMT)))

            if x_min and x_max and y_min and y_max:
                box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max)
                box_ref = BNode()
                g.add((box_ref, RDF.type, SCHEMA.GeoShape))
                g.add((box_ref, SCHEMA.box, Literal(box_value)))
                g.add((spatial_ref, LOCN.geometry, box_ref))

        ## Spatial Resolution
        spatial_res = dataset_dict.get("spatial_res")

        if spatial_res:
            g.add((dataset_ref, DCAT.spatialResolutionInMeters,
                   Literal(spatial_res, datatype=XSD.decimal)))

        ## Process Step
        proc_step = dataset_dict.get("process_step")

        if proc_step:
            proc_step_value = markdown_extract(proc_step, extract_length=0)
            proc_ref = BNode()
            g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement))
            g.add((proc_ref, RDFS.label, Literal(proc_step_value)))
            g.add((dataset_ref, DCTERMS.provenance, proc_ref))

        ## Project details
        project = dataset_dict.get("organization")

        if project:
            project["description"] = markdown_extract(project["description"],
                                                      extract_length=0)
            project_details = BNode()
            g.add((project_details, RDF.type, ORG.Organization))
            g.add((dataset_ref, DCTERMS.publisher, project_details))
            items = [("title", FOAF.name, None, Literal),
                     ("description", ORG.purpose, None, Literal)]

            self._add_triples_from_dict(project, project_details, items)

        ## Contact details
        contact_person = dataset_dict.get("contact_person")
        contact_email = dataset_dict.get("contact_email")

        if any([contact_person, contact_email]):
            contact_details = BNode()
            g.add((contact_details, RDF.type, VCARD.Individual))
            g.add((dataset_ref, DCAT.contactPoint, contact_details))

            self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn,
                                       "contact_person")

            self._add_triple_from_dict(dataset_dict,
                                       contact_details,
                                       VCARD.hasEmail,
                                       "contact_email",
                                       _type=URIRef,
                                       value_modifier=self._add_mailto)

        ## Theme
        themes = dataset_dict.get("groups")

        if themes:
            for theme in themes:
                theme_details = BNode()
                g.add((theme_details, RDF.type, SKOS.Concept))
                g.add((theme_details, SKOS.prefLabel, Literal(theme["title"])))
                g.add((dataset_ref, DCAT.theme, theme_details))

        # Resources

        ## Depositar defines license in the dataset level
        license = dataset_dict.get("license_url")

        for resource_dict in dataset_dict.get("resources", []):
            distribution = CleanedURIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))

            g.add((distribution, RDF.type, DCAT.Distribution))

            ## Simple values
            items = [
                ("name", DCTERMS.title, None, Literal),
                ("description", DCTERMS.description, None, Literal),
                ("encoding", CNT.characterEncoding, None, Literal),
                ("url", DCAT.downloadURL, None, URIRef),
            ]
            self._add_triples_from_dict(resource_dict, distribution, items)

            ## License
            if license:
                g.add((distribution, DCTERMS.license, URIRef(license)))

            ## Coordinate Systems
            crs = resource_dict.get("resource_crs")

            if crs:
                crs_value = EPSG_PREFIX + str(crs)
                g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value)))

            ## Format (mimetype)
            mimetype = resource_dict.get("mimetype")

            if mimetype:
                mimetype_value = IMT_PREFIX + mimetype
                g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))