def _find_spam(self): context = {'model': model, 'user': self.user} dataset_names = p.toolkit.get_action('package_list')({}, {}) num_datasets = len(dataset_names) page_size = 50 num_pages = num_datasets / page_size spam_datasets = [] try: for page in range(num_pages): search_params = {'rows': page_size, 'start': page, 'sort': 'metadata_modified desc'} datasets = p.toolkit.get_action('package_search')( {}, search_params)['results'] for dataset in datasets: if len(dataset.get('resources', [])) == 0: print print 'Name:', dataset['name'] print 'Title:', dataset.get('title') print 'Description:', print h.markdown_extract(dataset.get('notes'), 200) is_spam = '' while not is_spam in ['y', 'n']: is_spam = raw_input('Spam? [y/n] >> ') if is_spam == 'y': spam_datasets.append(dataset) except KeyboardInterrupt: print finally: for dataset in spam_datasets: self._spam_dataset(context, dataset)
def _find_spam(self): context = {'model': model, 'user': self.user} dataset_names = p.toolkit.get_action('package_list')({}, {}) num_datasets = len(dataset_names) page_size = 50 num_pages = num_datasets / page_size spam_datasets = [] try: for page in range(num_pages): search_params = { 'rows': page_size, 'start': page, 'sort': 'metadata_modified desc' } datasets = p.toolkit.get_action('package_search')( {}, search_params)['results'] for dataset in datasets: if len(dataset.get('resources', [])) == 0: print print 'Name:', dataset['name'] print 'Title:', dataset.get('title') print 'Description:', print h.markdown_extract(dataset.get('notes'), 200) is_spam = '' while not is_spam in ['y', 'n']: is_spam = raw_input('Spam? [y/n] >> ') if is_spam == 'y': spam_datasets.append(dataset) except KeyboardInterrupt: print finally: for dataset in spam_datasets: self._spam_dataset(context, dataset)
def test_extract_markdown(): with_html = u"""Data exposed: — Size of dump and data set: size? Notes: this is the classic RDF source but historically has had some problems with RDF correctness. """ with_unicode = u"""[From the project website] This project collects information on China’s foreign aid from the China Commerce Yearbook (中国商务年鉴) and the Almanac of China’s Foreign Economic Relations & Trade (中国对外经济贸易年间), published annually by China’s Ministry of Commerce (MOFCOM). Data is reported for each year between 1990 and 2005, with the exception of 2002, in which year China’s Ministry of Commerce published no project-level data on its foreign aid giving.""" assert "Data exposed" in h.markdown_extract(with_html) assert "collects information" in h.markdown_extract(with_unicode)
def send_comment_notification_mail(recipient_name, recipient_email, dataset, comment): from ckanext.ytp_comments import email_template # Fill out the message template url = str(g.site_url) + toolkit.url_for(controller='package', action='read', id=dataset.id) if comment.user_id: userobj = model.User.get(comment.user_id) commenter_email = userobj.email commenter_name = userobj.name subject_vars = { 'dataset': dataset.title } subject = email_template.subject.format(**subject_vars) message_vars = { 'user': commenter_name, 'email': commenter_email, 'dataset': dataset.title, 'link': url, 'comment_subject': helpers.markdown_extract(comment.subject).strip(), 'comment': helpers.markdown_extract(comment.comment).strip() } message = email_template.message.format(**message_vars) log.debug(subject) log.debug(message) # Locale fix current_locale = get_lang() locale = _get_safe_locale() if locale == 'en': _reset_lang() else: set_lang(locale) # Finally mail the user and reset locale try: log.debug("LOCALE: " + str(locale)) log.debug(subject) log.debug(message) mail_recipient(recipient_name, recipient_email, subject, message) except MailerException, e: log.error(e)
def markdown_extract_strip(text, extract_length=190): ''' return the plain text representation of markdown encoded text. That is the texted without any html tags. If extract_length is 0 then it will not be truncated.''' result_text = h.markdown_extract(text, extract_length) result = result_text.rstrip('\n').replace( '\n', ' ').replace('\r', '').replace('"', """) return result
def _mini_pkg_dict(self, pkg_id): '''For a package id, return the basic details for the package in a dictionary. Quite expensive - does two database lookups - so be careful with running it lots of times. ''' pkg = model.Session.query(model.Package).get(pkg_id) pub = pkg.get_organization() return OrderedDict((('id', pkg_id), ('name', pkg.name), ('title', pkg.title), ('notes', markdown_extract(pkg.notes)), ('dataset_link', '/dataset/%s' % pkg.name), ('publisher_title', pub.title if pub else None), ('publisher_link', '/publisher/%s' % pub.name if pub else None), # Metadata modified is a big query, so leave out unless required # ('metadata_modified', pkg.metadata_modified.isoformat()), ))
def _mini_pkg_dict(self, pkg_id): """For a package id, return the basic details for the package in a dictionary. Quite expensive - does two database lookups - so be careful with running it lots of times. """ pkg = model.Session.query(model.Package).get(pkg_id) pubs = pkg.get_groups() pub = pubs[0] if pubs else None return OrderedDict( ( ("id", pkg_id), ("name", pkg.name), ("title", pkg.title), ("notes", markdown_extract(pkg.notes)), ("dataset_link", "/dataset/%s" % pkg.name), ("publisher_title", pub.title if pub else None), ("publisher_link", "/publisher/%s" % pub.name if pub else None), # Metadata modified is a big query, so leave out unless required # ('metadata_modified', pkg.metadata_modified.isoformat()), ) )
def test_extract_markdown(self): assert "Data exposed" in h.markdown_extract(WITH_HTML) assert "collects information" in h.markdown_extract(WITH_UNICODE)
def _markdown(translation, length): return helpers.markdown_extract(translation, extract_length=length) if length is not True and isinstance(length, (int, long)) else \ helpers.render_markdown(translation)
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in import stage: %s" % harvest_object.guid) if not harvest_object: logger.error('No harvest object received') self._save_object_error('No harvest object received') return False try: self._set_config(harvest_object.job.source.config) package_dict = json.loads(harvest_object.content) data_dict = {} data_dict['id'] = package_dict['id'] data_dict['title'] = package_dict['title'] data_dict['name'] = munge_title_to_name(package_dict['name']) data_dict['notes'] = markdown_extract( package_dict.get('description')) tags = package_dict.get('keyword', []) data_dict['tag_string'] = ', '.join( [munge_tag(tag) for tag in tags]) data_dict['private'] = False license_id = package_dict.get('license', 'cc-by').strip('/').split('/')[-1] if license_id == 'de2a56f5-a565-481a-8589-406dc40b5588': license_id = 'sprep-public-license' data_dict['license_id'] = license_id or 'notspecified' data_dict['created'] = _parse_drupal_date(package_dict['issued']) data_dict['modified'] = _parse_drupal_date( package_dict['modified']) c_point, c_email = package_dict['contactPoint'][ 'fn'], package_dict['contactPoint']['hasEmail'].split(':')[-1] if c_email != '*****@*****.**': data_dict['contact_uri'] = c_point data_dict['contact_email'] = c_email data_dict['resources'] = [] for res in package_dict.get('distribution', []): # res['issued'] = _parse_drupal_date(res.pop('created')) # res['modified'] = _parse_drupal_date( # res.pop('last_modified').replace('Date changed ', '') # ) res['url'] = res.get('downloadURL') or res.get('accessURL') res['format'] = res['format'] res['name'] = res['title'] res['description'] = markdown_extract(res.get('description')) data_dict['resources'].append(res) if 'spatial' in package_dict: data_dict['spatial'] = package_dict['spatial'] try: data_dict['spatial'] = json.dumps({ "type": "Polygon", "coordinates": [[[float(c) for c in pair.split()] for pair in RE_SPATIAL.match( data_dict['spatial']).group(1).split(', ')]] }) except KeyError: pass # package_dict.pop('type') # add owner_org source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') data_dict['owner_org'] = owner_org data_dict['member_countries'] = country_mapping[None] if 'isPartOf' in package_dict: country = package_dict['isPartOf'].split('.')[0] data_dict['member_countries'] = country_mapping.get( country, country_mapping[None]) org = model.Session.query( model.Group).filter_by(name=country + '-data').first() if org: data_dict['owner_org'] = org.id data_dict['source'] = package_dict.get('landingPage') data_dict['theme'] = package_dict.get('theme', []) data_dict['theme'] = package_dict.get('theme', []) data_dict['thematic_area_string'] = _map_theme_to_topic( data_dict['theme']) data_dict['harvest_source'] = 'SPREP' self._create_or_update_package(data_dict, harvest_object, 'package_show') Session.commit() logger.debug("Finished record") except: logger.exception('Something went wrong!') self._save_object_error('Exception in import stage', harvest_object) return False return True
def package_matrix(packages, core_fields): html = u'' html += u'<table class="table table-bordered table-condensed packages">' + u"\n" table_rows = [] table_heads = {} for pkg_dict in packages: dic = {} for key, value in pkg_dict.iteritems(): if key == 'tags': tags = [] for tag_dict in pkg_dict['tags']: tags += [tag_dict['name']] dic['tags'] = tags table_heads['tags'] = "" elif key == 'groups': groups = [] #for group_dict in pkg_dict['groups']: # groups += [group_dict['id']] #dic['groups'] = groups dic['groups'] = pkg_dict['groups'] table_heads['groups'] = "" elif key == 'extras': for extra_dict in pkg_dict['extras']: if not extra_dict['key'] in dic.keys(): dic[extra_dict['key']] = extra_dict['value'] table_heads[extra_dict['key']] = "" elif key in core_fields and key not in dic.keys(): dic[key] = value table_heads[key] = "" table_rows.append(dic) if 'title' in table_heads: del table_heads['title'] if 'id' in table_heads: del table_heads['id'] table_heads_sorted = sorted(table_heads.iterkeys()) html += u'<thead>' + u"\n" html += u'<tr>' + u"\n" html += u'<th class="edit narrowTh" style="width: 15px;"><input type="checkbox" name="checkall" value="checkall" class="checkall"/></th>' + u"\n" html += u'<th class="title wideTh" style="max-width: 250px;">Title</th>' + u"\n" for key in table_heads_sorted: html += u'<th class="' + unicode(key) + u' wideTh">' + unicode( _(key)) + u'</th>' + u"\n" html += u'<th class="single_edit narrowTh" style="width: 35px;">Edit</th>' + u"\n" html += u'</tr>' + u"\n" html += u'</thead>' + u"\n" html += u'<tbody>' for row in table_rows: html += u'<tr>' html += u'<td><input type="checkbox" name="package_select" class="package_select" value="' + unicode( row['id']) + u'" /></td>' html += u'<td class="title ' + row['id'] + '">' html += unicode( h.link_to( row['title'] or row['name'], h.url_for(controller='package', action='read', id=row['name']))) html += u'</td>' for key in table_heads_sorted: if key in row: import json try: row_key = json.loads(row[key]) except (ValueError, TypeError): row_key = row[key] if key == "notes": val = h.markdown_extract(row_key) if key == "groups": group_ids = [] group_names = [] for group_dict in row[key]: group_ids += [group_dict['id']] group_names += [ h.group_name_to_title(group_dict['name']) ] row_key = ", ".join(group_ids) val = ", ".join(group_names) elif isinstance(row_key, list): val = ", ".join(row_key) else: val = row_key full_val = row_key html += u'<td class="' + unicode(key) + u' ' + unicode( row['id'] ) + u'" title="' + unicode( full_val ) + u'" style="max-height: 100px; display: block; overflow-y: auto;">' html += unicode(val) html += u'</td>' else: html += u'<td class="' + unicode(key) + u' ' + unicode( row['id'] ) + u'" style="max-height: 100px; display: block; overflow-y: scroll;"></td>' html += u'<td class="single_edit">' + unicode( h.subnav_link(h.icon('package_edit'), controller='package', action='edit', id=row['name'])) + u'</td>' html += u'</tr>' html += u'</tbody>' html += u'</table>' return toolkit.literal(html)
def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) # Dataset g.add((dataset_ref, RDF.type, DCAT.Dataset)) ## Simple values items = [ ("title", DCTERMS.title, None, Literal), ("name", DCTERMS.identifier, None, Literal), ("author", DC.creator, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) ## Description dataset_desc = dataset_dict.get("notes") if dataset_desc: dataset_desc_value = markdown_extract(dataset_desc, extract_length=0) g.add((dataset_ref, DCTERMS.description, Literal(dataset_desc))) ## Language langs = dataset_dict.get("language") if langs: for lang in langs: language_uri = LANG_PREFIX + lang g.add((dataset_ref, DCTERMS.language, URIRef(language_uri))) ## Tags for tag in dataset_dict.get("tags", []): g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) ## Wikidata keywords for keyword in dataset_dict.get("keywords", []): g.add((dataset_ref, DCAT.theme, WD[keyword])) ## Data Type data_types = dataset_dict.get("data_type") if data_types: for data_type in data_types: g.add((dataset_ref, DCTERMS.type, URIRef(DATA_TYPE_PREFIX + data_type))) ## Temporal Resolution temp_res = dataset_dict.get("temp_res") temp_res_mapping = {"yearly": "P1Y", "daily": "P1D", "monthly": "P1M"} if temp_res: temp_res_value = temp_res_mapping[temp_res] g.add((dataset_ref, DCAT.temporalResolution, Literal(temp_res_value, datatype=XSD.duration))) ## Start Time, End Time, and Created Time items = [("start_time", SCHEMA.startDate, None, Literal), ("end_time", SCHEMA.endDate, None, Literal), ("created_time", DCTERMS.issued, None, Literal)] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) ## Spatial Coverage spatial = dataset_dict.get("spatial") x_min = dataset_dict.get("x_min") x_max = dataset_dict.get("x_max") y_min = dataset_dict.get("y_min") y_max = dataset_dict.get("y_max") if any([spatial, x_min, x_max, y_min, y_max]): spatial_ref = BNode() g.add((spatial_ref, RDF.type, DCTERMS.Location)) g.add((dataset_ref, DCTERMS.spatial, spatial_ref)) if spatial: g.add((spatial_ref, LOCN.geometry, Literal(spatial, datatype=GEOJSON_IMT))) if x_min and x_max and y_min and y_max: box_value = "%s %s %s %s" % (y_min, x_min, y_max, x_max) box_ref = BNode() g.add((box_ref, RDF.type, SCHEMA.GeoShape)) g.add((box_ref, SCHEMA.box, Literal(box_value))) g.add((spatial_ref, LOCN.geometry, box_ref)) ## Spatial Resolution spatial_res = dataset_dict.get("spatial_res") if spatial_res: g.add((dataset_ref, DCAT.spatialResolutionInMeters, Literal(spatial_res, datatype=XSD.decimal))) ## Process Step proc_step = dataset_dict.get("process_step") if proc_step: proc_step_value = markdown_extract(proc_step, extract_length=0) proc_ref = BNode() g.add((proc_ref, RDF.type, DCTERMS.ProvenanceStatement)) g.add((proc_ref, RDFS.label, Literal(proc_step_value))) g.add((dataset_ref, DCTERMS.provenance, proc_ref)) ## Project details project = dataset_dict.get("organization") if project: project["description"] = markdown_extract(project["description"], extract_length=0) project_details = BNode() g.add((project_details, RDF.type, ORG.Organization)) g.add((dataset_ref, DCTERMS.publisher, project_details)) items = [("title", FOAF.name, None, Literal), ("description", ORG.purpose, None, Literal)] self._add_triples_from_dict(project, project_details, items) ## Contact details contact_person = dataset_dict.get("contact_person") contact_email = dataset_dict.get("contact_email") if any([contact_person, contact_email]): contact_details = BNode() g.add((contact_details, RDF.type, VCARD.Individual)) g.add((dataset_ref, DCAT.contactPoint, contact_details)) self._add_triple_from_dict(dataset_dict, contact_details, VCARD.fn, "contact_person") self._add_triple_from_dict(dataset_dict, contact_details, VCARD.hasEmail, "contact_email", _type=URIRef, value_modifier=self._add_mailto) ## Theme themes = dataset_dict.get("groups") if themes: for theme in themes: theme_details = BNode() g.add((theme_details, RDF.type, SKOS.Concept)) g.add((theme_details, SKOS.prefLabel, Literal(theme["title"]))) g.add((dataset_ref, DCAT.theme, theme_details)) # Resources ## Depositar defines license in the dataset level license = dataset_dict.get("license_url") for resource_dict in dataset_dict.get("resources", []): distribution = CleanedURIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) ## Simple values items = [ ("name", DCTERMS.title, None, Literal), ("description", DCTERMS.description, None, Literal), ("encoding", CNT.characterEncoding, None, Literal), ("url", DCAT.downloadURL, None, URIRef), ] self._add_triples_from_dict(resource_dict, distribution, items) ## License if license: g.add((distribution, DCTERMS.license, URIRef(license))) ## Coordinate Systems crs = resource_dict.get("resource_crs") if crs: crs_value = EPSG_PREFIX + str(crs) g.add((distribution, DCTERMS.conformsTo, URIRef(crs_value))) ## Format (mimetype) mimetype = resource_dict.get("mimetype") if mimetype: mimetype_value = IMT_PREFIX + mimetype g.add((distribution, DCAT.mediaType, URIRef(mimetype_value)))