예제 #1
0
 def test_munge(self):
     expected_data = [
         ("pollution", "pollution"),
         ("fish pollution", "fish-pollution"),
         ("dosh$money", "doshmoney"),
         ("under_score", "under-score"),
     ]
     for str_, tag in expected_data:
         result_tag = tag_munge(str_)
         assert_equal(result_tag, tag)
예제 #2
0
 def test_munge(self):
     expected_data = [
         ('pollution', 'pollution'),
         ('fish pollution', 'fish-pollution'),
         ('dosh$money', 'doshmoney'),
         ('under_score', 'under-score'),
     ]
     for str_, tag in expected_data:
         result_tag = tag_munge(str_)
         assert_equal(result_tag, tag)
예제 #3
0
 def test_munge(self):
     expected_data = [
         ('pollution', 'pollution'),
         ('fish pollution', 'fish-pollution'),
         ('dosh$money', 'doshmoney'),
         ('under_score', 'under-score'),
         ]
     for str_, tag in expected_data:
         result_tag = tag_munge(str_)
         assert_equal(result_tag, tag)
예제 #4
0
파일: theme.py 프로젝트: zfbpb/data.gov.hr
    def __init__(self):
        themes_filepath = os.path.abspath(
            os.path.join(__file__, '../../themes.json'))
        assert os.path.exists(themes_filepath), themes_filepath
        log.debug('Reading themes.json')
        with codecs.open(themes_filepath, encoding='utf8') as f:
            themes_json = f.read()
        themes_list = json.loads(themes_json)
        self.data = {}
        self.topic_words = {}  # topic:theme_name
        self.topic_bigrams = {}  # (topicword1, topicword2):theme_name
        self.topic_trigrams = {
        }  # (topicword1, topicword2, topicword3):theme_name
        self.gemet = {}  # gemet_keyword:theme_name
        self.ons = {}  # ons_keyword:theme_name
        self.lga_functions = {}  # LGA functions extra
        self.lga_services = {}  # LGA services extra
        for theme_dict in themes_list:
            name = theme_dict.get('stored_as') or theme_dict['title']

            for key in ('topics', 'gemet', 'nscl', 'ons', 'lga_functions',
                        'lga_services'):
                if key in theme_dict:
                    assert isinstance(theme_dict[key], list), (name, key)

            for topic in theme_dict['topics']:
                words = [normalize_token(word) for word in split_words(topic)]
                if len(words) == 1:
                    self.topic_words[words[0]] = name
                elif len(words) == 2:
                    self.topic_bigrams[tuple(words)] = name
                elif len(words) == 3:
                    self.topic_trigrams[tuple(words)] = name
                else:
                    assert 0, 'Too many words in topic: %s' % topic

            for gemet_keyword in theme_dict.get('gemet', []):
                self.gemet[normalize_keyword(gemet_keyword)] = name
            for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get(
                    'ons', []):
                self.ons[tag_munge(ons_keyword)] = name
            for function_id in theme_dict.get('lga_functions', []):
                self.lga_functions[function_id] = name
            for service_id in theme_dict.get('lga_services', []):
                self.lga_services[service_id] = name
            self.data[name] = theme_dict
        self.topic_words_set = self.topic_words.viewkeys(
        )  # can do set-like operations on it
        self.topic_bigrams_set = self.topic_bigrams.viewkeys()
        self.topic_trigrams_set = self.topic_trigrams.viewkeys()
예제 #5
0
    def __init__(self):
        themes_filepath = os.path.abspath(os.path.join(__file__, '../../themes.json'))
        assert os.path.exists(themes_filepath), themes_filepath
        log.debug('Reading themes.json')
        with codecs.open(themes_filepath, encoding='utf8') as f:
            themes_json = f.read()
        themes_list = json.loads(themes_json)
        self.data = {}
        self.topic_words = {}  # topic:theme_name
        self.topic_bigrams = {} # (topicword1, topicword2):theme_name
        self.topic_trigrams = {} # (topicword1, topicword2, topicword3):theme_name
        self.gemet = {}  # gemet_keyword:theme_name
        self.ons = {}  # ons_keyword:theme_name
        self.la_function = {} # LA functions extra
        self.la_service = {}  # LA services extra
        self.odc = {}  # OpenDataCommunities.org theme extra
        for theme_dict in themes_list:
            name = theme_dict.get('stored_as') or theme_dict['title']

            for key in ('topics', 'gemet', 'nscl', 'ons', 'la_function', 'la_service',
                        'odc'):
                if key in theme_dict:
                    assert isinstance(theme_dict[key], list), (name, key)

            for topic in theme_dict['topics']:
                words = [normalize_token(word) for word in split_words(topic)]
                if len(words) == 1:
                    self.topic_words[words[0]] = name
                elif len(words) == 2:
                    self.topic_bigrams[tuple(words)] = name
                elif len(words) == 3:
                    self.topic_trigrams[tuple(words)] = name
                else:
                    assert 0, 'Too many words in topic: %s' % topic

            for gemet_keyword in theme_dict.get('gemet', []):
                self.gemet[normalize_keyword(gemet_keyword)] = name
            for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []):
                self.ons[tag_munge(ons_keyword)] = name
            for function_id in theme_dict.get('la_functions', []):
                self.la_function[function_id] = name
            for service_id in theme_dict.get('la_service', []):
                self.la_service[service_id] = name
            for keyword in theme_dict.get('odc', []):
                self.odc[keyword] = name
            self.data[name] = theme_dict
        self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it
        self.topic_bigrams_set = self.topic_bigrams.viewkeys()
        self.topic_trigrams_set = self.topic_trigrams.viewkeys()
예제 #6
0
파일: theme.py 프로젝트: palcu/ckanext-dgu
    def __init__(self):
        self.data = {}
        self.topic_words = {}  # topic:theme_name
        self.topic_bigrams = {} # (topicword1, topicword2):theme_name
        self.topic_trigrams = {} # (topicword1, topicword2, topicword3):theme_name
        self.gemet = {}  # gemet_keyword:theme_name
        self.ons = {}  # ons_keyword:theme_name
        self.la_function = {} # LA functions extra
        self.la_service = {}  # LA services extra
        self.odc = {}  # OpenDataCommunities.org theme extra

        context = {'model': model}
        terms = get_action('taxonomy_term_list')(context, {'name': 'dgu-themes'})
        for term in terms:
            theme_dict = term['extras']
            theme_dict['title'] = name = term['label']
            theme_dict['description'] = term['description']

            for key in ('topics', 'gemet', 'nscl', 'ons', 'la_function', 'la_service',
                        'odc'):
                if key in theme_dict:
                    assert isinstance(theme_dict[key], list), (name, key)

            for topic in theme_dict['topics']:
                words = [normalize_token(word) for word in split_words(topic)]
                if len(words) == 1:
                    self.topic_words[words[0]] = name
                elif len(words) == 2:
                    self.topic_bigrams[tuple(words)] = name
                elif len(words) == 3:
                    self.topic_trigrams[tuple(words)] = name
                else:
                    assert 0, 'Too many words in topic: %s' % topic

            for gemet_keyword in theme_dict.get('gemet', []):
                self.gemet[normalize_keyword(gemet_keyword)] = name
            for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []):
                self.ons[tag_munge(ons_keyword)] = name
            for function_id in theme_dict.get('la_functions', []):
                self.la_function[function_id] = name
            for service_id in theme_dict.get('la_service', []):
                self.la_service[service_id] = name
            for keyword in theme_dict.get('odc', []):
                self.odc[keyword] = name
            self.data[name] = theme_dict
        self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it
        self.topic_bigrams_set = self.topic_bigrams.viewkeys()
        self.topic_trigrams_set = self.topic_trigrams.viewkeys()
예제 #7
0
파일: theme.py 프로젝트: palcu/ckanext-dgu
def score_by_ons_theme(pkg, scores):
    # There are 11 'Old ONS themes' e.g.: 'Agriculture and Environment', 'Business and Energy'
    # http://www.statistics.gov.uk/hub/browse-by-theme/index.html
    #
    # and there are set to be 4 'New ONS themes' e.g. 'Business, Trade and Industry'
    # which break down further, that we need to look at too.
    # http://digitalpublishing.ons.gov.uk/2013/12/05/no-longer-taxing-we-hope/
    if pkg['extras'].get('external_reference') != 'ONSHUB':
        return
    themes = Themes.instance()
    for tag in pkg['tags']:
        tag = tag_munge(tag)
        if tag in themes.ons:
            theme = themes.ons[tag]
            reason = '%s matched ONS keyword' % tag
            score = 10
            scores[theme].append((score, reason))
            log.debug(' %s %s %s' % (theme, score, reason))
예제 #8
0
파일: theme.py 프로젝트: ArunEG/ckanext-dgu
                elif len(words) == 2:
                    topic_dict = self.topic_bigrams
                    key = tuple(words)
                elif len(words) == 3:
                    topic_dict = self.topic_trigrams
                    key = tuple(words)
                else:
                    assert 0, 'Too many words in topic: %s' % topic
                if key not in topic_dict:
                    topic_dict[key] = []
                topic_dict[key].append(name)

            for gemet_keyword in theme_dict.get('gemet', []):
                self.gemet[normalize_keyword(gemet_keyword)] = name
            for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get('ons', []):
                self.ons[tag_munge(ons_keyword)] = name
            for function_id in theme_dict.get('la_functions', []):
                self.la_function[function_id] = name
            for service_id in theme_dict.get('la_service', []):
                self.la_service[service_id] = name
            for keyword in theme_dict.get('odc', []):
                self.odc[keyword] = name
            self.data[name] = theme_dict
        self.topic_words_set = self.topic_words.viewkeys() # can do set-like operations on it
        self.topic_bigrams_set = self.topic_bigrams.viewkeys()
        self.topic_trigrams_set = self.topic_trigrams.viewkeys()


def normalize_text(text):
    words = [normalize_token(w) for w in split_words(text)]
    words_without_stopwords = [word for word in words
예제 #9
0
    def record_2_package(self, item):
        assert isinstance(item, dict)

        # process item
        title, release = self._split_title(item['title'])
        munged_title = schema.name_munge(title)
        publisher_name = self._source_to_publisher(item['hub:source-agency'])
        if publisher_name:
            publishers = [publisher_name]
        else:
            publishers = []
            log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency'])

        # Resources
        guid = item['guid'] or None
        if guid:
            if not guid.startswith(guid_prefix):
                raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid))
            guid = guid[len(guid_prefix):]
            if 'http' in guid: 
                raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid))
        existing_resource = None
        download_url = item.get('link', None)

        notes_list = []
        if item['description']:
            notes_list.append(item['description'])
        for column, name in [('hub:source-agency', 'Source agency'),
                             ('hub:designation', 'Designation'),
                             ('hub:language', 'Language'),
                             ('hub:altTitle', 'Alternative title'),
                       ]:
            if item[column]:
                notes_list.append('%s: %s' % (name, item[column]))
        notes = '\n\n'.join(notes_list)

        extras = {
            'geographic_coverage': u'',
            'external_reference': u'',
            'temporal_granularity': u'',
            'date_updated': u'',
            'precision': u'',
            'geographic_granularity': u'',
            'temporal_coverage-from': u'',
            'temporal_coverage-to': u'',
            'national_statistic': u'',
            'update_frequency': u'',
            'date_released': u'',
            'categories': u'',
            'series':u'',
            }
        date_released = u''
        if item['pubDate']:
            date_released = date.parse(item["pubDate"])
            if date_released.qualifier:
                log.warn('Could not read format of publication (release) date: %r' % 
                         item["pubDate"])
        extras['date_released'] = date_released.isoformat()
        extras['categories'] = item['hub:theme']
        extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage'])
        extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no'
        extras['geographic_granularity'] = item['hub:geographic-breakdown']
        extras['external_reference'] = u'ONSHUB'
        extras['series'] = title if release else u''
        for update_frequency_suggestion in schema.update_frequency_options:
            item_info = ('%s %s' % (item['title'], item['description'])).lower()
            if update_frequency_suggestion in item_info:
                extras['update_frequency'] = update_frequency_suggestion
            elif update_frequency_suggestion.endswith('ly'):
                if update_frequency_suggestion.rstrip('ly') in item_info:
                    extras['update_frequency'] = update_frequency_suggestion
        extras['import_source'] = 'ONS-%s' % self._current_filename 

        resources = [{
            'url': download_url,
            'description': release,
            'hub-id': guid,
            'publish-date': date_released.as_datetime().strftime('%Y-%m-%d'),
            }]

        # update package
        pkg_dict = {
            'name': munged_title,
            'title': title,
            'version': None,
            'url': None,
            'maintainer': None,
            'maintainer_email': None,
            'notes': notes,
            'license_id': self._crown_license_id,
            'tags': [], # post-filled
            'groups': publishers,
            'resources': resources,
            'extras': extras,
            }

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        for keyword in item['hub:ipsv'].split(';') + \
                item['hub:keywords'].split(';') + \
                item['hub:nscl'].split(';'):
            tag = schema.tag_munge(keyword)
            if tag and len(tag) > 1:
                tags.add(tag)
        tags = list(tags)
        tags.sort()
        pkg_dict['tags'] = tags

        return pkg_dict
예제 #10
0
                    topic_dict = self.topic_bigrams
                    key = tuple(words)
                elif len(words) == 3:
                    topic_dict = self.topic_trigrams
                    key = tuple(words)
                else:
                    assert 0, 'Too many words in topic: %s' % topic
                if key not in topic_dict:
                    topic_dict[key] = []
                topic_dict[key].append(name)

            for gemet_keyword in theme_dict.get('gemet', []):
                self.gemet[normalize_keyword(gemet_keyword)] = name
            for ons_keyword in theme_dict.get('nscl', []) + theme_dict.get(
                    'ons', []):
                self.ons[tag_munge(ons_keyword)] = name
            for function_id in theme_dict.get('la_functions', []):
                self.la_function[function_id] = name
            for service_id in theme_dict.get('la_service', []):
                self.la_service[service_id] = name
            for keyword in theme_dict.get('odc', []):
                self.odc[keyword] = name
            self.data[name] = theme_dict
        self.topic_words_set = self.topic_words.viewkeys(
        )  # can do set-like operations on it
        self.topic_bigrams_set = self.topic_bigrams.viewkeys()
        self.topic_trigrams_set = self.topic_trigrams.viewkeys()


def normalize_text(text):
    words = [normalize_token(w) for w in split_words(text)]
예제 #11
0
    def record_2_package(self, item):
        assert isinstance(item, dict)

        # process item
        title, release = self._split_title(item['title'])
        munged_title = schema.name_munge(title)
        publisher_name = self._source_to_publisher(item['hub:source-agency'])
        if publisher_name:
            publishers = [publisher_name]
        else:
            publishers = []
            log.warn('Did not find publisher for source-agency: %s', item['hub:source-agency'])

        # Resources
        guid = item['guid'] or None
        if guid:
            if not guid.startswith(guid_prefix):
                raise RowParseError('GUID did not start with prefix %r: %r' % (guid_prefix, guid))
            guid = guid[len(guid_prefix):]
            if 'http' in guid: 
                raise RowParseError('GUID de-prefixed should not have \'http\' in it still: %r' % (guid))
        existing_resource = None
        download_url = item.get('link', None)

        notes_list = []
        if item['description']:
            notes_list.append(item['description'])
        for column, name in [('hub:source-agency', 'Source agency'),
                             ('hub:designation', 'Designation'),
                             ('hub:language', 'Language'),
                             ('hub:altTitle', 'Alternative title'),
                       ]:
            if item[column]:
                notes_list.append('%s: %s' % (name, item[column]))
        notes = '\n\n'.join(notes_list)

        extras = {
            'geographic_coverage': u'',
            'external_reference': u'',
            'temporal_granularity': u'',
            'date_updated': u'',
            'precision': u'',
            'geographic_granularity': u'',
            'temporal_coverage-from': u'',
            'temporal_coverage-to': u'',
            'national_statistic': u'',
            'update_frequency': u'',
            'date_released': u'',
            'categories': u'',
            'series':u'',
            }
        date_released = u''
        if item['pubDate']:
            date_released = date.parse(item["pubDate"])
            if date_released.qualifier:
                log.warn('Could not read format of publication (release) date: %r' % 
                         item["pubDate"])
        extras['date_released'] = date_released.isoformat()
        extras['categories'] = item['hub:theme']
        extras['geographic_coverage'] = self._parse_geographic_coverage(item['hub:coverage'])
        extras['national_statistic'] = 'yes' if item['hub:designation'] == 'National Statistics' or item['hub:designation'] == 'National Statistics' else 'no'
        extras['geographic_granularity'] = item['hub:geographic-breakdown']
        extras['external_reference'] = u'ONSHUB'
        extras['series'] = title if release else u''
        for update_frequency_suggestion in schema.update_frequency_options:
            item_info = ('%s %s' % (item['title'], item['description'])).lower()
            if update_frequency_suggestion in item_info:
                extras['update_frequency'] = update_frequency_suggestion
            elif update_frequency_suggestion.endswith('ly'):
                if update_frequency_suggestion.rstrip('ly') in item_info:
                    extras['update_frequency'] = update_frequency_suggestion
        extras['import_source'] = 'ONS-%s' % self._current_filename 

        resources = [{
            'url': download_url,
            'description': release,
            'hub-id': guid,
            }]

        # update package
        pkg_dict = {
            'name': munged_title,
            'title': title,
            'version': None,
            'url': None,
            'maintainer': None,
            'maintainer_email': None,
            'notes': notes,
            'license_id': self._crown_license_id,
            'tags': [], # post-filled
            'groups': publishers,
            'resources': resources,
            'extras': extras,
            }

        tags = schema.TagSuggester.suggest_tags(pkg_dict)
        for keyword in item['hub:ipsv'].split(';') + \
                item['hub:keywords'].split(';') + \
                item['hub:nscl'].split(';'):
            tag = schema.tag_munge(keyword)
            if tag and len(tag) > 1:
                tags.add(tag)
        tags = list(tags)
        tags.sort()
        pkg_dict['tags'] = tags

        return pkg_dict