Beispiel #1
0
def check_colon(title):
    if u':' in title:
        colon_prefix = title.split(u':')[0].lower()
        if colon_prefix in COLON_PREFIXES.keys() and (COLON_PREFIXES[colon_prefix])[1] == 'yes':
            return clean_string(u':'.join(title.split(u':')[1:]))
        else:
            return title
    else:
        return title
Beispiel #2
0
    def _get_resource_from_article(self, article, anchor, context=None):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        # titles
        anchor_title = clean_string(anchor.get_text())
        titles = self._reconcile_titles(anchor_title, article.title)
        try:
            title = titles[0]
        except IndexError:
            msg = 'could not extract resource title'
            raise IndexError(msg)
        try:
            title_extended = titles[1]
        except IndexError:
            title_extended = None

        # description
        desc_text = self._get_description(context, title=title)
        if desc_text is None:
            logger.warning(u'could not extract primary resource description from {0}; using title'.format(article.url))
            desc_text = title

        # parse authors
        authors = self._parse_authors(desc_text)

        # parse identifiers
        identifiers = self._parse_identifiers(desc_text)

        # language
        language = self._get_language(title, title_extended, desc_text)

        # determine keywords
        keywords = self._parse_keywords(article.title, titles[-1], article.categories)

        # create and populate the resource object
        params = {
            'url': anchor.get('href'),
            'domain': domain_from_url(anchor.get('href')),
            'title': title
        }
        if desc_text is not None:
            params['description'] = desc_text
        if len(identifiers.keys()) > 0:
            params['identifiers'] = identifiers
        if len(authors) > 0:
            params['authors'] = authors
        if title_extended is not None:
            params['title_extended'] = title_extended
        if language is not None:
            params['languages'] = language
        if len(keywords) > 0:
            params['keywords'] = keywords
        resource = self._make_resource(**params)

        # provenance
        self._set_provenance(resource, article)

        return resource
Beispiel #3
0
    def _get_primary_resource(self, article):
        # title
        a = self._get_primary_anchor()
        a_title = clean_string(a.get_text())
        titles = self._reconcile_titles(a_title, article.title)
        try:
            title = titles[0]
        except IndexError:
            msg = 'could not extract resource title'
            raise IndexError(msg)
        try:
            title_extended = titles[1]
        except IndexError:
            title_extended = None

        # description
        desc_text = self._get_description(title=title)
        if desc_text is None:
            desc_text = title

        # parse authors
        authors = self._parse_authors(desc_text)

        # parse identifiers
        identifiers = self._parse_identifiers(desc_text)

        # language
        language = self._get_language(title, title_extended, desc_text)

        # determine keywords
        keywords = self._parse_keywords(article.title, titles[-1], article.categories)

        # create and populate the resource object
        params = {
            'url': a.get('href'),
            'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
            'title': title
        }
        if desc_text is not None:
            params['description'] = desc_text
        if len(authors) > 0:
            params['authors'] = authors
        if len(identifiers.keys()) > 0:
            params['identifiers'] = identifiers
        if title_extended is not None:
            params['title_extended'] = title_extended
        if language is not None:
            params['languages'] = language
        if len(keywords) > 0:
            params['keywords'] = keywords
        resource = self._make_resource(**params)

        # provenance
        self._set_provenance(resource, article)

        return resource
Beispiel #4
0
    def _get_related_resources(self):
        resources = []
        anchors = self._get_anchors()[1:]
        anchors = [a for a in anchors if domain_from_url(a.get('href')) in DOMAINS_SELF]
        for a in anchors:
            # title
            title_context = self._get_anchor_ancestor_for_title(a)
            title = clean_string(title_context.get_text())

            # description
            next_node = title_context.next_element
            desc_text = self._get_description(next_node, title=title)

            # parse identifiers
            identifiers = self._parse_identifiers(desc_text)

            # language
            language = self._get_language(title, desc_text)

            # determine keywords
            keywords = self._parse_keywords(resource_title=title, resource_text=desc_text)

            # create and populate the resource object
            r = Resource()
            params = {
                'url': a.get('href'),
                'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
                'title': title
            }
            if desc_text is not None:
                params['description'] = desc_text
            if len(identifiers.keys()) > 0:
                params['identifiers'] = identifiers
            if language is not None:
                params['languages'] = language
            if len(keywords) > 0:
                params['keywords'] = keywords
            resource = self._make_resource(**params)
            resources.append(resource)
        return resources
Beispiel #5
0
    def _get_subordinate_resources(self, article, parent_package, start_anchor=None):
        logger = logging.getLogger(sys._getframe().f_code.co_name)
        resources = []
        anchors = self._get_anchors()
        index = 0
        if start_anchor is not None:
            for i,a in enumerate(anchors):
                if a == start_anchor:
                    index = i
                    break
            anchors = [a for a in anchors[index:]]

        parent_domain = domain_from_url(parent_package['url'])
        anchors = [a for a in anchors if parent_domain in a.get('href')]

        for a in anchors:
            # title
            title_context = self._get_anchor_ancestor_for_title(a)
            title = clean_string(title_context.get_text(u' '))

            # try to extract volume and year
            try:
                volume, issue, year = self._grok_analytic_title(title)
            except TypeError:
                volume = year = issue = None
            if volume is not None and year is None and issue is not None:
                # sometimes more than one volume falls in a single list item b/c same year or parts
                try:
                    parent_li = a.find_parents('li')[0]
                except:
                    pass
                else:
                    try:
                        raw = parent_li.get_text().strip()[0:4]
                    except IndexError:
                        pass
                    else:
                        try:
                            cooked = str(int(raw))
                        except ValueError:
                            pass
                        else:
                            if cooked == raw:
                                year = cooked

            # description
            next_node = title_context.next_sibling
            desc_text = self._get_description(next_node, title=title)

            # parse identifiers
            identifiers = self._parse_identifiers(desc_text)

            # language
            language = self._get_language(title, desc_text)

            # determine keywords
            keywords = self._parse_keywords(resource_title=title, resource_text=desc_text)

            # create and populate the resource object
            params = {
                'url': a.get('href'),
                'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0],
                'title': title,
                'is_part_of': parent_package
            }
            if desc_text is not None:
                params['description'] = desc_text
            if len(identifiers.keys()) > 0:
                params['identifiers'] = identifiers
            if language is not None:
                params['languages'] = language
            if len(keywords) > 0:
                params['keywords'] = keywords
            if volume is not None:
                params['volume'] = volume
            if year is not None:
                params['year'] = year
            if issue is not None:
                params['issue'] = issue
            resource = self._make_resource(**params)

            self._set_provenance(resource, article)

            resources.append(resource)
        return resources