def check_colon(title): if u':' in title: colon_prefix = title.split(u':')[0].lower() if colon_prefix in COLON_PREFIXES.keys() and (COLON_PREFIXES[colon_prefix])[1] == 'yes': return clean_string(u':'.join(title.split(u':')[1:])) else: return title else: return title
def _get_resource_from_article(self, article, anchor, context=None): logger = logging.getLogger(sys._getframe().f_code.co_name) # titles anchor_title = clean_string(anchor.get_text()) titles = self._reconcile_titles(anchor_title, article.title) try: title = titles[0] except IndexError: msg = 'could not extract resource title' raise IndexError(msg) try: title_extended = titles[1] except IndexError: title_extended = None # description desc_text = self._get_description(context, title=title) if desc_text is None: logger.warning(u'could not extract primary resource description from {0}; using title'.format(article.url)) desc_text = title # parse authors authors = self._parse_authors(desc_text) # parse identifiers identifiers = self._parse_identifiers(desc_text) # language language = self._get_language(title, title_extended, desc_text) # determine keywords keywords = self._parse_keywords(article.title, titles[-1], article.categories) # create and populate the resource object params = { 'url': anchor.get('href'), 'domain': domain_from_url(anchor.get('href')), 'title': title } if desc_text is not None: params['description'] = desc_text if len(identifiers.keys()) > 0: params['identifiers'] = identifiers if len(authors) > 0: params['authors'] = authors if title_extended is not None: params['title_extended'] = title_extended if language is not None: params['languages'] = language if len(keywords) > 0: params['keywords'] = keywords resource = self._make_resource(**params) # provenance self._set_provenance(resource, article) return resource
def _get_primary_resource(self, article): # title a = self._get_primary_anchor() a_title = clean_string(a.get_text()) titles = self._reconcile_titles(a_title, article.title) try: title = titles[0] except IndexError: msg = 'could not extract resource title' raise IndexError(msg) try: title_extended = titles[1] except IndexError: title_extended = None # description desc_text = self._get_description(title=title) if desc_text is None: desc_text = title # parse authors authors = self._parse_authors(desc_text) # parse identifiers identifiers = self._parse_identifiers(desc_text) # language language = self._get_language(title, title_extended, desc_text) # determine keywords keywords = self._parse_keywords(article.title, titles[-1], article.categories) # create and populate the resource object params = { 'url': a.get('href'), 'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0], 'title': title } if desc_text is not None: params['description'] = desc_text if len(authors) > 0: params['authors'] = authors if len(identifiers.keys()) > 0: params['identifiers'] = identifiers if title_extended is not None: params['title_extended'] = title_extended if language is not None: params['languages'] = language if len(keywords) > 0: params['keywords'] = keywords resource = self._make_resource(**params) # provenance self._set_provenance(resource, article) return resource
def _get_related_resources(self): resources = [] anchors = self._get_anchors()[1:] anchors = [a for a in anchors if domain_from_url(a.get('href')) in DOMAINS_SELF] for a in anchors: # title title_context = self._get_anchor_ancestor_for_title(a) title = clean_string(title_context.get_text()) # description next_node = title_context.next_element desc_text = self._get_description(next_node, title=title) # parse identifiers identifiers = self._parse_identifiers(desc_text) # language language = self._get_language(title, desc_text) # determine keywords keywords = self._parse_keywords(resource_title=title, resource_text=desc_text) # create and populate the resource object r = Resource() params = { 'url': a.get('href'), 'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0], 'title': title } if desc_text is not None: params['description'] = desc_text if len(identifiers.keys()) > 0: params['identifiers'] = identifiers if language is not None: params['languages'] = language if len(keywords) > 0: params['keywords'] = keywords resource = self._make_resource(**params) resources.append(resource) return resources
def _get_subordinate_resources(self, article, parent_package, start_anchor=None): logger = logging.getLogger(sys._getframe().f_code.co_name) resources = [] anchors = self._get_anchors() index = 0 if start_anchor is not None: for i,a in enumerate(anchors): if a == start_anchor: index = i break anchors = [a for a in anchors[index:]] parent_domain = domain_from_url(parent_package['url']) anchors = [a for a in anchors if parent_domain in a.get('href')] for a in anchors: # title title_context = self._get_anchor_ancestor_for_title(a) title = clean_string(title_context.get_text(u' ')) # try to extract volume and year try: volume, issue, year = self._grok_analytic_title(title) except TypeError: volume = year = issue = None if volume is not None and year is None and issue is not None: # sometimes more than one volume falls in a single list item b/c same year or parts try: parent_li = a.find_parents('li')[0] except: pass else: try: raw = parent_li.get_text().strip()[0:4] except IndexError: pass else: try: cooked = str(int(raw)) except ValueError: pass else: if cooked == raw: year = cooked # description next_node = title_context.next_sibling desc_text = self._get_description(next_node, title=title) # parse identifiers identifiers = self._parse_identifiers(desc_text) # language language = self._get_language(title, desc_text) # determine keywords keywords = self._parse_keywords(resource_title=title, resource_text=desc_text) # create and populate the resource object params = { 'url': a.get('href'), 'domain': a.get('href').replace('http://', '').replace('https://', '').split('/')[0], 'title': title, 'is_part_of': parent_package } if desc_text is not None: params['description'] = desc_text if len(identifiers.keys()) > 0: params['identifiers'] = identifiers if language is not None: params['languages'] = language if len(keywords) > 0: params['keywords'] = keywords if volume is not None: params['volume'] = volume if year is not None: params['year'] = year if issue is not None: params['issue'] = issue resource = self._make_resource(**params) self._set_provenance(resource, article) resources.append(resource) return resources