def get_identifiers(self): value = self.check_metas(['DC.Identifier.URI']) if not value: value = self.doc.select_one('link[rel=canonical]') value = value['href'] if value else None value = identifier_cleaner('http')(value) if value: self.identifiers['http'] = value value = self.check_metas(['DC.Identifier.DOI', 'citation_doi']) value = identifier_cleaner('doi')(value) if value: self.identifiers['doi'] = value isbns = self.get_isbns() ed_list = [] if len(isbns): #need to create edition list for key in isbns.keys(): isbn_type = key.split('_')[-1] ed_list.append({ 'edition_note': isbn_type, 'edition_identifiers': {'isbn': isbns[key]} }) else: value = self.check_metas(['citation_isbn'], list_mode='list') if len(value): for isbn in value: isbn = identifier_cleaner('isbn')(isbn) if isbn: ed_list.append({ '_edition': isbn, 'edition_identifiers': {'isbn':isbn} }) if len(ed_list): self.set('edition_list', ed_list)
def get_isbns(self): isbns = {} el = self.doc.select_one('#print-isbn') if el: value = identifier_cleaner('isbn', quiet=True)(el.text) if value: isbns['paper'] = value el = self.doc.select_one('#electronic-isbn') if el: value = identifier_cleaner('isbn', quiet=True)(el.text) if value: isbns['electronic'] = value return isbns
def get_identifiers(self): value = self.check_metas([r'DC\.Identifier\.URI']) if not value: value = self.doc.select_one('link[rel=canonical]') value = value['href'] if value else None value = identifier_cleaner('http', quiet=True)(value) if value: self.identifiers['http'] = value value = self.check_metas([r'DC\.Identifier\.DOI', 'citation_doi']) value = identifier_cleaner('doi', quiet=True)(value) if value: self.identifiers['doi'] = value #look for oclc numbers links = self.doc.find_all(href=CONTAINS_OCLCNUM) for link in links: oclcmatch = CONTAINS_OCLCNUM.search(link['href']) if oclcmatch: value = identifier_cleaner('oclc', quiet=True)(oclcmatch.group(1)) if value: self.identifiers['oclc'] = value break isbns = self.get_isbns() ed_list = [] if len(isbns): #need to create edition list for key in isbns.keys(): isbn_type = key.split('_')[-1] ed_list.append({ 'edition_note': isbn_type, 'edition_identifiers': { 'isbn': isbns[key] } }) else: value = self.check_metas(['citation_isbn'], list_mode='list') if len(value): for isbn in value: isbn = identifier_cleaner('isbn', quiet=True)(isbn) if isbn: ed_list.append({ '_edition': isbn, 'edition_identifiers': { 'isbn': isbn } }) if len(ed_list): self.set('edition_list', ed_list)
def clean(self): id_type = self.cleaned_data['id_type'] id_value = self.cleaned_data.get('id_value', '').strip() make_new = self.cleaned_data.get('make_new', False) if not make_new: self.cleaned_data['value'] = identifier_cleaner(id_type)(id_value) return self.cleaned_data
def get_identifiers(self): super(SpringerScraper, self).get_identifiers() el = self.doc.select_one('#doi-url') if el: value = identifier_cleaner('doi', quiet=True)(el.text) if value: self.identifiers['doi'] = value
def get_isbns(self): '''return a dict of edition keys and ISBNs''' isbns = {} isbn_cleaner = identifier_cleaner('isbn', quiet=True) label_map = { 'epub': 'EPUB', 'mobi': 'Mobi', 'paper': 'Paperback', 'pdf': 'PDF', 'hard': 'Hardback' } for key in label_map.keys(): isbn_key = 'isbn_{}'.format(key) value = self.check_metas(['citation_isbn'], type=label_map[key]) value = isbn_cleaner(value) if value: isbns[isbn_key] = value self.identifiers[isbn_key] = value if not isbns: values = self.check_metas(['book:isbn', 'books:isbn'], list_mode='list') values = values if values else self.get_itemprop('isbn') if values: value = isbn_cleaner(values[0]) isbns = {'': value} if value else {} return isbns
def clean(self): id_type = self.cleaned_data['id_type'] id_value = self.cleaned_data.get('id_value', '').strip() if id_value: try: id_value = identifier_cleaner(id_type)(id_value) identifier = Identifier.objects.filter(type=id_type, value=id_value) ident = identifier[0] if identifier else None if not ident or not self.instance: self.cleaned_data['id_value'] = id_value elif ident.edition_id == self.instance.id: self.cleaned_data['id_value'] = id_value elif not ident.edition_id and ident.work_id == self.instance.work_id: self.cleaned_data['id_value'] = id_value else: if ident.edition_id: err_msg = "{} is a duplicate for edition #{}.".format( id_value, ident.edition_id) else: err_msg = "{} is a duplicate for work #{}.".format( id_value, ident.work_id) self.add_error('id_value', forms.ValidationError(err_msg)) except forms.ValidationError, ve: self.add_error( 'id_value', forms.ValidationError('{}: {}'.format( ve.message, id_value)))
def get_isbns(self): '''add isbn identifiers and return a dict of edition keys and ISBNs''' isbns = {} for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]: isbn = identifier_cleaner('isbn')(self.get_dt_dd(label)) if isbn: self.identifiers['isbn_{}'.format(key)] = isbn isbns[key] = isbn return isbns
def get_isbns(self): '''return a dict of edition keys and ISBNs''' isbns = {} label_map = {'epub': 'EPUB', 'mobi': 'Mobi', 'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'} for key in label_map.keys(): isbn_key = 'isbn_{}'.format(key) value = self.check_metas(['citation_isbn'], type=label_map[key]) value = identifier_cleaner('isbn')(value) if value: isbns[isbn_key] = value self.identifiers[isbn_key] = value return isbns
def get_identifiers(self): doi_cleaner = identifier_cleaner('doi', quiet=True) super(KUMultiScraper, self).get_identifiers() url = self.fetch_one_el_content('Doi') if url: doi = doi_cleaner(url) if doi: self.identifiers['doi'] = doi url = self.fetch_one_el_content('OAPENURL') if url: oapn = ids_from_urls(url).get('oapn', None) if oapn: self.identifiers['oapn'] = oapn
def get_isbns(self): '''return a dict of edition keys and ISBNs''' isbns = {} isbn_cleaner = identifier_cleaner('isbn', quiet=True) labels = ['epub', 'pdf', 'paper'] info = self.doc.select_one('p.nfo').text isbntexts = re.split('ISBN', info) for isbntext in isbntexts[1:]: isbnmatch = ISBNMATCH.search(isbntext) if isbnmatch: isbn = isbn_cleaner(isbnmatch.group(0)) isbns[labels.pop()] = isbn return isbns
def get_isbns(self): isbn_cleaner = identifier_cleaner('isbn', quiet=True) isbns = {} isbn = isbn_cleaner(self.fetch_one_el_content('IsbnHardback')) if isbn: isbns['isbn_hard'] = isbn isbn = isbn_cleaner(self.fetch_one_el_content('IsbnPaperback')) if isbn: isbns['isbn_paper'] = isbn isbn = isbn_cleaner(self.fetch_one_el_content('IsbnEpdf')) if isbn: isbns['isbn_pdf'] = isbn isbn = isbn_cleaner(self.fetch_one_el_content('IsbnEpub')) if isbn: isbns['isbn_epub'] = isbn return isbns
def clean(self): id_type = self.cleaned_data['id_type'] id_value = self.cleaned_data.get('id_value', '').strip() if id_value: identifier = Identifier.objects.filter(type=id_type, value=id_value) if identifier: err_msg = "{} is a duplicate for work #{}.".format( identifier[0], identifier[0].work.id) self.add_error('id_value', forms.ValidationError(err_msg)) try: self.cleaned_data['value'] = identifier_cleaner(id_type)( id_value) except forms.ValidationError, ve: self.add_error( 'id_value', forms.ValidationError('{}: {}'.format( ve.message, id_value)))
return ['aut', fnf(auth)] def creator_list(creators): auths = [] for auth in creators: auths.append(creator(auth)) return auths DOAB_OAIURL = 'https://www.doabooks.org/oai' DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*') mdregistry = MetadataRegistry() mdregistry.registerReader('oai_dc', oai_dc_reader) doab_client = Client(DOAB_OAIURL, mdregistry) isbn_cleaner = identifier_cleaner('isbn', quiet=True) ISBNSEP = re.compile(r'[/]+') def add_by_doab(doab_id, record=None): try: record = record if record else doab_client.getRecord( metadataPrefix='oai_dc', identifier='oai:doab-books:{}'.format(doab_id)) metadata = record[1].getMap() isbns = [] url = None for ident in metadata.pop('identifier', []): if ident.startswith('ISBN: '): isbn_strings = ISBNSEP.split(ident[6:].strip()) for isbn_string in isbn_strings:
def get_isbns(self): isbn = self.record.get('issn', []) value = identifier_cleaner('isbn')(isbn) return {'print': value} if value else {}
def get_isbns(self): if self.record: isbn = self.record.get('issn', []) value = identifier_cleaner('isbn', quiet=True)(isbn) return {'print': value} if value else {} return super(HathitrustScraper, self).get_isbns()