def scrape_article(url, hhtype): # pragma: no cover html = get_html(url) md = { 'title': html.find('h3').text, 'author': [], 'hhtype': hhtype, 'journal': 'Language Documentation and Description', 'url': url, } pdf_url = None for div in html.find_all('div'): if div.text.startswith('Link to item:'): pdf_url = div.find('a')['href'] assert pdf_url.endswith('.pdf') code = language_code_from_pdf(pdf_url) if code: md['lgcode'] = '{} [{}]'.format(get_language_name(md['title']), code) if div.find('span') and div.find('span').text.startswith('Pages'): md['pages'] = div.find('div').text if div.text.startswith('Date: '): md['year'] = div.text.split(':')[1].strip() for td in html.find_all('td'): link = td.find('a') if link and link.attrs.get('href').startswith('/authorpage'): md['author'].append(link.text) assert pdf_url match = re.search(r'/ldd(?P<volume>[0-9]+)_[0-9]+\.pdf', pdf_url) md.update(match.groupdict()) md['author'] = ' and '.join(md['author']) return Source('article', url.split('/')[-1], **md)
def as_source(self): kw = { 'title': self.title, } if self.authors: kw['author'] = ' and '.join(self.authors) if self.editors: kw['editor'] = ' and '.join(self.editors) return Source(self.bibtex_type, self.id, **kw)
def change_request_as_source(id_, rows, ref_ids): title = "Change Request Number {0}: ".format(id_) title += ", ".join("{0} {1} [{2}]".format( r['Outcome/Effective date'].split('20')[0].strip().lower(), r['Change Type'].lower(), r['Affected Identifier']) for r in rows) date = None for row in rows: parts = row['Outcome/Effective date'].split('20') if len(parts) > 1: if date: assert date == parts[1].strip() else: date = parts[1].strip() if date: title += ' ({0})'.format(date) fields = { 'number': id_, 'title': title, 'howpublished': BASE_URL + "/chg_detail.asp?id=" + id_, 'address': "Dallas", 'author': "ISO 639-3 Registration Authority", 'publisher': "SIL International", 'url': BASE_URL + "/cr_files/{0}.pdf".format(id_), 'year': id_.split('-')[0], 'hhtype': "overview", 'lgcode': ', '.join("{0} [{1}]".format(r['Language Name'].strip(), r['Affected Identifier']) for r in rows), 'src': "iso6393", } if id_ in ref_ids: fields['glottolog_ref_id'] = ref_ids[id_] return Source('misc', id_, **fields)
def text(self) -> str: """Return the text linearization of the entry.""" return Source(self.type, self.key, _check_id=False, **self.fields).text()
def test_checks(): with pytest.raises(ValueError): Source('genre', 'a.b') assert Source('genre', 'a.b', _check_id=False).id == 'a.b' assert Source.from_bibtex('@misc{a.b,\n}', _check_id=False).id == 'a.b'