Esempio n. 1
0
def scrape_article(url, hhtype):  # pragma: no cover
    html = get_html(url)
    md = {
        'title': html.find('h3').text,
        'author': [],
        'hhtype': hhtype,
        'journal': 'Language Documentation and Description',
        'url': url,
    }
    pdf_url = None
    for div in html.find_all('div'):
        if div.text.startswith('Link to item:'):
            pdf_url = div.find('a')['href']
            assert pdf_url.endswith('.pdf')
            code = language_code_from_pdf(pdf_url)
            if code:
                md['lgcode'] = '{} [{}]'.format(get_language_name(md['title']),
                                                code)
        if div.find('span') and div.find('span').text.startswith('Pages'):
            md['pages'] = div.find('div').text
        if div.text.startswith('Date: '):
            md['year'] = div.text.split(':')[1].strip()
    for td in html.find_all('td'):
        link = td.find('a')
        if link and link.attrs.get('href').startswith('/authorpage'):
            md['author'].append(link.text)
    assert pdf_url
    match = re.search(r'/ldd(?P<volume>[0-9]+)_[0-9]+\.pdf', pdf_url)
    md.update(match.groupdict())
    md['author'] = ' and '.join(md['author'])
    return Source('article', url.split('/')[-1], **md)
Esempio n. 2
0
File: pure.py Progetto: clld/ldh
 def as_source(self):
     kw = {
         'title': self.title,
     }
     if self.authors:
         kw['author'] = ' and '.join(self.authors)
     if self.editors:
         kw['editor'] = ' and '.join(self.editors)
     return Source(self.bibtex_type, self.id, **kw)
Esempio n. 3
0
def change_request_as_source(id_, rows, ref_ids):
    title = "Change Request Number {0}: ".format(id_)
    title += ", ".join("{0} {1} [{2}]".format(
        r['Outcome/Effective date'].split('20')[0].strip().lower(),
        r['Change Type'].lower(), r['Affected Identifier']) for r in rows)
    date = None
    for row in rows:
        parts = row['Outcome/Effective date'].split('20')
        if len(parts) > 1:
            if date:
                assert date == parts[1].strip()
            else:
                date = parts[1].strip()
    if date:
        title += ' ({0})'.format(date)
    fields = {
        'number':
        id_,
        'title':
        title,
        'howpublished':
        BASE_URL + "/chg_detail.asp?id=" + id_,
        'address':
        "Dallas",
        'author':
        "ISO 639-3 Registration Authority",
        'publisher':
        "SIL International",
        'url':
        BASE_URL + "/cr_files/{0}.pdf".format(id_),
        'year':
        id_.split('-')[0],
        'hhtype':
        "overview",
        'lgcode':
        ', '.join("{0} [{1}]".format(r['Language Name'].strip(),
                                     r['Affected Identifier']) for r in rows),
        'src':
        "iso6393",
    }
    if id_ in ref_ids:
        fields['glottolog_ref_id'] = ref_ids[id_]
    return Source('misc', id_, **fields)
Esempio n. 4
0
 def text(self) -> str:
     """Return the text linearization of the entry."""
     return Source(self.type, self.key, _check_id=False, **self.fields).text()
Esempio n. 5
0
def test_checks():
    with pytest.raises(ValueError):
        Source('genre', 'a.b')

    assert Source('genre', 'a.b', _check_id=False).id == 'a.b'
    assert Source.from_bibtex('@misc{a.b,\n}', _check_id=False).id == 'a.b'