def __init__(self, genre, id_, **kw): BaseSource.__init__(self, genre, id_, **kw) persons = dict( author=list(self.persons(kw.pop('author', ''))), editor=list(self.persons(kw.pop('editor', '')))) assert 'author' not in kw self.entry = database.Entry( genre, fields={k: v for k, v in kw.items() if v}, persons=persons)
def test_Source_from_entry(mocker): src = Source.from_entry( 'xyz', mocker.Mock(type='misc', fields={'title': 'abc'}, persons=None)) assert src.id == 'xyz' assert src.genre == 'misc' assert 'author' not in src assert src['title'] == 'abc' assert '{0}'.format(src) == 'n.d. abc.' assert repr(src) == '<Source xyz>' src = Source.from_entry( 'xyz', mocker.Mock(type='misc', fields={'title': 'abc'}, persons={'author': ['Alfred E. Neumann', 'T. M.']})) assert src['author'] == 'Alfred E. Neumann and T. M.'
def scrape_article(url, hhtype): # pragma: no cover html = get_html(url) md = { 'title': html.find('h3').text, 'author': [], 'hhtype': hhtype, 'journal': 'Language Documentation and Description', 'url': url, } pdf_url = None for div in html.find_all('div'): if div.text.startswith('Link to item:'): pdf_url = div.find('a')['href'] assert pdf_url.endswith('.pdf') code = language_code_from_pdf(pdf_url) if code: md['lgcode'] = '{} [{}]'.format(get_language_name(md['title']), code) if div.find('span') and div.find('span').text.startswith('Pages'): md['pages'] = div.find('div').text if div.text.startswith('Date: '): md['year'] = div.text.split(':')[1].strip() for td in html.find_all('td'): link = td.find('a') if link and link.attrs.get('href').startswith('/authorpage'): md['author'].append(link.text) assert pdf_url match = re.search(r'/ldd(?P<volume>[0-9]+)_[0-9]+\.pdf', pdf_url) md.update(match.groupdict()) md['author'] = ' and '.join(md['author']) return Source('article', url.split('/')[-1], **md)
def sources(self): srcs = [ Source.from_bibtex('@' + s) for s in self.path('sources.bib').read_text( encoding='utf8').split('@') if s.strip() ] return {src['key']: src for src in srcs}
def bibliography(self) -> typing.Dict[str, Source]: """ :returns: `dict` mapping BibTeX IDs to `Reference` instances. """ return to_dict( Source.from_entry(key, entry) for key, entry in pybtex.database.parse_string( self.bibfile.read_text( encoding='utf8'), bib_format='bibtex').entries.items())
def as_source(self): kw = { 'title': self.title, } if self.authors: kw['author'] = ' and '.join(self.authors) if self.editors: kw['editor'] = ' and '.join(self.editors) return Source(self.bibtex_type, self.id, **kw)
def bibliography(self): """ :returns: `dict` mapping BibTeX IDs to `Reference` instances. """ refs = [] with self.bibfile.open(encoding='utf8') as fp: for key, entry in pybtex.database.parse_string( fp.read(), bib_format='bibtex').entries.items(): refs.append(Source.from_entry(key, entry)) return to_dict(refs)
def bibtex(): bib = references_path("bibtex", "iso6393.bib") glottolog_ref_ids = {} if bib.exists(): with bib.open(encoding="utf8") as fp: for rec in fp.read().split("@misc"): if rec.strip(): rec = Source.from_bibtex("@misc" + rec) if "glottolog_ref_id" in rec: glottolog_ref_ids[rec.id] = rec["glottolog_ref_id"] with bib.open("w", encoding="utf8") as fp: for id_, rows in groupby(iter_change_requests(), lambda c: c["CR Number"]): fp.write(change_request_as_source(id_, list(rows), glottolog_ref_ids).bibtex()) fp.write("\n\n")
def doi2bibtex(doi): url = 'https://scipython.com/apps/doi2bib/?doi={}'.format( urllib.parse.quote_plus(doi)) req = urllib.request.urlopen(url) bibtex, in_bibtex = [], False for line in req.read().decode('utf8').split('\n'): if line.strip().startswith('</textarea>'): break if in_bibtex: bibtex.append(line) if line.strip().startswith('<textarea'): in_bibtex = True bibtex = '\n'.join(bibtex).strip() if bibtex: src = Source.from_bibtex(bibtex, _check_id=False) src.id = slug(doi) src['key'] = doi return src.bibtex()
def change_request_as_source(id_, rows, ref_ids): title = "Change Request Number {0}: ".format(id_) title += ", ".join("{0} {1} [{2}]".format( r['Outcome/Effective date'].split('20')[0].strip().lower(), r['Change Type'].lower(), r['Affected Identifier']) for r in rows) date = None for row in rows: parts = row['Outcome/Effective date'].split('20') if len(parts) > 1: if date: assert date == parts[1].strip() else: date = parts[1].strip() if date: title += ' ({0})'.format(date) fields = { 'number': id_, 'title': title, 'howpublished': BASE_URL + "/chg_detail.asp?id=" + id_, 'address': "Dallas", 'author': "ISO 639-3 Registration Authority", 'publisher': "SIL International", 'url': BASE_URL + "/cr_files/{0}.pdf".format(id_), 'year': id_.split('-')[0], 'hhtype': "overview", 'lgcode': ', '.join("{0} [{1}]".format(r['Language Name'].strip(), r['Affected Identifier']) for r in rows), 'src': "iso6393", } if id_ in ref_ids: fields['glottolog_ref_id'] = ref_ids[id_] return Source('misc', id_, **fields)
def bibtex(): bib = references_path('bibtex', 'iso6393.bib') glottolog_ref_ids = {} if bib.exists(): with bib.open(encoding='utf8') as fp: for rec in fp.read().split('@misc'): if rec.strip(): rec = Source.from_bibtex('@misc' + rec) if 'glottolog_ref_id' in rec: glottolog_ref_ids[rec.id] = rec['glottolog_ref_id'] with bib.open('w', encoding='utf8') as fp: for id_, rows in groupby(iter_change_requests(), lambda c: c['CR Number']): fp.write( change_request_as_source(id_, list(rows), glottolog_ref_ids).bibtex()) fp.write('\n\n')
def test_linearization(self): from clldutils.source import Source for bib, txt in [ ( """@book{Dayley-1985, address = {Berkeley}, author = {Dayley, Jon P.}, iso_code = {tzt; tzj}, olac_field = {general_linguistics; semantics; morphology; typology; syntax}, publisher = {University of California Press}, series = {University of California Publications in Linguistics}, title = {Tzutujil Grammar}, volume = {107}, wals_code = {tzu}, year = {1985} } """, "Dayley, Jon P. 1985. Tzutujil Grammar. (University of California " "Publications in Linguistics, 107.) Berkeley: University of California " "Press."), ( """@misc{318762, author = {Cook, Eung-Do}, editor = {Some One}, title = {A Tsilhqút'ín Grammar}, issue = {1}, note = {note}, year = {2013} } """, "Cook, Eung-Do. 2013. A Tsilhq\xfat'\xedn Grammar. In Some One (ed.) " "(1). (note)."), ( """@article{467661, address = {Berlin, New York}, author = {Al-Hazemi, Hassan}, journal = {IRAL - International Review of Applied Linguistics in Language Teaching}, number = {2}, issue = {1}, pages = {89-94}, publisher = {Walter de Gruyter}, title = {Listening to the Yes/No vocabulary test}, volume = {38}, year = {2000}, doi = {10.1515/iral.2000.38.2.89}, issn = {0019-042X} }""", "Al-Hazemi, Hassan. 2000. Listening to the Yes/No vocabulary test. IRAL " "- International Review of Applied Linguistics in Language Teaching " "38(1). 89-94. Berlin, New York: Walter de Gruyter."), ( """@book{318762, address = {Vancouver}, author = {Cook, Eung-Do}, pages = {670}, publisher = {UBC Press}, series = {First Nations Languages Series}, title = {A Tsilhqút'ín Grammar}, year = {2013} } """, "Cook, Eung-Do. 2013. A Tsilhqút'ín Grammar. (First Nations Languages " "Series.) Vancouver: UBC Press. 670pp."), ( """@inbook{316361, author = {Healey, Alan}, booktitle = {New Guinea area languages and language study}, pages = {223-232}, title = {History of research in Austronesian languages: Admiralty Islands area}, volume = {2} } """, "Healey, Alan. n.d. History of research in Austronesian languages: " "Admiralty Islands area. 2. 223-232."), ( """@incollection{316361, author = {Healey, Alan}, editor = {Peter, Peter}, booktitle = {New Guinea area languages and language study}, pages = {223-232}, title = {History of research in Austronesian languages: Admiralty Islands area}, volume = {2} } """, "Healey, Alan. n.d. History of research in Austronesian languages: " "Admiralty Islands area. In Peter, Peter (ed.), " "New Guinea area languages and language study, 223-232."), ( """@inproceedings{moisikesling2011, author = {Moisik, Scott R. and Esling, John H.}, booktitle = {Proceedings of the Congress of Phonetic Sciences (ICPhS XVII)}, pages = {1406-1409}, title = {The 'whole larynx' approach to laryngeal features}, year = {2011} }""", "Moisik, Scott R. and Esling, John H. 2011. The 'whole larynx' approach " "to laryngeal features. In Proceedings of the Congress of " "Phonetic Sciences (ICPhS XVII), 1406-1409.") ]: rec = Source.from_bibtex(bib, lowercase=True) self.assertEqual(rec.text(), txt) self.assertEqual(rec.bibtex().strip(), bib.strip())
def test_linearization(): for bib, txt in [ ( """@book{Dayley-1985, address = {Berkeley}, author = {Dayley, Jon P.}, iso_code = {tzt; tzj}, olac_field = {general_linguistics; semantics; morphology; typology; syntax}, publisher = {University of California Press}, series = {University of California Publications in Linguistics}, title = {Tzutujil Grammar}, volume = {107}, wals_code = {tzu}, year = {1985} } """, "Dayley, Jon P. 1985. Tzutujil Grammar. (University of California " "Publications in Linguistics, 107.) Berkeley: University of California " "Press."), ( """@misc{318762, author = {Cook, Eung-Do}, editor = {Some One}, title = {A Tsilhqút'ín Grammar}, issue = {1}, note = {note}, year = {2013} } """, "Cook, Eung-Do. 2013. A Tsilhq\xfat'\xedn Grammar. In Some One (ed.) " "(1). (note)."), ( """@article{467661, address = {Berlin, New York}, author = {Al-Hazemi, Hassan}, journal = {IRAL - International Review of Applied Linguistics in Language Teaching}, number = {2}, issue = {1}, pages = {89-94}, publisher = {Walter de Gruyter}, title = {Listening to the Yes/No vocabulary test}, volume = {38}, year = {2000}, doi = {10.1515/iral.2000.38.2.89}, issn = {0019-042X} }""", "Al-Hazemi, Hassan. 2000. Listening to the Yes/No vocabulary test. IRAL " "- International Review of Applied Linguistics in Language Teaching " "38(1). 89-94. Berlin, New York: Walter de Gruyter."), ( """@book{318762, address = {Vancouver}, author = {Cook, Eung-Do}, pages = {670}, publisher = {UBC Press}, series = {First Nations Languages Series}, title = {A Tsilhqút'ín Grammar}, year = {2013} } """, "Cook, Eung-Do. 2013. A Tsilhqút'ín Grammar. (First Nations Languages " "Series.) Vancouver: UBC Press. 670pp."), ( """@inbook{316361, author = {Healey, Alan}, booktitle = {New Guinea area languages and language study}, pages = {223-232}, title = {History of research in Austronesian languages: Admiralty Islands area}, volume = {2} } """, "Healey, Alan. n.d. History of research in Austronesian languages: " "Admiralty Islands area. 2. 223-232."), ( """@incollection{316361, author = {Healey, Alan}, editor = {Peter, Peter}, booktitle = {New Guinea area languages and language study}, pages = {223-232}, title = {History of research in Austronesian languages: Admiralty Islands area}, volume = {2} } """, "Healey, Alan. n.d. History of research in Austronesian languages: " "Admiralty Islands area. In Peter, Peter (ed.), " "New Guinea area languages and language study, 223-232."), ( """@inproceedings{moisikesling2011, author = {Moisik, Scott R. and Esling, John H.}, booktitle = {Proceedings of the Congress of Phonetic Sciences (ICPhS XVII)}, pages = {1406-1409}, title = {The 'whole larynx' approach to laryngeal features}, year = {2011} }""", "Moisik, Scott R. and Esling, John H. 2011. The 'whole larynx' approach " "to laryngeal features. In Proceedings of the Congress of " "Phonetic Sciences (ICPhS XVII), 1406-1409."), ( """@mastersthesis{116989, address = {Ann Arbor}, author = {Bryant, Michael G.}, pages = {ix+151}, publisher = {UMI}, school = {University of Texas at Arlington}, title = {Aspects of Tirmaga Grammar}, year = {1999} }""", "Bryant, Michael G. 1999. Aspects of Tirmaga Grammar. Ann Arbor: UMI. " "(MA thesis, University of Texas at Arlington; ix+151pp.)"), ( """@misc{316754, author = {Radu Voica}, howpublished = {Paper Presented at the APLL-6 Conference, SOAS, London}, title = {Towards and internal classification of the Isabel languages: Th}, year = {2013} }""", "Radu Voica. 2013. Towards and internal classification of the Isabel " "languages: Th. Paper Presented at the APLL-6 Conference, SOAS, London."), ( """@book{312817, address = {Dar es Salaam}, author = {Rugemalira, Josephat Muhozi}, pages = {196}, publisher = {Mradi wa Lugha za Tanzania}, title = {Cigogo: kamusi ya Kigogo-Kiswahili-Kiingereza}, year = {2009}, title_english = {Gogo-Swahili-English, English-Gogo} }""", "Rugemalira, Josephat Muhozi. 2009. Cigogo: kamusi ya " "Kigogo-Kiswahili-Kiingereza [Gogo-Swahili-English, " "English-Gogo]. Dar es Salaam: Mradi wa Lugha za Tanzania. 196pp."), ]: rec = Source.from_bibtex(bib, lowercase=True) assert rec.text() == txt assert rec.bibtex().strip() == bib.strip()
def __init__(self, repos=None, datasets=None, concepticon=None): API.__init__(self, repos) self.datasets = datasets or collections.OrderedDict() concepticon = concepticon if not concepticon: # pragma: no cover try: concepticon = Concepticon( Config.from_file().get_clone('concepticon')) except KeyError: pass datasets = set() self.annotations = collections.defaultdict( lambda: collections.OrderedDict()) for row in reader(self.repos / 'norare.tsv', delimiter='\t', dicts=True): self.annotations[row['DATASET']][row['NAME'].lower()] = { k.lower(): row[k] for k in [ 'DATASET', 'NAME', 'LANGUAGE', 'STRUCTURE', 'TYPE', 'NORARE', 'RATING', 'SOURCE', 'OTHER', 'NOTE' ] } datasets.add(row['DATASET']) # get bibliography self.refs = collections.OrderedDict() with self.repos.joinpath( 'references', 'references.bib').open(encoding='utf-8') as fp: for key, entry in pybtex.database.parse_string( fp.read(), bib_format='bibtex').entries.items(): self.refs[key] = Source.from_entry(key, entry) all_refs = set(self.refs) if concepticon: all_refs = all_refs.union(concepticon.bibliography) for row in reader(self.repos / 'concept_set_meta.tsv', delimiter='\t', dicts=True): row['norare'] = self row['path'] = self.repos.joinpath('concept_set_meta', row['ID'], row['ID'] + '.tsv-metadata.json') self.datasets[row['ID']] = ConceptSetMeta( **{k.lower(): v for k, v in row.items()}) self.datasets[row['ID']].source_language = [ lg.lower().strip() for lg in self.datasets[row['ID']].source_language.split(',') ] # remaining datasets come from concepticon, we identify them from datasets concepticon_datasets = [d for d in datasets if d not in self.datasets] for dataset in concepticon_datasets: ds = concepticon.conceptlists[dataset] self.datasets[ds.id] = ConceptSetMeta( id=ds.id, author=ds.author, year=ds.year, tags=', '.join(ds.tags), source_language=ds.source_language, target_language=ds.target_language, url=ds.url, refs=ds.refs, note=ds.note, alias=ds.alias, norare=self, path=concepticon.repos.joinpath('concepticondata', 'conceptlists', ds.id + '.tsv-metadata.json')) for dataset in self.datasets.values(): if dataset.refs: refs = [dataset.refs] if isinstance(dataset.refs, str) else dataset.refs for ref in refs: if ref not in all_refs: # pragma: no cover raise ValueError( 'missing references.bib: {}'.format(ref))
def test_linearization(bib, txt): rec = Source.from_bibtex(bib, lowercase=True) assert rec.text() == txt assert rec.bibtex().strip() == bib.strip()
def text(self) -> str: """Return the text linearization of the entry.""" return Source(self.type, self.key, _check_id=False, **self.fields).text()
def test_checks(): with pytest.raises(ValueError): Source('genre', 'a.b') assert Source('genre', 'a.b', _check_id=False).id == 'a.b' assert Source.from_bibtex('@misc{a.b,\n}', _check_id=False).id == 'a.b'