Beispiel #1
0
 def __init__(self, genre, id_, **kw):
     BaseSource.__init__(self, genre, id_, **kw)
     persons = dict(
         author=list(self.persons(kw.pop('author', ''))),
         editor=list(self.persons(kw.pop('editor', ''))))
     assert 'author' not in kw
     self.entry = database.Entry(
         genre,
         fields={k: v for k, v in kw.items() if v},
         persons=persons)
Beispiel #2
0
def test_Source_from_entry(mocker):
    src = Source.from_entry(
        'xyz', mocker.Mock(type='misc', fields={'title': 'abc'}, persons=None))
    assert src.id == 'xyz'
    assert src.genre == 'misc'
    assert 'author' not in src
    assert src['title'] == 'abc'
    assert '{0}'.format(src) == 'n.d. abc.'
    assert repr(src) == '<Source xyz>'

    src = Source.from_entry(
        'xyz',
        mocker.Mock(type='misc',
                    fields={'title': 'abc'},
                    persons={'author': ['Alfred E. Neumann', 'T. M.']}))
    assert src['author'] == 'Alfred E. Neumann and T. M.'
Beispiel #3
0
def scrape_article(url, hhtype):  # pragma: no cover
    html = get_html(url)
    md = {
        'title': html.find('h3').text,
        'author': [],
        'hhtype': hhtype,
        'journal': 'Language Documentation and Description',
        'url': url,
    }
    pdf_url = None
    for div in html.find_all('div'):
        if div.text.startswith('Link to item:'):
            pdf_url = div.find('a')['href']
            assert pdf_url.endswith('.pdf')
            code = language_code_from_pdf(pdf_url)
            if code:
                md['lgcode'] = '{} [{}]'.format(get_language_name(md['title']),
                                                code)
        if div.find('span') and div.find('span').text.startswith('Pages'):
            md['pages'] = div.find('div').text
        if div.text.startswith('Date: '):
            md['year'] = div.text.split(':')[1].strip()
    for td in html.find_all('td'):
        link = td.find('a')
        if link and link.attrs.get('href').startswith('/authorpage'):
            md['author'].append(link.text)
    assert pdf_url
    match = re.search(r'/ldd(?P<volume>[0-9]+)_[0-9]+\.pdf', pdf_url)
    md.update(match.groupdict())
    md['author'] = ' and '.join(md['author'])
    return Source('article', url.split('/')[-1], **md)
Beispiel #4
0
 def sources(self):
     srcs = [
         Source.from_bibtex('@' + s)
         for s in self.path('sources.bib').read_text(
             encoding='utf8').split('@') if s.strip()
     ]
     return {src['key']: src for src in srcs}
Beispiel #5
0
 def bibliography(self) -> typing.Dict[str, Source]:
     """
     :returns: `dict` mapping BibTeX IDs to `Reference` instances.
     """
     return to_dict(
         Source.from_entry(key, entry)
         for key, entry in pybtex.database.parse_string(
             self.bibfile.read_text(
                 encoding='utf8'), bib_format='bibtex').entries.items())
Beispiel #6
0
Datei: pure.py Projekt: clld/ldh
 def as_source(self):
     kw = {
         'title': self.title,
     }
     if self.authors:
         kw['author'] = ' and '.join(self.authors)
     if self.editors:
         kw['editor'] = ' and '.join(self.editors)
     return Source(self.bibtex_type, self.id, **kw)
Beispiel #7
0
 def bibliography(self):
     """
     :returns: `dict` mapping BibTeX IDs to `Reference` instances.
     """
     refs = []
     with self.bibfile.open(encoding='utf8') as fp:
         for key, entry in pybtex.database.parse_string(
                 fp.read(), bib_format='bibtex').entries.items():
             refs.append(Source.from_entry(key, entry))
     return to_dict(refs)
Beispiel #8
0
def bibtex():
    bib = references_path("bibtex", "iso6393.bib")

    glottolog_ref_ids = {}
    if bib.exists():
        with bib.open(encoding="utf8") as fp:
            for rec in fp.read().split("@misc"):
                if rec.strip():
                    rec = Source.from_bibtex("@misc" + rec)
                    if "glottolog_ref_id" in rec:
                        glottolog_ref_ids[rec.id] = rec["glottolog_ref_id"]

    with bib.open("w", encoding="utf8") as fp:
        for id_, rows in groupby(iter_change_requests(), lambda c: c["CR Number"]):
            fp.write(change_request_as_source(id_, list(rows), glottolog_ref_ids).bibtex())
            fp.write("\n\n")
Beispiel #9
0
def doi2bibtex(doi):
    url = 'https://scipython.com/apps/doi2bib/?doi={}'.format(
        urllib.parse.quote_plus(doi))
    req = urllib.request.urlopen(url)
    bibtex, in_bibtex = [], False
    for line in req.read().decode('utf8').split('\n'):
        if line.strip().startswith('</textarea>'):
            break
        if in_bibtex:
            bibtex.append(line)
        if line.strip().startswith('<textarea'):
            in_bibtex = True
    bibtex = '\n'.join(bibtex).strip()
    if bibtex:
        src = Source.from_bibtex(bibtex, _check_id=False)
        src.id = slug(doi)
        src['key'] = doi
        return src.bibtex()
Beispiel #10
0
def change_request_as_source(id_, rows, ref_ids):
    title = "Change Request Number {0}: ".format(id_)
    title += ", ".join("{0} {1} [{2}]".format(
        r['Outcome/Effective date'].split('20')[0].strip().lower(),
        r['Change Type'].lower(), r['Affected Identifier']) for r in rows)
    date = None
    for row in rows:
        parts = row['Outcome/Effective date'].split('20')
        if len(parts) > 1:
            if date:
                assert date == parts[1].strip()
            else:
                date = parts[1].strip()
    if date:
        title += ' ({0})'.format(date)
    fields = {
        'number':
        id_,
        'title':
        title,
        'howpublished':
        BASE_URL + "/chg_detail.asp?id=" + id_,
        'address':
        "Dallas",
        'author':
        "ISO 639-3 Registration Authority",
        'publisher':
        "SIL International",
        'url':
        BASE_URL + "/cr_files/{0}.pdf".format(id_),
        'year':
        id_.split('-')[0],
        'hhtype':
        "overview",
        'lgcode':
        ', '.join("{0} [{1}]".format(r['Language Name'].strip(),
                                     r['Affected Identifier']) for r in rows),
        'src':
        "iso6393",
    }
    if id_ in ref_ids:
        fields['glottolog_ref_id'] = ref_ids[id_]
    return Source('misc', id_, **fields)
Beispiel #11
0
def bibtex():
    bib = references_path('bibtex', 'iso6393.bib')

    glottolog_ref_ids = {}
    if bib.exists():
        with bib.open(encoding='utf8') as fp:
            for rec in fp.read().split('@misc'):
                if rec.strip():
                    rec = Source.from_bibtex('@misc' + rec)
                    if 'glottolog_ref_id' in rec:
                        glottolog_ref_ids[rec.id] = rec['glottolog_ref_id']

    with bib.open('w', encoding='utf8') as fp:
        for id_, rows in groupby(iter_change_requests(),
                                 lambda c: c['CR Number']):
            fp.write(
                change_request_as_source(id_, list(rows),
                                         glottolog_ref_ids).bibtex())
            fp.write('\n\n')
Beispiel #12
0
    def test_linearization(self):
        from clldutils.source import Source

        for bib, txt in [
            (
                """@book{Dayley-1985,
  address    = {Berkeley},
  author     = {Dayley, Jon P.},
  iso_code   = {tzt; tzj},
  olac_field = {general_linguistics; semantics; morphology; typology; syntax},
  publisher  = {University of California Press},
  series     = {University of California Publications in Linguistics},
  title      = {Tzutujil Grammar},
  volume     = {107},
  wals_code  = {tzu},
  year       = {1985}
}
                """,
                "Dayley, Jon P. 1985. Tzutujil Grammar. (University of California "
                "Publications in Linguistics, 107.) Berkeley: University of California "
                "Press."),
            (
                """@misc{318762,
  author = {Cook, Eung-Do},
  editor = {Some One},
  title  = {A Tsilhqút'ín Grammar},
  issue  = {1},
  note   = {note},
  year   = {2013}
}
                """,
                "Cook, Eung-Do. 2013. A Tsilhq\xfat'\xedn Grammar. In  Some One (ed.) "
                "(1). (note)."),
            (
                """@article{467661,
  address   = {Berlin, New York},
  author    = {Al-Hazemi, Hassan},
  journal   = {IRAL - International Review of Applied Linguistics in Language Teaching},
  number    = {2},
  issue     = {1},
  pages     = {89-94},
  publisher = {Walter de Gruyter},
  title     = {Listening to the Yes/No vocabulary test},
  volume    = {38},
  year      = {2000},
  doi       = {10.1515/iral.2000.38.2.89},
  issn      = {0019-042X}
}""",
                "Al-Hazemi, Hassan. 2000. Listening to the Yes/No vocabulary test. IRAL "
                "- International Review of Applied Linguistics in Language Teaching "
                "38(1). 89-94. Berlin, New York: Walter de Gruyter."),
            (
                """@book{318762,
  address   = {Vancouver},
  author    = {Cook, Eung-Do},
  pages     = {670},
  publisher = {UBC Press},
  series    = {First Nations Languages Series},
  title     = {A Tsilhqút'ín Grammar},
  year      = {2013}
}
                """,
                "Cook, Eung-Do. 2013. A Tsilhqút'ín Grammar. (First Nations Languages "
                "Series.) Vancouver: UBC Press. 670pp."),
            (
                """@inbook{316361,
  author    = {Healey, Alan},
  booktitle = {New Guinea area languages and language study},
  pages     = {223-232},
  title     = {History of research in Austronesian languages: Admiralty Islands area},
  volume    = {2}
}
                """,
                "Healey, Alan. n.d. History of research in Austronesian languages: "
                "Admiralty Islands area. 2. 223-232."),
            (
                """@incollection{316361,
  author    = {Healey, Alan},
  editor    = {Peter, Peter},
  booktitle = {New Guinea area languages and language study},
  pages     = {223-232},
  title     = {History of research in Austronesian languages: Admiralty Islands area},
  volume    = {2}
}
                """,
                "Healey, Alan. n.d. History of research in Austronesian languages: "
                "Admiralty Islands area. In Peter, Peter (ed.), "
                "New Guinea area languages and language study, 223-232."),
            (
                """@inproceedings{moisikesling2011,
  author    = {Moisik, Scott R. and Esling, John H.},
  booktitle = {Proceedings of the Congress of Phonetic Sciences (ICPhS XVII)},
  pages     = {1406-1409},
  title     = {The 'whole larynx' approach to laryngeal features},
  year      = {2011}
}""",
                "Moisik, Scott R. and Esling, John H. 2011. The 'whole larynx' approach "
                "to laryngeal features. In Proceedings of the Congress of "
                "Phonetic Sciences (ICPhS XVII), 1406-1409.")
        ]:
            rec = Source.from_bibtex(bib, lowercase=True)
            self.assertEqual(rec.text(), txt)
            self.assertEqual(rec.bibtex().strip(), bib.strip())
Beispiel #13
0
def test_linearization():
    for bib, txt in [
        (
            """@book{Dayley-1985,
  address    = {Berkeley},
  author     = {Dayley, Jon P.},
  iso_code   = {tzt; tzj},
  olac_field = {general_linguistics; semantics; morphology; typology; syntax},
  publisher  = {University of California Press},
  series     = {University of California Publications in Linguistics},
  title      = {Tzutujil Grammar},
  volume     = {107},
  wals_code  = {tzu},
  year       = {1985}
}
            """,
            "Dayley, Jon P. 1985. Tzutujil Grammar. (University of California "
            "Publications in Linguistics, 107.) Berkeley: University of California "
            "Press."),
        (
            """@misc{318762,
  author = {Cook, Eung-Do},
  editor = {Some One},
  title  = {A Tsilhqút'ín Grammar},
  issue  = {1},
  note   = {note},
  year   = {2013}
}
            """,
            "Cook, Eung-Do. 2013. A Tsilhq\xfat'\xedn Grammar. In  Some One (ed.) "
            "(1). (note)."),
        (
            """@article{467661,
  address   = {Berlin, New York},
  author    = {Al-Hazemi, Hassan},
  journal   = {IRAL - International Review of Applied Linguistics in Language Teaching},
  number    = {2},
  issue     = {1},
  pages     = {89-94},
  publisher = {Walter de Gruyter},
  title     = {Listening to the Yes/No vocabulary test},
  volume    = {38},
  year      = {2000},
  doi       = {10.1515/iral.2000.38.2.89},
  issn      = {0019-042X}
}""",
            "Al-Hazemi, Hassan. 2000. Listening to the Yes/No vocabulary test. IRAL "
            "- International Review of Applied Linguistics in Language Teaching "
            "38(1). 89-94. Berlin, New York: Walter de Gruyter."),
        (
            """@book{318762,
  address   = {Vancouver},
  author    = {Cook, Eung-Do},
  pages     = {670},
  publisher = {UBC Press},
  series    = {First Nations Languages Series},
  title     = {A Tsilhqút'ín Grammar},
  year      = {2013}
}
            """,
            "Cook, Eung-Do. 2013. A Tsilhqút'ín Grammar. (First Nations Languages "
            "Series.) Vancouver: UBC Press. 670pp."),
        (
            """@inbook{316361,
  author    = {Healey, Alan},
  booktitle = {New Guinea area languages and language study},
  pages     = {223-232},
  title     = {History of research in Austronesian languages: Admiralty Islands area},
  volume    = {2}
}
            """,
            "Healey, Alan. n.d. History of research in Austronesian languages: "
            "Admiralty Islands area. 2. 223-232."),
        (
            """@incollection{316361,
  author    = {Healey, Alan},
  editor    = {Peter, Peter},
  booktitle = {New Guinea area languages and language study},
  pages     = {223-232},
  title     = {History of research in Austronesian languages: Admiralty Islands area},
  volume    = {2}
}
            """,
            "Healey, Alan. n.d. History of research in Austronesian languages: "
            "Admiralty Islands area. In Peter, Peter (ed.), "
            "New Guinea area languages and language study, 223-232."),
        (
            """@inproceedings{moisikesling2011,
  author    = {Moisik, Scott R. and Esling, John H.},
  booktitle = {Proceedings of the Congress of Phonetic Sciences (ICPhS XVII)},
  pages     = {1406-1409},
  title     = {The 'whole larynx' approach to laryngeal features},
  year      = {2011}
}""",
            "Moisik, Scott R. and Esling, John H. 2011. The 'whole larynx' approach "
            "to laryngeal features. In Proceedings of the Congress of "
            "Phonetic Sciences (ICPhS XVII), 1406-1409."),
        (
            """@mastersthesis{116989,
  address   = {Ann Arbor},
  author    = {Bryant, Michael G.},
  pages     = {ix+151},
  publisher = {UMI},
  school    = {University of Texas at Arlington},
  title     = {Aspects of Tirmaga Grammar},
  year      = {1999}
}""",
            "Bryant, Michael G. 1999. Aspects of Tirmaga Grammar. Ann Arbor: UMI. "
            "(MA thesis, University of Texas at Arlington; ix+151pp.)"),
        (
            """@misc{316754,
  author       = {Radu Voica},
  howpublished = {Paper Presented at the APLL-6 Conference, SOAS, London},
  title        = {Towards and internal classification of the Isabel languages: Th},
  year         = {2013}
}""",
            "Radu Voica. 2013. Towards and internal classification of the Isabel "
            "languages: Th. Paper Presented at the APLL-6 Conference, SOAS, London."),
        (
            """@book{312817,
  address       = {Dar es Salaam},
  author        = {Rugemalira, Josephat Muhozi},
  pages         = {196},
  publisher     = {Mradi wa Lugha za Tanzania},
  title         = {Cigogo: kamusi ya Kigogo-Kiswahili-Kiingereza},
  year          = {2009},
  title_english = {Gogo-Swahili-English, English-Gogo}
}""",
            "Rugemalira, Josephat Muhozi. 2009. Cigogo: kamusi ya "
            "Kigogo-Kiswahili-Kiingereza [Gogo-Swahili-English, "
            "English-Gogo]. Dar es Salaam: Mradi wa Lugha za Tanzania. 196pp."),
    ]:
        rec = Source.from_bibtex(bib, lowercase=True)
        assert rec.text() == txt
        assert rec.bibtex().strip() == bib.strip()
Beispiel #14
0
    def __init__(self, repos=None, datasets=None, concepticon=None):
        API.__init__(self, repos)
        self.datasets = datasets or collections.OrderedDict()

        concepticon = concepticon
        if not concepticon:  # pragma: no cover
            try:
                concepticon = Concepticon(
                    Config.from_file().get_clone('concepticon'))
            except KeyError:
                pass

        datasets = set()
        self.annotations = collections.defaultdict(
            lambda: collections.OrderedDict())
        for row in reader(self.repos / 'norare.tsv',
                          delimiter='\t',
                          dicts=True):
            self.annotations[row['DATASET']][row['NAME'].lower()] = {
                k.lower(): row[k]
                for k in [
                    'DATASET', 'NAME', 'LANGUAGE', 'STRUCTURE', 'TYPE',
                    'NORARE', 'RATING', 'SOURCE', 'OTHER', 'NOTE'
                ]
            }
            datasets.add(row['DATASET'])

        # get bibliography
        self.refs = collections.OrderedDict()
        with self.repos.joinpath(
                'references', 'references.bib').open(encoding='utf-8') as fp:
            for key, entry in pybtex.database.parse_string(
                    fp.read(), bib_format='bibtex').entries.items():
                self.refs[key] = Source.from_entry(key, entry)

        all_refs = set(self.refs)
        if concepticon:
            all_refs = all_refs.union(concepticon.bibliography)

        for row in reader(self.repos / 'concept_set_meta.tsv',
                          delimiter='\t',
                          dicts=True):
            row['norare'] = self
            row['path'] = self.repos.joinpath('concept_set_meta', row['ID'],
                                              row['ID'] + '.tsv-metadata.json')
            self.datasets[row['ID']] = ConceptSetMeta(
                **{k.lower(): v
                   for k, v in row.items()})
            self.datasets[row['ID']].source_language = [
                lg.lower().strip()
                for lg in self.datasets[row['ID']].source_language.split(',')
            ]

        # remaining datasets come from concepticon, we identify them from datasets
        concepticon_datasets = [d for d in datasets if d not in self.datasets]
        for dataset in concepticon_datasets:
            ds = concepticon.conceptlists[dataset]
            self.datasets[ds.id] = ConceptSetMeta(
                id=ds.id,
                author=ds.author,
                year=ds.year,
                tags=', '.join(ds.tags),
                source_language=ds.source_language,
                target_language=ds.target_language,
                url=ds.url,
                refs=ds.refs,
                note=ds.note,
                alias=ds.alias,
                norare=self,
                path=concepticon.repos.joinpath('concepticondata',
                                                'conceptlists',
                                                ds.id + '.tsv-metadata.json'))

        for dataset in self.datasets.values():
            if dataset.refs:
                refs = [dataset.refs] if isinstance(dataset.refs,
                                                    str) else dataset.refs
                for ref in refs:
                    if ref not in all_refs:  # pragma: no cover
                        raise ValueError(
                            'missing references.bib: {}'.format(ref))
Beispiel #15
0
def test_linearization(bib, txt):
    rec = Source.from_bibtex(bib, lowercase=True)
    assert rec.text() == txt
    assert rec.bibtex().strip() == bib.strip()
Beispiel #16
0
 def text(self) -> str:
     """Return the text linearization of the entry."""
     return Source(self.type, self.key, _check_id=False, **self.fields).text()
Beispiel #17
0
def test_checks():
    with pytest.raises(ValueError):
        Source('genre', 'a.b')

    assert Source('genre', 'a.b', _check_id=False).id == 'a.b'
    assert Source.from_bibtex('@misc{a.b,\n}', _check_id=False).id == 'a.b'