Beispiel #1
0
def fulltext_to_single_entry(fulltext: str) -> pybtex.Entry:
    """
    Parses a BibTeX entry into a pybtex.Entry
    """
    entry, = pybtex.parse_string(fulltext,
                                 bib_format="bibtex").entries.values()
    return entry
Beispiel #2
0
def _add_file(fname, force_redownload, db, per_file_progress_bar):
    """
    Return #added, #skipped, file_skipped
    """
    if fname.startswith('http'):
        if not force_redownload and db.file_has_been_downloaded(fname):
            return 0, 0, True
        try:
            new_entries = pybtex.parse_string(download_file(fname),
                                              bib_format="bibtex").entries
        except urllib.error.URLError as e:
            raise AddFileError("Error downloading '%s' [%s]" % (fname, str(e)))
        except pybtex.PybtexError:
            raise AddFileError("Error parsing file %s" % fname)
        db.register_file_downloaded(fname)
    else:
        new_entries = pybtex.parse_file(fname, bib_format="bibtex").entries

    added = 0
    skipped = 0
    if per_file_progress_bar:
        iterable = tqdm(
            new_entries.values(),
            ncols=80,
            bar_format="{l_bar}{bar}| [Elapsed: {elapsed} ETA: {remaining}]")
    else:
        iterable = new_entries.values()
    for entry in iterable:
        if db.add(entry):
            added += 1
        else:
            skipped += 1

    return added, skipped, False
Beispiel #3
0
def main():
    doi = _extract_doi(args.identifier[0])

    if doi is None:
        print(item)
    elif args.bibtex:
        result = cn.content_negotiation(doi, format="bibtex")
        bibtex = parse_string(result, "bibtex")
        try:
            name = "".join(
                bibtex.entries.values()[0].persons.values()[0][0].last_names)
            name = name.replace("ä", "ae").replace("ö",
                                                   "oe").replace("ü", "ue")
            name = unidecode(name)
            shortdoi = _short_doi(doi)[3:]
            year = bibtex.entries.values()[0].fields["year"]
            key = "{}_{}_{}".format(name, year, shortdoi)
            new = BibliographyData()
            new.add_entry(key, bibtex.entries[bibtex.entries.keys()[0]])
            print(new.to_string("bibtex"))
        except KeyError:
            print(result)
    else:
        try:
            result = cn.content_negotiation(doi, format=args.format)
            print(result)
        except requests.exceptions.HTTPError:
            print(doi)
    print()
Beispiel #4
0
def test_backend_html(bib, html):
    style = UnsrtStyle()
    backend = HtmlBackend()
    bib_data = parse_string(bib, 'bibtex')
    for formatted_entry in style.format_entries(bib_data.entries.values()):
        render = formatted_entry.text.render(backend)
        print(render)
        assert render.strip() == html.strip()
Beispiel #5
0
def push_2_db(bibtex_str, collection_name, db_con):
    """push a collection to database"""

    bib_data = parse_string(bibtex_str, 'bibtex')
    cursor = db_con.cursor()

    # drop existing table if has the same name
    drop_cmd = """DROP TABLE IF EXISTS {}""".format(collection_name)
    cursor.execute(drop_cmd)
    db_con.commit()

    # first create table
    create_cmd = """CREATE TABLE {} (
    ref_tag TEXT PRIMARY KEY, author_list TEXT, journal TEXT,
    volume INTEGER, pages TEXT, year INTEGER, title TEXT, 
    collection TEXT) """.format(collection_name)

    cursor.execute(create_cmd)
    db_con.commit()

    # add each entry to the table
    for entry in bib_data.entries.values():
        entry_fields = entry.lower().fields.keys()
        wanted_fields = {'journal', 'volume', 'pages', 'year', 'title'}

        # valide fields
        valid_fields = []
        for fd in entry_fields:
            if fd in wanted_fields:
                valid_fields += [fd]

        ref_tag, author_list = entry.key, make_author_list(entry)

        # construct insert command
        # we only insert valid fields into database in case of missing data
        # so this takes a few lines to construct sql command
        bracket_1, bracket_2 = "{}", "\"{}\""
        bracket_dict = {
            'journal': bracket_2,
            'volume': bracket_1,
            'pages': bracket_2,
            'year': bracket_1,
            'title': bracket_2
        }
        valid_fd_str = ','.join(valid_fields)
        bracket_str = ','.join([bracket_dict[fd] for fd in valid_fields])
        valid_fd_values = list(
            tidy_string(entry.fields[fd]) for fd in valid_fields)

        insert_cmd = """INSERT INTO {} (ref_tag, author_list, """ + valid_fd_str + \
            """, collection) VALUES """ + """("{}", "{}","""+ bracket_str + """, "{}")"""
        insert_cmd = insert_cmd.format(collection_name, ref_tag, author_list,
                                       *valid_fd_values, collection_name)

        # commit insertion
        cursor.execute(insert_cmd)
        db_con.commit()
    return
Beispiel #6
0
 def add(self, *entries):
     """
     Add a source, either specified by glottolog reference id, or as bibtex record.
     """
     for entry in entries:
         if isinstance(entry, string_types):
             self._add_entries(database.parse_string(entry, bib_format='bibtex'))
         else:
             self._add_entries(entry)
Beispiel #7
0
 def add(self, *entries):
     """
     Add a source, either specified by glottolog reference id, or as bibtex record.
     """
     for entry in entries:
         if isinstance(entry, string_types):
             self._add_entries(
                 database.parse_string(entry, bib_format='bibtex'))
         else:
             self._add_entries(entry)
Beispiel #8
0
def read_bib_entries(*locations):
    """Yield pybtex.database.Entry objects from each location in turn.
    Locations can be file names or strings containing file contents.
    """
    for loc in locations:
        if os.path.isfile(loc):
            with open(loc, encoding='latin1') as f:
                loc = f.read()
        for item in database.parse_string(loc.replace('}.', '},'),
                                          'bibtex').entries.values():
            yield item
Beispiel #9
0
def aggregate_snls(snls):
    """
    Aggregates a series of SNLs into the fields for a single SNL
    """
    # Choose earliesst created_at
    created_at = sorted([snl["about"]["created_at"]["string"] for snl in snls])[0]

    # Choose earliest history
    history = sorted(snls, key=lambda snl: snl["about"]["created_at"]["string"])[0]["about"]["history"]

    # Aggregate all references into one dict to remove duplicates
    refs = {}
    for snl in snls:
        try:
            entries = parse_string(snl["about"]["references"], bib_format="bibtex")
            refs.update(entries.entries)
        except:
            self.logger.debug("Failed parsing bibtex: {}".format(snl["about"]["references"]))

    entries = BibliographyData(entries=refs)
    references = entries.to_string("bibtex")

    # Aggregate all remarks
    remarks = list(set([remark for snl in snls for remark in snl["about"]["remarks"]]))

    # Aggregate all projects
    projects = list(set([projects for snl in snls for projects in snl["about"]["projects"]]))

    # Aggregate all authors - Converting a single dictionary first performs duplicate checking
    authors = {entry["name"].lower(): entry["email"] for snl in snls for entry in snl["about"]["authors"]}
    authors = [{"name": name.title(), "email": email} for name, email in authors.items()]

    # Aggregate all the database IDs
    db_ids = defaultdict(list)
    for snl in snls:
        if len(snl["about"]["history"]) == 1 and \
                snl["about"]["history"][0]["name"] in DB_indexes:
            db_name = snl["about"]["history"][0]["name"]
            db_id_key = DB_indexes[db_name]
            db_ids[db_id_key].append(snl["about"]["history"][0]["description"].get("id", None))
    # remove Nones and empty lists
    db_ids = {k: list(filter(None, v)) for k, v in db_ids.items() if len(list(filter(None, db_ids.items()))) > 0}

    snl_fields = {
        "created_at": created_at,
        "history": history,
        "references": references,
        "remarks": remarks,
        "projects": projects,
        "authors": authors,
        "data": {"_db_ids": db_ids}
    }

    return snl_fields
def test_biblio(stagingArea):

    if "References" in ref_field:
        for reference in ref_field["References"]:
            if "bibtex" in reference:
                bibtex_str = reference["bibtex"]
                # force a key if not present, for having valid parsing
                bibtex_str = bibtex_str.replace("@Article{,", "@Article{toto,")
                biblio = None
                #try:
                biblio = parse_string(bibtex_str, "bibtex")
                #except:
                #    print("Failed to parse the bibtext string:", bibtex_str)

                if biblio != None:
                    for key in biblio.entries:
                        print(key)
                        local_title = biblio.entries[key].fields["title"]
                        local_authors = biblio.entries[key].persons
                        if "author" in local_authors:
                            all_authors = local_authors["author"]
                            first_author_last_name = all_authors[0].last_names[
                                0]

                        text_format_ref = format_from_string(bibtex_str,
                                                             style="plain")
                        res_format_ref = ""
                        for line_format_ref in text_format_ref.split("\n"):
                            if line_format_ref.startswith("\\newblock"):
                                res_format_ref += line_format_ref.replace(
                                    "\\newblock", "")
                            elif len(line_format_ref.strip(
                            )) != 0 and not line_format_ref.startswith("\\"):
                                res_format_ref += line_format_ref

                        res_format_ref = res_format_ref.strip()
                        res_format_ref = res_format_ref.replace("\\emph{", "")
                        res_format_ref = res_format_ref.replace("\\url{", "")
                        res_format_ref = res_format_ref.replace("}", "")
                        print(res_format_ref)

                        print(
                            stagingArea.biblio_glutton_lookup(
                                raw_ref=res_format_ref,
                                title=local_title,
                                first_author_last_name=first_author_last_name))

            if "raw" in reference:
                # this can be sent to GROBID
                print(reference["raw"])
                print(
                    stagingArea.biblio_glutton_lookup(
                        raw_ref=reference["raw"]))
def import_citations(path_citations):
    # This function creates a list of pybtex object. Every pybtex object is a citation
    my_file = open(path_citations, 'r', encoding='utf-8')
    strings = my_file.read().split("@")
    bib_datas = []
    for stringa in strings:
        if stringa == "":
            continue
        stringa = '@'+stringa
        bib_data = parse_string(stringa, "bibtex")
        bib_datas.append(bib_data)
    my_file.close()
    return bib_datas
Beispiel #12
0
def bib2html(bibliography, id_use_order=(), exclude_fields=None):
    exclude_fields = exclude_fields or []
    if exclude_fields:

        bibliography = parse_string(bibliography.to_string('bibtex'), 'bibtex')

        for entry in bibliography.entries.values():

            for ef in exclude_fields:
                if ef in entry.fields.__dict__['_dict']:
                    del entry.fields.__dict__['_dict'][ef]

    # return old_format(bibliography)
    return new_format(bibliography, id_use_order)
Beispiel #13
0
def bibtex_to_dict(bibtex):
    try:
        bibdict = {}
        #['doi', 'url', 'year', 'month', 'publisher', 'pages', 'title', 'journal', 'volume', 'number', 'booktitle', 'keywords']
        bibtex = parse_string(bibtex, bib_format="bibtex")
        entry = bibtex.entries.values()[0]

        for field in entry.fields.keys():
            bibdict[field] = entry.fields[field]

        for key in entry.persons.keys():
            bibdict[key] = []
            for people in entry.persons[key]:
                bibdict[key].append(str(people))
    except Exception as e:
        pass
    return bibdict
Beispiel #14
0
def download_from_openreview(url, dirpath='.'):
    url = url.rstrip('\n')
    if '/pdf?' in url:
        url = url.replace('/pdf?', '/forum?')
    page_source = urllib.request.urlopen(url).read().decode('utf-8')
    xml = etree.fromstring(page_source, parser=etree.HTMLParser())
    bib = xml.xpath('//a[@class="action-bibtex-modal"]/@data-bibtex')[0]
    bib_database = database.parse_string(bib, bib_format='bibtex')
    author_lastname = bib_database.entries.values(
    )[0].persons['author'][0].last()[0]
    year = bib_database.entries.values()[0].fields['year'].strip()
    title = bib_database.entries.values()[0].fields['title'].strip()
    out_name = '[{}+{}] {}.pdf'.format(author_lastname, year,
                                       title).replace('{',
                                                      '').replace('}', '')

    path = os.path.join(dirpath, out_name)
    pdf_url = url.replace('/forum?', '/pdf?')
    logger.info('Download "{}" from "{}"'.format(title, pdf_url))
    urlretrieve(pdf_url, path, reporthook=reporthook)
    return path
Beispiel #15
0
def download_from_acl(url, dirpath='.'):
    if url.endswith('.pdf'):
        url = url[:-4]  # strip '.pdf'

    # get filename
    bib_url = url.strip('\n').rstrip('/') + '.bib'
    bib = urllib.request.urlopen(bib_url).read().decode('utf-8')
    bib_database = database.parse_string(bib, bib_format='bibtex')
    author_lastname = bib_database.entries.values(
    )[0].persons['author'][0].last()[0]
    year = bib_database.entries.values()[0].fields['year'].strip()
    title = bib_database.entries.values()[0].fields['title'].strip()
    out_name = '[{}+{}] {}.pdf'.format(author_lastname, year,
                                       title).replace('{',
                                                      '').replace('}', '')

    # get authorname
    path = os.path.join(dirpath, out_name)
    pdf_url = url.strip('\n').rstrip('/') + '.pdf'
    logger.info('Download "{}" from "{}"'.format(title, pdf_url))
    urlretrieve(pdf_url, path, reporthook=reporthook)
    return path
def mostrarSeccionCarga():
    conectarBd()
    uploaded_file = st.file_uploader("Archivo Bibtex con la información de los papers")
    before = len(Paper.objects)
    if uploaded_file is not None:
        # To read file as bytes:
        bytes_data = uploaded_file.read()
        data = bytes_data.decode("utf-8")
        bib_data = parse_string(data, 'bibtex')
        notdoi = []
        papers = []
        with st.spinner("Preprocesando el archivo para la carga..."):
            total = sum(1 for entry in bib_data.entries.values())
        st.success("Se iniciará la carga de "+str(total)+" papers a la base de datos.")
        my_bar = st.progress(.0)
        loaded = 0
        for entry in bib_data.entries.values():
            fields = entry.fields
            title = fields["title"].replace('{', '').replace('}', '')
            doi = fields.get("doi")
            isOnlyReference = False
            loaded+=1
            my_bar.progress(loaded/total)
            if doi is None:
                notdoi.append(title)
                continue
            abstract = fields.get("abstract","")
            paper = Paper(title = title, doi = doi , abstract = abstract, isOnlyReference = isOnlyReference).save()
            papers.append(paper)

        after = len(Paper.objects)
        st.success("Se ingresaron "+ str(after-before) + " papers a la base de datos")
        st.write([x.title for x in papers])
        if len(notdoi):
            st.error ("No se pudo ingresar " + str(len(notdoi)) + " debido a que no se conocía su doi")
            st.write(notdoi)
def getSourceFromBibTex(source):
    "utility function to read source from bibtex"
    return database.parse_string(source, bib_format="bibtex")
def getEvoBibAsSource(key):
    """Download bibtex format and parse it from EvoBib"""
    return database.parse_string(
        urlopen("http://bibliography.lingpy.org/raw.php?key=" +
                key).read().decode('utf-8'),
        bib_format='bibtex')
Beispiel #19
0
    def to_cldf(self, ds, concepts):
        """
        :param ds: the dataset object
        :concepts: a dictionary mapping concept labels to concept ids

        :return: A dataset object, ds.
        """
        source = []
        if self.language.source:
            bib = parse_string(self.language.source, "bibtex")
            try:
                ds.add_sources(
                    *[Source.from_entry(k, e) for k, e in bib.entries.items()])
                source = list(bib.entries.keys())
            except:  # noqa: E722
                self.log.warning("Invalid citekey for %s" % self.language.id)

        ds.add_language(ID=self.language.id,
                        Glottocode=self.language.glottocode,
                        ISO639P3code=self.language.iso,
                        Name=self.language.name,
                        author=self.language.author,
                        url=self.url('language.php?id=%s' % self.language.id),
                        typedby=self.language.typedby,
                        checkedby=self.language.checkedby,
                        notes=self.language.notes,
                        source=";".join(source))

        for entry in self.entries:
            if entry.name is None or len(
                    entry.name) == 0:  # skip empty entries
                continue  # pragma: no cover

            # skip entries marked as incorrect word form due to semantics
            # (x = probably, s = definitely)
            if entry.cognacy and entry.cognacy.lower() in ('s', 'x'):
                continue  # pragma: no cover

            # handle concepts
            cid = concepts.get(entry.word_id)
            if not cid:
                self.dataset.unmapped.add_concept(ID=entry.word_id,
                                                  Name=entry.word)
                # add it if we don't have it.
                ds.add_concept(ID=entry.word_id, Name=entry.word)
                cid = entry.word_id

            # handle lexemes
            try:
                lex = ds.add_forms_from_value(
                    Local_ID=entry.id,
                    Language_ID=self.language.id,
                    Parameter_ID=cid,
                    Value=entry.name,
                    # set source to entry-level sources if they exist, otherwise use
                    # the language level source.
                    Source=[entry.source] if entry.source else source,
                    Cognacy=entry.cognacy,
                    Comment=entry.comment or '',
                    Loan=True if entry.loan and len(entry.loan) else False,
                )
            except:  # NOQA: E722; pragma: no cover
                print("ERROR with %r -- %r" % (entry.id, entry.name))
                raise

            if lex:
                for cognate_set_id in entry.cognates:
                    match = self.dataset.cognate_pattern.match(cognate_set_id)
                    if not match:  # pragma: no cover
                        self.log.warning(
                            'Invalid cognateset ID for entry {0}: {1}'.format(
                                entry.id, cognate_set_id))
                    else:
                        # make global cognate set id
                        cs_id = "%s-%s" % (slug(entry.word), match.group('id'))

                        ds.add_cognate(lexeme=lex[0],
                                       Cognateset_ID=cs_id,
                                       Doubt=bool(match.group('doubt')),
                                       Source=['Greenhilletal2008'] if
                                       self.section == 'austronesian' else [])

        return ds
Beispiel #20
0
    def from_SNLs(
        cls,
        material_id: Union[MPID, int],
        snls: List[Dict],
    ) -> "ProvenanceDoc":
        """
        Converts legacy Pymatgen SNLs into a single provenance document
        """

        # Choose earliest created_at
        created_at = sorted(
            [get(snl, "about.created_at.string", datetime.max) for snl in snls]
        )[0]

        # Choose earliest history
        history = sorted(
            snls, key=lambda snl: get(snl, "about.created_at.string", datetime.max)
        )[0]["about"]["history"]

        # Aggregate all references into one dict to remove duplicates
        refs = {}
        for snl in snls:
            try:
                entries = parse_string(snl["about"]["references"], bib_format="bibtex")
                refs.update(entries.entries)
            except Exception:
                warnings.warn(f"Failed parsing bibtex: {snl['about']['references']}")

        bib_data = BibliographyData(entries=refs)
        references = [ref.to_string("bibtex") for ref in bib_data.entries]

        # TODO: Maybe we should combine this robocrystallographer?
        # TODO: Refine these tags / remarks
        remarks = list(
            set([remark for snl in snls for remark in snl["about"]["remarks"]])
        )
        tags = [r for r in remarks if len(r) < 140]

        # Aggregate all authors - Converting a single dictionary first
        # performs duplicate checking
        authors_dict = {
            entry["name"].lower(): entry["email"]
            for snl in snls
            for entry in snl["about"]["authors"]
        }
        authors = [
            {"name": name.title(), "email": email}
            for name, email in authors_dict.items()
        ]

        # Check if this entry is experimental
        if any(get(snl, "about.history.0.experimental", False) for snl in snls):
            experimental = True

        # Aggregate all the database IDs
        snl_ids = [snl.get("snl_id", "") for snl in snls]
        db_ids = {
            Database(db_id): [snl_id for snl_id in snl_ids if db_id in snl_id]
            for db_id in map(str, Database)
        }

        # remove Nones and empty lists
        db_ids = {k: list(filter(None, v)) for k, v in db_ids.items()}
        db_ids = {k: v for k, v in db_ids.items() if len(v) > 0}

        # Get experimental bool
        experimental = any(
            get(snl, "about.history.0.experimental", False) for snl in snls
        )

        snl_fields = {
            "created_at": created_at,
            "references": references,
            "authors": authors,
            "remarks": remarks,
            "tags": tags,
            "database_IDs": db_ids,
            "theoretical": not experimental,
            "history": history,
        }

        return ProvenanceDoc(material_id=material_id, **snl_fields)
Beispiel #21
0
 def deserialize(self, string):
     return parse_string(string, self.bib_format)
Beispiel #22
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
Beispiel #23
0
 def read(self, fname):
     self._add_entries(
         database.parse_string(read_text(fname), bib_format='bibtex'))
Beispiel #24
0
    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        bib = parse_string(self.raw_dir.read('Borrowing_references.bib'),
                           'bibtex')
        for k, v in bib.entries.items():
            args.writer.add_sources(
                Source.from_entry(slug(k, lowercase=False), v))

        args.writer.cldf.add_component(
            'BorrowingTable', {
                'name': 'Likelihood',
                'dc:description':
                'Likelihood of borrowing (*possible*, *probable* or *clear*).',
                'datatype': {
                    'base': 'string',
                    'format': 'possible|clear|probable'
                }
            }, {
                'name': 'SourceLanguoid',
                'dc:description': 'Borrowing source of lexeme.',
            })
        args.writer.cldf['FormTable', 'form'].required = False
        args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS
        args.writer.cldf['FormTable', 'value'].required = False
        args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \
            "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \
            "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \
            "meanings in Proto-Uralic."

        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        inlists = {r['mng_item']: r for r in self._read('Meaning_lists')}
        attrs = [
            k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank'
        ]
        for concept in self.concepts:
            if concept['ID'] in inlists:
                memberships = {
                    k.replace('-', '_'): v == '1'
                    for k, v in inlists[concept['ID']].items()
                    if k.replace('-', '_') in attrs
                }
                concept.update(memberships)
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                if language['item'] in NULL_ITEMS:
                    language['etym_notes'] = language['etym_notes'] + language[
                        'item']
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for i, lex in enumerate(args.writer.add_lexemes(**kw)):
                    lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[
                        'Form']
                    if cogid not in ["?", "0"]:
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))
                    if language['borr_qual']:
                        c = ': borrowed to Pre-Permic'
                        ref = language['ref_borr']
                        if c in ref:
                            comment = c[1:].strip()
                            ref = ref.replace(c, '')
                        else:
                            comment = None
                        args.writer.objects['BorrowingTable'].append(
                            dict(
                                ID=lex['ID'],
                                Target_Form_ID=lex['ID'],
                                SourceLanguoid=language['borr_source'],
                                Likelihood=language['borr_qual'],
                                Source=bibkeys(ref),
                                Comment=comment,
                            ))
Beispiel #25
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
Beispiel #26
0
 def read(self, fname):
     self._add_entries(database.parse_string(read_text(fname), bib_format='bibtex'))
Beispiel #27
0
def _edit(args, config):
    db = BibDB(config)

    search_results = db.search(args.terms)
    if not search_results:
        logging.error("Search returned no results. Aborting.")
        sys.exit(1)

    with tempfile.NamedTemporaryFile("w") as temp_file:
        temp_fname = temp_file.name
        with open(temp_fname, "wt") as fp:
            original_entries_text = format_search_results(
                search_results, bibtex_output=True, use_original_key=False)
            fp.write(original_entries_text)
            original_entries = pybtex.parse_string(
                original_entries_text, bib_format="bibtex").entries.values()
        subprocess.run([config.editor, temp_file.name])

        with open(temp_fname, "rt"):
            new_entries = pybtex.parse_file(
                temp_fname, bib_format="bibtex").entries.values()
    deleted_entries = []
    edited_entries = []
    seen_original_keys = set()
    changelog = []
    for new in new_entries:
        original_key = new.fields["original_key"]
        seen_original_keys.add(original_key)
        old = find_entry(original_entries, "original_key", original_key)
        added, deleted, edited = compare_entries(old, new)
        if added or deleted or edited:
            edited_entries.append(new)
            # Report changes
            edited_entries.append(new)
            changelog.append("\nEntry %s" % old.key)
            for field in added:
                changelog.append('\tAdded %s with value "%s"' %
                                 (field, new.fields[field]))
            for field in deleted:
                changelog.append("\tDeleted %s" % field)
            for field in edited:
                changelog.append(
                    '\tChanged %s to "%s"' %
                    (field, new.key if field == "key" else new.fields[field]))
    for old in original_entries:
        if not old.fields["original_key"] in seen_original_keys:
            deleted_entries.append(old)
    if deleted_entries:
        changelog.append("\nDeleted entries:")
        for e in deleted_entries:
            changelog.append("\t%s" % e.key)

    if not edited_entries and not deleted_entries:
        logging.warning("There were not changes in the entries.")
        sys.exit(0)

    print("Summary of changes:")
    print("\n".join(changelog) + "\n")

    confirmation = prompt("Do you want to perform these changes?", "YES", "no")
    if confirmation == "YES":
        for e in edited_entries:
            db.update(e)
        for e in deleted_entries:
            db.remove(e.key)
        db.save()
        print("Updated database.")
    else:
        print("Aborted.")
Beispiel #28
0
 def read_string(self, text):
     self._add_entries(database.parse_string(text, bib_format='bibtex'))
Beispiel #29
0
def user_cv(op=None):
    """Create user CV using the CV templage filled with the ORCID profile data."""
    user = User.get(current_user.id)
    if not user.orcid:
        flash("You haven't linked your account with ORCID.", "warning")
        return redirect(request.referrer or url_for("index"))

    record = cache.get(user.orcid)
    if not record:
        token = OrcidToken.select(OrcidToken.access_token).where(
            OrcidToken.user_id == user.id, OrcidToken.org_id == user.organisation_id,
            OrcidToken.scopes.contains("read-limited")).first()
        if token is None:
            flash("You haven't granted your organisation necessary access to your profile..",
                  "danger")
            return redirect(url_for("link"))
        api = MemberAPIV3(user=user, access_token=token.access_token)
        try:
            record = api.get_record()
            works = [w for g in record.get("activities-summary", "works", "group") for w in g.get("work-summary")]
            combine_detail_works_summary = []
            for w in works:
                work_api_response = api.view_workv3(user.orcid, w.get("put-code"), _preload_content=False)
                work_data = json.loads(work_api_response.data, object_pairs_hook=NestedDict)
                combine_detail_works_summary.append(work_data)

            record['detail-works-summary'] = combine_detail_works_summary
            cache.set(user.orcid, record)
        except Exception as ex:
            flash(f"Failed to retrieve the profile: {ex}", "danger")
            return redirect(url_for("link"))

    if op is None:
        return render_template("user_cv.html")
    else:

        work_type_journal = []
        work_type_books = []
        work_type_book_chapter = []
        work_type_conference = []
        work_type_patent = []
        work_type_other = []
        educations = []
        employments = []
        first_name = None
        second_names = None
        family_name = None
        countries = []
        emails = []
        researcher_urls = []

        if record:
            works = record.get("detail-works-summary")
            for w in works:
                publications_dissemination = ""
                # Case 1 of getting Research publications and dissemination: Check for DOI and crossref the value
                external_id_url = [item.get('external-id-url', 'value') for item in
                                   w.get("external-ids").get("external-id") if
                                   item.get('external-id-type') and item.get(
                                       'external-id-type').lower() == 'doi'] if w.get("external-ids") and w.get(
                    "external-ids").get("external-id") else []

                for ex in external_id_url:
                    try:
                        resp = requests.get(ex, headers={"Accept": "text/bibliography; style=apa"})
                        if resp.status_code == 200:
                            publications_dissemination = resp.text.encode('latin-1').decode('utf-8').strip()
                            break
                    except requests.exceptions.RequestException:
                        continue
                    except Exception:
                        continue

                # Case 2 of getting Research publications and dissemination: Check citation types
                if not publications_dissemination and w.get("citation") and w.get(
                        "citation", "citation-type") and w.get("citation", "citation-value"):

                    citation_type = w.get("citation", "citation-type").lower()
                    citation_value = w.get("citation", "citation-value")

                    # Check if the citation is bibtex and try to parse it
                    if citation_type == "bibtex":
                        try:
                            data = parse_string(citation_value, 'bibtex')
                            apa = find_plugin('pybtex.style.formatting', 'apa')()
                            formatted_bib = apa.format_bibliography(data)
                            publications_dissemination = " ".join(
                                entry.text.render_as('text') for entry in formatted_bib)
                        except Exception:
                            # pass any exception and move forward to check for other criteria.
                            pass
                    # Case 3: If the citation is other than bibtex and ris, i.e. non standard citation then reproduce.
                    elif citation_type != "ris":
                        publications_dissemination = citation_value

                # Case 4 of getting Research publications and dissemination: Simple/Parse of work elements
                if not publications_dissemination:
                    publications_dissemination = w

                if w.get("type") in ['journal-article', 'journal-issue']:
                    work_type_journal.append(publications_dissemination)
                elif w.get("type") in ['book', 'book-review']:
                    work_type_books.append(publications_dissemination)
                elif w.get("type") in ['book-chapter', 'edited-book']:
                    work_type_book_chapter.append(publications_dissemination)
                elif w.get("type") in ['conference-paper', 'conference-abstract', 'conference-poster']:
                    work_type_conference.append(publications_dissemination)
                elif w.get("type") in ['patent']:
                    work_type_patent.append(publications_dissemination)
                else:
                    work_type_other.append(publications_dissemination)

            educations = [s.get("education-summary") for ag in
                          record.get("activities-summary", "educations", "affiliation-group", default=[]) for s in
                          ag.get("summaries", default=[])]
            employments = [s.get("employment-summary") for ag in
                           record.get("activities-summary", "employments", "affiliation-group", default=[]) for s in
                           ag.get("summaries", default=[])]

            first_name, *second_names = re.split("[,; \t]", str(
                record.get("person", "name", "given-names", "value", default=user.first_name)))

            family_name = record.get("person", "name", "family-name", "value", default=user.last_name)

            countries = [a.get("country", "value") for a in record.get("person", "addresses", "address")]

            emails = [e.get("email") for e in record.get("person", "emails", "email")] if record.get(
                "person", "emails", "email") else [user.email]

            researcher_urls = [r.get("url", "value") for r in record.get("person", "researcher-urls", "researcher-url")]

        person_data = dict(first_name=first_name, second_names=second_names, family_name=family_name, address=countries,
                           emails=emails, researcher_urls=researcher_urls)
        resp = make_response(
            render_template(
                "CV.html",
                user=user,
                now=datetime.now(),
                record=record,
                person_data=person_data,
                work_type_books=work_type_books,
                work_type_book_chapter=work_type_book_chapter,
                work_type_journal=work_type_journal,
                work_type_conference=work_type_conference,
                work_type_patent=work_type_patent,
                work_type_other=work_type_other,
                educations=educations,
                employments=employments))
        resp.headers["Cache-Control"] = "private, max-age=60"
        if op == "download" or "download" in request.args:
            meta_xml_data = render_template("CV/meta.xml", user=user, now=datetime.now())
            content_xml_data = render_template("CV/content.xml", user=user, now=datetime.now(), record=record,
                                               person_data=person_data, work_type_books=work_type_books,
                                               work_type_book_chapter=work_type_book_chapter,
                                               work_type_journal=work_type_journal,
                                               work_type_conference=work_type_conference,
                                               work_type_patent=work_type_patent, work_type_other=work_type_other,
                                               educations=educations, employments=employments)

            response = Response(cv_generator(meta_xml_data, content_xml_data),
                                mimetype='application/vnd.oasis.opendocument.text')
            response.headers["Content-Type"] = "application/vnd.oasis.opendocument.text"
            response.headers[
                'Content-Disposition'] = f"attachment; filename={current_user.name.replace(' ', '_')}_CV.odt"

            return response

    return resp
Beispiel #30
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        pk2id = collections.defaultdict(dict)

        skip_source = [
            'Lous-1969',  # -> Loos-1969
            'Payne-1990',  # -> Payne-1990a
        ]
        updated_source_keys = {
            'Anonymous-nd': 'North-East-Frontier-Agency-1963',
        }
        updated_source_names = {
            'North-East-Frontier-Agency-1963':
            'North East Frontier Agency 1963',
        }
        sources = parse_string(
            self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'),
            'bibtex')
        gbs_lg_refs = collections.defaultdict(set)
        src_names = {}
        for s in self.read('source', pkmap=pk2id).values():
            if s['id'] in skip_source:
                continue
            s['id'] = updated_source_keys.get(s['id'], s['id'])
            src_names[s['id']] = updated_source_names.get(s['id'], s['name'])
            try:
                jsd = json.loads(s['jsondata'])
                if 'wals_code' in jsd:
                    [gbs_lg_refs[c].add(s['id']) for c in jsd['wals_code']]
                gbs = jsd['gbs']
                if gbs['id'].strip():
                    sef = sources.entries[s['id']].fields
                    sef['google_book_search_id'] = gbs['id'].strip()
                    sef['google_book_viewability'] = gbs['accessInfo'][
                        'viewability'].strip()
            except (json.decoder.JSONDecodeError, KeyError):
                continue

        chapters = self.read('contribution', extended='chapter', pkmap=pk2id)

        refs = []
        crefs = collections.defaultdict(list)
        for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True):
            if row['source_pk']:
                sid = pk2id['source'][row['source_pk']]
                if sid not in skip_source:
                    refs.append(
                        (row['valueset_pk'], updated_source_keys.get(sid, sid),
                         row['description']))
        srcids = set(r[1] for r in refs)
        for row in self.raw_dir.read_csv('contributionreference.csv',
                                         dicts=True):
            sid = pk2id['source'][row['source_pk']]
            if sid not in crefs[pk2id['contribution'][row['contribution_pk']]]:
                crefs[pk2id['contribution'][row['contribution_pk']]].append(
                    sid)
                srcids.add(sid)
        unused_srcids = []
        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))
            else:
                unused_srcids.append(id_)
            # add language references out of bibtex tag 'wals_code'
            # to ensure that nothing was missed in raw/languagesource.csv (37 cases)
            if 'wals_code' in e.fields:
                [
                    gbs_lg_refs[c].add(id_)
                    for c in e.fields['wals_code'].split('; ')
                ]

        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in unused_srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))

        editors = {
            e['contributor_pk']: int(e['ord'])
            for e in self.read('editor', key=lambda r: int(r['ord'])).values()
        }

        contributors = self.read('contributor',
                                 pkmap=pk2id,
                                 key=lambda r: r['id'])
        for row in contributors.values():
            args.writer.objects['contributors.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Url':
                row['url'],
                'Editor_Ord':
                editors[row['pk']] if row['pk'] in editors else 0,
            })

        cc = {
            chapters[fid]['id']:
            [(r['primary'], pk2id['contributor'][r['contributor_pk']])
             for r in rows]
            for fid, rows in itertools.groupby(
                self.read('contributioncontributor',
                          key=lambda d: (d['contribution_pk'], d['primary'] ==
                                         'f', int(d['ord']))).values(),
                lambda r: r['contribution_pk'])
        }

        areas = self.read('area')
        for row in areas.values():
            args.writer.objects['areas.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'dbpedia_url':
                row['dbpedia_url'],
            })

        for row in self.read('parameter',
                             extended='feature',
                             pkmap=pk2id,
                             key=lambda d: fid_key(d['id'])).values():
            args.writer.objects['ParameterTable'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Chapter_ID':
                chapters[row['contribution_pk']]['id'],
            })

        for row in self.read(
                'domainelement',
                pkmap=pk2id,
                key=lambda d:
            (fid_key(d['id'].split('-')[0]), int(d['number']))).values():
            args.writer.objects['CodeTable'].append({
                'ID':
                row['id'],
                'Parameter_ID':
                pk2id['parameter'][row['parameter_pk']],
                'Name':
                row['name'],
                'Description':
                row['description'],
                'Number':
                int(row['number']),
                'icon':
                json.loads(row['jsondata'])['icon'],
            })

        identifier = self.read('identifier')
        lang2id = collections.defaultdict(
            lambda: collections.defaultdict(list))
        for row in self.read('languageidentifier').values():
            id_ = identifier[row['identifier_pk']]
            lang2id[row['language_pk']][id_['type']].append(
                (id_['name'], id_['description']))

        families = self.read('family', pkmap=pk2id)
        genera = self.read('genus', pkmap=pk2id)
        countries = self.read('country', pkmap=pk2id)
        lang2country = collections.defaultdict(list)
        for c in self.read('countrylanguage').values():
            lang2country[c['language_pk']].append(
                pk2id['country'][c['country_pk']])
        lrefs = collections.defaultdict(list)
        for c in self.read('languagesource').values():
            sid = pk2id['source'][c['source_pk']]
            sid = updated_source_keys.get(sid, sid)
            if sid not in lrefs[c['language_pk']]:
                lrefs[c['language_pk']].append(sid)

        for row in self.read('language', extended='walslanguage',
                             pkmap=pk2id).values():
            id = row['id']
            genus = genera[row['genus_pk']]
            genus_icon = genus['icon'] if genus else ''
            family = families[genus['family_pk']]
            if row['name'] == genus['name'] == family['name']:
                # an isolate!
                genus = family = None
            iso_codes = set(i[0]
                            for i in lang2id[row['pk']].get('iso639-3', []))
            glottocodes = [
                i[0] for i in lang2id[row['pk']].get('glottolog', [])
            ]
            srcs = lrefs[row['pk']]
            if id in gbs_lg_refs:
                [srcs.append(s) for s in gbs_lg_refs[id] if s not in srcs]
            args.writer.objects['LanguageTable'].append({
                'ID':
                id,
                'Name':
                row['name'].strip(),
                'ISO639P3code':
                list(iso_codes)[0] if len(iso_codes) == 1 else None,
                'Glottocode':
                glottocodes[0] if len(glottocodes) == 1 else None,
                'ISO_codes':
                sorted(iso_codes),
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Macroarea':
                row['macroarea'],
                'Genus':
                genus['name'] if genus else None,
                'GenusIcon':
                genus_icon,
                'Subfamily':
                genus['subfamily'] if genus else None,
                'Family':
                family['name'] if family else None,
                'Samples_100':
                row['samples_100'] == 't',
                'Samples_200':
                row['samples_200'] == 't',
                'Country_ID':
                lang2country[row['pk']],
                'Source':
                sorted(srcs),
            })
        args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID'])

        refs = {
            dpid: [
                str(
                    Reference(
                        source=str(r[1]),
                        desc=r[2].replace('[', ')').replace(']', ')').replace(
                            ';', '.').strip() if r[2] else None))
                for r in refs_
            ]
            for dpid, refs_ in itertools.groupby(refs, lambda r: r[0])
        }

        vsdict = self.read('valueset', pkmap=pk2id)

        examples = self.read('sentence', pkmap=pk2id)
        igts = {}
        for ex in examples.values():
            if all(ex[k] for k in ['description', 'analyzed', 'gloss']):
                a, g = ex['analyzed'].split(), ex['gloss'].split()
                if len(a) != len(g):
                    a, g = [ex['analyzed']], [ex['gloss']]
                igts[ex['pk']] = ex['id']
                args.writer.objects['ExampleTable'].append({
                    'ID':
                    ex['id'],
                    'Language_ID':
                    pk2id['language'][ex['language_pk']],
                    'Primary_Text':
                    ex['name'],
                    'Translated_Text':
                    ex['description'],
                    'Analyzed_Word':
                    a,
                    'Gloss':
                    g,
                })
        example_by_value = {
            vpk: [r['sentence_pk'] for r in rows]
            for vpk, rows in itertools.groupby(
                self.read('valuesentence', key=lambda d: d['value_pk']).values(
                ), lambda d: d['value_pk'])
        }

        for row in self.read('value').values():
            vs = vsdict[row['valueset_pk']]
            comment = None
            ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])]
            if len(ex) == 1 and not any(
                    ex[0][k] for k in ['description', 'analyzed', 'gloss']):
                comment = re.sub(r'[\r\n]', '', ex[0]['xhtml'])
                del example_by_value[row['pk']]
            args.writer.objects['ValueTable'].append({
                'ID':
                vs['id'],
                'Language_ID':
                pk2id['language'][vs['language_pk']],
                'Parameter_ID':
                pk2id['parameter'][vs['parameter_pk']],
                'Value':
                pk2id['domainelement'][row['domainelement_pk']].split('-')[1],
                'Code_ID':
                pk2id['domainelement'][row['domainelement_pk']],
                'Comment':
                comment,
                'Source':
                refs.get(vs['pk'], []),
                'Example_ID':
                sorted(igts[epk]
                       for epk in example_by_value.get(row['pk'], [])
                       if epk in igts),
            })

        args.writer.objects['ValueTable'].sort(
            key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID'])))

        altnames = []
        for lpk in lang2id:
            for type in lang2id[lpk]:
                if type == 'name':
                    for name, prov in lang2id[lpk][type]:
                        altnames.append((prov, name, pk2id['language'][lpk]))

        lnid = 0
        for (type, name), rows in itertools.groupby(sorted(altnames), lambda t:
                                                    (t[0], t[1])):
            lnid += 1
            args.writer.objects['language_names.csv'].append({
                'ID':
                str(lnid),
                'Language_ID': [r[2] for r in rows],
                'Name':
                name.strip(),
                'Provider':
                type,
            })

        for c in sorted(countries.values(), key=lambda x: x['id']):
            args.writer.objects['countries.csv'].append({
                'ID': c['id'],
                'Name': c['name'],
            })

        desc_dir = self.raw_dir / 'descriptions'
        src_pattern = re.compile(
            'src="https?://wals.info/static/descriptions/(?P<sid>s?[0-9]+)/images/(?P<fname>[^"]+)"'
        )

        def repl(m):
            p = desc_dir.joinpath(m.group('sid'), 'images', m.group('fname'))
            if p.exists():
                return 'src="{0}"'.format(data_url(p))
            return m.string[m.start():m.end()]

        descs = {}
        docs_dir = self.cldf_dir / 'docs'
        docs_dir.mkdir(exist_ok=True)
        for d in desc_dir.iterdir():
            if d.is_dir():
                descs[d.stem] = src_pattern.sub(
                    repl,
                    d.joinpath('body.xhtml').read_text(encoding='utf8'))

        for c in sorted(chapters.values(), key=lambda x: int(x['sortkey'])):
            if c['id'] in descs:
                fname = docs_dir / 'chapter_{}.html'.format(c['id'])
                with io.open(fname, 'w', encoding='utf-8') as f:
                    f.write(descs[c['id']])
            cid, wcid = [], []
            if c['id'] in cc:
                cid = [co[1] for co in cc[c['id']] if co[0] == 't']
                wcid = [co[1] for co in cc[c['id']] if co[0] == 'f']
            args.writer.objects['chapters.csv'].append({
                'ID':
                c['id'],
                'Name':
                c['name'],
                'wp_slug':
                c['wp_slug'],
                'Number':
                c['sortkey'],
                'Area_ID':
                areas[c['area_pk']]['id'] if c['area_pk'] in areas else '',
                'Source':
                crefs.get(c['id'], []),
                'Contributor_ID':
                cid,
                'With_Contributor_ID':
                wcid,
            })
Beispiel #31
0
 def read_bib(self, fname='sources.bib'):
     bib = database.parse_string(self.read(fname), bib_format='bibtex')
     return [Source.from_entry(k, e) for k, e in bib.entries.items()]
    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.

        >>> args.writer.objects['LanguageTable'].append(...)
        """

        # CLDF schema

        args.writer.cldf.add_component('ParameterTable')
        args.writer.cldf.add_component('LanguageTable', 'SubBranch', 'Family')
        args.writer.cldf.add_component('CodeTable')

        args.writer.cldf.add_table(
            'constructions.csv', 'http://cldf.clld.org/v1.0/terms.rdf#id',
            'http://cldf.clld.org/v1.0/terms.rdf#name',
            'http://cldf.clld.org/v1.0/terms.rdf#description',
            'http://cldf.clld.org/v1.0/terms.rdf#languageReference',
            'http://cldf.clld.org/v1.0/terms.rdf#source')
        args.writer.cldf.add_table(
            'cvalues.csv', 'http://cldf.clld.org/v1.0/terms.rdf#id',
            'Construction_ID',
            'http://cldf.clld.org/v1.0/terms.rdf#parameterReference',
            'http://cldf.clld.org/v1.0/terms.rdf#value',
            'http://cldf.clld.org/v1.0/terms.rdf#codeReference',
            'http://cldf.clld.org/v1.0/terms.rdf#comment')

        args.writer.cldf.add_foreign_key('cvalues.csv', 'Construction_ID',
                                         'constructions.csv', 'ID')

        # Read data

        data = self.raw_dir.read_csv('Data_to_be_published.csv', dicts=True)
        data = normalise_table(data)
        parameters = self.etc_dir.read_csv('parameters.csv', dicts=True)
        source_map = {
            citation.strip(): key.strip()
            for key, citation in self.etc_dir.read_csv(
                'citations-to-bibtex.csv')
        }
        sources = parse_string(self.raw_dir.read('sources.bib'), 'bibtex')

        # Process data

        lang_info = {
            row['Glottolog.Name']: {
                'ID': row['Glottolog.Name'],
                'Name': title_case(row.get('Language', '')),
                'SubBranch': title_case(row.get('Sub-branch', '')),
                'Family': title_case(row.get('Family', '')),
            }
            for row in data
        }
        languages = OrderedDict(
            (l['ID'], l) for l in make_language_table(lang_info))

        code_dict = OrderedDict()
        for column, param_id in PARAMETER_COLUMNS:
            if param_id == 'ap-marker':
                continue
            code_dict[param_id] = sorted(
                {unify_na(row[column])
                 for row in data if row.get(column)})
        codes = OrderedDict((
            (param_id, name),
            {
                'ID': '{}-c{}'.format(param_id, index + 1),
                'Parameter_ID': param_id,
                'Name': name,
            },
        ) for param_id, code_names in code_dict.items()
                            for index, name in enumerate(code_names))

        constructions = []
        cvalues = []
        ords = defaultdict(int)
        for index, row in enumerate(data):
            lang_id = row['Glottolog.Name']
            lang_name = languages[row['Glottolog.Name']]['Name']
            ords[lang_id] += 1

            constr_ord = ords[lang_id]
            constr_id = '{}-ap{}'.format(lang_id, constr_ord)

            def known_citation(cite):
                if cite in source_map:
                    return True
                else:
                    print('row {}: unknown citation:'.format(index + 2),
                          cite,
                          file=sys.stderr)
                    return False

            citations = [
                source_map[citation.strip()]
                for citation in row.get('Source', '').splitlines()
                if known_citation(citation)
            ]

            constructions.append({
                'ID':
                constr_id,
                'Name':
                '{} Antipassive Construction {}'.format(lang_name, constr_ord),
                'Language_ID':
                lang_id,
                'Source':
                citations
            })

            cvalues.extend({
                'ID':
                '{}-{}'.format(constr_id, param_id),
                'Construction_ID':
                constr_id,
                'Parameter_ID':
                param_id,
                'Value':
                unify_na(row[column]),
                'Code_ID':
                codes.get((param_id, unify_na(row[column])), {}).get('ID'),
            } for column, param_id in PARAMETER_COLUMNS if row.get(column))

        # Output data

        args.writer.cldf.add_sources(sources)
        args.writer.objects['LanguageTable'] = languages.values()
        args.writer.objects['ParameterTable'] = parameters
        args.writer.objects['CodeTable'] = codes.values()
        args.writer.objects['ValueTable'] = []
        args.writer.objects['constructions.csv'] = constructions
        args.writer.objects['cvalues.csv'] = cvalues
Beispiel #33
0
parser.add_argument(
    "-b",
    "--borr_ref",
    dest="borr_ref",
    help="Use borrowing reference file instead of citations file.",
    default=False,
    action='store_true')

args = parser.parse_args()
if args.borr_ref == True:
    tsvfile = "Borrowing_references.tsv"

else:
    tsvfile = "Citation_codes.tsv"

out = []
with open(tsvfile, "r") as f:
    for row in csv.reader(f, delimiter="\t", quotechar='"'):
        if row[2] == "type":  # header
            continue
        if row[2] == "E":  # expert (not citable)
            continue

        bib = BREAK_PATTERN.sub(lambda m: ',\n {0}='.format(m.group('key')),
                                row[1])
        parse_string(bib, bib_format='bibtex')
        out.append(bib)

print('\n\n'.join(out))