def fulltext_to_single_entry(fulltext: str) -> pybtex.Entry: """ Parses a BibTeX entry into a pybtex.Entry """ entry, = pybtex.parse_string(fulltext, bib_format="bibtex").entries.values() return entry
def _add_file(fname, force_redownload, db, per_file_progress_bar): """ Return #added, #skipped, file_skipped """ if fname.startswith('http'): if not force_redownload and db.file_has_been_downloaded(fname): return 0, 0, True try: new_entries = pybtex.parse_string(download_file(fname), bib_format="bibtex").entries except urllib.error.URLError as e: raise AddFileError("Error downloading '%s' [%s]" % (fname, str(e))) except pybtex.PybtexError: raise AddFileError("Error parsing file %s" % fname) db.register_file_downloaded(fname) else: new_entries = pybtex.parse_file(fname, bib_format="bibtex").entries added = 0 skipped = 0 if per_file_progress_bar: iterable = tqdm( new_entries.values(), ncols=80, bar_format="{l_bar}{bar}| [Elapsed: {elapsed} ETA: {remaining}]") else: iterable = new_entries.values() for entry in iterable: if db.add(entry): added += 1 else: skipped += 1 return added, skipped, False
def main(): doi = _extract_doi(args.identifier[0]) if doi is None: print(item) elif args.bibtex: result = cn.content_negotiation(doi, format="bibtex") bibtex = parse_string(result, "bibtex") try: name = "".join( bibtex.entries.values()[0].persons.values()[0][0].last_names) name = name.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue") name = unidecode(name) shortdoi = _short_doi(doi)[3:] year = bibtex.entries.values()[0].fields["year"] key = "{}_{}_{}".format(name, year, shortdoi) new = BibliographyData() new.add_entry(key, bibtex.entries[bibtex.entries.keys()[0]]) print(new.to_string("bibtex")) except KeyError: print(result) else: try: result = cn.content_negotiation(doi, format=args.format) print(result) except requests.exceptions.HTTPError: print(doi) print()
def test_backend_html(bib, html): style = UnsrtStyle() backend = HtmlBackend() bib_data = parse_string(bib, 'bibtex') for formatted_entry in style.format_entries(bib_data.entries.values()): render = formatted_entry.text.render(backend) print(render) assert render.strip() == html.strip()
def push_2_db(bibtex_str, collection_name, db_con): """push a collection to database""" bib_data = parse_string(bibtex_str, 'bibtex') cursor = db_con.cursor() # drop existing table if has the same name drop_cmd = """DROP TABLE IF EXISTS {}""".format(collection_name) cursor.execute(drop_cmd) db_con.commit() # first create table create_cmd = """CREATE TABLE {} ( ref_tag TEXT PRIMARY KEY, author_list TEXT, journal TEXT, volume INTEGER, pages TEXT, year INTEGER, title TEXT, collection TEXT) """.format(collection_name) cursor.execute(create_cmd) db_con.commit() # add each entry to the table for entry in bib_data.entries.values(): entry_fields = entry.lower().fields.keys() wanted_fields = {'journal', 'volume', 'pages', 'year', 'title'} # valide fields valid_fields = [] for fd in entry_fields: if fd in wanted_fields: valid_fields += [fd] ref_tag, author_list = entry.key, make_author_list(entry) # construct insert command # we only insert valid fields into database in case of missing data # so this takes a few lines to construct sql command bracket_1, bracket_2 = "{}", "\"{}\"" bracket_dict = { 'journal': bracket_2, 'volume': bracket_1, 'pages': bracket_2, 'year': bracket_1, 'title': bracket_2 } valid_fd_str = ','.join(valid_fields) bracket_str = ','.join([bracket_dict[fd] for fd in valid_fields]) valid_fd_values = list( tidy_string(entry.fields[fd]) for fd in valid_fields) insert_cmd = """INSERT INTO {} (ref_tag, author_list, """ + valid_fd_str + \ """, collection) VALUES """ + """("{}", "{}","""+ bracket_str + """, "{}")""" insert_cmd = insert_cmd.format(collection_name, ref_tag, author_list, *valid_fd_values, collection_name) # commit insertion cursor.execute(insert_cmd) db_con.commit() return
def add(self, *entries): """ Add a source, either specified by glottolog reference id, or as bibtex record. """ for entry in entries: if isinstance(entry, string_types): self._add_entries(database.parse_string(entry, bib_format='bibtex')) else: self._add_entries(entry)
def add(self, *entries): """ Add a source, either specified by glottolog reference id, or as bibtex record. """ for entry in entries: if isinstance(entry, string_types): self._add_entries( database.parse_string(entry, bib_format='bibtex')) else: self._add_entries(entry)
def read_bib_entries(*locations): """Yield pybtex.database.Entry objects from each location in turn. Locations can be file names or strings containing file contents. """ for loc in locations: if os.path.isfile(loc): with open(loc, encoding='latin1') as f: loc = f.read() for item in database.parse_string(loc.replace('}.', '},'), 'bibtex').entries.values(): yield item
def aggregate_snls(snls): """ Aggregates a series of SNLs into the fields for a single SNL """ # Choose earliesst created_at created_at = sorted([snl["about"]["created_at"]["string"] for snl in snls])[0] # Choose earliest history history = sorted(snls, key=lambda snl: snl["about"]["created_at"]["string"])[0]["about"]["history"] # Aggregate all references into one dict to remove duplicates refs = {} for snl in snls: try: entries = parse_string(snl["about"]["references"], bib_format="bibtex") refs.update(entries.entries) except: self.logger.debug("Failed parsing bibtex: {}".format(snl["about"]["references"])) entries = BibliographyData(entries=refs) references = entries.to_string("bibtex") # Aggregate all remarks remarks = list(set([remark for snl in snls for remark in snl["about"]["remarks"]])) # Aggregate all projects projects = list(set([projects for snl in snls for projects in snl["about"]["projects"]])) # Aggregate all authors - Converting a single dictionary first performs duplicate checking authors = {entry["name"].lower(): entry["email"] for snl in snls for entry in snl["about"]["authors"]} authors = [{"name": name.title(), "email": email} for name, email in authors.items()] # Aggregate all the database IDs db_ids = defaultdict(list) for snl in snls: if len(snl["about"]["history"]) == 1 and \ snl["about"]["history"][0]["name"] in DB_indexes: db_name = snl["about"]["history"][0]["name"] db_id_key = DB_indexes[db_name] db_ids[db_id_key].append(snl["about"]["history"][0]["description"].get("id", None)) # remove Nones and empty lists db_ids = {k: list(filter(None, v)) for k, v in db_ids.items() if len(list(filter(None, db_ids.items()))) > 0} snl_fields = { "created_at": created_at, "history": history, "references": references, "remarks": remarks, "projects": projects, "authors": authors, "data": {"_db_ids": db_ids} } return snl_fields
def test_biblio(stagingArea): if "References" in ref_field: for reference in ref_field["References"]: if "bibtex" in reference: bibtex_str = reference["bibtex"] # force a key if not present, for having valid parsing bibtex_str = bibtex_str.replace("@Article{,", "@Article{toto,") biblio = None #try: biblio = parse_string(bibtex_str, "bibtex") #except: # print("Failed to parse the bibtext string:", bibtex_str) if biblio != None: for key in biblio.entries: print(key) local_title = biblio.entries[key].fields["title"] local_authors = biblio.entries[key].persons if "author" in local_authors: all_authors = local_authors["author"] first_author_last_name = all_authors[0].last_names[ 0] text_format_ref = format_from_string(bibtex_str, style="plain") res_format_ref = "" for line_format_ref in text_format_ref.split("\n"): if line_format_ref.startswith("\\newblock"): res_format_ref += line_format_ref.replace( "\\newblock", "") elif len(line_format_ref.strip( )) != 0 and not line_format_ref.startswith("\\"): res_format_ref += line_format_ref res_format_ref = res_format_ref.strip() res_format_ref = res_format_ref.replace("\\emph{", "") res_format_ref = res_format_ref.replace("\\url{", "") res_format_ref = res_format_ref.replace("}", "") print(res_format_ref) print( stagingArea.biblio_glutton_lookup( raw_ref=res_format_ref, title=local_title, first_author_last_name=first_author_last_name)) if "raw" in reference: # this can be sent to GROBID print(reference["raw"]) print( stagingArea.biblio_glutton_lookup( raw_ref=reference["raw"]))
def import_citations(path_citations): # This function creates a list of pybtex object. Every pybtex object is a citation my_file = open(path_citations, 'r', encoding='utf-8') strings = my_file.read().split("@") bib_datas = [] for stringa in strings: if stringa == "": continue stringa = '@'+stringa bib_data = parse_string(stringa, "bibtex") bib_datas.append(bib_data) my_file.close() return bib_datas
def bib2html(bibliography, id_use_order=(), exclude_fields=None): exclude_fields = exclude_fields or [] if exclude_fields: bibliography = parse_string(bibliography.to_string('bibtex'), 'bibtex') for entry in bibliography.entries.values(): for ef in exclude_fields: if ef in entry.fields.__dict__['_dict']: del entry.fields.__dict__['_dict'][ef] # return old_format(bibliography) return new_format(bibliography, id_use_order)
def bibtex_to_dict(bibtex): try: bibdict = {} #['doi', 'url', 'year', 'month', 'publisher', 'pages', 'title', 'journal', 'volume', 'number', 'booktitle', 'keywords'] bibtex = parse_string(bibtex, bib_format="bibtex") entry = bibtex.entries.values()[0] for field in entry.fields.keys(): bibdict[field] = entry.fields[field] for key in entry.persons.keys(): bibdict[key] = [] for people in entry.persons[key]: bibdict[key].append(str(people)) except Exception as e: pass return bibdict
def download_from_openreview(url, dirpath='.'): url = url.rstrip('\n') if '/pdf?' in url: url = url.replace('/pdf?', '/forum?') page_source = urllib.request.urlopen(url).read().decode('utf-8') xml = etree.fromstring(page_source, parser=etree.HTMLParser()) bib = xml.xpath('//a[@class="action-bibtex-modal"]/@data-bibtex')[0] bib_database = database.parse_string(bib, bib_format='bibtex') author_lastname = bib_database.entries.values( )[0].persons['author'][0].last()[0] year = bib_database.entries.values()[0].fields['year'].strip() title = bib_database.entries.values()[0].fields['title'].strip() out_name = '[{}+{}] {}.pdf'.format(author_lastname, year, title).replace('{', '').replace('}', '') path = os.path.join(dirpath, out_name) pdf_url = url.replace('/forum?', '/pdf?') logger.info('Download "{}" from "{}"'.format(title, pdf_url)) urlretrieve(pdf_url, path, reporthook=reporthook) return path
def download_from_acl(url, dirpath='.'): if url.endswith('.pdf'): url = url[:-4] # strip '.pdf' # get filename bib_url = url.strip('\n').rstrip('/') + '.bib' bib = urllib.request.urlopen(bib_url).read().decode('utf-8') bib_database = database.parse_string(bib, bib_format='bibtex') author_lastname = bib_database.entries.values( )[0].persons['author'][0].last()[0] year = bib_database.entries.values()[0].fields['year'].strip() title = bib_database.entries.values()[0].fields['title'].strip() out_name = '[{}+{}] {}.pdf'.format(author_lastname, year, title).replace('{', '').replace('}', '') # get authorname path = os.path.join(dirpath, out_name) pdf_url = url.strip('\n').rstrip('/') + '.pdf' logger.info('Download "{}" from "{}"'.format(title, pdf_url)) urlretrieve(pdf_url, path, reporthook=reporthook) return path
def mostrarSeccionCarga(): conectarBd() uploaded_file = st.file_uploader("Archivo Bibtex con la información de los papers") before = len(Paper.objects) if uploaded_file is not None: # To read file as bytes: bytes_data = uploaded_file.read() data = bytes_data.decode("utf-8") bib_data = parse_string(data, 'bibtex') notdoi = [] papers = [] with st.spinner("Preprocesando el archivo para la carga..."): total = sum(1 for entry in bib_data.entries.values()) st.success("Se iniciará la carga de "+str(total)+" papers a la base de datos.") my_bar = st.progress(.0) loaded = 0 for entry in bib_data.entries.values(): fields = entry.fields title = fields["title"].replace('{', '').replace('}', '') doi = fields.get("doi") isOnlyReference = False loaded+=1 my_bar.progress(loaded/total) if doi is None: notdoi.append(title) continue abstract = fields.get("abstract","") paper = Paper(title = title, doi = doi , abstract = abstract, isOnlyReference = isOnlyReference).save() papers.append(paper) after = len(Paper.objects) st.success("Se ingresaron "+ str(after-before) + " papers a la base de datos") st.write([x.title for x in papers]) if len(notdoi): st.error ("No se pudo ingresar " + str(len(notdoi)) + " debido a que no se conocía su doi") st.write(notdoi)
def getSourceFromBibTex(source): "utility function to read source from bibtex" return database.parse_string(source, bib_format="bibtex")
def getEvoBibAsSource(key): """Download bibtex format and parse it from EvoBib""" return database.parse_string( urlopen("http://bibliography.lingpy.org/raw.php?key=" + key).read().decode('utf-8'), bib_format='bibtex')
def to_cldf(self, ds, concepts): """ :param ds: the dataset object :concepts: a dictionary mapping concept labels to concept ids :return: A dataset object, ds. """ source = [] if self.language.source: bib = parse_string(self.language.source, "bibtex") try: ds.add_sources( *[Source.from_entry(k, e) for k, e in bib.entries.items()]) source = list(bib.entries.keys()) except: # noqa: E722 self.log.warning("Invalid citekey for %s" % self.language.id) ds.add_language(ID=self.language.id, Glottocode=self.language.glottocode, ISO639P3code=self.language.iso, Name=self.language.name, author=self.language.author, url=self.url('language.php?id=%s' % self.language.id), typedby=self.language.typedby, checkedby=self.language.checkedby, notes=self.language.notes, source=";".join(source)) for entry in self.entries: if entry.name is None or len( entry.name) == 0: # skip empty entries continue # pragma: no cover # skip entries marked as incorrect word form due to semantics # (x = probably, s = definitely) if entry.cognacy and entry.cognacy.lower() in ('s', 'x'): continue # pragma: no cover # handle concepts cid = concepts.get(entry.word_id) if not cid: self.dataset.unmapped.add_concept(ID=entry.word_id, Name=entry.word) # add it if we don't have it. ds.add_concept(ID=entry.word_id, Name=entry.word) cid = entry.word_id # handle lexemes try: lex = ds.add_forms_from_value( Local_ID=entry.id, Language_ID=self.language.id, Parameter_ID=cid, Value=entry.name, # set source to entry-level sources if they exist, otherwise use # the language level source. Source=[entry.source] if entry.source else source, Cognacy=entry.cognacy, Comment=entry.comment or '', Loan=True if entry.loan and len(entry.loan) else False, ) except: # NOQA: E722; pragma: no cover print("ERROR with %r -- %r" % (entry.id, entry.name)) raise if lex: for cognate_set_id in entry.cognates: match = self.dataset.cognate_pattern.match(cognate_set_id) if not match: # pragma: no cover self.log.warning( 'Invalid cognateset ID for entry {0}: {1}'.format( entry.id, cognate_set_id)) else: # make global cognate set id cs_id = "%s-%s" % (slug(entry.word), match.group('id')) ds.add_cognate(lexeme=lex[0], Cognateset_ID=cs_id, Doubt=bool(match.group('doubt')), Source=['Greenhilletal2008'] if self.section == 'austronesian' else []) return ds
def from_SNLs( cls, material_id: Union[MPID, int], snls: List[Dict], ) -> "ProvenanceDoc": """ Converts legacy Pymatgen SNLs into a single provenance document """ # Choose earliest created_at created_at = sorted( [get(snl, "about.created_at.string", datetime.max) for snl in snls] )[0] # Choose earliest history history = sorted( snls, key=lambda snl: get(snl, "about.created_at.string", datetime.max) )[0]["about"]["history"] # Aggregate all references into one dict to remove duplicates refs = {} for snl in snls: try: entries = parse_string(snl["about"]["references"], bib_format="bibtex") refs.update(entries.entries) except Exception: warnings.warn(f"Failed parsing bibtex: {snl['about']['references']}") bib_data = BibliographyData(entries=refs) references = [ref.to_string("bibtex") for ref in bib_data.entries] # TODO: Maybe we should combine this robocrystallographer? # TODO: Refine these tags / remarks remarks = list( set([remark for snl in snls for remark in snl["about"]["remarks"]]) ) tags = [r for r in remarks if len(r) < 140] # Aggregate all authors - Converting a single dictionary first # performs duplicate checking authors_dict = { entry["name"].lower(): entry["email"] for snl in snls for entry in snl["about"]["authors"] } authors = [ {"name": name.title(), "email": email} for name, email in authors_dict.items() ] # Check if this entry is experimental if any(get(snl, "about.history.0.experimental", False) for snl in snls): experimental = True # Aggregate all the database IDs snl_ids = [snl.get("snl_id", "") for snl in snls] db_ids = { Database(db_id): [snl_id for snl_id in snl_ids if db_id in snl_id] for db_id in map(str, Database) } # remove Nones and empty lists db_ids = {k: list(filter(None, v)) for k, v in db_ids.items()} db_ids = {k: v for k, v in db_ids.items() if len(v) > 0} # Get experimental bool experimental = any( get(snl, "about.history.0.experimental", False) for snl in snls ) snl_fields = { "created_at": created_at, "references": references, "authors": authors, "remarks": remarks, "tags": tags, "database_IDs": db_ids, "theoretical": not experimental, "history": history, } return ProvenanceDoc(material_id=material_id, **snl_fields)
def deserialize(self, string): return parse_string(string, self.bib_format)
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) ds.tablegroup.notes.append( OrderedDict([('dc:title', 'environment'), ('properties', OrderedDict([ ('glottolog_version', git_describe(glottolog.repos)), ]))])) ds.add_columns('ValueTable', { 'name': 'Marginal', 'datatype': 'boolean' }, { 'name': 'Allophones', 'separator': ' ' }, 'Contribution_ID') features = [ "tone", "stress", "syllabic", "short", "long", "consonantal", "sonorant", "continuant", "delayedRelease", "approximant", "tap", "trill", "nasal", "lateral", "labial", "round", "labiodental", "coronal", "anterior", "distributed", "strident", "dorsal", "high", "low", "front", "back", "tense", "retractedTongueRoot", "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource", "spreadGlottis", "constrictedGlottis", "fortis", "raisedLarynxEjective", "loweredLarynxImplosive", "click" ] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable') ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL') ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL', ) def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], {}, {}, [] for contrib in read('contributors.csv'): sources.append( dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[ c.strip().lower() for c in contrib.Citation.split(';') ], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', )) pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = pid segments.append( dict(ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features})) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split( ';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict(ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID, URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID]) uniq = set() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug( inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, ) values.append( dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid_map[row.Parameter_ID], Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval( row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) ds.write( **{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def read(self, fname): self._add_entries( database.parse_string(read_text(fname), bib_format='bibtex'))
def cmd_makecldf(self, args): args.writer.add_sources(self.raw_dir.read("Citations.bib")) bib = parse_string(self.raw_dir.read('Borrowing_references.bib'), 'bibtex') for k, v in bib.entries.items(): args.writer.add_sources( Source.from_entry(slug(k, lowercase=False), v)) args.writer.cldf.add_component( 'BorrowingTable', { 'name': 'Likelihood', 'dc:description': 'Likelihood of borrowing (*possible*, *probable* or *clear*).', 'datatype': { 'base': 'string', 'format': 'possible|clear|probable' } }, { 'name': 'SourceLanguoid', 'dc:description': 'Borrowing source of lexeme.', }) args.writer.cldf['FormTable', 'form'].required = False args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS args.writer.cldf['FormTable', 'value'].required = False args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \ "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \ "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \ "meanings in Proto-Uralic." for src in self._read("Citation_codes"): if src["type"] == "E": args.writer.add_sources( Source("misc", src["ref_abbr"], author=src["original_reference"])) glottocodes = { language["ID"]: language["Glottocode"] for language in self.languages } for language in self._read("Languages"): glottocode = glottocodes.get(language["lgid3"]) if not glottocode: glottocode = self.glottolog.glottocode_by_iso.get( language["ISO-639-3"]) args.writer.add_language( ID=language["lgid3"], Name=language["language"], Glottocode=glottocode, Description=language["Description"], Subgroup=language["Subgroup"], ISO639P3code=language["ISO-639-3"], ) inlists = {r['mng_item']: r for r in self._read('Meaning_lists')} attrs = [ k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank' ] for concept in self.concepts: if concept['ID'] in inlists: memberships = { k.replace('-', '_'): v == '1' for k, v in inlists[concept['ID']].items() if k.replace('-', '_') in attrs } concept.update(memberships) args.writer.add_concept(**concept) for (cid, cogid), ll in itertools.groupby( sorted(self._read("Data"), key=lambda i: (i["mng_item"], i["cogn_set"])), lambda i: (i["mng_item"], i["cogn_set"]), ): for language in ll: if language['item'] in NULL_ITEMS: language['etym_notes'] = language['etym_notes'] + language[ 'item'] kw = dict( Value=language["item"], Language_ID=language["lgid3"], Parameter_ID=cid, Comment=language["general_notes"], Source=[ slug(rid, lowercase=False) for rid in split_text( language["ref_abbr"], ",", strip=True) ], ) kw.update({ k: language[k] for k in [ "item_UPA", "item_IPA", "form_set", "etym_notes", "glossing_notes", ] }) for i, lex in enumerate(args.writer.add_lexemes(**kw)): lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[ 'Form'] if cogid not in ["?", "0"]: args.writer.add_cognate(lexeme=lex, Cognateset_ID="{0}-{1}".format( cid, cogid)) if language['borr_qual']: c = ': borrowed to Pre-Permic' ref = language['ref_borr'] if c in ref: comment = c[1:].strip() ref = ref.replace(c, '') else: comment = None args.writer.objects['BorrowingTable'].append( dict( ID=lex['ID'], Target_Form_ID=lex['ID'], SourceLanguoid=language['borr_source'], Likelihood=language['borr_qual'], Source=bibkeys(ref), Comment=comment, ))
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') for _, e in bib.entries.items(): for field in e.fields: e.fields[field] = e.fields[field].replace('\\', '') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) def describe_repos(r, org, name=None): return OrderedDict([ ('dc:title', '{0}/{1}'.format(org, name or r.name)), ('dc:description', git_describe(r))]) ds.tablegroup.common_props['prov:wasDerivedFrom'] = [ describe_repos(dev, 'phoible'), describe_repos(scripts, 'bambooforest'), describe_repos(glottolog.repos, 'clld'), ] ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos( Path(__file__).parent, 'cldf-datasets', name='phoible') ds.add_columns( 'ValueTable', {'name': 'Marginal', 'datatype': 'boolean'}, {'name': 'Allophones', 'separator': ' '}, 'Contribution_ID') features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name') table = ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'}, ) table.tableSchema.primaryKey = ['ID'] table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict( columnReference='Contributor_ID', reference=dict(resource='contributors.csv', columnReference='ID')))) table.common_props['dc:conformsTo'] = None table = ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}}, ) table.tableSchema.primaryKey = ['ID'] table.common_props['dc:conformsTo'] = None def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), [] with_tones = {} for contrib in read('contributors.csv'): sources.append(dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[c.strip().lower() for c in contrib.Citation.split(';')], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', with_tones=contrib.with_tones == '1', )) with_tones[contrib.Name] = contrib.with_tones == '1' pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = (pid, row.SegmentClass) segments.append(dict( ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features} )) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split(';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict( ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID.upper(), URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID], count_phonemes=0, count_consonants=0, count_vowels=0, count_tones=0, ) uniq, counts = set(), Counter() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) fam = lang.lineage[0] if lang and lang.lineage else None languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None, Latitude=lang.latitude if lang else None, Longitude=lang.longitude if lang else None, Family_Glottocode=fam[1] if fam else None, Family_Name=fam[0] if fam else None, ) pid, sc = pid_map[row.Parameter_ID] counts.update([(row.Contribution_ID, sc)]) values.append(dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid, Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) for key, count in counts.items(): inventories[key[0]]['count_{0}s'.format(key[1])] = count inventories[key[0]]['count_phonemes'] += count for inv in inventories.values(): if not with_tones[inv['Contributor_ID']]: assert inv['count_tones'] == 0 inv['count_tones'] = 'NA' ds.write(**{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def read(self, fname): self._add_entries(database.parse_string(read_text(fname), bib_format='bibtex'))
def _edit(args, config): db = BibDB(config) search_results = db.search(args.terms) if not search_results: logging.error("Search returned no results. Aborting.") sys.exit(1) with tempfile.NamedTemporaryFile("w") as temp_file: temp_fname = temp_file.name with open(temp_fname, "wt") as fp: original_entries_text = format_search_results( search_results, bibtex_output=True, use_original_key=False) fp.write(original_entries_text) original_entries = pybtex.parse_string( original_entries_text, bib_format="bibtex").entries.values() subprocess.run([config.editor, temp_file.name]) with open(temp_fname, "rt"): new_entries = pybtex.parse_file( temp_fname, bib_format="bibtex").entries.values() deleted_entries = [] edited_entries = [] seen_original_keys = set() changelog = [] for new in new_entries: original_key = new.fields["original_key"] seen_original_keys.add(original_key) old = find_entry(original_entries, "original_key", original_key) added, deleted, edited = compare_entries(old, new) if added or deleted or edited: edited_entries.append(new) # Report changes edited_entries.append(new) changelog.append("\nEntry %s" % old.key) for field in added: changelog.append('\tAdded %s with value "%s"' % (field, new.fields[field])) for field in deleted: changelog.append("\tDeleted %s" % field) for field in edited: changelog.append( '\tChanged %s to "%s"' % (field, new.key if field == "key" else new.fields[field])) for old in original_entries: if not old.fields["original_key"] in seen_original_keys: deleted_entries.append(old) if deleted_entries: changelog.append("\nDeleted entries:") for e in deleted_entries: changelog.append("\t%s" % e.key) if not edited_entries and not deleted_entries: logging.warning("There were not changes in the entries.") sys.exit(0) print("Summary of changes:") print("\n".join(changelog) + "\n") confirmation = prompt("Do you want to perform these changes?", "YES", "no") if confirmation == "YES": for e in edited_entries: db.update(e) for e in deleted_entries: db.remove(e.key) db.save() print("Updated database.") else: print("Aborted.")
def read_string(self, text): self._add_entries(database.parse_string(text, bib_format='bibtex'))
def user_cv(op=None): """Create user CV using the CV templage filled with the ORCID profile data.""" user = User.get(current_user.id) if not user.orcid: flash("You haven't linked your account with ORCID.", "warning") return redirect(request.referrer or url_for("index")) record = cache.get(user.orcid) if not record: token = OrcidToken.select(OrcidToken.access_token).where( OrcidToken.user_id == user.id, OrcidToken.org_id == user.organisation_id, OrcidToken.scopes.contains("read-limited")).first() if token is None: flash("You haven't granted your organisation necessary access to your profile..", "danger") return redirect(url_for("link")) api = MemberAPIV3(user=user, access_token=token.access_token) try: record = api.get_record() works = [w for g in record.get("activities-summary", "works", "group") for w in g.get("work-summary")] combine_detail_works_summary = [] for w in works: work_api_response = api.view_workv3(user.orcid, w.get("put-code"), _preload_content=False) work_data = json.loads(work_api_response.data, object_pairs_hook=NestedDict) combine_detail_works_summary.append(work_data) record['detail-works-summary'] = combine_detail_works_summary cache.set(user.orcid, record) except Exception as ex: flash(f"Failed to retrieve the profile: {ex}", "danger") return redirect(url_for("link")) if op is None: return render_template("user_cv.html") else: work_type_journal = [] work_type_books = [] work_type_book_chapter = [] work_type_conference = [] work_type_patent = [] work_type_other = [] educations = [] employments = [] first_name = None second_names = None family_name = None countries = [] emails = [] researcher_urls = [] if record: works = record.get("detail-works-summary") for w in works: publications_dissemination = "" # Case 1 of getting Research publications and dissemination: Check for DOI and crossref the value external_id_url = [item.get('external-id-url', 'value') for item in w.get("external-ids").get("external-id") if item.get('external-id-type') and item.get( 'external-id-type').lower() == 'doi'] if w.get("external-ids") and w.get( "external-ids").get("external-id") else [] for ex in external_id_url: try: resp = requests.get(ex, headers={"Accept": "text/bibliography; style=apa"}) if resp.status_code == 200: publications_dissemination = resp.text.encode('latin-1').decode('utf-8').strip() break except requests.exceptions.RequestException: continue except Exception: continue # Case 2 of getting Research publications and dissemination: Check citation types if not publications_dissemination and w.get("citation") and w.get( "citation", "citation-type") and w.get("citation", "citation-value"): citation_type = w.get("citation", "citation-type").lower() citation_value = w.get("citation", "citation-value") # Check if the citation is bibtex and try to parse it if citation_type == "bibtex": try: data = parse_string(citation_value, 'bibtex') apa = find_plugin('pybtex.style.formatting', 'apa')() formatted_bib = apa.format_bibliography(data) publications_dissemination = " ".join( entry.text.render_as('text') for entry in formatted_bib) except Exception: # pass any exception and move forward to check for other criteria. pass # Case 3: If the citation is other than bibtex and ris, i.e. non standard citation then reproduce. elif citation_type != "ris": publications_dissemination = citation_value # Case 4 of getting Research publications and dissemination: Simple/Parse of work elements if not publications_dissemination: publications_dissemination = w if w.get("type") in ['journal-article', 'journal-issue']: work_type_journal.append(publications_dissemination) elif w.get("type") in ['book', 'book-review']: work_type_books.append(publications_dissemination) elif w.get("type") in ['book-chapter', 'edited-book']: work_type_book_chapter.append(publications_dissemination) elif w.get("type") in ['conference-paper', 'conference-abstract', 'conference-poster']: work_type_conference.append(publications_dissemination) elif w.get("type") in ['patent']: work_type_patent.append(publications_dissemination) else: work_type_other.append(publications_dissemination) educations = [s.get("education-summary") for ag in record.get("activities-summary", "educations", "affiliation-group", default=[]) for s in ag.get("summaries", default=[])] employments = [s.get("employment-summary") for ag in record.get("activities-summary", "employments", "affiliation-group", default=[]) for s in ag.get("summaries", default=[])] first_name, *second_names = re.split("[,; \t]", str( record.get("person", "name", "given-names", "value", default=user.first_name))) family_name = record.get("person", "name", "family-name", "value", default=user.last_name) countries = [a.get("country", "value") for a in record.get("person", "addresses", "address")] emails = [e.get("email") for e in record.get("person", "emails", "email")] if record.get( "person", "emails", "email") else [user.email] researcher_urls = [r.get("url", "value") for r in record.get("person", "researcher-urls", "researcher-url")] person_data = dict(first_name=first_name, second_names=second_names, family_name=family_name, address=countries, emails=emails, researcher_urls=researcher_urls) resp = make_response( render_template( "CV.html", user=user, now=datetime.now(), record=record, person_data=person_data, work_type_books=work_type_books, work_type_book_chapter=work_type_book_chapter, work_type_journal=work_type_journal, work_type_conference=work_type_conference, work_type_patent=work_type_patent, work_type_other=work_type_other, educations=educations, employments=employments)) resp.headers["Cache-Control"] = "private, max-age=60" if op == "download" or "download" in request.args: meta_xml_data = render_template("CV/meta.xml", user=user, now=datetime.now()) content_xml_data = render_template("CV/content.xml", user=user, now=datetime.now(), record=record, person_data=person_data, work_type_books=work_type_books, work_type_book_chapter=work_type_book_chapter, work_type_journal=work_type_journal, work_type_conference=work_type_conference, work_type_patent=work_type_patent, work_type_other=work_type_other, educations=educations, employments=employments) response = Response(cv_generator(meta_xml_data, content_xml_data), mimetype='application/vnd.oasis.opendocument.text') response.headers["Content-Type"] = "application/vnd.oasis.opendocument.text" response.headers[ 'Content-Disposition'] = f"attachment; filename={current_user.name.replace(' ', '_')}_CV.odt" return response return resp
def cmd_makecldf(self, args): self.create_schema(args.writer.cldf) pk2id = collections.defaultdict(dict) skip_source = [ 'Lous-1969', # -> Loos-1969 'Payne-1990', # -> Payne-1990a ] updated_source_keys = { 'Anonymous-nd': 'North-East-Frontier-Agency-1963', } updated_source_names = { 'North-East-Frontier-Agency-1963': 'North East Frontier Agency 1963', } sources = parse_string( self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'), 'bibtex') gbs_lg_refs = collections.defaultdict(set) src_names = {} for s in self.read('source', pkmap=pk2id).values(): if s['id'] in skip_source: continue s['id'] = updated_source_keys.get(s['id'], s['id']) src_names[s['id']] = updated_source_names.get(s['id'], s['name']) try: jsd = json.loads(s['jsondata']) if 'wals_code' in jsd: [gbs_lg_refs[c].add(s['id']) for c in jsd['wals_code']] gbs = jsd['gbs'] if gbs['id'].strip(): sef = sources.entries[s['id']].fields sef['google_book_search_id'] = gbs['id'].strip() sef['google_book_viewability'] = gbs['accessInfo'][ 'viewability'].strip() except (json.decoder.JSONDecodeError, KeyError): continue chapters = self.read('contribution', extended='chapter', pkmap=pk2id) refs = [] crefs = collections.defaultdict(list) for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True): if row['source_pk']: sid = pk2id['source'][row['source_pk']] if sid not in skip_source: refs.append( (row['valueset_pk'], updated_source_keys.get(sid, sid), row['description'])) srcids = set(r[1] for r in refs) for row in self.raw_dir.read_csv('contributionreference.csv', dicts=True): sid = pk2id['source'][row['source_pk']] if sid not in crefs[pk2id['contribution'][row['contribution_pk']]]: crefs[pk2id['contribution'][row['contribution_pk']]].append( sid) srcids.add(sid) unused_srcids = [] for id_, e in sources.entries.items(): if id_ in skip_source: continue if id_ in srcids: if id_ in src_names: e.fields['wals_ref_name'] = src_names[id_] args.writer.cldf.add_sources(Source.from_entry(id_, e)) else: unused_srcids.append(id_) # add language references out of bibtex tag 'wals_code' # to ensure that nothing was missed in raw/languagesource.csv (37 cases) if 'wals_code' in e.fields: [ gbs_lg_refs[c].add(id_) for c in e.fields['wals_code'].split('; ') ] for id_, e in sources.entries.items(): if id_ in skip_source: continue if id_ in unused_srcids: if id_ in src_names: e.fields['wals_ref_name'] = src_names[id_] args.writer.cldf.add_sources(Source.from_entry(id_, e)) editors = { e['contributor_pk']: int(e['ord']) for e in self.read('editor', key=lambda r: int(r['ord'])).values() } contributors = self.read('contributor', pkmap=pk2id, key=lambda r: r['id']) for row in contributors.values(): args.writer.objects['contributors.csv'].append({ 'ID': row['id'], 'Name': row['name'], 'Url': row['url'], 'Editor_Ord': editors[row['pk']] if row['pk'] in editors else 0, }) cc = { chapters[fid]['id']: [(r['primary'], pk2id['contributor'][r['contributor_pk']]) for r in rows] for fid, rows in itertools.groupby( self.read('contributioncontributor', key=lambda d: (d['contribution_pk'], d['primary'] == 'f', int(d['ord']))).values(), lambda r: r['contribution_pk']) } areas = self.read('area') for row in areas.values(): args.writer.objects['areas.csv'].append({ 'ID': row['id'], 'Name': row['name'], 'dbpedia_url': row['dbpedia_url'], }) for row in self.read('parameter', extended='feature', pkmap=pk2id, key=lambda d: fid_key(d['id'])).values(): args.writer.objects['ParameterTable'].append({ 'ID': row['id'], 'Name': row['name'], 'Chapter_ID': chapters[row['contribution_pk']]['id'], }) for row in self.read( 'domainelement', pkmap=pk2id, key=lambda d: (fid_key(d['id'].split('-')[0]), int(d['number']))).values(): args.writer.objects['CodeTable'].append({ 'ID': row['id'], 'Parameter_ID': pk2id['parameter'][row['parameter_pk']], 'Name': row['name'], 'Description': row['description'], 'Number': int(row['number']), 'icon': json.loads(row['jsondata'])['icon'], }) identifier = self.read('identifier') lang2id = collections.defaultdict( lambda: collections.defaultdict(list)) for row in self.read('languageidentifier').values(): id_ = identifier[row['identifier_pk']] lang2id[row['language_pk']][id_['type']].append( (id_['name'], id_['description'])) families = self.read('family', pkmap=pk2id) genera = self.read('genus', pkmap=pk2id) countries = self.read('country', pkmap=pk2id) lang2country = collections.defaultdict(list) for c in self.read('countrylanguage').values(): lang2country[c['language_pk']].append( pk2id['country'][c['country_pk']]) lrefs = collections.defaultdict(list) for c in self.read('languagesource').values(): sid = pk2id['source'][c['source_pk']] sid = updated_source_keys.get(sid, sid) if sid not in lrefs[c['language_pk']]: lrefs[c['language_pk']].append(sid) for row in self.read('language', extended='walslanguage', pkmap=pk2id).values(): id = row['id'] genus = genera[row['genus_pk']] genus_icon = genus['icon'] if genus else '' family = families[genus['family_pk']] if row['name'] == genus['name'] == family['name']: # an isolate! genus = family = None iso_codes = set(i[0] for i in lang2id[row['pk']].get('iso639-3', [])) glottocodes = [ i[0] for i in lang2id[row['pk']].get('glottolog', []) ] srcs = lrefs[row['pk']] if id in gbs_lg_refs: [srcs.append(s) for s in gbs_lg_refs[id] if s not in srcs] args.writer.objects['LanguageTable'].append({ 'ID': id, 'Name': row['name'].strip(), 'ISO639P3code': list(iso_codes)[0] if len(iso_codes) == 1 else None, 'Glottocode': glottocodes[0] if len(glottocodes) == 1 else None, 'ISO_codes': sorted(iso_codes), 'Latitude': row['latitude'], 'Longitude': row['longitude'], 'Macroarea': row['macroarea'], 'Genus': genus['name'] if genus else None, 'GenusIcon': genus_icon, 'Subfamily': genus['subfamily'] if genus else None, 'Family': family['name'] if family else None, 'Samples_100': row['samples_100'] == 't', 'Samples_200': row['samples_200'] == 't', 'Country_ID': lang2country[row['pk']], 'Source': sorted(srcs), }) args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID']) refs = { dpid: [ str( Reference( source=str(r[1]), desc=r[2].replace('[', ')').replace(']', ')').replace( ';', '.').strip() if r[2] else None)) for r in refs_ ] for dpid, refs_ in itertools.groupby(refs, lambda r: r[0]) } vsdict = self.read('valueset', pkmap=pk2id) examples = self.read('sentence', pkmap=pk2id) igts = {} for ex in examples.values(): if all(ex[k] for k in ['description', 'analyzed', 'gloss']): a, g = ex['analyzed'].split(), ex['gloss'].split() if len(a) != len(g): a, g = [ex['analyzed']], [ex['gloss']] igts[ex['pk']] = ex['id'] args.writer.objects['ExampleTable'].append({ 'ID': ex['id'], 'Language_ID': pk2id['language'][ex['language_pk']], 'Primary_Text': ex['name'], 'Translated_Text': ex['description'], 'Analyzed_Word': a, 'Gloss': g, }) example_by_value = { vpk: [r['sentence_pk'] for r in rows] for vpk, rows in itertools.groupby( self.read('valuesentence', key=lambda d: d['value_pk']).values( ), lambda d: d['value_pk']) } for row in self.read('value').values(): vs = vsdict[row['valueset_pk']] comment = None ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])] if len(ex) == 1 and not any( ex[0][k] for k in ['description', 'analyzed', 'gloss']): comment = re.sub(r'[\r\n]', '', ex[0]['xhtml']) del example_by_value[row['pk']] args.writer.objects['ValueTable'].append({ 'ID': vs['id'], 'Language_ID': pk2id['language'][vs['language_pk']], 'Parameter_ID': pk2id['parameter'][vs['parameter_pk']], 'Value': pk2id['domainelement'][row['domainelement_pk']].split('-')[1], 'Code_ID': pk2id['domainelement'][row['domainelement_pk']], 'Comment': comment, 'Source': refs.get(vs['pk'], []), 'Example_ID': sorted(igts[epk] for epk in example_by_value.get(row['pk'], []) if epk in igts), }) args.writer.objects['ValueTable'].sort( key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID']))) altnames = [] for lpk in lang2id: for type in lang2id[lpk]: if type == 'name': for name, prov in lang2id[lpk][type]: altnames.append((prov, name, pk2id['language'][lpk])) lnid = 0 for (type, name), rows in itertools.groupby(sorted(altnames), lambda t: (t[0], t[1])): lnid += 1 args.writer.objects['language_names.csv'].append({ 'ID': str(lnid), 'Language_ID': [r[2] for r in rows], 'Name': name.strip(), 'Provider': type, }) for c in sorted(countries.values(), key=lambda x: x['id']): args.writer.objects['countries.csv'].append({ 'ID': c['id'], 'Name': c['name'], }) desc_dir = self.raw_dir / 'descriptions' src_pattern = re.compile( 'src="https?://wals.info/static/descriptions/(?P<sid>s?[0-9]+)/images/(?P<fname>[^"]+)"' ) def repl(m): p = desc_dir.joinpath(m.group('sid'), 'images', m.group('fname')) if p.exists(): return 'src="{0}"'.format(data_url(p)) return m.string[m.start():m.end()] descs = {} docs_dir = self.cldf_dir / 'docs' docs_dir.mkdir(exist_ok=True) for d in desc_dir.iterdir(): if d.is_dir(): descs[d.stem] = src_pattern.sub( repl, d.joinpath('body.xhtml').read_text(encoding='utf8')) for c in sorted(chapters.values(), key=lambda x: int(x['sortkey'])): if c['id'] in descs: fname = docs_dir / 'chapter_{}.html'.format(c['id']) with io.open(fname, 'w', encoding='utf-8') as f: f.write(descs[c['id']]) cid, wcid = [], [] if c['id'] in cc: cid = [co[1] for co in cc[c['id']] if co[0] == 't'] wcid = [co[1] for co in cc[c['id']] if co[0] == 'f'] args.writer.objects['chapters.csv'].append({ 'ID': c['id'], 'Name': c['name'], 'wp_slug': c['wp_slug'], 'Number': c['sortkey'], 'Area_ID': areas[c['area_pk']]['id'] if c['area_pk'] in areas else '', 'Source': crefs.get(c['id'], []), 'Contributor_ID': cid, 'With_Contributor_ID': wcid, })
def read_bib(self, fname='sources.bib'): bib = database.parse_string(self.read(fname), bib_format='bibtex') return [Source.from_entry(k, e) for k, e in bib.entries.items()]
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. >>> args.writer.objects['LanguageTable'].append(...) """ # CLDF schema args.writer.cldf.add_component('ParameterTable') args.writer.cldf.add_component('LanguageTable', 'SubBranch', 'Family') args.writer.cldf.add_component('CodeTable') args.writer.cldf.add_table( 'constructions.csv', 'http://cldf.clld.org/v1.0/terms.rdf#id', 'http://cldf.clld.org/v1.0/terms.rdf#name', 'http://cldf.clld.org/v1.0/terms.rdf#description', 'http://cldf.clld.org/v1.0/terms.rdf#languageReference', 'http://cldf.clld.org/v1.0/terms.rdf#source') args.writer.cldf.add_table( 'cvalues.csv', 'http://cldf.clld.org/v1.0/terms.rdf#id', 'Construction_ID', 'http://cldf.clld.org/v1.0/terms.rdf#parameterReference', 'http://cldf.clld.org/v1.0/terms.rdf#value', 'http://cldf.clld.org/v1.0/terms.rdf#codeReference', 'http://cldf.clld.org/v1.0/terms.rdf#comment') args.writer.cldf.add_foreign_key('cvalues.csv', 'Construction_ID', 'constructions.csv', 'ID') # Read data data = self.raw_dir.read_csv('Data_to_be_published.csv', dicts=True) data = normalise_table(data) parameters = self.etc_dir.read_csv('parameters.csv', dicts=True) source_map = { citation.strip(): key.strip() for key, citation in self.etc_dir.read_csv( 'citations-to-bibtex.csv') } sources = parse_string(self.raw_dir.read('sources.bib'), 'bibtex') # Process data lang_info = { row['Glottolog.Name']: { 'ID': row['Glottolog.Name'], 'Name': title_case(row.get('Language', '')), 'SubBranch': title_case(row.get('Sub-branch', '')), 'Family': title_case(row.get('Family', '')), } for row in data } languages = OrderedDict( (l['ID'], l) for l in make_language_table(lang_info)) code_dict = OrderedDict() for column, param_id in PARAMETER_COLUMNS: if param_id == 'ap-marker': continue code_dict[param_id] = sorted( {unify_na(row[column]) for row in data if row.get(column)}) codes = OrderedDict(( (param_id, name), { 'ID': '{}-c{}'.format(param_id, index + 1), 'Parameter_ID': param_id, 'Name': name, }, ) for param_id, code_names in code_dict.items() for index, name in enumerate(code_names)) constructions = [] cvalues = [] ords = defaultdict(int) for index, row in enumerate(data): lang_id = row['Glottolog.Name'] lang_name = languages[row['Glottolog.Name']]['Name'] ords[lang_id] += 1 constr_ord = ords[lang_id] constr_id = '{}-ap{}'.format(lang_id, constr_ord) def known_citation(cite): if cite in source_map: return True else: print('row {}: unknown citation:'.format(index + 2), cite, file=sys.stderr) return False citations = [ source_map[citation.strip()] for citation in row.get('Source', '').splitlines() if known_citation(citation) ] constructions.append({ 'ID': constr_id, 'Name': '{} Antipassive Construction {}'.format(lang_name, constr_ord), 'Language_ID': lang_id, 'Source': citations }) cvalues.extend({ 'ID': '{}-{}'.format(constr_id, param_id), 'Construction_ID': constr_id, 'Parameter_ID': param_id, 'Value': unify_na(row[column]), 'Code_ID': codes.get((param_id, unify_na(row[column])), {}).get('ID'), } for column, param_id in PARAMETER_COLUMNS if row.get(column)) # Output data args.writer.cldf.add_sources(sources) args.writer.objects['LanguageTable'] = languages.values() args.writer.objects['ParameterTable'] = parameters args.writer.objects['CodeTable'] = codes.values() args.writer.objects['ValueTable'] = [] args.writer.objects['constructions.csv'] = constructions args.writer.objects['cvalues.csv'] = cvalues
parser.add_argument( "-b", "--borr_ref", dest="borr_ref", help="Use borrowing reference file instead of citations file.", default=False, action='store_true') args = parser.parse_args() if args.borr_ref == True: tsvfile = "Borrowing_references.tsv" else: tsvfile = "Citation_codes.tsv" out = [] with open(tsvfile, "r") as f: for row in csv.reader(f, delimiter="\t", quotechar='"'): if row[2] == "type": # header continue if row[2] == "E": # expert (not citable) continue bib = BREAK_PATTERN.sub(lambda m: ',\n {0}='.format(m.group('key')), row[1]) parse_string(bib, bib_format='bibtex') out.append(bib) print('\n\n'.join(out))