def iterrows(core, extended=False): res = collections.OrderedDict() for row in reader(repos / 'raw' / core, dicts=True): res[row['pk']] = row if extended: for row in reader(repos / 'raw' / extended, dicts=True): res[row['pk']].update(row) for r in res.values(): yield typed(r, core)
def rename(args): # pragma: no cover api = Concepticon(args.repos) from_, to_ = args.args assert CONCEPTLIST_ID_PATTERN.match(to_) cl = api.conceptlists[from_] # write the adapted concept list to the new path: with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_), delimiter='\t') as writer: header = [] for i, row in enumerate(reader(cl.path, delimiter='\t')): if i == 0: header = row writer.writerow(row) header = {v: k for k, v in enumerate(header) } # Map col name to row index else: oid = row[header['ID']] assert oid.startswith(from_) nid = oid.replace(from_, to_) api.add_retirement( 'Concept', dict(id=oid, comment='renaming', replacement=nid)) row[header['ID']] = nid writer.writerow(row) # write adapted metadata to the new path: fname = cl.path.name.replace(from_, to_) + MD_SUFFIX md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX), object_pairs_hook=OrderedDict) md['tables'][0]['url'] = fname jsonlib.dump(md, cl.path.parent / fname, indent=4) # remove obsolete concept list and metadata: cl.path.unlink() cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink() # adapt conceptlists.tsv rows = [] for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'): rows.append([col.replace(from_, to_) if col else col for col in row]) with UnicodeWriter(api.data_path('conceptlists.tsv'), delimiter='\t') as writer: writer.writerows(rows) api.add_retirement('Conceptlist', dict(id=from_, comment='renaming', replacement=to_)) print("""Please run grep -r "{0}" concepticondata/ | grep -v retired.json to confirm the renaming was complete!""".format(from_))
def iter_sources(self, type=None): for src in reader(self.repos / 'sources' / 'index.tsv', dicts=True, delimiter='\t'): if (type is None) or (type == src['TYPE']): graphemesp = self.repos / 'sources' / src[ 'NAME'] / 'graphemes.tsv' if graphemesp.exists(): yield src, list( reader(graphemesp, dicts=True, delimiter='\t'))
def rows_and_sourcesheets(sheet, active): allrows, sourcesheets = [], [] def clean_row(row): sourcesheets.extend(row['Sheet'].split()) coders = '-'.join(s.split('_')[0] for s in row['Sheet'].split()) coders = coders.split('-') del row['Sheet'] if row.get('Contributed_Datapoint'): row['Contributed_Datapoint'] += ' ' + ' '.join(coders) else: row['Contributed_Datapoint'] = ' '.join(coders) allrows.append(row) #fids = collections.Counter([r['Feature_ID'] for r in reader(sheet, dicts=True, delimiter='\t')]) #print(len(fids), sum(fids.values())) for fid, rows in itertools.groupby( sorted(reader(sheet, dicts=True, delimiter='\t'), key=lambda r: r['Feature_ID']), lambda r: r['Feature_ID'], ): rows = list(rows) if len(rows) == 1: clean_row(rows[0]) else: row = merged_rows(rows, active) if row: clean_row(row) return allrows, set(sourcesheets)
def iter(cls, table=None, cache_dir=None, log=None): content = read_url( 'sites/iso639-3/files/downloads/iso-639-3_Retirements.tab', cache_dir=cache_dir, log=log) for d in table or dsv.reader(content.splitlines(), dicts=True, delimiter='\t'): yield cls(**d)
def test_get_concepts(concepticon): res = util.get_concepts(concepticon.conceptlists.values(), []) assert len(res) == 1 assert 'chinese' in res[0].attributes assert res[0].number == '1' id_lookup, _ = util.get_ids_and_attrs( res, {}, id_factory=lambda c: c.number + 'x', lookup_factory=lambda c: c.number + 'y') assert id_lookup['1y'] == '1x' id_lookup, _ = util.get_ids_and_attrs( res, {'number': 'Number'}, id_factory=lambda c: c.number + 'x', lookup_factory=lambda c: c['Number'] + 'y') assert id_lookup['1y'] == '1x' csv = Path( __file__ ).parent / 'repos' / 'datasets' / 'test_dataset' / 'etc' / 'concepts.csv' res = util.get_concepts([], list(reader(csv, dicts=True))) assert len(res) == 2 assert 'chinese' in res[0].attributes assert res[0].number == '1' id_lookup, _ = util.get_ids_and_attrs(res, {'chinese': 'chi'}, id_factory=lambda c: c.number + 'x', lookup_factory=lambda c: c['chi']) assert id_lookup['xyz'] == '1x'
def iterupdated(self, languoids): # pragma: no cover res = requests.post('https://query.wikidata.org/sparql', data=dict(query=SPARQL), headers=dict(Accept='text/csv')) res = {} if self.repos: res = { d['glottocode']: d for d in reader(self.repos.path('build', 'glottocode2wikidata.csv'), dicts=True) } assert res for lang in languoids: urls = { 'www.wikidata.org': [res[lang.id]['item'].replace('http:', 'https:')] if lang.id in res else [], 'en.wikipedia.org': [res[lang.id]['wikipedia']] if (lang.id in res) and res[lang.id]['wikipedia'] else [], } if any([lang.update_links(d, u) for d, u in urls.items()]): # Note: We must use list comprehension rather than a generator as first argument # to `any` to make sure `update_links` is called for each item in urls! yield lang
def from_path(cls, path, spec=None): """ Instantiate a corpus from a file path. :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \ component as CSV file. Note that in the latter case, the file must use the default \ column names, as defined in the CLDF ontology. :return: `Corpus` instance. """ if isinstance(path, str): path = pathlib.Path(path) if path.suffix == '.json': return cls.from_cldf(Dataset.from_metadata(path), spec=spec) # We are given only an ExampleTable. Let's create the appropriate dataset: header = None for d in reader(path, dicts=True): header = list(d.keys()) break ds = Dataset.from_metadata( pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json') ds.tablegroup._fname = path.parent / 'cldf-metadata.json' t = ds.add_component('ExampleTable') t.url = Link(path.name) default_cols = [col.name for col in t.tableSchema.columns] ds.remove_columns(t, *list(set(default_cols) - set(header))) ds.add_columns(t, *list(set(header) - set(default_cols))) return cls.from_cldf(ds, spec=spec)
def read_csv(self, fname, normalize=None, **kw) -> list: """ Read CSV data from a file. """ if not normalize: return list(dsv.reader(self._path(fname), **kw)) if kw.get('dicts'): return [ collections.OrderedDict([(k, unicodedata.normalize(normalize, v)) for k, v in row.items()]) for row in dsv.reader(self._path(fname), **kw) ] else: return [[unicodedata.normalize(normalize, k) for k in row] for row in dsv.reader(self._path(fname), **kw)]
def read_cldf_languages(url): # pragma: no cover r = requests.get(url) with zipfile.ZipFile(io.BytesIO(r.content)) as zip: for member in zip.namelist(): if member.endswith(MD_SUFFIX): break else: raise ValueError('No metadata file found') with zip.open(member) as fp: md = json.loads(fp.read().decode('utf8')) for table in md['tables']: if table.get( 'dc:conformsTo' ) == 'http://cldf.clld.org/v1.0/terms.rdf#LanguageTable': lurl, schema = table['url'], table['tableSchema']['columns'] break else: raise ValueError('No LanguageTable found') for member in zip.namelist(): if member.endswith(lurl): with zip.open(member) as fp: return reader([line.strip() for line in fp.readlines()], dicts=True), schema else: raise ValueError('LanguageTable url not found in zip')
def test_sndcmp2(sndcmp2_dataset, mocker): sndcmp2_dataset.cmd_create_ref_etc_files(mocker.MagicMock()) csv = sndcmp2_dataset.raw_dir / 'concepts.csv' res = list(reader(csv, dicts=True)) assert len(res) == 3 assert 'Bislama_Gloss' not in res[0]
def _iter_etc(self, what): delimiter = '\t' path = self.etc_dir / (what + '.tsv') if not path.exists(): delimiter = ',' path = path.parent / (what + '.csv') return reader(path, dicts=True, delimiter=delimiter) if path.exists() else []
def run(args): ordered = [d['species'].lower() for d in reader(args.ordered, dicts=True)] ranks = ['phylum', 'klass', 'order', 'family', 'genus'] ordered_ranks = {r: {} for r in ranks} seen = {} augmented_species = [] for ex in args.api.experiments: species = ex.gbif.cname if species not in seen: seen[species] = (ex.gbif.classification, ex.species_latin) skey = species.lower() if skey not in ordered: skey = ' '.join(skey.split()[:2]) if skey not in ordered: skey = [n for n in ordered if n.split()[0] == skey.split()[0]] if skey: skey = skey[0] if skey in ordered: augmented_species.append((species, ordered.index(skey))) else: augmented_species.append((species, len(ordered) + 1)) for s, i in sorted(augmented_species, key=lambda t: t[1], reverse=True): for r in ranks: ordered_ranks[r][getattr(seen[s][0], r)] = i fully_augmented_species = { s: (ordered_ranks['phylum'][seen[s][0].phylum], ordered_ranks['klass'][seen[s][0].klass], ordered_ranks['order'][seen[s][0].order], ordered_ranks['family'][seen[s][0].family], ordered_ranks['genus'][seen[s][0].genus], i) for s, i in sorted(augmented_species, key=lambda t: t[1]) } clf = collections.defaultdict(lambda: [-1, None]) prefix = {} for k, _ in sorted(fully_augmented_species.items(), key=lambda i: i[1], reverse=True): for j, a in enumerate(ranks): if clf[a][1] != getattr(seen[k][0], a): for aa in ranks[j + 1:]: clf[aa][0] = -1 if a == 'genus': # reset prefix index for all deeper taxonomy ranks: clf['species'][0] = -1 clf[a][0] += 1 clf[a][1] = getattr(seen[k][0], a) node_name = '_'.join( getattr(seen[k][0], aa) for aa in ranks[:j + 1]) prefix[node_name] = string.ascii_lowercase[clf[a][0]] if clf['species'][1] != k: clf['species'][0] += 1 clf['species'][1] = k prefix[k.lower()] = string.ascii_lowercase[clf['species'][0]] dump(prefix, args.api.path('taxa_sortkeys.json'), indent=4)
def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: log.warning('Glottolog data has already been loaded') return self.glottolog_loaded = True self.classifications, glottocode2node, label2name = monophyly.classifications_from_newick( str(get_glottolog_data('newick', self.admin.glottolog_release))) # Load geographic metadata dialects = [] for t in reader(get_glottolog_data('geo', self.admin.glottolog_release), dicts=True): identifiers = [t['glottocode']] + t['isocodes'].split() if t['level'] == "dialect": dialects.append((t, identifiers)) if t['macroarea']: for id_ in identifiers: self.glotto_macroareas[id_] = t['macroarea'] if t['latitude'] and t['longitude']: latlon = (float(t['latitude']), float(t['longitude'])) for id_ in identifiers: self.locations[id_] = latlon # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t, identifiers in dialects: failed = False if t['glottocode'] not in glottocode2node: # pragma: no cover # This may only happen for newick downloads of older Glottolog releases, where # possibly isolates may not be included. continue node = glottocode2node[t['glottocode']] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] for id_ in identifiers: self.locations[id_] = latlon
def upgrade(): csv = Path(phoible.__file__).parent.joinpath( '..', 'data', 'InventoryID-InternetArchive.csv') ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != 'NA'} conn = Connection(op.get_bind()) for id_, url in ia_urls.items(): pk = conn.pk(Contribution, id_) conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
def test_sndcmp(sndcmp_dataset, mocker): sndcmp_dataset.cmd_create_ref_etc_files(mocker.MagicMock()) assert (sndcmp_dataset.raw_dir / 'languages.csv').exists() assert (sndcmp_dataset.raw_dir / 'concepts.csv').exists() csv = sndcmp_dataset.raw_dir / 'concepts.csv' res = list(reader(csv, dicts=True)) assert len(res) == 3 assert 'Bislama_Gloss' in res[0] assert res[0]["IndexInSource"] == '1-0'
def rewrite(self, fname, v): rows = list(dsv.reader(self.raw_dir / fname, dicts=True)) with dsv.UnicodeWriter(self.raw_dir / fname) as w: for i, row in enumerate(rows): if i == 0: w.writerow(row.keys()) res = v(row) if res: w.writerow(res.values())
def experiments(self): gbif = load(self.path('gbif.json')) res = [ Experiment.from_dict(d, self.sources) for d in list( dsv.reader(self.path('data.Sheet1.csv'), dicts=True))[1:] ] for ex in res: key, md = gbif.get(ex.species_latin, (None, None)) if key: ex.gbif = GBIF(key=key, metadata=md) return res
def iter_languages(api): # pragma: no coverage meds = { row['Language_ID']: row['Value'] for row in reader(GLOTTOLOG_VENV / 'glottolog-cldf' / 'cldf' / 'values.csv', dicts=True) if row['Parameter_ID'] == 'med' } for l in api.languoids(): if l.level == api.languoid_levels.language and not l.category.startswith( 'Pseudo'): yield Language(l, meds.get(l.id))
def run(args): dicts = list(dsv.reader(get_conceptlist(args, path_only=True), delimiter="\t", dicts=True)) out_dict = collections.OrderedDict() for d in dicts: out_dict[d[args.column]] = list(d.values()) with dsv.UnicodeWriter(args.output, delimiter='\t') as w: w.writerow(dicts[0].keys()) w.writerows(out_dict.values()) if not args.output: print(w.read().decode('utf8'))
def run(args): # pragma: no cover for p in pathlib.Path(args.groupings).glob('*.csv'): groupings = {r['Feature_ID']: r for r in reader(p, dicts=True)} new_features = [] for feature in args.repos.ordered_features: for k, v in groupings[feature.id].items(): if k != 'Feature_ID': if k in feature: pass else: feature[k] = v new_features.append(feature) args.repos.gb20.save(new_features)
def read_data(folder, fname, grapheme_col, *cols): data, sounds, names = defaultdict(list), [], [] for row in reader(pkg_path(folder, fname), delimiter='\t', dicts=True): grapheme = {"grapheme": row[grapheme_col]} for col in cols: grapheme[col.lower()] = row[col] data[row['BIPA_GRAPHEME']].append(grapheme) data[row['CLTS_NAME']].append(grapheme) sounds.append(row['BIPA_GRAPHEME']) names.append(row['CLTS_NAME']) return data, sounds, names
def iter_inventories(p): """ Read the raw PHOIBLE data file, splitting rows into inventory, language and phoneme information and grouping the data by inventory. """ for iid, rows in itertools.groupby( sorted(dsv.reader(p, dicts=True), key=lambda r: int(r['InventoryID'])), lambda r: r['InventoryID'], ): rows = list(rows) yield iid, Doculect.from_row( rows[0]), [Phoneme.from_row(row) for row in rows]
def iter_samples(self, repos: 'GEOROC', stdout=False) -> typing.Generator['Sample', None, None]: from pygeoroc import errata lines = itertools.takewhile( lambda l: not (l.startswith('Abbreviations') or l.startswith('References:')), self.iter_lines(repos)) for i, row in enumerate(dsv.reader(lines, dicts=True), start=2): try: sample = Sample.from_row(row) except: # pragma: no cover # noqa: E722 print('{}:{}'.format(self.name, i)) raise errata.fix(sample, self, repos, stdout=stdout) yield sample
def check(self, clts=None, log=None, ipa_col=IPA_COLUMN): """ Check a profile for consistency, logging problems. For each grapheme, raise: - a warning if there are duplicate entries - an error if there are inconsistencies - an error if the mapping has invalid BIPA """ mapping = collections.defaultdict(list) if self.fname: # We read the profile from disk because segments.Profile already skips duplicate # graphemes, which we want to investigate more closely. for spec in dsv.reader(self.fname, dicts=True, delimiter='\t'): mapping[spec[self.GRAPHEME_COL]].append(spec[ipa_col]) for grapheme in mapping: # check mapping consistency if len(mapping[grapheme]) >= 2: if len(set(mapping[grapheme])) == 1: log_or_raise( "Duplicate, redundant entry or entries for grapheme [{}]." .format(grapheme), log=log, level='warning') else: log_or_raise( "Inconsist entries for grapheme [{}]: multiple mappings {}." .format(grapheme, str(mapping[grapheme])), log=log, level='error') # check BIPA consistency if clts: for value in mapping[grapheme]: if value: # check for unknown sounds unknown = [ isinstance(clts.bipa[segment], pyclts.models.UnknownSound) for segment in ipa2tokens(value) if segment and segment != 'NULL' ] if any(unknown): log_or_raise( "Mapping [{}] ({}) -> [{}] ({}) includes an unknown sound." .format(grapheme, unicode2codepointstr(grapheme), value, unicode2codepointstr(value)), log=log, level='error')
def pytest_generate_tests(metafunc): if 'test_sounds' == metafunc.function.__name__: fixturenames = None tests = [] for i, test in enumerate( reader(Path(__file__).parent / 'data' / 'test_data.tsv', delimiter='\t', dicts=True)): if i == 0: fixturenames = list(test.keys()) fixturenames.pop(fixturenames.index('bipa')) del test['bipa'] if None in test: del test[None] if len(fixturenames) != len(test.keys()): raise ValueError(set(test.keys()) - set(fixturenames)) tests.append(test) attrs = [ 'nfd-normalized', 'clts-normalized', 'aliased', 'generated', 'stressed' ] tests = sorted(tests, key=lambda t: tuple([t[a] for a in attrs])) batches = [] for _, ts in groupby(tests, lambda t: tuple([t[a] for a in attrs])): for test in ts: batches.append(tuple(test.values())) break metafunc.parametrize( ','.join(n.replace('-', '_') for n in fixturenames), batches) elif 'test_clicks' == metafunc.function.__name__: tests = [] for test in reader(Path(__file__).parent / 'data' / 'clicks.tsv', delimiter='\t', dicts=True): tests.append((test['GRAPHEME'], test['MANNER'])) metafunc.parametrize('grapheme,gtype', tests)
def get_wordlist(path, delimiter=",", quotechar='"', normalization_form="NFC", **keywords): """ Load a wordlist from a normal CSV file. Parameters ---------- path : str The path to your CSV file. delimiter : str The delimiter in the CSV file. quotechar : str The quote character in your data. row : str (default = "concept") A string indicating the name of the row that shall be taken as the basis for the tabular representation of the word list. col : str (default = "doculect") A string indicating the name of the column that shall be taken as the basis for the tabular representation of the word list. conf : string (default='') A string defining the path to the configuration file. Notes ----- This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object. In contrast to the normal way to load a wordlist from a tab-separated file, however, this allows to directly load a wordlist from any "normal" csv-file, with your own specified delimiters and quote characters. If the first cell in the first row of your CSV file is not named "ID", the integer identifiers, which are required by LingPy will be automatically created. """ kw = dict(conf="", col="doculect", row="concept") kw.update(keywords) data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar)) header = [h.lower() for h in data[0]] data = data[1:] D = {} if header[0] == 'ID': D[0] = header[1:] for row in data: D[row[0]] = [normalize(normalization_form, n) for n in row[1:]] else: D[0] = header for idx, row in enumerate(data): D[idx + 1] = row return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
def read(encoding): with fname.open(encoding=encoding) as csvfile: line = csvfile.readline() delimiter = ',' if ';' in line and ((',' not in line) or (line.index(';') < line.index(','))): delimiter = ';' for row in dsv.reader(fname, delimiter=delimiter, quotechar='"', doublequote=True, encoding=encoding, dicts=True): yield _normalized_row(row)
def read(fname, data): concepts, loan = None, None for i, row in enumerate(reader(fname)): if i == 0: concepts = {j: c for j, c in enumerate(row[1:])} else: for j, c in enumerate(row[1:]): if j % 2 == 0: # even number loan, form = get_loan_and_form(c) else: if form.strip(): data[row[0]][concepts[j]] = (form, loan, c) return data
def cmd_download(self, args): for row in reader(DPLACE_DATA / 'phylogenies' / 'index.csv', dicts=True): if not row['id'].startswith('glottolog_'): self.raw_dir.joinpath(row['id']).mkdir(exist_ok=True) for fname in [ 'posterior.trees', 'source.bib', 'summary.trees', 'taxa.csv', ]: src = DPLACE_DATA / 'phylogenies' / row['id'] / fname if src.exists(): shutil.copy(str(src), str(self.raw_dir / row['id'] / fname))
from collections import OrderedDict from csvw.dsv import reader from clldutils.jsonlib import dump from sqlalchemy import create_engine eth17 = OrderedDict() for l in reader('LanguageCodes.tab', dicts=True, delimiter='\t'): eth17[l['LangID']] = l['Name'] db = create_engine('postgresql://robert@/asjp') in_asjp = set(r[0] for r in db.execute('select code_iso from doculect where code_iso is not null')) missing = [(k, v) for k, v in eth17.items() if k not in in_asjp] dump(missing, 'missing.json', indent=4)