Esempio n. 1
0
 def iterrows(core, extended=False):
     res = collections.OrderedDict()
     for row in reader(repos / 'raw' / core, dicts=True):
         res[row['pk']] = row
     if extended:
         for row in reader(repos / 'raw' / extended, dicts=True):
             res[row['pk']].update(row)
     for r in res.values():
         yield typed(r, core)
Esempio n. 2
0
def rename(args):  # pragma: no cover
    api = Concepticon(args.repos)

    from_, to_ = args.args
    assert CONCEPTLIST_ID_PATTERN.match(to_)
    cl = api.conceptlists[from_]

    # write the adapted concept list to the new path:
    with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_),
                       delimiter='\t') as writer:
        header = []
        for i, row in enumerate(reader(cl.path, delimiter='\t')):
            if i == 0:
                header = row
                writer.writerow(row)
                header = {v: k
                          for k, v in enumerate(header)
                          }  # Map col name to row index
            else:
                oid = row[header['ID']]
                assert oid.startswith(from_)
                nid = oid.replace(from_, to_)
                api.add_retirement(
                    'Concept', dict(id=oid,
                                    comment='renaming',
                                    replacement=nid))
                row[header['ID']] = nid
                writer.writerow(row)

    # write adapted metadata to the new path:
    fname = cl.path.name.replace(from_, to_) + MD_SUFFIX
    md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX),
                      object_pairs_hook=OrderedDict)
    md['tables'][0]['url'] = fname
    jsonlib.dump(md, cl.path.parent / fname, indent=4)

    # remove obsolete concept list and metadata:
    cl.path.unlink()
    cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink()

    # adapt conceptlists.tsv
    rows = []
    for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'):
        rows.append([col.replace(from_, to_) if col else col for col in row])

    with UnicodeWriter(api.data_path('conceptlists.tsv'),
                       delimiter='\t') as writer:
        writer.writerows(rows)

    api.add_retirement('Conceptlist',
                       dict(id=from_, comment='renaming', replacement=to_))

    print("""Please run
grep -r "{0}" concepticondata/ | grep -v retired.json

to confirm the renaming was complete!""".format(from_))
Esempio n. 3
0
 def iter_sources(self, type=None):
     for src in reader(self.repos / 'sources' / 'index.tsv',
                       dicts=True,
                       delimiter='\t'):
         if (type is None) or (type == src['TYPE']):
             graphemesp = self.repos / 'sources' / src[
                 'NAME'] / 'graphemes.tsv'
             if graphemesp.exists():
                 yield src, list(
                     reader(graphemesp, dicts=True, delimiter='\t'))
Esempio n. 4
0
def rows_and_sourcesheets(sheet, active):
    allrows, sourcesheets = [], []

    def clean_row(row):
        sourcesheets.extend(row['Sheet'].split())
        coders = '-'.join(s.split('_')[0] for s in row['Sheet'].split())
        coders = coders.split('-')
        del row['Sheet']
        if row.get('Contributed_Datapoint'):
            row['Contributed_Datapoint'] += ' ' + ' '.join(coders)
        else:
            row['Contributed_Datapoint'] = ' '.join(coders)
        allrows.append(row)

    #fids = collections.Counter([r['Feature_ID'] for r in reader(sheet, dicts=True, delimiter='\t')])
    #print(len(fids), sum(fids.values()))

    for fid, rows in itertools.groupby(
        sorted(reader(sheet, dicts=True, delimiter='\t'), key=lambda r: r['Feature_ID']),
        lambda r: r['Feature_ID'],
    ):
        rows = list(rows)
        if len(rows) == 1:
            clean_row(rows[0])
        else:
            row = merged_rows(rows, active)
            if row:
                clean_row(row)
    return allrows, set(sourcesheets)
Esempio n. 5
0
 def iter(cls, table=None, cache_dir=None, log=None):
     content = read_url(
         'sites/iso639-3/files/downloads/iso-639-3_Retirements.tab',
         cache_dir=cache_dir,
         log=log)
     for d in table or dsv.reader(content.splitlines(), dicts=True, delimiter='\t'):
         yield cls(**d)
Esempio n. 6
0
def test_get_concepts(concepticon):
    res = util.get_concepts(concepticon.conceptlists.values(), [])
    assert len(res) == 1
    assert 'chinese' in res[0].attributes
    assert res[0].number == '1'

    id_lookup, _ = util.get_ids_and_attrs(
        res, {},
        id_factory=lambda c: c.number + 'x',
        lookup_factory=lambda c: c.number + 'y')
    assert id_lookup['1y'] == '1x'

    id_lookup, _ = util.get_ids_and_attrs(
        res, {'number': 'Number'},
        id_factory=lambda c: c.number + 'x',
        lookup_factory=lambda c: c['Number'] + 'y')
    assert id_lookup['1y'] == '1x'

    csv = Path(
        __file__
    ).parent / 'repos' / 'datasets' / 'test_dataset' / 'etc' / 'concepts.csv'
    res = util.get_concepts([], list(reader(csv, dicts=True)))
    assert len(res) == 2
    assert 'chinese' in res[0].attributes
    assert res[0].number == '1'

    id_lookup, _ = util.get_ids_and_attrs(res, {'chinese': 'chi'},
                                          id_factory=lambda c: c.number + 'x',
                                          lookup_factory=lambda c: c['chi'])
    assert id_lookup['xyz'] == '1x'
Esempio n. 7
0
 def iterupdated(self, languoids):  # pragma: no cover
     res = requests.post('https://query.wikidata.org/sparql',
                         data=dict(query=SPARQL),
                         headers=dict(Accept='text/csv'))
     res = {}
     if self.repos:
         res = {
             d['glottocode']: d
             for d in reader(self.repos.path('build',
                                             'glottocode2wikidata.csv'),
                             dicts=True)
         }
     assert res
     for lang in languoids:
         urls = {
             'www.wikidata.org':
             [res[lang.id]['item'].replace('http:', 'https:')]
             if lang.id in res else [],
             'en.wikipedia.org': [res[lang.id]['wikipedia']] if
             (lang.id in res) and res[lang.id]['wikipedia'] else [],
         }
         if any([lang.update_links(d, u) for d, u in urls.items()]):
             # Note: We must use list comprehension rather than a generator as first argument
             # to `any` to make sure `update_links` is called for each item in urls!
             yield lang
Esempio n. 8
0
    def from_path(cls, path, spec=None):
        """
        Instantiate a corpus from a file path.

        :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \
        component as CSV file. Note that in the latter case, the file must use the default \
        column names, as defined in the CLDF ontology.
        :return: `Corpus` instance.
        """
        if isinstance(path, str):
            path = pathlib.Path(path)
        if path.suffix == '.json':
            return cls.from_cldf(Dataset.from_metadata(path), spec=spec)
        # We are given only an ExampleTable. Let's create the appropriate dataset:
        header = None
        for d in reader(path, dicts=True):
            header = list(d.keys())
            break
        ds = Dataset.from_metadata(
            pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json')
        ds.tablegroup._fname = path.parent / 'cldf-metadata.json'
        t = ds.add_component('ExampleTable')
        t.url = Link(path.name)
        default_cols = [col.name for col in t.tableSchema.columns]
        ds.remove_columns(t, *list(set(default_cols) - set(header)))
        ds.add_columns(t, *list(set(header) - set(default_cols)))
        return cls.from_cldf(ds, spec=spec)
Esempio n. 9
0
 def read_csv(self, fname, normalize=None, **kw) -> list:
     """
     Read CSV data from a file.
     """
     if not normalize:
         return list(dsv.reader(self._path(fname), **kw))
     if kw.get('dicts'):
         return [
             collections.OrderedDict([(k,
                                       unicodedata.normalize(normalize, v))
                                      for k, v in row.items()])
             for row in dsv.reader(self._path(fname), **kw)
         ]
     else:
         return [[unicodedata.normalize(normalize, k) for k in row]
                 for row in dsv.reader(self._path(fname), **kw)]
Esempio n. 10
0
def read_cldf_languages(url):  # pragma: no cover
    r = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(r.content)) as zip:
        for member in zip.namelist():
            if member.endswith(MD_SUFFIX):
                break
        else:
            raise ValueError('No metadata file found')

        with zip.open(member) as fp:
            md = json.loads(fp.read().decode('utf8'))

        for table in md['tables']:
            if table.get(
                    'dc:conformsTo'
            ) == 'http://cldf.clld.org/v1.0/terms.rdf#LanguageTable':
                lurl, schema = table['url'], table['tableSchema']['columns']
                break
        else:
            raise ValueError('No LanguageTable found')

        for member in zip.namelist():
            if member.endswith(lurl):
                with zip.open(member) as fp:
                    return reader([line.strip() for line in fp.readlines()],
                                  dicts=True), schema
        else:
            raise ValueError('LanguageTable url not found in zip')
Esempio n. 11
0
def test_sndcmp2(sndcmp2_dataset, mocker):

    sndcmp2_dataset.cmd_create_ref_etc_files(mocker.MagicMock())
    csv = sndcmp2_dataset.raw_dir / 'concepts.csv'
    res = list(reader(csv, dicts=True))
    assert len(res) == 3
    assert 'Bislama_Gloss' not in res[0]
Esempio n. 12
0
 def _iter_etc(self, what):
     delimiter = '\t'
     path = self.etc_dir / (what + '.tsv')
     if not path.exists():
         delimiter = ','
         path = path.parent / (what + '.csv')
     return reader(path, dicts=True,
                   delimiter=delimiter) if path.exists() else []
Esempio n. 13
0
def run(args):
    ordered = [d['species'].lower() for d in reader(args.ordered, dicts=True)]
    ranks = ['phylum', 'klass', 'order', 'family', 'genus']

    ordered_ranks = {r: {} for r in ranks}
    seen = {}
    augmented_species = []
    for ex in args.api.experiments:
        species = ex.gbif.cname
        if species not in seen:
            seen[species] = (ex.gbif.classification, ex.species_latin)
            skey = species.lower()
            if skey not in ordered:
                skey = ' '.join(skey.split()[:2])
            if skey not in ordered:
                skey = [n for n in ordered if n.split()[0] == skey.split()[0]]
                if skey:
                    skey = skey[0]

            if skey in ordered:
                augmented_species.append((species, ordered.index(skey)))
            else:
                augmented_species.append((species, len(ordered) + 1))

    for s, i in sorted(augmented_species, key=lambda t: t[1], reverse=True):
        for r in ranks:
            ordered_ranks[r][getattr(seen[s][0], r)] = i

    fully_augmented_species = {
        s: (ordered_ranks['phylum'][seen[s][0].phylum],
            ordered_ranks['klass'][seen[s][0].klass],
            ordered_ranks['order'][seen[s][0].order],
            ordered_ranks['family'][seen[s][0].family],
            ordered_ranks['genus'][seen[s][0].genus], i)
        for s, i in sorted(augmented_species, key=lambda t: t[1])
    }
    clf = collections.defaultdict(lambda: [-1, None])
    prefix = {}
    for k, _ in sorted(fully_augmented_species.items(),
                       key=lambda i: i[1],
                       reverse=True):
        for j, a in enumerate(ranks):
            if clf[a][1] != getattr(seen[k][0], a):
                for aa in ranks[j + 1:]:
                    clf[aa][0] = -1
                if a == 'genus':
                    # reset prefix index for all deeper taxonomy ranks:
                    clf['species'][0] = -1
                clf[a][0] += 1
                clf[a][1] = getattr(seen[k][0], a)
                node_name = '_'.join(
                    getattr(seen[k][0], aa) for aa in ranks[:j + 1])
                prefix[node_name] = string.ascii_lowercase[clf[a][0]]
        if clf['species'][1] != k:
            clf['species'][0] += 1
            clf['species'][1] = k
            prefix[k.lower()] = string.ascii_lowercase[clf['species'][0]]
    dump(prefix, args.api.path('taxa_sortkeys.json'), indent=4)
Esempio n. 14
0
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            log.warning('Glottolog data has already been loaded')
            return
        self.glottolog_loaded = True

        self.classifications, glottocode2node, label2name = monophyly.classifications_from_newick(
            str(get_glottolog_data('newick', self.admin.glottolog_release)))

        # Load geographic metadata
        dialects = []
        for t in reader(get_glottolog_data('geo',
                                           self.admin.glottolog_release),
                        dicts=True):
            identifiers = [t['glottocode']] + t['isocodes'].split()
            if t['level'] == "dialect":
                dialects.append((t, identifiers))
            if t['macroarea']:
                for id_ in identifiers:
                    self.glotto_macroareas[id_] = t['macroarea']

            if t['latitude'] and t['longitude']:
                latlon = (float(t['latitude']), float(t['longitude']))
                for id_ in identifiers:
                    self.locations[id_] = latlon

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t, identifiers in dialects:
            failed = False
            if t['glottocode'] not in glottocode2node:  # pragma: no cover
                # This may only happen for newick downloads of older Glottolog releases, where
                # possibly isolates may not be included.
                continue
            node = glottocode2node[t['glottocode']]
            ancestor = node.ancestor
            while label2name[ancestor.name][1] not in self.locations:
                if not ancestor.ancestor:
                    # We've hit the root without finding an ancestral node
                    # with location data!
                    failed = True
                    break
                else:
                    ancestor = ancestor.ancestor
            if failed:
                continue
            latlon = self.locations[label2name[ancestor.name][1]]
            for id_ in identifiers:
                self.locations[id_] = latlon
Esempio n. 15
0
def upgrade():
    csv = Path(phoible.__file__).parent.joinpath(
        '..', 'data', 'InventoryID-InternetArchive.csv')
    ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != 'NA'}

    conn = Connection(op.get_bind())
    for id_, url in ia_urls.items():
        pk = conn.pk(Contribution, id_)
        conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
Esempio n. 16
0
def test_sndcmp(sndcmp_dataset, mocker):
    sndcmp_dataset.cmd_create_ref_etc_files(mocker.MagicMock())
    assert (sndcmp_dataset.raw_dir / 'languages.csv').exists()
    assert (sndcmp_dataset.raw_dir / 'concepts.csv').exists()
    csv = sndcmp_dataset.raw_dir / 'concepts.csv'
    res = list(reader(csv, dicts=True))
    assert len(res) == 3
    assert 'Bislama_Gloss' in res[0]
    assert res[0]["IndexInSource"] == '1-0'
Esempio n. 17
0
    def rewrite(self, fname, v):
        rows = list(dsv.reader(self.raw_dir / fname, dicts=True))

        with dsv.UnicodeWriter(self.raw_dir / fname) as w:
            for i, row in enumerate(rows):
                if i == 0:
                    w.writerow(row.keys())
                res = v(row)
                if res:
                    w.writerow(res.values())
Esempio n. 18
0
 def experiments(self):
     gbif = load(self.path('gbif.json'))
     res = [
         Experiment.from_dict(d, self.sources) for d in list(
             dsv.reader(self.path('data.Sheet1.csv'), dicts=True))[1:]
     ]
     for ex in res:
         key, md = gbif.get(ex.species_latin, (None, None))
         if key:
             ex.gbif = GBIF(key=key, metadata=md)
     return res
Esempio n. 19
0
def iter_languages(api):  # pragma: no coverage
    meds = {
        row['Language_ID']: row['Value']
        for row in reader(GLOTTOLOG_VENV / 'glottolog-cldf' / 'cldf' /
                          'values.csv',
                          dicts=True) if row['Parameter_ID'] == 'med'
    }
    for l in api.languoids():
        if l.level == api.languoid_levels.language and not l.category.startswith(
                'Pseudo'):
            yield Language(l, meds.get(l.id))
Esempio n. 20
0
def run(args):
    dicts = list(dsv.reader(get_conceptlist(args, path_only=True), delimiter="\t", dicts=True))
    out_dict = collections.OrderedDict()

    for d in dicts:
        out_dict[d[args.column]] = list(d.values())

    with dsv.UnicodeWriter(args.output, delimiter='\t') as w:
        w.writerow(dicts[0].keys())
        w.writerows(out_dict.values())
    if not args.output:
        print(w.read().decode('utf8'))
Esempio n. 21
0
def run(args):  # pragma: no cover
    for p in pathlib.Path(args.groupings).glob('*.csv'):
        groupings = {r['Feature_ID']: r for r in reader(p, dicts=True)}
        new_features = []
        for feature in args.repos.ordered_features:
            for k, v in groupings[feature.id].items():
                if k != 'Feature_ID':
                    if k in feature:
                        pass
                    else:
                        feature[k] = v
            new_features.append(feature)
    args.repos.gb20.save(new_features)
Esempio n. 22
0
def read_data(folder, fname, grapheme_col, *cols):
    data, sounds, names = defaultdict(list), [], []

    for row in reader(pkg_path(folder, fname), delimiter='\t', dicts=True):
        grapheme = {"grapheme": row[grapheme_col]}
        for col in cols:
            grapheme[col.lower()] = row[col]
        data[row['BIPA_GRAPHEME']].append(grapheme)
        data[row['CLTS_NAME']].append(grapheme)
        sounds.append(row['BIPA_GRAPHEME'])
        names.append(row['CLTS_NAME'])

    return data, sounds, names
Esempio n. 23
0
def iter_inventories(p):
    """
    Read the raw PHOIBLE data file, splitting rows into inventory, language and phoneme information
    and grouping the data by inventory.
    """
    for iid, rows in itertools.groupby(
            sorted(dsv.reader(p, dicts=True),
                   key=lambda r: int(r['InventoryID'])),
            lambda r: r['InventoryID'],
    ):
        rows = list(rows)
        yield iid, Doculect.from_row(
            rows[0]), [Phoneme.from_row(row) for row in rows]
Esempio n. 24
0
 def iter_samples(self, repos: 'GEOROC', stdout=False) -> typing.Generator['Sample', None, None]:
     from pygeoroc import errata
     lines = itertools.takewhile(
         lambda l: not (l.startswith('Abbreviations') or l.startswith('References:')),
         self.iter_lines(repos))
     for i, row in enumerate(dsv.reader(lines, dicts=True), start=2):
         try:
             sample = Sample.from_row(row)
         except:  # pragma: no cover # noqa: E722
             print('{}:{}'.format(self.name, i))
             raise
         errata.fix(sample, self, repos, stdout=stdout)
         yield sample
Esempio n. 25
0
    def check(self, clts=None, log=None, ipa_col=IPA_COLUMN):
        """
        Check a profile for consistency, logging problems.

        For each grapheme, raise:
        - a warning if there are duplicate entries
        - an error if there are inconsistencies
        - an error if the mapping has invalid BIPA
        """
        mapping = collections.defaultdict(list)
        if self.fname:
            # We read the profile from disk because segments.Profile already skips duplicate
            # graphemes, which we want to investigate more closely.
            for spec in dsv.reader(self.fname, dicts=True, delimiter='\t'):
                mapping[spec[self.GRAPHEME_COL]].append(spec[ipa_col])

        for grapheme in mapping:
            # check mapping consistency
            if len(mapping[grapheme]) >= 2:
                if len(set(mapping[grapheme])) == 1:
                    log_or_raise(
                        "Duplicate, redundant entry or entries for grapheme [{}]."
                        .format(grapheme),
                        log=log,
                        level='warning')
                else:
                    log_or_raise(
                        "Inconsist entries for grapheme [{}]: multiple mappings {}."
                        .format(grapheme, str(mapping[grapheme])),
                        log=log,
                        level='error')

            # check BIPA consistency
            if clts:
                for value in mapping[grapheme]:
                    if value:
                        # check for unknown sounds
                        unknown = [
                            isinstance(clts.bipa[segment],
                                       pyclts.models.UnknownSound)
                            for segment in ipa2tokens(value)
                            if segment and segment != 'NULL'
                        ]
                        if any(unknown):
                            log_or_raise(
                                "Mapping [{}] ({}) -> [{}] ({}) includes an unknown sound."
                                .format(grapheme,
                                        unicode2codepointstr(grapheme), value,
                                        unicode2codepointstr(value)),
                                log=log,
                                level='error')
Esempio n. 26
0
def pytest_generate_tests(metafunc):
    if 'test_sounds' == metafunc.function.__name__:
        fixturenames = None
        tests = []
        for i, test in enumerate(
                reader(Path(__file__).parent / 'data' / 'test_data.tsv',
                       delimiter='\t',
                       dicts=True)):
            if i == 0:
                fixturenames = list(test.keys())
                fixturenames.pop(fixturenames.index('bipa'))
            del test['bipa']
            if None in test:
                del test[None]
            if len(fixturenames) != len(test.keys()):
                raise ValueError(set(test.keys()) - set(fixturenames))
            tests.append(test)

        attrs = [
            'nfd-normalized', 'clts-normalized', 'aliased', 'generated',
            'stressed'
        ]
        tests = sorted(tests, key=lambda t: tuple([t[a] for a in attrs]))
        batches = []
        for _, ts in groupby(tests, lambda t: tuple([t[a] for a in attrs])):
            for test in ts:
                batches.append(tuple(test.values()))
                break

        metafunc.parametrize(
            ','.join(n.replace('-', '_') for n in fixturenames), batches)
    elif 'test_clicks' == metafunc.function.__name__:
        tests = []
        for test in reader(Path(__file__).parent / 'data' / 'clicks.tsv',
                           delimiter='\t',
                           dicts=True):
            tests.append((test['GRAPHEME'], test['MANNER']))
        metafunc.parametrize('grapheme,gtype', tests)
Esempio n. 27
0
def get_wordlist(path,
                 delimiter=",",
                 quotechar='"',
                 normalization_form="NFC",
                 **keywords):
    """
    Load a wordlist from a normal CSV file.

    Parameters
    ----------
    path : str
        The path to your CSV file.
    delimiter : str
        The delimiter in the CSV file.
    quotechar : str
        The quote character in your data.
    row : str (default = "concept")
        A string indicating the name of the row that shall be taken as the
        basis for the tabular representation of the word list.
    col : str (default = "doculect")
        A string indicating the name of the column that shall be taken as the
        basis for the tabular representation of the word list.
    conf : string (default='')
        A string defining the path to the configuration file.
    
    Notes
    -----
    This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object.
    In contrast to the normal way to load a wordlist from a tab-separated file,
    however, this allows to directly load a wordlist from any "normal"
    csv-file, with your own specified delimiters and quote characters. If the
    first cell in the first row of your CSV file is not named "ID", the integer
    identifiers, which are required by LingPy will be automatically created.

    """
    kw = dict(conf="", col="doculect", row="concept")
    kw.update(keywords)
    data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar))
    header = [h.lower() for h in data[0]]
    data = data[1:]
    D = {}
    if header[0] == 'ID':
        D[0] = header[1:]
        for row in data:
            D[row[0]] = [normalize(normalization_form, n) for n in row[1:]]
    else:
        D[0] = header
        for idx, row in enumerate(data):
            D[idx + 1] = row
    return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
Esempio n. 28
0
 def read(encoding):
     with fname.open(encoding=encoding) as csvfile:
         line = csvfile.readline()
         delimiter = ','
         if ';' in line and ((',' not in line) or
                             (line.index(';') < line.index(','))):
             delimiter = ';'
     for row in dsv.reader(fname,
                           delimiter=delimiter,
                           quotechar='"',
                           doublequote=True,
                           encoding=encoding,
                           dicts=True):
         yield _normalized_row(row)
Esempio n. 29
0
def read(fname, data):
    concepts, loan = None, None

    for i, row in enumerate(reader(fname)):
        if i == 0:
            concepts = {j: c for j, c in enumerate(row[1:])}
        else:
            for j, c in enumerate(row[1:]):
                if j % 2 == 0:  # even number
                    loan, form = get_loan_and_form(c)
                else:
                    if form.strip():
                        data[row[0]][concepts[j]] = (form, loan, c)
    return data
Esempio n. 30
0
 def cmd_download(self, args):
     for row in reader(DPLACE_DATA / 'phylogenies' / 'index.csv',
                       dicts=True):
         if not row['id'].startswith('glottolog_'):
             self.raw_dir.joinpath(row['id']).mkdir(exist_ok=True)
             for fname in [
                     'posterior.trees',
                     'source.bib',
                     'summary.trees',
                     'taxa.csv',
             ]:
                 src = DPLACE_DATA / 'phylogenies' / row['id'] / fname
                 if src.exists():
                     shutil.copy(str(src),
                                 str(self.raw_dir / row['id'] / fname))
Esempio n. 31
0
from collections import OrderedDict

from csvw.dsv import reader
from clldutils.jsonlib import dump
from sqlalchemy import create_engine


eth17 = OrderedDict()
for l in reader('LanguageCodes.tab', dicts=True, delimiter='\t'):
    eth17[l['LangID']] = l['Name']

db = create_engine('postgresql://robert@/asjp')
in_asjp = set(r[0] for r in db.execute('select code_iso from doculect where code_iso is not null'))

missing = [(k, v) for k, v in eth17.items() if k not in in_asjp]
dump(missing, 'missing.json', indent=4)