def _make_package(args): # pragma: no cover """Prepare transcriptiondata from the transcription sources.""" from lingpy.sequence.sound_classes import token2class from lingpy.data import Model columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE'] bipa = TranscriptionSystem('bipa') for src, rows in args.repos.iter_sources(type='td'): args.log.info('TranscriptionData {0} ...'.format(src['NAME'])) uritemplate = URITemplate( src['URITEMPLATE']) if src['URITEMPLATE'] else None out = [[ 'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME', 'URL' ] + columns] graphemes = set() for row in rows: if row['GRAPHEME'] in graphemes: args.log.warn('skipping duplicate grapheme: {0}'.format( row['GRAPHEME'])) continue graphemes.add(row['GRAPHEME']) if not row['BIPA']: bipa_sound = bipa[row['GRAPHEME']] explicit = '' else: bipa_sound = bipa[row['BIPA']] explicit = '+' generated = '+' if bipa_sound.generated else '' if is_valid_sound(bipa_sound, bipa): bipa_grapheme = bipa_sound.s bipa_name = bipa_sound.name else: bipa_grapheme, bipa_name = '<NA>', '<NA>' url = uritemplate.expand( **row) if uritemplate else row.get('URL', '') out.append([ bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'], url ] + [row.get(c, '') for c in columns]) found = len([o for o in out if o[0] != '<NA>']) args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format( found, len(out), found / len(out) * 100)) with UnicodeWriter(pkg_path('transcriptiondata', '{0}.tsv'.format(src['NAME'])), delimiter='\t') as writer: writer.writerows(out) count = 0 with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'), delimiter='\t') as writer: writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS) for grapheme, sound in sorted(bipa.sounds.items()): if not sound.alias: writer.writerow([sound.name, grapheme] + [ token2class(grapheme, Model(cls)) for cls in SOUNDCLASS_SYSTEMS ]) count += 1 args.log.info('SoundClasses: {0} written to file.'.format(count))
def before(self, req, fp): self.writer = UnicodeWriter(fp) self.writer.__enter__() self.writer.writerow([ f if isinstance(f, string_types) else f[1] for f in self.get_fields(req) ])
def writerow(self, row): if not self.header_written: UnicodeWriter.writerow( self, [col.name for col in self.table.schema.columns.values()]) self.header_written = True if not isinstance(row, Row): row = Row.from_list(self.table.schema, row) else: assert row.schema == self.table.schema UnicodeWriter.writerow(self, row.to_list())
def __init__(self, table, container=None, **kw): self.table, kw['delimiter'] = _table_and_delimiter(table) if isinstance(container, Archive): f = None elif isinstance(container, Path): f = container.joinpath(self.table.url) else: f = self.table.url # pragma: no cover self.container = container self.header_written = not self.table.dialect.header UnicodeWriter.__init__(self, f, **kw)
def iso2codes(args): from clldutils.dsv import UnicodeWriter nodes = list(args.repos.languoids()) res = {} for node in nodes: if node.iso: res[node.id] = (node.iso, set()) for node in nodes: if node.level == Level.family or node.id in res: continue for nid in res: matched = False for l in node.lineage: if l[1] == nid: res[nid][1].add(node.id) matched = True break if matched: break with UnicodeWriter('iso2glottocodes.csv') as writer: writer.writerow(['iso', 'glottocodes']) for gc, (iso, gcs) in res.items(): writer.writerow([iso, ';'.join([gc] + list(gcs))])
def get_values(self, p, language_url_pattern): q = DBSession.query(Value).join(Value.valueset)\ .filter(ValueSet.parameter_pk == p.pk)\ .options( joinedload(Value.valueset, ValueSet.language), joinedload(Value.valueset, ValueSet.contribution), joinedload(Value.domainelement), joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source) ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk) with UnicodeWriter() as writer: writer.writerow([ 'ID', 'Language_ID', 'Parameter_ID', 'Contribution_ID', 'Value', 'Source', 'Comment', ]) for v in page_query(q): writer.writerow([ v.id, language_url_pattern.format(v.valueset.language.id), p.id, v.valueset.contribution.id, v.domainelement.name if v.domainelement else v.name, ';'.join(self.format_sources(v)), getattr(v, 'comment', v.valueset.source) or '', ]) return writer.read()
def iso2codes(args): """ Map ISO codes to the list of all Glottolog languages and dialects subsumed "under" it. """ from clldutils.dsv import UnicodeWriter nodes = list(args.repos.languoids()) res = {} for node in nodes: if node.iso: res[node.id] = (node.iso, set()) for node in nodes: if node.level == Level.family or node.id in res: continue for nid in res: matched = False for l in node.lineage: if l[1] == nid: res[nid][1].add(node.id) matched = True break if matched: break outdir = Path('.') if not args.args else Path(args.args[0]) with UnicodeWriter(outdir / 'iso2glottocodes.csv') as writer: writer.writerow(['iso', 'glottocodes']) for gc, (iso, gcs) in res.items(): writer.writerow([iso, ';'.join([gc] + list(gcs))])
def cmd_download(self, **kw): # download source self.raw.write('sources.bib', getEvoBibAsBibtex(SOURCE, **kw)) # download data all_records = [] for i in pb(list(range(1, 20 * self.pages + 1, 20))): with self.raw.temp_download(self._url(i), 'file-{0}'.format(i), log=self.log) as fname: soup = BeautifulSoup( fname.open(encoding='utf8').read(), 'html.parser') for record in soup.findAll(name='div', attrs={"class": "results_record"}): if isinstance(record, bs4.element.Tag): children = list(record.children) number = children[0].findAll('span')[1].text.strip() concept = children[1].findAll('span')[1].text for child in children[2:]: if isinstance(child, bs4.element.Tag): dpoints = child.findAll('span') if len(dpoints) >= 3: lname = dpoints[1].text glottolog = re.findall( 'Glottolog: (........)', str(dpoints[1]))[0] entry = dpoints[2].text cogid = list( child.children)[4].text.strip() all_records.append( (number, concept, lname, glottolog, entry, cogid)) with UnicodeWriter(self.raw.posix('output.csv')) as f: f.writerows(all_records)
def geo(args): with_session(args) fname = args.pkg_dir.joinpath('static', 'download', 'languages-and-dialects-geo.csv') with transaction.manager, UnicodeWriter(fname) as writer: writer.writerow([ 'glottocode', 'name', 'isocodes', 'level', 'macroarea', 'latitude', 'longitude' ]) for l in DBSession.query(models.Languoid)\ .filter(or_( models.Languoid.level == models.LanguoidLevel.dialect, models.Languoid.level == models.LanguoidLevel.language))\ .options( joinedload(models.Languoid.macroareas), joinedload_all( common.Language.languageidentifier, common.LanguageIdentifier.identifier))\ .order_by(common.Language.name): writer.writerow([ l.id, l.name, ' '.join( i.name for i in l.get_identifier_objs(common.IdentifierType.iso)), l.level, l.macroareas[0].name if l.macroareas else '', l.latitude if l.latitude is not None else '', l.longitude if l.longitude is not None else '' ]) args.log.info('{0} written'.format(fname))
def word_length(args): from pyconcepticon.api import Concepticon c = Concepticon(args.concepticon_repos) res = defaultdict(lambda: defaultdict(list)) def _word_length(ds, **kw): ds.word_length(res) with_dataset(args, _word_length) concepts = c.conceptsets languoids = {l.id: l for l in Glottolog(args.glottolog_repos).languoids()} with UnicodeWriter('wordlength.csv') as writer: writer.writerow([ 'Concepticon_ID', 'Gloss', 'Semanticfield', 'Category', 'Glottocode', 'Variety', 'Family', 'Form', 'Length' ]) for pid, langs in res.items(): if len(langs) >= 500: for (lang, variety), forms in langs.items(): if lang in languoids: lengths = [len(f.split()) for f in forms] lang = languoids[lang] family = lang.lineage[0][0] if lang.lineage else '' c = concepts[pid] writer.writerow([ pid, c['GLOSS'], c['SEMANTICFIELD'], c['ONTOLOGICAL_CATEGORY'], lang.id, variety, family, forms[0], sum(lengths) / len(lengths) ])
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None): blamefield = "hhtype" mafter = markall(m, trigs, verbose=verbose, rank=rank) ls = lstat(ref, hht) lsafter = lstat_witness(mafter, hht) log = [] for (lg, (stat, wits)) in lsafter.items(): if not ls.get(lg): if verbose: print lg, "lacks status", [mafter[k][1]['srctrickle'] for k in wits] continue if hht[stat] > hht[ls[lg]]: log = log + [ (lg, [(mafter[k][1].get(blamefield, "No %s" % blamefield), k, mafter[k][1].get('title', 'no title'), mafter[k][1].get('srctrickle', 'no srctrickle')) for k in wits], ls[lg])] for k in wits: (t, f) = mafter[k] if blamefield in f: del f[blamefield] mafter[k] = (t, f) with UnicodeWriter(outfn, dialect='excel-tab') as writer: writer.writerows(((lg, was) + mis for (lg, miss, was) in log for mis in miss)) return mafter
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None): blamefield = "hhtype" mafter = markall(m, trigs, verbose=verbose, rank=rank) ls = lstat(ref, hht) lsafter = lstat_witness(mafter, hht) log = [] no_status = defaultdict(set) for (lg, (stat, wits)) in lsafter.items(): if not ls.get(lg): srctrickles = [mafter[k][1]['srctrickle'] for k in wits] for t in srctrickles: if not t.startswith('iso6393'): no_status[lg].add(t) continue if hht[stat] > hht[ls[lg]]: log = log + [(lg, [(mafter[k][1].get( blamefield, "No %s" % blamefield), k, mafter[k][1].get( 'title', 'no title'), mafter[k][1].get( 'srctrickle', 'no srctrickle')) for k in wits], ls[lg])] for k in wits: (t, f) = mafter[k] if blamefield in f: del f[blamefield] mafter[k] = (t, f) for lg in no_status: print('{0} lacks status'.format(lg)) with UnicodeWriter(outfn, dialect='excel-tab') as writer: writer.writerows( ((lg, was) + mis for (lg, miss, was) in log for mis in miss)) return mafter
def write(self, path, sep="\t"): with UnicodeWriter(path, delimiter=sep) as writer: for i, item in enumerate(self): if i == 0: writer.writerow(list(item.keys())) writer.writerow(list(item.values())) if path is None: return writer.read()
def render(self, ctx, req): with UnicodeWriter() as writer: writer.writerow(self.header(ctx, req)) for item in ctx.get_query(limit=csv.QUERY_LIMIT): writer.writerow(self.row(ctx, req, item)) return writer.read()
def languoids(langs, outdir): with UnicodeWriter(outdir.joinpath('csv', 'glottolog.csv')) as writer: writer.writerow(['id', 'name', 'family_id', 'family_name', 'iso_code']) for lang in sorted(langs): writer.writerow([ lang.id, lang.name, lang.lineage[0][1] if lang.lineage else '', lang.lineage[0][0] if lang.lineage else '', lang.iso or '' ])
def locations(glottolog, fid, outpath): with UnicodeWriter(outpath) as writer: writer.writerow(['name', 'glottocode', 'latitude', 'longitude']) for lang in glottolog.languoids(): if lang.level == Level.language and lang.latitude is not None: if fid in [l[1] for l in lang.lineage]: writer.writerow( [lang.name, lang.id, lang.latitude, lang.longitude])
def render(self, ctx, req): with UnicodeWriter() as writer: rows = iter(ctx.get_query(limit=QUERY_LIMIT)) first = next(rows, None) if first is not None: cols = first.csv_head() writer.writerow(cols) for item in chain([first], rows): writer.writerow(item.to_csv(ctx=ctx, req=req, cols=cols)) return writer.read()
def to_csvfile(self, filename): """Write a CSV file with one row for each entry in each bibfile.""" with self.connect() as conn: cursor = conn.execute( 'SELECT filename, bibkey, hash, cast(id AS text) AS id ' 'FROM entry ORDER BY lower(filename), lower(bibkey), hash, id') with UnicodeWriter(filename) as writer: writer.writerow([col[0] for col in cursor.description]) for row in cursor: writer.writerow(row)
def render(self, data, accepted_media_type=None, renderer_context=None): "Renders a list of SocietyResultSets to CSV" if data is None: return '' results = DPLACECSVResults(data) with UnicodeWriter() as writer: writer.writerow([CSV_PREAMBLE]) writer.writerow(results.field_names) for row in results: writer.writerow(row) return writer.read()
def xls2csv(fname, outdir=None): res = {} outdir = outdir or fname.parent wb = xlrd.open_workbook(as_posix(fname)) for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) if sheet.nrows: path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with UnicodeWriter(path) as writer: for i in range(sheet.nrows): writer.writerow([col.value for col in sheet.row(i)]) res[sname] = path return res
def render(self, ctx, req): fid = req.route_url('parameter', id='xxx').replace('xxx', '{0}') lid = req.route_url('language', id='xxx').replace('xxx', '{0}') with UnicodeWriter() as writer: writer.writerow(['Language_ID', 'Feature_ID', 'Value']) for _lid, _fid, v in DBSession.query( Language.id, Parameter.id, Value.name)\ .filter(Language.pk == ValueSet.language_pk)\ .filter(Parameter.pk == ValueSet.parameter_pk)\ .filter(Value.valueset_pk == ValueSet.pk)\ .order_by(Parameter.pk, Language.id): if v: writer.writerow([lid.format(_lid), fid.format(_fid), v]) return writer.read()
def download(dataset, **kw): def rp(*names): return dataset.raw.joinpath(*names).as_posix() download_and_unpack_zipfiles(URL, dataset, FNAME) check_call('libreoffice --headless --convert-to docx %s --outdir %s' % (rp(FNAME), rp()), shell=True) doc = Document(rp(Path(FNAME).stem + '.docx')) for i, table in enumerate(doc.tables): with UnicodeWriter(rp('%s.csv' % (i + 1, ))) as writer: for row in table.rows: writer.writerow(map(text_and_color, row.cells))
def lookup(args): """ Looks up a single gloss from the commandline. concepticon lookup <gloss1 gloss2 ... glossN> """ api = Concepticon() found = api.lookup(args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None, delimiter='\t') as writer: writer.writerow( ['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for f in found: writer.writerow(f) print(writer.read().decode('utf-8'))
def write_tree(tree, fname, taxa_in_dplace, societies_by_glottocode): if not fname.exists(): fname.mkdir() tree.prune([n.encode('ascii') for n in taxa_in_dplace]) with fname.joinpath('summary.trees').open('w', encoding="utf-8") as handle: handle.write( NEXUS_TEMPLATE.format(tree.name if tree.name else 'UNTITLED', tree.write(format=9))) with UnicodeWriter(fname.joinpath('taxa.csv')) as writer: writer.writerow(['taxon', 'glottocode', 'xd_ids', 'soc_ids']) for gc in sorted(taxa_in_dplace): socs = societies_by_glottocode[gc] writer.writerow([ gc, gc, ', '.join(set(s.xd_id for s in socs)), ', '.join(s.id for s in socs) ]) return tree
def orthography(args): # pragma: no cover ds = get_dataset(args) out = ds.dir.joinpath('orthography.tsv') if out.exists(): if not confirm( 'There already is an orthography profile for this dataset. Overwrite?', default=False): return graphemes = Counter() for line in ds.iter_raw_lexemes(): graphemes.update(grapheme_pattern.findall(line)) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow(['graphemes', 'frequency', 'IPA']) for grapheme, frequency in graphemes.most_common(): writer.writerow([grapheme, '{0}'.format(frequency), grapheme]) log_dump(out, log=args.log)
def create(self, req, filename=None, verbose=True): # pragma: no cover meanings = [(p.name, p.id) for p in DBSession.query(Parameter).order_by(Parameter.pk)] tmp = mkdtemp() path = os.path.join(tmp, 'asjp.tab') with UnicodeWriter(f=path, delimiter=binary_type("\t")) as writer: writer.writerow([f[0] for f in self.fields] + [m[0] for m in meanings]) for lang in DBSession.query(Doculect).order_by( Doculect.pk).options( joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(Language.valuesets, ValueSet.parameter)).limit(10000): row = [f[1](lang) for f in self.fields] vss = {vs.parameter.id: vs for vs in lang.valuesets} row.extend( [Doculect.format_words(vss.get(m[1])) for m in meanings]) writer.writerow(row) Download.create(self, req, filename=path) rmtree(tmp)
def _freeze(table, fpath): def conv(v, col): if v is None: return '' if isinstance(col.type, DeclEnumType): # pragma: no cover return v.value if isinstance(col.type, JSONEncodedDict): return json.dumps(v) if isinstance(v, (datetime, date)): return v.isoformat() return v keys = [col.name for col in table.columns] cols = {col.name: col for col in table.columns} rows = [keys] for row in DBSession.execute(select([table])): rows.append([conv(row[key], cols[key]) for key in keys]) if len(rows) > 1: with UnicodeWriter(fpath) as writer: writer.writerows(rows)
class CsvDump(Download): """Download of a resource type as csv.""" ext = 'csv' def __init__(self, model, pkg, fields=None, **kw): """Initialize. fields can be a list of column names or a dictionary mapping model attribute names to csv column names. """ super(CsvDump, self).__init__(model, pkg, **kw) self.fields = fields self.writer = None def get_stream(self): return StringIO(newline='') if PY3 else BytesIO() def read_stream(self, fp): res = Download.read_stream(self, fp) if PY3: # pragma: no cover res = res.encode('utf8') return res def get_fields(self, req): if not self.fields: self.fields = ['id', 'name'] return self.fields def before(self, req, fp): self.writer = UnicodeWriter(fp) self.writer.__enter__() self.writer.writerow([ f if isinstance(f, string_types) else f[1] for f in self.get_fields(req) ]) def row(self, req, fp, item, index): return [ getattr(item, f if isinstance(f, string_types) else f[0]) for f in self.get_fields(req) ] def dump(self, req, fp, item, index): self.writer.writerow(self.row(req, fp, item, index))
def write_conceptlist(clist, filename, header=False): """ Write conceptlist to file. """ def natural_sort(l): """ Code-piece from http://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort """ convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key=alphanum_key) header = header or clist['header'] keys = natural_sort(list(clist.keys())) with UnicodeWriter(filename, delimiter='\t') as writer: writer.writerow(header) for k in keys: v = clist[k] if k not in ['splits', 'mergers', 'header']: writer.writerow([v[h] for h in header])
class CsvDump(Download): """Download of a resource type as csv.""" ext = 'csv' def __init__(self, model, pkg, fields=None, **kw): """Initialize. fields can be a list of column names or a dictionary mapping model attribute names to csv column names. """ super(CsvDump, self).__init__(model, pkg, **kw) self.fields = fields self.writer = None def get_stream(self): return StringIO(newline='') if PY3 else BytesIO() def read_stream(self, fp): res = Download.read_stream(self, fp) if PY3: # pragma: no cover res = res.encode('utf8') return res def get_fields(self, req): if not self.fields: self.fields = ['id', 'name'] return self.fields def before(self, req, fp): self.writer = UnicodeWriter(fp) self.writer.__enter__() self.writer.writerow( [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)]) def row(self, req, fp, item, index): return [getattr(item, f if isinstance(f, string_types) else f[0]) for f in self.get_fields(req)] def dump(self, req, fp, item, index): self.writer.writerow(self.row(req, fp, item, index))
def main(): socs = read_win1252( 'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv') links = { r['soc_id']: r for r in read_win1252( 'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv') } locations = { 'SCCS' + r['soc_id']: r for r in reader('../../legacy/LatLong_data.csv', dicts=True) } for row in reader( '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv', dicts=True): if row['Dataset'] == 'SCCS': locations[row['soc_id']]['Lat'] = row['soc.latitude'] locations[row['soc_id']]['Long'] = row['soc.longitude'] with UnicodeWriter('societies.csv') as w: w.writerow([f.name for f in attr.fields(Society)]) for soc in socs: kw = { 'id': soc['soc_id'], 'glottocode': soc['glottolog_id'], 'glottocode_comment': 'Lang_assignment_change_notes' } for col in [ 'xd_id', 'pref_name_for_society', 'ORIG_name_and_ID_in_this_dataset', 'alt_names_by_society', 'main_focal_year', ]: kw[col] = soc[col] for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']: kw[col] = locations[soc['soc_id']][col] kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID'] kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link'] w.writerow(attr.astuple(Society(**kw))) with UnicodeWriter('societies_mapping.csv') as w: w.writerow(['id', 'related']) for sid, l in links.items(): rels = [] for dsid, suffix in [ ('EA', '1'), ('EA', '2'), ('Binford', '1'), ('Binford', '2'), ('Binford', '3'), ('SCCS', ''), ('WNAI', '1'), ('WNAI', '2'), ('WNAI', '3'), ('WNAI', '4'), ('WNAI', '5'), ]: if dsid == 'SCCS': label = l['{0}_society_equivalent{1}'.format(dsid, suffix)] else: label = l['{0}_label_society_equivalent{1}'.format( dsid, suffix)] id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)] if label and id: rels.append('{0}: {1} [{2}]'.format(dsid, label, id)) w.writerow([sid, '; '.join(rels)]) var_info = { r['source']: r['APA_reference'] for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv', ignore_dataset=True) } with UnicodeWriter('variables.csv') as w: fm = OrderedDict([ ('VarID', 'id'), ('Category', 'category'), ('VarTitle', 'title'), ('VarDefinition', 'definition'), ('VarType', 'type'), ('UserNotes', 'notes'), ('source', 'source'), ('VarTitleShort', 'changes'), ('Unit', 'units'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_Full_VariableList_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] row['VarType'] = row['VarType'].capitalize() if row['VarDefinition']: row['VarDefinition'] += '\n\n' row['VarDefinition'] += var_info.get(row['source'], row['source']) w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('codes.csv') as w: fm = OrderedDict([ ('VarID', 'var_id'), ('Code', 'code'), ('CodeDescription', 'description'), ('ShortName', 'name'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_CodeDescriptions_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('data.csv') as w: fm = OrderedDict([ ('soc_id', 'soc_id'), ('SubCase', 'sub_case'), ('Year', 'year'), ('VarID', 'var_id'), ('Code', 'code'), ('EthnoReferences', 'references'), ('AdminComment', 'admin_comment'), ('UserComment', 'comment'), ('SourceCodedData', 'source_coded_data'), ]) w.writerow(fm.values()) for row in read_win1252( 'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()])
def extract(args): import argparse usage = """ dplace %(prog)s - extracts subsets of data for further processing. To filter societies: > dplace %(prog)s --society Cj4,Cj5,Cj6 output.csv To filter societies on a given tree: > dplace %(prog)s --tree gray_et_al2009 output.csv To filter societies only from a given dataset: > dplace %(prog)s --dataset EA output.csv """ parser = argparse.ArgumentParser(prog='extract', usage=usage) parser.add_argument('filename', help='filename', default=None) parser.add_argument('--society', help='restrict to these society ids (x,y,z)', default=None) parser.add_argument('--tree', help='restrict to this tree', default=None) parser.add_argument('--dataset', help='restrict to these datasets (x,y,z)', default=None) parser.add_argument('--variable', help='restrict to thes dataset (x,y,z)', default=None) xargs = parser.parse_args(args.args) datasets = xargs.dataset.split(",") if xargs.dataset else None variables = xargs.variable.split(",") if xargs.variable else None societies = xargs.society.split(",") if xargs.society else None # get tree if given if xargs.tree: # get trees trees = {t.id: t for t in args.repos.phylogenies} try: tree = trees.get(xargs.tree) except IndexError: raise SystemExit("Failed to find Tree %s" % xargs.tree) societies = [ s for sublist in [t.soc_ids for t in tree.taxa] for s in sublist ] with UnicodeWriter(f=xargs.filename) as out: header = [ 'ID', 'XD_ID', 'Glottocode', 'Name', 'OriginalName', 'FocalYear', 'Latitude', 'Longitude', 'Variable', 'Value' ] out.writerow(header) for record in args.repos.iter_data(datasets=datasets, variables=variables, societies=societies): s = args.repos.societies.get(record.soc_id, None) if s is None: # we get these warnings as we are currently missing the SCCS # and WNAI data args.log.warn("Missing society definition for %s" % record.soc_id) continue row = [ s.id, s.xd_id, s.glottocode, s.pref_name_for_society, s.ORIG_name_and_ID_in_this_dataset, s.main_focal_year, s.Lat, s.Long, record.var_id, record.code ] out.writerow(row)
def map(self, clist, otherlist=None, out=None, full_search=False, similarity_level=5, language='en'): assert clist.exists(), "File %s does not exist" % clist from_ = [] for item in read_dicts(clist): from_.append((item.get('ID', item.get('NUMBER')), item.get('GLOSS', item.get('ENGLISH')))) to = self._get_map_for_language(language, otherlist) if not full_search: cmap = concept_map2([i[1] for i in from_], [i[1] for i in to], similarity_level=similarity_level, freqs=self.frequencies, language=language) good_matches = 0 with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow([ 'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY' ]) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] matches, sim = cmap.get(i, ([], 10)) if sim <= 5: good_matches += 1 if not matches: writer.writerow(row + ['', '???', '']) elif len(matches) == 1: row.extend([ to[matches[0]][0], to[matches[0]][1].split('///')[0], sim ]) writer.writerow(row) else: # we need a list to retain the order by frequency visited = [] for j in matches: gls, cid = to[j][0], to[j][1].split('///')[0] if (gls, cid) not in visited: visited += [(gls, cid)] if len(visited) > 1: writer.writerow(['<<<', '', '', '']) for gls, cid in visited: writer.writerow(row + [gls, cid, sim]) writer.writerow(['>>>', '', '', '']) else: row.extend([visited[0][0], visited[0][1], sim]) writer.writerow(row) writer.writerow([ '#', good_matches, len(from_), '{0:.2f}'.format(good_matches / len(from_)) ]) else: cmap = concept_map([i[1] for i in from_], [ i[1] for i in self._get_map_for_language(language, otherlist) ], similarity_level=similarity_level) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow( ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS']) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] match = cmap.get(i) row.extend(list(to[match[0]]) if match else ['', '']) writer.writerow(row) if out is None: print(writer.read().decode('utf-8'))
def __exit__(self, type, value, traceback): UnicodeWriter.__exit__(self, type, value, traceback) if isinstance(self.container, Archive): self.container.write_text(self.read(), self.table.url)
def before(self, req, fp): self.writer = UnicodeWriter(fp) self.writer.__enter__() self.writer.writerow( [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)])