def ods2csv(self, fname, outdir=None): """ Dump the data from an OpenDocument Spreadsheet (suffix .ODS) file to CSV. .. note:: Requires `cldfbench` to be installed with extra "odf". """ if not load_odf: # pragma: no cover raise EnvironmentError( 'ods2csv is only available when cldfbench is installed with odf support\n' 'pip install cldfbench[odf]') fname = self._path(fname) ods_data = load_odf(fname) tables = [ e for e in ods_data.spreadsheet.childNodes if e.qname == (ODF_NS_TABLE, 'table') ] outdir = outdir or self res = {} for table in tables: table_name = table.attributes[ODF_NS_TABLE, 'name'] csv_path = outdir / '{}.{}.csv'.format( fname.stem, slug(table_name, lowercase=False)) with dsv.UnicodeWriter(csv_path) as writer: writer.writerows(_ods_to_list(table)) res[table_name] = csv_path return res
def xls2csv(self, fname, outdir=None): """ Dump the data from an Excel XLS file to CSV. .. note:: Requires `cldfbench` to be installed with extra "excel". """ if not xlrd: # pragma: no cover raise EnvironmentError( 'xls2csv is only available when cldfbench is installed with excel support\n' 'pip install cldfbench[excel]') fname = self._path(fname) res = {} outdir = outdir or self try: wb = xlrd.open_workbook(str(fname)) except xlrd.biffh.XLRDError as e: if 'xlsx' in str(e): raise ValueError('To read xlsx files, call xlsx2csv!') raise # pragma: no cover for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) if sheet.nrows: path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with dsv.UnicodeWriter(path) as writer: for i in range(sheet.nrows): writer.writerow([col.value for col in sheet.row(i)]) res[sname] = path return res
def xlsx2csv(self, fname, outdir=None): if not openpyxl: # pragma: no cover raise EnvironmentError( 'xlsx2csv is only available when cldfbench is installed with excel support\n' 'pip install cldfbench[excel]') def _excel_value(x): if x is None: return "" if isinstance(x, float): return '{0}'.format(int(x)) return '{0}'.format(x).strip() fname = self._path(fname) res = {} outdir = outdir or self wb = openpyxl.load_workbook(str(fname), data_only=True) for sname in wb.sheetnames: sheet = wb.get_sheet_by_name(sname) path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with dsv.UnicodeWriter(path) as writer: for row in sheet.rows: writer.writerow([_excel_value(col.value) for col in row]) res[sname] = path return res
def rewrite(self, fname, v): rows = list(dsv.reader(self.raw_dir / fname, dicts=True)) with dsv.UnicodeWriter(self.raw_dir / fname) as w: for i, row in enumerate(rows): if i == 0: w.writerow(row.keys()) res = v(row) if res: w.writerow(res.values())
def run(args): dicts = list(dsv.reader(get_conceptlist(args, path_only=True), delimiter="\t", dicts=True)) out_dict = collections.OrderedDict() for d in dicts: out_dict[d[args.column]] = list(d.values()) with dsv.UnicodeWriter(args.output, delimiter='\t') as w: w.writerow(dicts[0].keys()) w.writerows(out_dict.values()) if not args.output: print(w.read().decode('utf8'))
def to_csvfile(self, filename, encoding='utf-8', dialect='excel'): """Write a CSV file with one row for each entry in each bibfile.""" select_rows = sa.select( [ File.name.label('filename'), Entry.bibkey, Entry.hash, sa.cast(Entry.id, sa.Text).label('id'), ]).select_from(sa.join(File, Entry))\ .order_by(sa.func.lower(File.name), sa.func.lower(Entry.bibkey), Entry.hash, Entry.id) with self.execute(select_rows) as cursor: with dsv.UnicodeWriter(filename, encoding=encoding, dialect=dialect) as writer: writer.writerow(cursor.keys()) for row in cursor: writer.writerow(row)
def write_tsv(in_, out_, glottocode): rows = list({ '.xlsx': iter_xlsx, '.xls': iter_xls, '.csv': iter_csv, '.tsv': iter_tsv, }[in_.suffix](in_)) i = 0 with dsv.UnicodeWriter(out_, delimiter='\t') as w: for i, row in enumerate(rows): if i == 0: w.writerow(list(row.keys())) row['Language_ID'] = glottocode w.writerow(list(row.values())) return i
def to_csvfile(self, filename, *, dialect: str = 'excel', encoding: str = ENCODING): """Write a CSV file with one row for each entry in each .bib file.""" select_rows = (sa.select( File.name.label('filename'), Entry.bibkey, Entry.hash, sa.cast(Entry.id, sa.Text).label('id')).join_from( File, Entry).order_by(sa.func.lower(File.name), sa.func.lower(Entry.bibkey), 'hash', Entry.id)) with self.execute(select_rows) as result,\ dsv.UnicodeWriter(filename, encoding=encoding, dialect=dialect) as writer: header = list(result.keys()) writer.writerow(header) writer.writerows(result)
def xls2csv(self, fname, outdir=None): if not xlrd: # pragma: no cover raise EnvironmentError( 'xls2csv is only available when cldfbench is installed with excel support\n' 'pip install cldfbench[excel]') fname = self._path(fname) res = {} outdir = outdir or self wb = xlrd.open_workbook(str(fname)) for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) if sheet.nrows: path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with dsv.UnicodeWriter(path) as writer: for i in range(sheet.nrows): writer.writerow([col.value for col in sheet.row(i)]) res[sname] = path return res
def run(args): for p in args.path: # use reader rather than iterrows so we operate on raw file rather than a # grambank-ifyed version. rows = list(Sheet(p)._reader()) not_empty = None with dsv.UnicodeWriter(p, delimiter='\t', encoding='utf8') as w: for i, row in enumerate(rows): if i == 0: not_empty = [i for i, k in enumerate(row) if k] if set(row) == {''}: continue # check other cells are empty for i, e in enumerate(row): if i not in not_empty and e: # pragma: no cover raise ValueError( "Unlabelled column has value on line %d. Fix manually!" % i) w.writerow([row[i] for i in not_empty]) return
def dump(self): def _excel_value(x): if x is None: return "" if isinstance(x, float): return '{0}'.format(int(x)) return '{0}'.format(x).strip() res = {} outdir = self.repos wb = openpyxl.load_workbook(str(self.path('COMBINED.xlsx')), data_only=True) for sname in wb.sheetnames: sheet = wb[sname] path = outdir.joinpath('data.' + slug(sname, lowercase=False) + '.csv') with dsv.UnicodeWriter(path) as writer: for row in sheet.rows: writer.writerow([_excel_value(col.value) for col in row]) res[sname] = path return res
def visit(self, row_visitor=None): """ Apply `row_visitor` to all rows in a sheet. :param row_visitor: :return: Pair of `int`s specifying the number of rows read and written. """ if row_visitor is None: row_visitor = lambda r: r # noqa: E731 rows = list(self.iterrows()) count = 0 with dsv.UnicodeWriter(self.path, delimiter='\t', encoding='utf8') as w: for i, row in enumerate(rows): if i == 0: w.writerow(list(row.keys())) res = row_visitor(row) if res: w.writerow(list(row.values())) count += 1 # Make sure calling iterrows again will re-read from disk: self._rows = None return (len(rows), count)
def xlsx2csv(self, fname, outdir=None): """ Dump the data from an Excel XLSX file to CSV. .. note:: Requires `cldfbench` to be installed with extra "excel". """ if not openpyxl: # pragma: no cover raise EnvironmentError( 'xlsx2csv is only available when cldfbench is installed with excel support\n' 'pip install cldfbench[excel]') def _excel_value(x): if x is None: return "" if isinstance(x, float) and int(x) == x: # Since Excel does not have an integer type, integers are rendered as "n.0", # which in turn confuses type detection of tools like csvkit. Thus, we normalize # numbers of the form "n.0" to "n". return '{0}'.format(int(x)) # pragma: no cover return '{0}'.format(x).strip() fname = self._path(fname) res = {} outdir = outdir or self wb = openpyxl.load_workbook(str(fname), data_only=True) for sname in wb.sheetnames: sheet = wb[sname] path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with dsv.UnicodeWriter(path) as writer: for row in sheet.rows: writer.writerow([_excel_value(col.value) for col in row]) res[sname] = path return res
def cmd_create_ref_etc_files(self, args): # Helper command to generate raw/concepts.csv and raw/languages.csv out of # the JSON data file which can be used to detect changes for the files # etc/concepts.csv and etc/langauges.csv # Load JSON data json_data = self.raw_dir.read_json(self.data_file_name) longnames = { rl['LanguageIx']: rl['RegionGpMemberLgNameLongInThisSubFamilyWebsite'].strip() for rl in json_data['regionLanguages'] } # Create raw/languages.csv for usage as etc/languages.csv fname = self.raw_dir / 'languages.csv' seen_codes = {} with dsv.UnicodeWriter(fname) as f: f.writerow([ 'ID', 'Name', 'LongName', 'Glottocode', 'Glottolog_Name', 'ISO639P3code', 'Macroarea', 'Latitude', 'Longitude', 'Family', 'IndexInSource' ]) for language in sorted(json_data['languages'], key=lambda k: int(k['LanguageIx'])): # Build ID lang_id = slug(language['ShortName']).capitalize() language['GlottoCode'] = language['GlottoCode'].strip()\ if language['GlottoCode'] else '' # add to language map if language['GlottoCode'] in seen_codes: gldata = seen_codes[language['GlottoCode']] else: gldata = args.glottolog.api.languoid( language['GlottoCode']) seen_codes[language['GlottoCode']] = gldata f.writerow([ lang_id, language['ShortName'].strip(), longnames[language['LanguageIx']] if longnames[language['LanguageIx']] != language['ShortName'].strip() else '', language['GlottoCode'], gldata.name if gldata else '', language['ISOCode'].strip(), gldata.macroareas[0].name if gldata and gldata.macroareas else '', language['Latitude'].strip() if language['Latitude'] else '', language['Longtitude'].strip() if language['Longtitude'] else '', gldata.family.name if gldata and gldata.family else '', language['LanguageIx'].strip(), ]) # Create raw/concepts.csv to compare it against etc/concepts.csv fname = self.raw_dir / 'concepts.csv' with dsv.UnicodeWriter(fname) as f: if self.second_gloss_lang is None: f.writerow([ 'ID', 'Name', 'Concepticon_ID', 'Concepticon_Gloss', 'IndexInSource' ]) else: f.writerow([ 'ID', 'Name', 'Concepticon_ID', 'Concepticon_Gloss', '{0}_Gloss'.format(self.second_gloss_lang), 'IndexInSource' ]) for c_idx, concept in enumerate( sorted(json_data['words'], key=lambda k: (int(k['IxElicitation']), int(k['IxMorphologicalInstance'])))): # Build ID concept_id = '%i_%s' % (c_idx, slug(concept['FullRfcModernLg01'])) # Unmapped concepts are reported with int(ID)<1 in source if int(concept['StudyDefaultConcepticonID']) > 0: concepticon_id = concept['StudyDefaultConcepticonID'] co_gloss = args.concepticon.api.conceptsets[ concepticon_id].gloss else: concepticon_id = None co_gloss = '' if self.second_gloss_lang is None: f.writerow([ concept_id, concept['FullRfcModernLg01'], concepticon_id, co_gloss, '%s-%s' % (concept['IxElicitation'], concept['IxMorphologicalInstance']), ]) else: f.writerow([ concept_id, concept['FullRfcModernLg01'], concepticon_id, co_gloss, concept['FullRfcModernLg02'], '%s-%s' % (concept['IxElicitation'], concept['IxMorphologicalInstance']), ])
def write_csv(self, fname, rows, **kw): with dsv.UnicodeWriter(self._path(fname), **kw) as writer: writer.writerows(rows)