def test_rewrite(tmpdir, tsvname=str(TESTDIR / 'tsv.txt'), csvname=str(TESTDIR / 'csv.txt')): filename = str(tmpdir / 'test.txt') shutil.copy(tsvname, filename) rewrite(filename, lambda i, row: [len(row)], delimiter='\t') assert next(iterrows(filename)) == ['2'] shutil.copy(csvname, filename) rewrite(filename, lambda i, row: row) assert list(iterrows(filename)) == list(iterrows(csvname))
def test_iterrows_dialect( lines=['1,x,y', ' #1,a,b', '#1,1,2', ',,', '1,3, 4\t ']): dialect = Dialect(trim=True, skipRows=1, skipColumns=1, skipBlankRows=True) r = list(iterrows(lines, dialect=dialect)) # make sure comment lines are stripped: assert len(r) == 2 # make sure cells are trimmmed: assert r[1][1] == '4' r = list( iterrows(lines, dialect=dialect.updated(skipRows=0, skipColumns=0))) assert r[2][2] == '4'
def test_add_delete_rows(tmpdir): filename = str(tmpdir / 'test.csv') add_rows(filename, ['a', 'b'], [1, 2], [3, 4]) assert len(list(iterrows(filename, dicts=True))) == 2 filter_rows_as_dict(filename, lambda item: item['a'] == '1') assert len(list(iterrows(filename, dicts=True))) == 1 add_rows(filename, [2, 2], [2, 4]) assert len(list(iterrows(filename, dicts=True))) == 3 nremoved = filter_rows_as_dict(filename, lambda item: item['a'] == '1') assert nremoved == 2
def test_roundtrip_with_keyword_dialect(tmpdir, rows=[['1', 'y'], [' "1 ', '3\t4']], dialect='excel'): filename = str(tmpdir / 'test.csv') with UnicodeWriter(filename, dialect=dialect) as w: w.writerows(rows) assert list(iterrows(filename, dialect=dialect)) == rows
def __init__(self, name_and_date, fp): parts = name_and_date.split('_') digits = map(int, DATESTAMP_PATTERN.match(parts[-1]).groups()) self.date = datetime.date(*digits) name = '_'.join(parts[:-1]) if name.startswith(('_', '-')): name = name[1:] if not name: name = 'Codes' self.name = name super(Table, self).__init__( iterrows(fp.splitlines(), dicts=True, delimiter='\t'))
def test_iterrows_restkey(lines=['a,b', '1,2,3,4', '1']): result = iterrows(lines, dicts=True, restkey='x', restval='y', delimiter=',') assert list(result) == [{ 'a': '1', 'b': '2', 'x': ['3', '4'] }, { 'a': '1', 'b': 'y' }]
def __init__(self, name_and_date, date, fp): parts = name_and_date.split('_') # The ISO 639-3 code tables from 2020-05-15 contain a table with a # malformed name - having an excess "0" in the date stamp. if parts[-1] == '202000515': # pragma: no cover date = '20200515' digits = map(int, DATESTAMP_PATTERN.match(date).groups()) self.date = datetime.date(*digits) name = '_'.join([p for p in parts if not DATESTAMP_PATTERN.match(p)]) if name.startswith(('_', '-')): name = name[1:] if not name: name = 'Codes' self.name = name super(Table, self).__init__(iterrows( [line for line in fp.splitlines() if line.strip()], # strip malformed lines. dicts=True, delimiter='\t'))
def from_data(cls, fname): fname = Path(fname) colnames = next(iterrows(fname), []) if not colnames: raise ValueError('empty data file!') if cls is Dataset: try: cls = next(mod.cls for mod in get_modules() if mod.match(fname)) except StopIteration: raise ValueError(fname) assert issubclass(cls, Dataset) and cls is not Dataset res = cls.from_metadata(fname.parent) required_cols = { c.name for c in res[res.primary_table].tableSchema.columns if c.required} if not required_cols.issubset(colnames): raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames))) return res
def from_data(cls, fname): fname = Path(fname) colnames = next(iterrows(fname), []) if not colnames: raise ValueError('empty data file!') if cls is Dataset: try: cls = next(mod.cls for mod in get_modules() if mod.match(fname)) except StopIteration: raise ValueError('{0} does not match a CLDF module spec'.format(fname)) assert issubclass(cls, Dataset) and cls is not Dataset res = cls.from_metadata(fname.parent) required_cols = { c.name for c in res[res.primary_table].tableSchema.columns if c.required} if not required_cols.issubset(colnames): raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames))) return res
def test_iterrows(rows=[['first', 'line'], ['s\u00fccond', 'l\u00e4ne\u00df']]): assert list(iterrows(TESTDIR / 'csv.txt')) == rows lines = ['\t'.join(r) for r in rows] assert list(iterrows(lines, delimiter='\t')) == rows for lt in ['\n', '\r\n', '\r']: # Simulate file opened in text mode: fp = io.StringIO(lt.join(lines), newline='') assert list(iterrows(fp, delimiter='\t')) == rows assert list(iterrows(lines, dicts=True, delimiter='\t')) == [OrderedDict(zip(*rows))] r = list(iterrows(lines, namedtuples=True, delimiter='\t')) assert len(r) == 1 and r[0].first == 's\u00fccond' r = list(iterrows([l.replace('\t', ',') for l in lines], namedtuples=True)) assert len(r) == 1 and r[0].first == 's\u00fccond'
def test_iterrows_tsv(filename=str(TESTDIR / 'tsv.txt')): res = list(iterrows(filename, namedtuples=True, delimiter='\t')) assert res[0].a_name == 'b' # Missing column values should be set to None: assert res[2].a_name is None
def test_iterrows_empty(): assert list(iterrows([], dicts=True, delimiter='\t')) == [] assert list(iterrows([''], dicts=True, fieldnames=['a', 'b'], delimiter='\t')) == \ [] assert list(iterrows(['a,b', ''], dicts=True, delimiter='\t')) == []
def test_iterrows_quote_comment(dialect, lines, expected): assert list(iterrows(lines, dialect=dialect)) == expected
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger): if fname.name != "forms.csv": cli.Exit.CLI_ARGUMENT_ERROR( "A metadata-free Wordlist must be in a file called 'forms.csv'.") default_wordlist = TableGroup.from_file( pycldf.util.pkg_path("modules", "Wordlist-metadata.json")) default_wordlist._fname = fname.with_name("Wordlist-metadata.json") ds = pycldf.Wordlist(default_wordlist) # `from_data` checks that the reqired columns of the FormTable are present, # but it does not consolidate the columns further. colnames = next(iterrows(fname)) understood_colnames = { c.name for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames } more_columns = { c.propertyUrl.uri: c for c in ds[ds.primary_table].tableSchema.columns if c.name not in understood_colnames } logger.info( "CLDF freely understood the columns %s in your forms.csv.", sorted(understood_colnames), ) # Consider the columns that were not understood. columns_without_metadata = set(colnames) - understood_colnames for column_name in columns_without_metadata: column: Column # Maybe they are known CLDF properties? if column_name in pycldf.terms.TERMS: column = pycldf.TERMS[column_name].to_column() # Maybe they are CLDF default column names? elif column_name in DEFAULT_NAME_COLUMNS: column = DEFAULT_NAME_COLUMNS[column_name] # Maybe they are columns that Lexedata knows to handle? elif column_name in LEXEDATA_COLUMNS: column = LEXEDATA_COLUMNS[column_name] # Maybe they are columns inherited from LingPy? elif column_name.upper() in LINGPY_COLUMNS: column = LINGPY_COLUMNS[column_name.upper()] # Maybe they are some name we have seen before? elif column_name in OTHER_KNOWN_COLUMNS: column = OTHER_KNOWN_COLUMNS[column_name] else: # TODO: Maybe they look like they have a specific type? ... # Otherwise, they are probably just text to be kept. column = Column( datatype=Datatype(base="string"), default="", null=[""], name=column_name, ) column.name = column_name ds[ds.primary_table].tableSchema.columns.append(column) summary = column.propertyUrl or column.datatype logger.info(f"Column {column_name} seems to be a {summary} column.") if column.propertyUrl: to_be_replaced = more_columns.pop(column.propertyUrl.uri, None) if to_be_replaced is not None: ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced) for column in more_columns.values(): logger.info( f"Also added column {column.name}, as expected for a FormTable.") ds[ds.primary_table].tableSchema.columns.sort( key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10) # TODO: Once lexedata is properly published, we can give a better URL. ds.properties["dc:contributor"] = [ "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py" ] return ds
def test_iterrows_invalid(): with pytest.raises(ValueError, match=r'either namedtuples or dicts'): next(iterrows([], namedtuples=True, dicts=True))
def _read(self, what): return iterrows(self.raw_dir / "{0}.tsv".format(what), dicts=True, delimiter="\t")
def test_iterrows_with_bom(tmpdir): filepath = tmpdir / 'spam.csv' filepath.write_text('\ufeffcol1,col2\nval1,val2', encoding='utf8') rows = list(iterrows(str(filepath))) assert rows[0] == ['col1', 'col2']