def __init__(self, args, column_names): self.args = args self.column_names = column_names self.lat_column = None self.lon_column = None self.type_column = None self.geometry_column = None self.id_column = None self.lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) self.lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.type: self.type_column = match_column_identifier( column_names, self.args.type, self.args.zero_based) if self.args.geometry: self.geometry_column = match_column_identifier( column_names, self.args.geometry, self.args.zero_based) if self.args.key: self.id_column = match_column_identifier( column_names, self.args.key, self.args.zero_based)
def test_match_column_identifier_string(self): self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here')) self.assertEqual( 2, match_column_identifier(self.headers, 'i_work_here', column_offset=0))
def test_match_column_identifier_string(self): self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here')) self.assertEqual( 2, match_column_identifier(self.headers, 'i_work_here', zero_based=True))
def __init__(self, args, column_names): self.args = args self.column_names = column_names self.lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) self.lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.type: self.type_column = match_column_identifier(column_names, self.args.type, self.args.zero_based) else: self.type_column = None if self.args.geometry: self.geometry_column = match_column_identifier(column_names, self.args.geometry, self.args.zero_based) else: self.geometry_column = None if self.args.key: self.id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: self.id_column = None
def test_match_column_identifier_numeric(self): self.assertEqual(2, match_column_identifier(self.headers, 3)) self.assertEqual( 3, match_column_identifier(self.headers, 3, column_offset=0))
def main(self): if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def dump_json(data, newline=False): json.dump(data, stream, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error('--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error('--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error('--crs is only allowed when --lat and --lon are also specified.') if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error('--stream is only allowed if --lat, --lon and --key are not specified.') # GeoJSON if self.args.lat and self.args.lon: rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([ ('type', 'Point'), ('coordinates', [lon, lat]) ]) feature['properties'] = properties features.append(feature) output = OrderedDict([ ('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features) ]) if self.args.crs: output['crs'] = OrderedDict([ ('type', 'name'), ('properties', { 'name': self.args.crs }) ]) dump_json(output) elif self.args.streamOutput and self.args.no_inference: rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None dump_json(data, newline=True) else: table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types()) table.to_json(stream, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent)
def main(self): if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def dump_json(data, newline=False): json.dump(data, stream, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error( '--stream is only allowed if --lat, --lon and --key are not specified.' ) rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([('type', 'Point'), ('coordinates', [lon, lat])]) feature['properties'] = properties features.append(feature) output = OrderedDict([('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features)]) if self.args.crs: output['crs'] = OrderedDict([('type', 'name'), ('properties', { 'name': self.args.crs })]) dump_json(output) # Keyed JSON elif self.args.key: output = OrderedDict() for row in rows: data = OrderedDict() for i, column in enumerate(column_names): data[column] = row[i] k = data[self.args.key] if k in output: raise NonUniqueKeyColumnException( 'Value %s is not unique in the key column.' % six.text_type(k)) output[k] = data dump_json(output) # Boring JSON else: output = [] for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None if (self.args.streamOutput): dump_json(data, newline=True) else: output.append(data) if not self.args.streamOutput: dump_json(output)
def main(self): """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error('--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error('--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error('--crs is only allowed when --lat and --lon are also specified.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([ ('type', 'Point'), ('coordinates', [lon, lat]) ]) feature['properties'] = properties features.append(feature) output = OrderedDict([ ('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features) ]) if self.args.crs: output['crs'] = OrderedDict([ ('type', 'name'), ('properties', { 'name': self.args.crs }) ]) # Keyed JSON elif self.args.key: output = OrderedDict() for row in rows: data = OrderedDict() for i, column in enumerate(column_names): data[column] = row[i] k = data[self.args.key] if k in output: raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k)) output[k] = data # Boring JSON else: output = [] for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None output.append(data) kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: kwargs['encoding'] = 'utf-8' json.dump(output, stream, **kwargs)
def test_match_column_identifier_numeric(self): self.assertEqual(2, match_column_identifier(self.headers, 3)) self.assertEqual(3, match_column_identifier(self.headers, 3, zero_based=True))
def main(self): # We need to do this dance here, because we aren't writing through agate. if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def default(obj): if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat() raise TypeError('%s is not JSON serializable' % repr(obj)) def dump_json(data, newline=False): json.dump(data, stream, default=default, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error( '--stream is only allowed if --lat, --lon and --key are not specified.' ) # GeoJSON if self.args.lat and self.args.lon: table = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs) features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(table.column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(table.column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(table.column_names, self.args.key, self.args.zero_based) else: id_column = None for row in table.rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif i == id_column: geoid = c else: properties[table.column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([('type', 'Point'), ('coordinates', [lon, lat])]) feature['properties'] = properties features.append(feature) output = OrderedDict([('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features)]) if self.args.crs: output['crs'] = OrderedDict([('type', 'name'), ('properties', { 'name': self.args.crs })]) dump_json(output) elif self.args.streamOutput and self.args.no_inference and not self.args.skip_lines: rows = agate.csv.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None dump_json(data, newline=True) else: table = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs) table.to_json(self.output_file, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error( 'You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names( self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error( 'The number of join column names must match the number of files, or be a single column name that exists in all files.' ) if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error( 'You must provide join column names when performing an outer join.' ) if self.args.left_join and self.args.right_join: self.argparser.error( 'It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(CSVKitReader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append( match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)
def main(self): if six.PY2: stream = codecs.getwriter("utf-8")(self.output_file) else: stream = self.output_file json_kwargs = {"ensure_ascii": False, "indent": self.args.indent} if six.PY2: json_kwargs["encoding"] = "utf-8" def default(obj): if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat() raise TypeError("%s is not JSON serializable" % repr(obj)) def dump_json(data, newline=False): json.dump(data, stream, default=default, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error("--lon is required whenever --lat is specified.") if self.args.lon and not self.args.lat: self.argparser.error("--lat is required whenever --lon is specified.") if self.args.crs and not self.args.lat: self.argparser.error("--crs is only allowed when --lat and --lon are also specified.") if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error("--stream is only allowed if --lat, --lon and --key are not specified.") # GeoJSON if self.args.lat and self.args.lon: table = agate.Table.from_csv( self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs ) features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(table.column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(table.column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(table.column_names, self.args.key, self.args.zero_based) else: id_column = None for row in table.rows: feature = OrderedDict() feature["type"] = "Feature" properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif i == id_column: geoid = c else: properties[table.column_names[i]] = c if id_column is not None: feature["id"] = geoid feature["geometry"] = OrderedDict([("type", "Point"), ("coordinates", [lon, lat])]) feature["properties"] = properties features.append(feature) output = OrderedDict( [("type", "FeatureCollection"), ("bbox", [min_lon, min_lat, max_lon, max_lat]), ("features", features)] ) if self.args.crs: output["crs"] = OrderedDict([("type", "name"), ("properties", {"name": self.args.crs})]) dump_json(output) elif self.args.streamOutput and self.args.no_inference: rows = agate.csv.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None dump_json(data, newline=True) else: table = agate.Table.from_csv( self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs ) table.to_json(self.output_file, key=self.args.key, newline=self.args.streamOutput, **json_kwargs)
def main(self): """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() stream = codecs.getwriter('utf-8')(self.output_file) # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = {'type': 'Feature'} properties = {} geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: lat = float(c) if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: lon = float(c) if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = { 'type': 'Point', 'coordinates': [lon, lat] } feature['properties'] = properties features.append(feature) output = { 'type': 'FeatureCollection', 'bbox': [min_lon, min_lat, max_lon, max_lat], 'features': features } if self.args.crs: output['crs'] = { 'type': 'name', 'properties': { 'name': self.args.crs } } # Keyed JSON elif self.args.key: output = {} for row in rows: row_dict = dict(zip(column_names, row)) k = row_dict[self.args.key] if k in output: raise NonUniqueKeyColumnException( 'Value %s is not unique in the key column.' % unicode(k)) output[k] = row_dict # Boring JSON else: output = [dict(zip(column_names, row)) for row in rows] json.dump(output, stream, ensure_ascii=False, indent=self.args.indent, encoding='utf-8')
def test_match_column_identifier_numeric(self): self.assertEqual(2, match_column_identifier(self.headers, 3))
def test_match_column_identifier_string(self): self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here')) self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here', column_offset=0))
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] sniff_limit = self.args.sniff_limit column_types = self.get_column_types() for f in self.input_files: tables.append(agate.Table.from_csv( f, skip_lines=self.args.skip_lines, sniff_limit=sniff_limit, column_types=column_types, **self.reader_kwargs )) f.close() join_column_ids = [] if self.args.columns: for i, table in enumerate(tables): join_column_ids.append(match_column_identifier(table.column_names, join_column_names[i])) jointab = tables[0] if self.args.left_join: # Left outer join for i, table in enumerate(tables[1:]): jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, table in enumerate(remaining_tables): jointab = agate.Table.join(jointab, table, join_column_ids[-(i + 2)], join_column_ids[-1]) elif self.args.outer_join: # Full outer join for i, table in enumerate(tables[1:]): jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1], full_outer=True) elif self.args.columns: # Inner join for i, table in enumerate(tables[1:]): jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1], inner=True) else: # Sequential join for table in tables[1:]: jointab = agate.Table.join(jointab, table, full_outer=True) jointab.to_csv(self.output_file, **self.writer_kwargs)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(CSVKitReader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append(match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)
def main(self): tab = table.Table.from_csv(self.args.files[0], **self.reader_kwargs) lookup = table.Table.from_csv(self.args.files[1], **self.reader_kwargs) # store indices of columns to do the fuzzy match on fuzzy_columns = self.args.columns if not fuzzy_columns: self.argparser.error("Fuzzy match column -f must be specified.") if "=" in fuzzy_columns: (tab_fuzzy_field, lookup_fuzzy_field) = fuzzy_columns.split("=") else: (tab_fuzzy_field, lookup_fuzzy_field) = (fuzzy_columns, fuzzy_columns) tab_fuzzy_index = match_column_identifier(tab.headers(), tab_fuzzy_field) lookup_fuzzy_index = match_column_identifier(lookup.headers(), lookup_fuzzy_field) # store indices of columns to do a normal left join on. # limiting fuzzy matches to a set of hard matches on other columns is optional. hard_joins = [] if self.args.hard: for pair in self.args.hard.split(","): if "=" in pair: (tabfield, lookupfield) = pair.split("=") else: (tabfield, lookupfield) = (pair, pair) hard_joins.append( ( match_column_identifier(tab.headers(), tabfield), match_column_identifier(lookup.headers(), lookupfield), ) ) if len(hard_joins) == 0: # we're not doing any hard joins, so our pool of possible matches is everything join_dict = lookup.to_rows() else: # create a hash using the hard-join keys so we don't recreate a list of possible matches for the fuzzy join each time. # in other words, if we're doing a fuzzy join on a lawmaker's name, but limiting via hard join to records that match # on state and party, we just have to do the fuzzy join on join_dict[('democrat','ny')] join_dict = {} for row in lookup.to_rows(): hashkey = tuple([row[x[1]] for x in hard_joins]) if hashkey not in join_dict.keys(): join_dict[hashkey] = [] join_dict[hashkey].append(row) output = CSVKitWriter(self.output_file, **self.writer_kwargs) newheaders = tab.headers() + ["match_score"] + lookup.headers() output.writerow(newheaders) def getfirstmatch(lookup_rows, lookup_fuzzy_index, fuzzy_match): # it's assumed that most lookup tables will have the intended match appear only once, but if there are # multiple rows containing the same value of the fuzzy match column, where that value was selected as # the best match for the main table, we just take the first time it appears in the lookup table for row in lookup_rows: if row[lookup_fuzzy_index] == fuzzy_match: return row for row in tab.to_rows(): possible_matches = [] if type(join_dict) == list: # we're not using any hard-join columns, so search the whole table possible_matches = join_dict else: # use the previously assembled hash to rapidly look for sets of rows that match on all hard-join fields hashkey = tuple( [row[x[0]] for x in hard_joins] ) # find values in the columns specified in the table_col side of hard_joins if hashkey in join_dict.keys(): possible_matches = join_dict[hashkey] if not len(possible_matches): output.writerow(list(row) + [""] + ["" for x in lookup.headers()]) else: # use fuzzywuzzy's levenstein implementation to select the best match in our universe of options. lookup_vals = [x[lookup_fuzzy_index] for x in possible_matches] # the meat is all in this one line. # score is a number from 0 to 100 representing how good the fuzzy match is, and it becomes # a field in our output table so users can react accordingly. (fuzzy_match, fuzzy_score) = process.extractOne(row[tab_fuzzy_index], lookup_vals) # we could say a really bad score counts as no match, and set nomatch=True, but we won't lookup_row = getfirstmatch(possible_matches, lookup_fuzzy_index, fuzzy_match) output.writerow(list(row) + [fuzzy_score] + list(lookup_row))
def test_match_column_identifier_string(self): self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here')) self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here', zero_based=True))
def main(self): """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error('--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error('--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error('--crs is only allowed when --lat and --lon are also specified.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = { 'type': 'Feature' } properties = {} geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: lat = float(c) if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: lon = float(c) if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = { 'type': 'Point', 'coordinates': [lon, lat] } feature['properties'] = properties features.append(feature) output = { 'type': 'FeatureCollection', 'bbox': [min_lon, min_lat, max_lon, max_lat], 'features': features } if self.args.crs: output['crs'] = { 'type': 'name', 'properties': { 'name': self.args.crs } } # Keyed JSON elif self.args.key: output = {} for row in rows: row_dict = dict(zip(column_names, row)) k = row_dict[self.args.key] if k in output: raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k)) output[k] = row_dict # Boring JSON else: output = [dict(zip(column_names, row)) for row in rows] kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: kwargs['encoding'] = 'utf-8' json.dump(output, stream, **kwargs)
def test_match_column_which_could_be_integer_name_is_treated_as_positional_id(self): self.assertEqual(0, match_column_identifier(self.headers, '1')) self.assertEqual(1, match_column_identifier(self.headers, '1', zero_based=True))
def main(self): if self.additional_input_expected(): if self.args.streamOutput and self.args.no_inference and not self.args.skip_lines and self.args.sniff_limit == 0: sys.stderr.write( 'No input file or piped data provided. Waiting for standard input:\n' ) else: self.argparser.error( 'You must provide an input file or piped data.') # We need to do this dance here, because we aren't writing through agate. if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def default(obj): if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat() elif isinstance(obj, decimal.Decimal): return str(obj) raise TypeError('%s is not JSON serializable' % repr(obj)) def dump_json(data, newline=False): json.dump(data, stream, default=default, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) if self.args.type and not self.args.lat: self.argparser.error( '--type is only allowed when --lat and --lon are also specified.' ) if self.args.geometry and not self.args.lat: self.argparser.error( '--geometry is only allowed when --lat and --lon are also specified.' ) if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error( '--stream is only allowed if --lat, --lon and --key are not specified.' ) # GeoJSON if self.args.lat and self.args.lon: table = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs) features = [] self.min_lon = None self.min_lat = None self.max_lon = None self.max_lat = None def update_boundary_lat(lat): if self.min_lat is None or lat < self.min_lat: self.min_lat = lat if self.max_lat is None or lat > self.max_lat: self.max_lat = lat def update_boundary_lon(lon): if self.min_lon is None or lon < self.min_lon: self.min_lon = lon if self.max_lon is None or lon > self.max_lon: self.max_lon = lon def update_boundary_coordinates(coordinates): if len(coordinates) <= 3 and isinstance( coordinates[0], (float, int)): update_boundary_lon(coordinates[0]) update_boundary_lat(coordinates[1]) else: for coordinate in coordinates: update_boundary_coordinates(coordinate) lat_column = match_column_identifier(table.column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(table.column_names, self.args.lon, self.args.zero_based) if self.args.type: type_column = match_column_identifier(table.column_names, self.args.type, self.args.zero_based) else: type_column = None if self.args.geometry: geometry_column = match_column_identifier( table.column_names, self.args.geometry, self.args.zero_based) else: geometry_column = None if self.args.key: id_column = match_column_identifier(table.column_names, self.args.key, self.args.zero_based) else: id_column = None for row in table.rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() feature_id = None lat = None lon = None geometry = None for i, c in enumerate(row): if c is None: continue if i == lat_column: try: lat = float(c) except ValueError: lat = None if not self.args.no_bbox: update_boundary_lat(lat) elif i == lon_column: try: lon = float(c) except ValueError: lon = None if not self.args.no_bbox: update_boundary_lon(lon) elif i == id_column: feature_id = c elif i == type_column: pass # Prevent "type" from being added to "properties". elif i == geometry_column: geometry = json.loads(c) if not self.args.no_bbox and 'coordinates' in geometry: update_boundary_coordinates( geometry['coordinates']) elif c: properties[table.column_names[i]] = c if id_column is not None: feature['id'] = feature_id feature['properties'] = properties if geometry or lat is None and lon is None: feature['geometry'] = geometry elif lon and lat: feature['geometry'] = OrderedDict([('type', 'Point'), ('coordinates', [lon, lat])]) features.append(feature) items = [ ('type', 'FeatureCollection'), ('features', features), ] if not self.args.no_bbox: items.insert(1, ('bbox', [ self.min_lon, self.min_lat, self.max_lon, self.max_lat ])) output = OrderedDict(items) if self.args.crs: output['crs'] = OrderedDict([('type', 'name'), ('properties', { 'name': self.args.crs })]) dump_json(output) elif self.args.streamOutput and self.args.no_inference and not self.args.skip_lines and self.args.sniff_limit == 0: rows = agate.csv.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None dump_json(data, newline=True) else: table = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs) table.to_json( self.output_file, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent, )