Ejemplo n.º 1
0
        def __init__(self, args, column_names):
            self.args = args
            self.column_names = column_names
            self.lat_column = None
            self.lon_column = None
            self.type_column = None
            self.geometry_column = None
            self.id_column = None

            self.lat_column = match_column_identifier(column_names,
                                                      self.args.lat,
                                                      self.args.zero_based)

            self.lon_column = match_column_identifier(column_names,
                                                      self.args.lon,
                                                      self.args.zero_based)

            if self.args.type:
                self.type_column = match_column_identifier(
                    column_names, self.args.type, self.args.zero_based)

            if self.args.geometry:
                self.geometry_column = match_column_identifier(
                    column_names, self.args.geometry, self.args.zero_based)

            if self.args.key:
                self.id_column = match_column_identifier(
                    column_names, self.args.key, self.args.zero_based)
Ejemplo n.º 2
0
 def test_match_column_identifier_string(self):
     self.assertEqual(2, match_column_identifier(self.headers,
                                                 'i_work_here'))
     self.assertEqual(
         2,
         match_column_identifier(self.headers,
                                 'i_work_here',
                                 column_offset=0))
Ejemplo n.º 3
0
 def test_match_column_identifier_string(self):
     self.assertEqual(2, match_column_identifier(self.headers,
                                                 'i_work_here'))
     self.assertEqual(
         2,
         match_column_identifier(self.headers,
                                 'i_work_here',
                                 zero_based=True))
Ejemplo n.º 4
0
        def __init__(self, args, column_names):
            self.args = args
            self.column_names = column_names

            self.lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based)

            self.lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based)

            if self.args.type:
                self.type_column = match_column_identifier(column_names, self.args.type, self.args.zero_based)
            else:
                self.type_column = None

            if self.args.geometry:
                self.geometry_column = match_column_identifier(column_names, self.args.geometry, self.args.zero_based)
            else:
                self.geometry_column = None

            if self.args.key:
                self.id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based)
            else:
                self.id_column = None
Ejemplo n.º 5
0
 def test_match_column_identifier_numeric(self):
     self.assertEqual(2, match_column_identifier(self.headers, 3))
     self.assertEqual(
         3, match_column_identifier(self.headers, 3, column_offset=0))
Ejemplo n.º 6
0
    def main(self):
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def dump_json(data, newline=False):
            json.dump(data, stream, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error('--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error('--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error('--crs is only allowed when --lat and --lon are also specified.')

        if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key):
            self.argparser.error('--stream is only allowed if --lat, --lon and --key are not specified.')

        # GeoJSON
        if self.args.lat and self.args.lon:
            rows = agate.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([
                    ('type', 'Point'),
                    ('coordinates', [lon, lat])
                ])

                feature['properties'] = properties

                features.append(feature)

            output = OrderedDict([
                ('type', 'FeatureCollection'),
                ('bbox', [min_lon, min_lat, max_lon, max_lat]),
                ('features', features)
            ])

            if self.args.crs:
                output['crs'] = OrderedDict([
                    ('type', 'name'),
                    ('properties', {
                        'name': self.args.crs
                    })
                ])
            dump_json(output)
        elif self.args.streamOutput and self.args.no_inference:
            rows = agate.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            for row in rows:
                data = OrderedDict()
                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                dump_json(data, newline=True)
        else:
            table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types())
            table.to_json(stream, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent)
Ejemplo n.º 7
0
    def main(self):
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def dump_json(data, newline=False):
            json.dump(data, stream, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )

        if self.args.streamOutput and (self.args.lat or self.args.lon
                                       or self.args.key):
            self.argparser.error(
                '--stream is only allowed if --lat, --lon and --key are not specified.'
            )

        rows = agate.reader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon,
                                                 self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([('type', 'Point'),
                                                   ('coordinates', [lon,
                                                                    lat])])

                feature['properties'] = properties

                features.append(feature)

            output = OrderedDict([('type', 'FeatureCollection'),
                                  ('bbox',
                                   [min_lon, min_lat, max_lon, max_lat]),
                                  ('features', features)])

            if self.args.crs:
                output['crs'] = OrderedDict([('type', 'name'),
                                             ('properties', {
                                                 'name': self.args.crs
                                             })])
            dump_json(output)
        # Keyed JSON
        elif self.args.key:
            output = OrderedDict()

            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    data[column] = row[i]

                k = data[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        six.text_type(k))

                output[k] = data
            dump_json(output)
        # Boring JSON
        else:
            output = []
            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                if (self.args.streamOutput):
                    dump_json(data, newline=True)
                else:
                    output.append(data)
            if not self.args.streamOutput:
                dump_json(output)
Ejemplo n.º 8
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error('--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error('--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error('--crs is only allowed when --lat and --lon are also specified.')

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file 

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([ 
                    ('type', 'Point'),
                    ('coordinates', [lon, lat])
                ])

                feature['properties'] = properties

                features.append(feature)

            output = OrderedDict([
                ('type', 'FeatureCollection'),
                ('bbox', [min_lon, min_lat, max_lon, max_lat]),
                ('features', features)
            ])

            if self.args.crs:
                output['crs'] = OrderedDict([ 
                    ('type', 'name'),
                    ('properties', {
                        'name': self.args.crs
                    })
                ])
        # Keyed JSON
        elif self.args.key:
            output = OrderedDict()
            
            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    data[column] = row[i]

                k = data[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k))

                output[k] = data
        # Boring JSON
        else:
            output = []

            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None

                output.append(data)

        kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            kwargs['encoding'] = 'utf-8'

        json.dump(output, stream, **kwargs)
Ejemplo n.º 9
0
 def test_match_column_identifier_numeric(self):
     self.assertEqual(2, match_column_identifier(self.headers, 3))
     self.assertEqual(3, match_column_identifier(self.headers, 3, zero_based=True))
Ejemplo n.º 10
0
    def main(self):
        # We need to do this dance here, because we aren't writing through agate.
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def default(obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()
            raise TypeError('%s is not JSON serializable' % repr(obj))

        def dump_json(data, newline=False):
            json.dump(data, stream, default=default, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )

        if self.args.streamOutput and (self.args.lat or self.args.lon
                                       or self.args.key):
            self.argparser.error(
                '--stream is only allowed if --lat, --lon and --key are not specified.'
            )

        # GeoJSON
        if self.args.lat and self.args.lon:
            table = agate.Table.from_csv(self.input_file,
                                         skip_lines=self.args.skip_lines,
                                         sniff_limit=self.args.sniff_limit,
                                         column_types=self.get_column_types(),
                                         **self.reader_kwargs)

            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(table.column_names,
                                                 self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(table.column_names,
                                                 self.args.lon,
                                                 self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(table.column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in table.rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None
                        if min_lat is None or lat < min_lat:
                            min_lat = lat
                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None
                        if min_lon is None or lon < min_lon:
                            min_lon = lon
                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif i == id_column:
                        geoid = c
                    else:
                        properties[table.column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([('type', 'Point'),
                                                   ('coordinates', [lon,
                                                                    lat])])
                feature['properties'] = properties
                features.append(feature)

            output = OrderedDict([('type', 'FeatureCollection'),
                                  ('bbox',
                                   [min_lon, min_lat, max_lon, max_lat]),
                                  ('features', features)])

            if self.args.crs:
                output['crs'] = OrderedDict([('type', 'name'),
                                             ('properties', {
                                                 'name': self.args.crs
                                             })])

            dump_json(output)
        elif self.args.streamOutput and self.args.no_inference and not self.args.skip_lines:
            rows = agate.csv.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            for row in rows:
                data = OrderedDict()
                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                dump_json(data, newline=True)
        else:
            table = agate.Table.from_csv(self.input_file,
                                         skip_lines=self.args.skip_lines,
                                         sniff_limit=self.args.sniff_limit,
                                         column_types=self.get_column_types(),
                                         **self.reader_kwargs)

            table.to_json(self.output_file,
                          key=self.args.key,
                          newline=self.args.streamOutput,
                          indent=self.args.indent)
Ejemplo n.º 11
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error(
                'You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(
                self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error(
                    'The number of join column names must match the number of files, or be a single column name that exists in all files.'
                )

        if (self.args.left_join or self.args.right_join
                or self.args.outer_join) and not self.args.columns:
            self.argparser.error(
                'You must provide join column names when performing an outer join.'
            )

        if self.args.left_join and self.args.right_join:
            self.argparser.error(
                'It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(CSVKitReader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []

        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(
                    match_column_identifier(t[0], join_column_names[i]))

        jointab = []

        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t,
                                               join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)],
                                                jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t,
                                               join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t,
                                              join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)
Ejemplo n.º 12
0
    def main(self):
        if six.PY2:
            stream = codecs.getwriter("utf-8")(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {"ensure_ascii": False, "indent": self.args.indent}

        if six.PY2:
            json_kwargs["encoding"] = "utf-8"

        def default(obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()
            raise TypeError("%s is not JSON serializable" % repr(obj))

        def dump_json(data, newline=False):
            json.dump(data, stream, default=default, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error("--lon is required whenever --lat is specified.")

        if self.args.lon and not self.args.lat:
            self.argparser.error("--lat is required whenever --lon is specified.")

        if self.args.crs and not self.args.lat:
            self.argparser.error("--crs is only allowed when --lat and --lon are also specified.")

        if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key):
            self.argparser.error("--stream is only allowed if --lat, --lon and --key are not specified.")

        # GeoJSON
        if self.args.lat and self.args.lon:
            table = agate.Table.from_csv(
                self.input_file,
                sniff_limit=self.args.sniff_limit,
                column_types=self.get_column_types(),
                **self.reader_kwargs
            )

            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(table.column_names, self.args.lat, self.args.zero_based)
            lon_column = match_column_identifier(table.column_names, self.args.lon, self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(table.column_names, self.args.key, self.args.zero_based)
            else:
                id_column = None

            for row in table.rows:
                feature = OrderedDict()
                feature["type"] = "Feature"
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None
                        if min_lat is None or lat < min_lat:
                            min_lat = lat
                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None
                        if min_lon is None or lon < min_lon:
                            min_lon = lon
                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif i == id_column:
                        geoid = c
                    else:
                        properties[table.column_names[i]] = c

                if id_column is not None:
                    feature["id"] = geoid

                feature["geometry"] = OrderedDict([("type", "Point"), ("coordinates", [lon, lat])])
                feature["properties"] = properties
                features.append(feature)

            output = OrderedDict(
                [("type", "FeatureCollection"), ("bbox", [min_lon, min_lat, max_lon, max_lat]), ("features", features)]
            )

            if self.args.crs:
                output["crs"] = OrderedDict([("type", "name"), ("properties", {"name": self.args.crs})])

            dump_json(output)
        elif self.args.streamOutput and self.args.no_inference:
            rows = agate.csv.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            for row in rows:
                data = OrderedDict()
                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                dump_json(data, newline=True)
        else:
            table = agate.Table.from_csv(
                self.input_file,
                sniff_limit=self.args.sniff_limit,
                column_types=self.get_column_types(),
                **self.reader_kwargs
            )
            table.to_json(self.output_file, key=self.args.key, newline=self.args.streamOutput, **json_kwargs)
Ejemplo n.º 13
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        stream = codecs.getwriter('utf-8')(self.output_file)

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon,
                                                 self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = {'type': 'Feature'}
                properties = {}
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        lat = float(c)

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        lon = float(c)

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = {
                    'type': 'Point',
                    'coordinates': [lon, lat]
                }

                feature['properties'] = properties

                features.append(feature)

            output = {
                'type': 'FeatureCollection',
                'bbox': [min_lon, min_lat, max_lon, max_lat],
                'features': features
            }

            if self.args.crs:
                output['crs'] = {
                    'type': 'name',
                    'properties': {
                        'name': self.args.crs
                    }
                }
        # Keyed JSON
        elif self.args.key:
            output = {}

            for row in rows:
                row_dict = dict(zip(column_names, row))
                k = row_dict[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        unicode(k))

                output[k] = row_dict
        # Boring JSON
        else:
            output = [dict(zip(column_names, row)) for row in rows]

        json.dump(output,
                  stream,
                  ensure_ascii=False,
                  indent=self.args.indent,
                  encoding='utf-8')
Ejemplo n.º 14
0
 def test_match_column_identifier_numeric(self):
     self.assertEqual(2, match_column_identifier(self.headers, 3))
Ejemplo n.º 15
0
 def test_match_column_identifier_string(self):
     self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here'))
     self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here', column_offset=0))
Ejemplo n.º 16
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.')

        if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
            self.argparser.error('You must provide join column names when performing an outer join.')

        if self.args.left_join and self.args.right_join:
            self.argparser.error('It is not valid to specify both a left and a right join.')

        tables = []
        sniff_limit = self.args.sniff_limit
        column_types = self.get_column_types()

        for f in self.input_files:
            tables.append(agate.Table.from_csv(
                f,
                skip_lines=self.args.skip_lines,
                sniff_limit=sniff_limit,
                column_types=column_types,
                **self.reader_kwargs
            ))
            f.close()

        join_column_ids = []

        if self.args.columns:
            for i, table in enumerate(tables):
                join_column_ids.append(match_column_identifier(table.column_names, join_column_names[i]))

        jointab = tables[0]

        if self.args.left_join:
            # Left outer join
            for i, table in enumerate(tables[1:]):
                jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, table in enumerate(remaining_tables):
                jointab = agate.Table.join(jointab, table, join_column_ids[-(i + 2)], join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            for i, table in enumerate(tables[1:]):
                jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1], full_outer=True)
        elif self.args.columns:
            # Inner join
            for i, table in enumerate(tables[1:]):
                jointab = agate.Table.join(jointab, table, join_column_ids[0], join_column_ids[i + 1], inner=True)
        else:
            # Sequential join
            for table in tables[1:]:
                jointab = agate.Table.join(jointab, table, full_outer=True)

        jointab.to_csv(self.output_file, **self.writer_kwargs)
Ejemplo n.º 17
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.')

        if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
            self.argparser.error('You must provide join column names when performing an outer join.')

        if self.args.left_join and self.args.right_join:
             self.argparser.error('It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(CSVKitReader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []
        
        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))

        jointab = []
        
        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)
Ejemplo n.º 18
0
    def main(self):

        tab = table.Table.from_csv(self.args.files[0], **self.reader_kwargs)
        lookup = table.Table.from_csv(self.args.files[1], **self.reader_kwargs)

        # store indices of columns to do the fuzzy match on
        fuzzy_columns = self.args.columns
        if not fuzzy_columns:
            self.argparser.error("Fuzzy match column -f must be specified.")
        if "=" in fuzzy_columns:
            (tab_fuzzy_field, lookup_fuzzy_field) = fuzzy_columns.split("=")
        else:
            (tab_fuzzy_field, lookup_fuzzy_field) = (fuzzy_columns, fuzzy_columns)
        tab_fuzzy_index = match_column_identifier(tab.headers(), tab_fuzzy_field)
        lookup_fuzzy_index = match_column_identifier(lookup.headers(), lookup_fuzzy_field)

        # store indices of columns to do a normal left join on.
        # limiting fuzzy matches to a set of hard matches on other columns is optional.
        hard_joins = []
        if self.args.hard:
            for pair in self.args.hard.split(","):
                if "=" in pair:
                    (tabfield, lookupfield) = pair.split("=")
                else:
                    (tabfield, lookupfield) = (pair, pair)
                hard_joins.append(
                    (
                        match_column_identifier(tab.headers(), tabfield),
                        match_column_identifier(lookup.headers(), lookupfield),
                    )
                )

        if len(hard_joins) == 0:
            # we're not doing any hard joins, so our pool of possible matches is everything
            join_dict = lookup.to_rows()
        else:
            # create a hash using the hard-join keys so we don't recreate a list of possible matches for the fuzzy join each time.
            # in other words, if we're doing a fuzzy join on a lawmaker's name, but limiting via hard join to records that match
            # on state and party, we just have to do the fuzzy join on join_dict[('democrat','ny')]
            join_dict = {}

            for row in lookup.to_rows():
                hashkey = tuple([row[x[1]] for x in hard_joins])
                if hashkey not in join_dict.keys():
                    join_dict[hashkey] = []
                join_dict[hashkey].append(row)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        newheaders = tab.headers() + ["match_score"] + lookup.headers()
        output.writerow(newheaders)

        def getfirstmatch(lookup_rows, lookup_fuzzy_index, fuzzy_match):
            # it's assumed that most lookup tables will have the intended match appear only once, but if there are
            # multiple rows containing the same value of the fuzzy match column, where that value was selected as
            # the best match for the main table, we just take the first time it appears in the lookup table
            for row in lookup_rows:
                if row[lookup_fuzzy_index] == fuzzy_match:
                    return row

        for row in tab.to_rows():

            possible_matches = []
            if type(join_dict) == list:  # we're not using any hard-join columns, so search the whole table
                possible_matches = join_dict
            else:
                # use the previously assembled hash to rapidly look for sets of rows that match on all hard-join fields
                hashkey = tuple(
                    [row[x[0]] for x in hard_joins]
                )  # find values in the columns specified in the table_col side of hard_joins
                if hashkey in join_dict.keys():
                    possible_matches = join_dict[hashkey]

            if not len(possible_matches):
                output.writerow(list(row) + [""] + ["" for x in lookup.headers()])
            else:
                # use fuzzywuzzy's levenstein implementation to select the best match in our universe of options.
                lookup_vals = [x[lookup_fuzzy_index] for x in possible_matches]
                # the meat is all in this one line.
                # score is a number from 0 to 100 representing how good the fuzzy match is, and it becomes
                # a field in our output table so users can react accordingly.
                (fuzzy_match, fuzzy_score) = process.extractOne(row[tab_fuzzy_index], lookup_vals)
                # we could say a really bad score counts as no match, and set nomatch=True, but we won't
                lookup_row = getfirstmatch(possible_matches, lookup_fuzzy_index, fuzzy_match)
                output.writerow(list(row) + [fuzzy_score] + list(lookup_row))
Ejemplo n.º 19
0
 def test_match_column_identifier_string(self):
     self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here'))
     self.assertEqual(2, match_column_identifier(self.headers, 'i_work_here', zero_based=True))
Ejemplo n.º 20
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error('--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error('--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error('--crs is only allowed when --lat and --lon are also specified.')

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file 

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = { 'type': 'Feature' }
                properties = {}
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        lat = float(c)

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        lon = float(c)

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = {
                    'type': 'Point',
                    'coordinates': [lon, lat]
                }

                feature['properties'] = properties

                features.append(feature)

            output = {
                'type': 'FeatureCollection',
                'bbox': [min_lon, min_lat, max_lon, max_lat],
                'features': features 
            }

            if self.args.crs:
                output['crs'] = {
                    'type': 'name',
                    'properties': {
                        'name': self.args.crs
                    }
                }
        # Keyed JSON
        elif self.args.key:
            output = {}
            
            for row in rows:
                row_dict = dict(zip(column_names, row))
                k = row_dict[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k))

                output[k] = row_dict
        # Boring JSON
        else:
            output = [dict(zip(column_names, row)) for row in rows]

        kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            kwargs['encoding'] = 'utf-8'

        json.dump(output, stream, **kwargs)
Ejemplo n.º 21
0
 def test_match_column_which_could_be_integer_name_is_treated_as_positional_id(self):
     self.assertEqual(0, match_column_identifier(self.headers, '1'))
     self.assertEqual(1, match_column_identifier(self.headers, '1', zero_based=True))
Ejemplo n.º 22
0
    def main(self):
        if self.additional_input_expected():
            if self.args.streamOutput and self.args.no_inference and not self.args.skip_lines and self.args.sniff_limit == 0:
                sys.stderr.write(
                    'No input file or piped data provided. Waiting for standard input:\n'
                )
            else:
                self.argparser.error(
                    'You must provide an input file or piped data.')

        # We need to do this dance here, because we aren't writing through agate.
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def default(obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()
            elif isinstance(obj, decimal.Decimal):
                return str(obj)
            raise TypeError('%s is not JSON serializable' % repr(obj))

        def dump_json(data, newline=False):
            json.dump(data, stream, default=default, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')
        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )
        if self.args.type and not self.args.lat:
            self.argparser.error(
                '--type is only allowed when --lat and --lon are also specified.'
            )
        if self.args.geometry and not self.args.lat:
            self.argparser.error(
                '--geometry is only allowed when --lat and --lon are also specified.'
            )

        if self.args.streamOutput and (self.args.lat or self.args.lon
                                       or self.args.key):
            self.argparser.error(
                '--stream is only allowed if --lat, --lon and --key are not specified.'
            )

        # GeoJSON
        if self.args.lat and self.args.lon:
            table = agate.Table.from_csv(self.input_file,
                                         skip_lines=self.args.skip_lines,
                                         sniff_limit=self.args.sniff_limit,
                                         column_types=self.get_column_types(),
                                         **self.reader_kwargs)

            features = []

            self.min_lon = None
            self.min_lat = None
            self.max_lon = None
            self.max_lat = None

            def update_boundary_lat(lat):
                if self.min_lat is None or lat < self.min_lat:
                    self.min_lat = lat
                if self.max_lat is None or lat > self.max_lat:
                    self.max_lat = lat

            def update_boundary_lon(lon):
                if self.min_lon is None or lon < self.min_lon:
                    self.min_lon = lon
                if self.max_lon is None or lon > self.max_lon:
                    self.max_lon = lon

            def update_boundary_coordinates(coordinates):
                if len(coordinates) <= 3 and isinstance(
                        coordinates[0], (float, int)):
                    update_boundary_lon(coordinates[0])
                    update_boundary_lat(coordinates[1])
                else:
                    for coordinate in coordinates:
                        update_boundary_coordinates(coordinate)

            lat_column = match_column_identifier(table.column_names,
                                                 self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(table.column_names,
                                                 self.args.lon,
                                                 self.args.zero_based)
            if self.args.type:
                type_column = match_column_identifier(table.column_names,
                                                      self.args.type,
                                                      self.args.zero_based)
            else:
                type_column = None
            if self.args.geometry:
                geometry_column = match_column_identifier(
                    table.column_names, self.args.geometry,
                    self.args.zero_based)
            else:
                geometry_column = None

            if self.args.key:
                id_column = match_column_identifier(table.column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in table.rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                feature_id = None
                lat = None
                lon = None
                geometry = None

                for i, c in enumerate(row):
                    if c is None:
                        continue
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None
                        if not self.args.no_bbox:
                            update_boundary_lat(lat)
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None
                        if not self.args.no_bbox:
                            update_boundary_lon(lon)
                    elif i == id_column:
                        feature_id = c
                    elif i == type_column:
                        pass  # Prevent "type" from being added to "properties".
                    elif i == geometry_column:
                        geometry = json.loads(c)
                        if not self.args.no_bbox and 'coordinates' in geometry:
                            update_boundary_coordinates(
                                geometry['coordinates'])
                    elif c:
                        properties[table.column_names[i]] = c

                if id_column is not None:
                    feature['id'] = feature_id

                feature['properties'] = properties
                if geometry or lat is None and lon is None:
                    feature['geometry'] = geometry
                elif lon and lat:
                    feature['geometry'] = OrderedDict([('type', 'Point'),
                                                       ('coordinates',
                                                        [lon, lat])])
                features.append(feature)

            items = [
                ('type', 'FeatureCollection'),
                ('features', features),
            ]

            if not self.args.no_bbox:
                items.insert(1, ('bbox', [
                    self.min_lon, self.min_lat, self.max_lon, self.max_lat
                ]))

            output = OrderedDict(items)

            if self.args.crs:
                output['crs'] = OrderedDict([('type', 'name'),
                                             ('properties', {
                                                 'name': self.args.crs
                                             })])

            dump_json(output)
        elif self.args.streamOutput and self.args.no_inference and not self.args.skip_lines and self.args.sniff_limit == 0:
            rows = agate.csv.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            for row in rows:
                data = OrderedDict()
                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                dump_json(data, newline=True)
        else:
            table = agate.Table.from_csv(self.input_file,
                                         skip_lines=self.args.skip_lines,
                                         sniff_limit=self.args.sniff_limit,
                                         column_types=self.get_column_types(),
                                         **self.reader_kwargs)

            table.to_json(
                self.output_file,
                key=self.args.key,
                newline=self.args.streamOutput,
                indent=self.args.indent,
            )