Exemple #1
0
def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
Exemple #2
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), BoolType(), BoolType(),
                          BoolType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
Exemple #3
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            IntegerType(),
            IntegerType(),
            IntegerType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
Exemple #4
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            IntegerType(),
            StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
Exemple #5
0
def main(basic_config_file, batch_config_file):
    with open(basic_config_file, "r") as f:
        base_settings = yaml.load(f)

    if batch_config_file:
        # RUN MANY
        # parse csv into a list of settings-dicts
        import messytables
        with open(batch_config_file, "rb") as f:
            row_set = messytables.CSVRowSet("", f)
            offset, headers = messytables.headers_guess(row_set.sample)
            row_set.register_processor(messytables.headers_processor(headers))
            row_set.register_processor(messytables.offset_processor(offset +
                                                                    1))
            types = messytables.type_guess(row_set.sample, strict=True)
            row_set.register_processor(messytables.types_processor(types))
            settings_list = row_set.dicts()
        name = batch_config_file.replace(".csv", "")
        run_many(settings_list, name, base_settings=base_settings)
    else:
        # RUN ONE
        # parse yaml into a settings-dict
        settings_file = os.path.join(base_settings["out_dir"], "settings.yml")
        with open(settings_file, "w") as f:
            yaml.dump(base_settings, f)
        training_log, exit_status = run_one(**base_settings)
        training_log_file = os.path.join(base_settings["out_dir"],
                                         "training_log.csv")
        training_log.to_csv(training_log_file)
        stats = compute_final_stats(training_log)
        stats["exit_status"] = exit_status
        training_stats_file = os.path.join(base_settings["out_dir"],
                                           "training_stats.yml")
        with open(training_stats_file, "w") as f:
            yaml.dump(stats, f)
Exemple #6
0
    def connect(self,
                host=None,
                port=None,
                database=None,
                username=None,
                password=None,
                file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host,
                                        username,
                                        password,
                                        database,
                                        port=port,
                                        as_dict=True,
                                        charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn
def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
Exemple #8
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)
Exemple #9
0
def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set, headers, offset, types
Exemple #10
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj("simple.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = map(lambda c: c.type, data[0])
        assert_equal(header_types, [StringType()] * 3)
        row_types = map(lambda c: c.type, data[2])
        assert_equal(expected_types, row_types)
Exemple #11
0
def prepare_csv_rows(csv_file):
    row_set = CSVTableSet(csv_file).tables[0]

    offset, headers = headers_guess(row_set.sample)
    headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)]

    row_set.register_processor(headers_processor_remove_blank(headers))
    row_set.register_processor(offset_processor(offset + 1))

    DateType.formats = create_date_formats(day_first=False)

    # We are never wanting boolean types, so remove that from the default list
    eligible_types = [StringType, DecimalType, IntegerType, DateType]
    types = type_guess(row_set.sample, types=eligible_types, strict=True)

    row_set.register_processor(types_processor(types))

    return row_set
Exemple #12
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Exemple #13
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Exemple #14
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"',
                                window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
Exemple #15
0
def parse_data(input):
    fh = open(input, 'rb')

    try:
        table_set = messytables.any_tableset(fh)
    except messytables.ReadError as e:
        print(e)

    get_row_set = lambda table_set: table_set.tables.pop()
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)
    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row

    result = row_iterator()

    headers_dicts = [
        dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
        for field in zip(headers, types)
    ]

    print('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    return headers_dicts, result
Exemple #16
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"', window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
Exemple #17
0
def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return

    row_set = tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set
Exemple #18
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), StringType(), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d
Exemple #20
0
def parse_table(row_set, save_func):
    num_rows = 0
    fields = {}

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    for i, row in enumerate(row_set):
        if not len(fields):
            fields = generate_field_spec(row)

        data = {}
        for cell, field in zip(row, fields):
            value = cell.value
            if isinstance(value, datetime):
                value = value.date()
            if isinstance(value, Decimal):
                # Baby jesus forgive me.
                value = float(value)
            if isinstance(value, basestring) and not len(value.strip()):
                value = None
            data[field['name']] = value
            random_sample(value, field, i)

        check_empty = set(data.values())
        if None in check_empty and len(check_empty) == 1:
            continue

        save_func(data)
        num_rows = i

    fields = {f.get('name'): f for f in fields}
    return num_rows, fields
Exemple #21
0
def determine_messytables_types(file_handle, types=messytables.types.TYPES):
    """

    :param file_handle: file handle opened in binary mode
    :return: (headers, types, row_set)
    """

    # Load a file object:
    table_set = messytables.CSVTableSet(file_handle)

    # If you aren't sure what kind of file it is
    # table_set = messytables.any_tableset(file_handle)

    # A table set is a collection of tables:
    row_set = table_set.tables[0]

    # A row set is an iterator over the table, but it can only
    # be run once. To peek, a sample is provided:
    print(next(row_set.sample))

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))

    # add one to begin with content, not the header:
    row_set.register_processor(messytables.offset_processor(offset + 1))

    # guess column types:
    types = messytables.type_guess(row_set.sample, types, strict=True)

    # and tell the row set to apply these types to
    # each row when traversing the iterator:
    row_set.register_processor(messytables.types_processor(types))

    # now run some operation on the data:
    return headers, types, row_set
Exemple #22
0
def push_to_datastore(task_id, input, dry_run=False):
    '''Download and parse a resource push its data into CKAN's DataStore.

    An asynchronous job that gets a resource from CKAN, downloads the
    resource's data file and, if the data file has changed since last time,
    parses the data and posts it into CKAN's DataStore.

    :param dry_run: Fetch and parse the data file but don't actually post the
        data to the DataStore, instead return the data headers and rows that
        would have been posted.
    :type dry_run: boolean

    '''
    handler = util.StoringHandler(task_id, input)
    logger = logging.getLogger(task_id)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource = get_resource(resource_id, ckan_url, api_key)
    except util.JobError as e:
        # try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise util.JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    headers = {}
    if resource.get('url_type') == 'upload':
        # If this is an uploaded file to CKAN, authenticate the request,
        # otherwise we won't get file from private resources
        headers['Authorization'] = api_key
    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
            stream=True,  # just gets the headers for now
        )
        response.raise_for_status()

        cl = response.headers.get('content-length')
        try:
            if cl and int(cl) > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to download: {cl} > max ({max_cl}).'
                    .format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
        except ValueError:
            pass

        tmp = tempfile.TemporaryFile()
        length = 0
        m = hashlib.md5()
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to process: {cl} > max ({max_cl}).'
                    .format(cl=length, max_cl=MAX_CONTENT_LENGTH))
            tmp.write(chunk)
            m.update(chunk)

        ct = response.headers.get('content-type', '').split(';', 1)[0]

    except requests.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file", status_code=e.response.status_code,
            request_url=url, response=e.response.content)
    except requests.RequestException as e:
        raise HTTPError(
            message=str(e), status_code=None,
            request_url=url, response=None)

    file_hash = m.hexdigest()
    tmp.seek(0)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info("The file hash hasn't changed: {hash}.".format(
            hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        # try again with format
        tmp.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(tmp, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    get_row_set = web.app.config.get('GET_ROW_SET',
                                     lambda table_set: table_set.tables.pop())
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
            for f in existing.get('fields', []) if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
            for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                if isinstance(cell.value, str):
                    try:
                        data_row[column_name] = cell.value.encode('latin-1').decode('utf-8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        data_row[column_name] = cell.value
                else:
                    data_row[column_name] = cell.value
            yield data_row
    result = row_iterator()

    '''
    Delete existing datstore resource before proceeding. Otherwise
    'datastore_create' will append to the existing datastore. And if
    the fields have significantly changed, it may also fail.
    '''
    if existing:
        logger.info('Deleting "{res_id}" from datastore.'.format(
            res_id=resource_id))
        delete_datastore_resource(resource_id, api_key, ckan_url)

    headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
                     for field in zip(headers, types)]

    # Maintain data dictionaries from matching column names
    if existing_info:
        for h in headers_dicts:
            if h['id'] in existing_info:
                h['info'] = existing_info[h['id']]
                # create columns with types user requested
                type_override = existing_info[h['id']].get('type_override')
                if type_override in list(_TYPE_MAPPING.values()):
                    h['type'] = type_override

    logger.info('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    if dry_run:
        return headers_dicts, result

    count = 0
    for i, chunk in enumerate(chunky(result, 250)):
        records, is_it_the_last_chunk = chunk
        count += len(records)
        logger.info('Saving chunk {number} {is_last}'.format(
            number=i, is_last='(last)' if is_it_the_last_chunk else ''))
        send_resource_to_datastore(resource, headers_dicts, records,
                                   is_it_the_last_chunk, api_key, ckan_url)

    logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
        n=count, res_id=resource_id))

    if data.get('set_url_type', False):
        update_resource(resource, api_key, ckan_url)
Exemple #23
0
    def push_to_datastore(self, context, resource):

        # Get the resource's content hash, which is used to check whether the
        # resource file has changed since last time.
        hash_dict = resource.get('hash')
        if hash_dict:
            original_content_hash = json.loads(hash_dict)['content']
            check_hash = not self.options.force
        else:
            # This resource has no hash yet, it must be a new resource.
            original_content_hash = ''
            check_hash = False

        try:
            result = fetch_resource.download(context,
                                             resource,
                                             self.max_content_length,
                                             DATA_FORMATS,
                                             check_modified=check_hash)
        except fetch_resource.ResourceNotModified as e:
            logger.info(u'Skipping unmodified resource: {0}'.format(
                resource['url']))
            return {'success': True, 'resource': resource['id'], 'error': None}
        except Exception as e:
            logger.exception(e)
            return {
                'success': False,
                'resource': resource['id'],
                'error': 'Could not download resource'
            }

        if check_hash and (result['hash'] == original_content_hash):
            logger.info(u'Skipping unmodified resource: {0}'.format(
                resource['url']))
            os.remove(result['saved_file'])
            return {'success': True, 'resource': resource['id'], 'error': None}

        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        try:
            table_sets = any_tableset(f,
                                      mimetype=content_type,
                                      extension=resource['format'].lower())
            # only first sheet in xls for time being
            row_set = table_sets.tables[0]
            offset, headers = headers_guess(row_set.sample)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {
                'success': False,
                'resource': resource['id'],
                'error': 'Error parsing the resource'
            }

        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(row_set.sample, [
            messytables.types.StringType, messytables.types.IntegerType,
            messytables.types.FloatType, messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
                                   strict=True)
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id':
                resource['id'],
                'fields': [
                    dict(id=name, type=typename)
                    for name, typename in zip(headers, guessed_type_names)
                ],
                'records':
                data,
                'force':
                True,
            }
            response = toolkit.get_action('datastore_create')(context,
                                                              data_dict)
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Trying to delete existing datastore for resource {0} '
                    '(may not exist).'.format(resource['id']))
        try:
            toolkit.get_action('datastore_delete')(
                context, {
                    'resource_id': resource['id'],
                    'force': True
                })
        except toolkit.ObjectNotFound:
            logger.info('Datastore not found for resource {0}.'.format(
                resource['id']))
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(itertools.imap(dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        try:
            for data in chunky(row_set.dicts(), 100):
                count += len(data)
                send_request(data)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {
                'success': False,
                'resource': resource['id'],
                'error': 'Error pushing data to datastore'
            }

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count, res_id=resource['id']))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.now().isoformat()
        })

        toolkit.get_action('resource_update')(context, resource)
        os.remove(result['saved_file'])
        return {'success': True, 'resource': resource['id'], 'error': None}
    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
Exemple #25
0
            # print(row)
            x += 1
        print("Rows: ", x - 1)  # prints the number of rows (minus the header)
        print("Columns: ", len(row))  # prints the number of columns

        cN_array = np.array(cN)
        print("Header: ", cN_array)

        ft = open(file, "rb")
        tableTypes = CSVTableSet(ft)
        columnTypes = tableTypes.tables[0]
        offset, headers = headers_guess(columnTypes.sample)
        columnTypes.register_processor(headers_processor(headers))
        columnTypes.register_processor(offset_processor(offset + 1))
        types = type_guess(columnTypes.sample, strict=True)
        columnTypes.register_processor(types_processor(types))

        h_length = len(headers)
        h_var = 0
        dt_array = []
        for each_type in range(0, h_length):
            types[h_var] = str(types[h_var])
            if types[h_var] == 'Decimal':
                types[h_var] = "Float"
            randVar = f"{headers[h_var]} : {types[h_var]}"
            dt_array.append(randVar)
            h_var += 1
        print("Datatypes: ", dt_array)  #

except Exception:
    print("File Unavailable")
    def push_to_datastore(self, context, resource):

        # Get the resource's content hash, which is used to check whether the
        # resource file has changed since last time.
        hash_dict = resource.get('hash')
        if hash_dict:
            original_content_hash = json.loads(hash_dict)['content']
            check_hash = not self.options.force
        else:
            # This resource has no hash yet, it must be a new resource.
            original_content_hash = ''
            check_hash = False

        try:
            result = fetch_resource.download(context,
                                             resource,
                                             self.max_content_length,
                                             DATA_FORMATS,
                                             check_modified=check_hash)
        except fetch_resource.ResourceNotModified as e:
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}
        except Exception as e:
            logger.exception(e)
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Could not download resource'}

        if check_hash and (result['hash'] == original_content_hash):
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            os.remove(result['saved_file'])
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}

        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        try:
            table_sets = any_tableset(
                f,
                mimetype=content_type,
                extension=resource['format'].lower()
            )
            # only first sheet in xls for time being
            row_set = table_sets.tables[0]
            offset, headers = headers_guess(row_set.sample)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error parsing the resource'}

        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(
            row_set.sample,
            [
                messytables.types.StringType,
                messytables.types.IntegerType,
                messytables.types.FloatType,
                messytables.types.DecimalType,
                messytables.types.DateUtilType
            ],
            strict=True
        )
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in
                              guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id': resource['id'],
                'fields': [dict(id=name, type=typename) for name, typename
                           in zip(headers, guessed_type_names)],
                'records': data,
                'force': True,
            }
            response = toolkit.get_action('datastore_create')(
                context,
                data_dict
            )
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Trying to delete existing datastore for resource {0} '
                    '(may not exist).'.format(resource['id']))
        try:
            toolkit.get_action('datastore_delete')(
                context,
                {'resource_id': resource['id'], 'force': True}
            )
        except toolkit.ObjectNotFound:
            logger.info('Datastore not found for resource {0}.'.format(
                resource['id']))
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(
                    itertools.imap(
                        dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        try:
            for data in chunky(row_set.dicts(), 100):
                count += len(data)
                send_request(data)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error pushing data to datastore'}

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count,
            res_id=resource['id']
        ))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.now().isoformat()
        })

        toolkit.get_action('resource_update')(context, resource)
        os.remove(result['saved_file'])
        return {'success': True,
                'resource': resource['id'],
                'error': None}
Exemple #27
0
def _datastorer_upload(context, resource, logger):

    excel_types = ['xls', 'application/ms-excel', 'application/xls',
                   'application/vnd.ms-excel']
    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']

    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        is_tsv = (content_type in tsv_types or
                  resource['format'] in tsv_types)
        delimiter = '\t' if is_tsv else ','
        table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat(),
        'url': resource['url']
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))
    def push_to_datastore(self, context, resource):
        try:
            result = download(
                context,
                resource,
                self.max_content_length,
                DATA_FORMATS
            )
        except Exception as e:
            logger.exception(e)
            return
        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        table_sets = AnyTableSet.from_fileobj(
            f,
            mimetype=content_type,
            extension=resource['format'].lower()
        )

        ##only first sheet in xls for time being
        row_set = table_sets.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(
            row_set.sample,
            [
                messytables.types.StringType,
                messytables.types.IntegerType,
                messytables.types.FloatType,
                messytables.types.DecimalType,
                messytables.types.DateUtilType
            ],
            strict=True
        )
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in
                              guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id': resource['id'],
                'fields': [dict(id=name, type=typename) for name, typename
                           in zip(headers, guessed_type_names)],
                'records': data
            }
            response = logic.get_action('datastore_create')(
                context,
                data_dict
            )
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Deleting existing datastore (it may not exist): '
                    '{0}.'.format(resource['id']))
        try:
            logic.get_action('datastore_delete')(
                context,
                {'resource_id': resource['id']}
            )
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(
                    itertools.imap(
                        dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        for data in chunky(row_set.dicts(), 100):
            count += len(data)
            send_request(data)

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count,
            res_id=resource['id']
        ))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.datetime.now().isoformat()
        })

        logic.get_action('resource_update')(context, resource)
Exemple #29
0
# A table set is a collection of tables:
row_set = table_set.tables[0]

# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))

# add one to begin with content, not the header:
row_set.register_processor(offset_processor(offset + 1))

# guess column types:
types = type_guess(row_set.sample, strict=True)

# and tell the row set to apply these types to
# each row when traversing the i:
row_set.register_processor(types_processor(types))

#-------doing rest of operations using pandas------#
dataFrame = pd.read_csv(datafile)
ListColumns = list(dataFrame.columns)
rw, col = dataFrame.shape  #number of rows,number of colmuns
ListOfFunctionalDependecies = {}
ListOfCombinationsOfColumns = GetAllCombinations(ListColumns)
for_primary_key_and_file_breaking = [[]] * (len(ListOfCombinationsOfColumns))
candidate_keys = []
one_to_one_relation = []
many_to_one_relation = []
super_keys = []
StoringAllRelationsInADictionaryOfTuples = {}

for x in xrange(0, len(ListColumns)):
def webstorer_upload(context, data):

    context = json.loads(context)
    resource = json.loads(data)
    
    excel_types = ['xls', 'application/ms-excel', 'application/xls']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))

    rows = []
    
    for row in row_set.dicts():
        rows.append(row)


    webstore_url = context.get('webstore_url').rstrip('/')
    
    webstore_request_url = '%s/%s/%s' % (webstore_url,
                                         context['username'],
                                         resource['id']
                                         )
    #check if resource is already there.
    webstore_response = requests.get(webstore_request_url+'.json')
    check_response_and_retry(webstore_response, webstore_request_url+'.json')

    #should be an empty list as no tables should be there.
    if json.loads(webstore_response.content):
        raise WebstorerError('Webstore already has this resource')

    response = requests.post(webstore_request_url+'/data',
                             data = json.dumps(rows),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )
    check_response_and_retry(response, webstore_request_url+'.json')
    if response.status_code != 201:
        raise WebstorerError('Websore bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )

    ckan_url = context['site_url'].rstrip('/')
    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url+'/data',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Exemple #31
0
def parse(stream, guess_types=True, **kwargs):
    '''Parse CSV file and return row iterator plus metadata (fields etc).

    Additional CSV arguments as per
    http://docs.python.org/2/library/csv.html#csv-fmt-params

    :param delimiter:
    :param quotechar:
    :param window: the size of the sample used for analysis

    There is also support for:

    :param encoding: file encoding (will be guess with chardet if not provided)


    You can process csv as well as tsv files using this function. For tsv just
    pass::

        delimiter='\t'
    '''
    metadata = dict(**kwargs)
    delimiter = metadata.get('delimiter', None)
    quotechar = metadata.get('quotechar', None)
    window = metadata.get('window', None)
    encoding = metadata.get('encoding', None)
    table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter,
            quotechar=quotechar, encoding=encoding, window=window)
    row_set = table_set.tables.pop()
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guessable_types = [StringType, IntegerType, FloatType, DecimalType,
                           DateUtilType]
        row_types = type_guess(row_set.sample, guessable_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', unicode(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))
    if guess_types:
        row_set.register_processor(types_processor(row_types))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row
    return row_iterator(), {'fields': fields}
Exemple #32
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')
    table_sets = any_tableset(f,
                              mimetype=content_type,
                              extension=resource['format'].lower())

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(row_set.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateUtilType
    ],
                               strict=True)
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (
        ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {
            'resource_id':
            resource['id'],
            'fields': [
                dict(id=name, type=typename)
                for name, typename in zip(headers, guessed_type_names)
            ],
            'force':
            True,
            'records':
            data
        }
        response = requests.post(
            datastore_create_request_url,
            data=json.dumps(request),
            headers={
                'Content-Type': 'application/json',
                'Authorization': context['apikey']
            },
        )
        check_response_and_retry(response, datastore_create_request_url,
                                 logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info(
            'Deleting existing datastore (it may not exist): {0}.'.format(
                resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({
                                     'resource_id': resource['id'],
                                     'force': True
                                 }),
                                 headers={
                                     'Content-Type': 'application/json',
                                     'Authorization': context['apikey']
                                 })
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(
                get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(itertools.imap(dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(
        n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url':
        'active',
        'webstore_last_updated':
        datetime.datetime.now().isoformat()
    })

    response = requests.post(ckan_request_url,
                             data=json.dumps(resource),
                             headers={
                                 'Content-Type': 'application/json',
                                 'Authorization': context['apikey']
                             })

    if response.status_code not in (201, 200):
        raise DatastorerException(
            'Ckan bad response code (%s). Response was %s' %
            (response.status_code, response.content))
# A table set is a collection of tables:
row_set = table_set.tables[0]

# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))

# add one to begin with content, not the header:
row_set.register_processor(offset_processor(offset + 1))

# guess column types:
types = type_guess(row_set.sample, strict=True)

# and tell the row set to apply these types to
# each row when traversing the i:
row_set.register_processor(types_processor(types))

def update_datetypes(dataFrame,ListColumns,rw):
	for x in xrange(0,len(ListColumns)):
		datet = False
		tempList = dataFrame[ListColumns[x]].tolist()
		for i in xrange(1,rw):
			if(not type(tempList[i]) == str):
				break
			if(getType(tempList[i]) == datetime):
				datet = True
				break
		if(datet):
			types[x] = datetime

#creating dataframe
Exemple #34
0
def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" %
                         len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample),
                       strict=True,
                       types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname,
                                           port=args.port,
                                           ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data,
                                     args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
Exemple #35
0
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads an Excel file (or other tabular data recognized by messytables)
    into Datastore and creates indexes.

    Largely copied from datapusher - see below. Is slower than load_csv.
    '''

    # use messytables to determine the header row
    logger.info('Determining column names and types')
    ct = mimetype
    format = os.path.splitext(table_filepath)[1]  # filename extension
    with open(table_filepath, 'rb') as tmp:

        #
        # Copied from datapusher/jobs.py:push_to_datastore
        #

        try:
            table_set = messytables.any_tableset(tmp,
                                                 mimetype=ct,
                                                 extension=ct)
        except messytables.ReadError as e:
            # try again with format
            tmp.seek(0)
            try:
                table_set = messytables.any_tableset(tmp,
                                                     mimetype=format,
                                                     extension=format)
            except Exception as e:
                raise LoaderError(e)

        if not table_set.tables:
            raise LoaderError('Could not parse file as tabular data')
        row_set = table_set.tables.pop()
        offset, headers = messytables.headers_guess(row_set.sample)

        existing = datastore_resource_exists(resource_id)
        existing_info = None
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)

        # Some headers might have been converted from strings to floats and such.
        headers = encode_headers(headers)

        row_set.register_processor(messytables.headers_processor(headers))
        row_set.register_processor(messytables.offset_processor(offset + 1))
        TYPES, TYPE_MAPPING = get_types()
        types = messytables.type_guess(row_set.sample,
                                       types=TYPES,
                                       strict=True)

        # override with types user requested
        if existing_info:
            types = [{
                'text': messytables.StringType(),
                'numeric': messytables.DecimalType(),
                'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
                     for t, h in zip(types, headers)]

        row_set.register_processor(messytables.types_processor(types))

        headers = [
            header.strip()[:MAX_COLUMN_LENGTH] for header in headers
            if header.strip()
        ]
        headers_set = set(headers)

        def row_iterator():
            for row in row_set:
                data_row = {}
                for index, cell in enumerate(row):
                    column_name = cell.column.strip()
                    if column_name not in headers_set:
                        continue
                    data_row[column_name] = cell.value
                yield data_row

        result = row_iterator()
        '''
        Delete existing datstore resource before proceeding. Otherwise
        'datastore_create' will append to the existing datastore. And if
        the fields have significantly changed, it may also fail.
        '''
        if existing:
            logger.info('Deleting "{res_id}" from datastore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        headers_dicts = [
            dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
            for field in zip(headers, types)
        ]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for h in headers_dicts:
                if h['id'] in existing_info:
                    h['info'] = existing_info[h['id']]
                    # create columns with types user requested
                    type_override = existing_info[h['id']].get('type_override')
                    if type_override in _TYPE_MAPPING.values():
                        h['type'] = type_override

        logger.info('Determined headers and types: {headers}'.format(
            headers=headers_dicts))

        ### Commented - this is only for tests
        # if dry_run:
        #     return headers_dicts, result

        logger.info('Copying to database...')
        count = 0
        for i, records in enumerate(chunky(result, 250)):
            count += len(records)
            logger.info('Saving chunk {number}'.format(number=i))
            send_resource_to_datastore(resource_id, headers_dicts, records)
        logger.info('...copying done')

        if count:
            logger.info(
                'Successfully pushed {n} entries to "{res_id}".'.format(
                    n=count, res_id=resource_id))
        else:
            # no datastore table is created
            raise LoaderError('No entries found - nothing to load')
Exemple #36
0
def parse(stream, guess_types=True, **kwargs):
    '''Parse CSV file and return row iterator plus metadata (fields etc).

    Additional CSV arguments as per
    http://docs.python.org/2/library/csv.html#csv-fmt-params

    :param delimiter:
    :param quotechar:
    :param window: the size of the sample used for analysis

    There is also support for:

    :param encoding: file encoding (will be guess with chardet if not provided)


    You can process csv as well as tsv files using this function. For tsv just
    pass::

        delimiter='\t'
    '''
    metadata = dict(**kwargs)
    delimiter = metadata.get('delimiter', None)
    quotechar = metadata.get('quotechar', None)
    window = metadata.get('window', None)
    encoding = metadata.get('encoding', None)
    table_set = CSVTableSet(stream,
                            delimiter=delimiter,
                            quotechar=quotechar,
                            encoding=encoding,
                            window=window)
    strict_type_guess = metadata.get('strict_type_guess', False)
    row_set = table_set.tables.pop()
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guessable_types = [
            StringType, IntegerType, FloatType, DecimalType, DateUtilType
        ]
        sample = row_set.sample
        for _ in range(offset + 1):
            sample.next()
        row_types = type_guess(sample,
                               guessable_types,
                               strict=strict_type_guess)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', unicode(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))
    if guess_types:
        row_set.register_processor(
            types_processor(row_types, strict=strict_type_guess))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
Exemple #37
0
    headers = [unicode(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
        }.get(existing_info.get(h, {}).get('type_override'), t)
                 for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row

    result = row_iterator()
Exemple #38
0
def _datastorer_upload(context, resource):

    excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))


    ckan_url = context['site_url'].rstrip('/')
    
    webstore_request_url = '%s/api/data/%s/' % (ckan_url,
                                                resource['id']
                                                )

    def send_request(data):
        return requests.post(webstore_request_url + '_bulk',
                             data = "%s%s" % ("\n".join(data), "\n"),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )

    data = []
    for count,dict_ in enumerate(row_set.dicts()):
        data.append(json.dumps({"index": {"_id": count+1}}))
        data.append(json.dumps(dict_))
        if (count % 100) == 0:
            response = send_request(data)
            check_response_and_retry(response, webstore_request_url+'_mapping')
            data[:] = []

    if data:
        respose = send_request(data)
        check_response_and_retry(response, webstore_request_url+'_mapping')


    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url,
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Exemple #39
0
        table_set = messytables.any_tableset(f, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        f.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(f, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)
    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row
    result = row_iterator()
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)
    logger.info('Downloaded resource %r' %(resource))

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters
    
    extension = resource['format'].lower()
    
    fp = open(result['saved_file'], 'rb')
    if zipfile.is_zipfile(result['saved_file']):
        fp, zf = open_zipped_tableset(fp, extension=extension)
        logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file'])
    else:
        logger.info('Opened file %s' %(result['saved_file']))

    table_sets = any_tableset(fp, extension=extension)
    
    if 'sample_size' in context:
        table_sets.window = max(1000, int(context['sample_size']))
        logger.info('Using a sample window of %d', table_sets.window)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'force': True,
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({'resource_id': resource['id'], 'force': True}),
                        headers={'Content-Type': 'application/json',
                                'Authorization': context['apikey']}
                        )
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    })

    response = requests.post(
        ckan_request_url,
        data=json.dumps(resource),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))