Esempio n. 1
0
    def write(self, df):

        row_count, col_count = df.shape

         # CSV Chunksize needs to be one more than block_size, so if we do get a UnicodeDecodeError, no rows will have been re-written
        chunksize = row_count + 1

        try:
            # Test whether the data is encoding - we need to test for this here
            # before calling to_csv which writes line by line - so lines preceding an error
            # Will cause duplicate key constraints when imported into the DB
            json.dumps(df.to_dict(outtype='records')).encode('ascii')
        except UnicodeDecodeError:

            # Encoding failed - rather than ditch the whole batch, loop through and write each individually, logging an error for failures
            # Some of these failures are just corrupt records in KE EMu - for example record irn has
            # For example: DarFieldNumber:1=ÃÆâ

            # Loop through each row
            for i in range(row_count):
                # Get one row of the dataframe as new frame
                df_row = df[i:i+1]
                try:
                    # Try to write the row
                    df_row.to_csv(self.path, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8')
                except UnicodeDecodeError:
                    # On failure, log an error with the _id of that row
                    log.critical('UTF8 Encoding error for record irn=%s', df_row.iloc[-1]['_id'])

        else:
            # Batch is good to write to CSV
            df.to_csv(self.path, chunksize=chunksize, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8')
Esempio n. 2
0
    def write(self, df):

        log.info("Saving records to CKAN resource %s", self.resource_id)

        # Convert all empty/null values to None - so will be NULL values in postgres
        # Ensure any float fields with value 0.0 are actually None
        for col, np_type in self.columns.iteritems():

            if np_type.startswith('float'):
                df[col][df[col] == 0.0] = None
            else:
                # BUGFIX: Multimedia fields are being populated with empty string rather than NULL
                df[col][df[col].astype(str) == ''] = None

        # Loop through all the dataframe columns, removing internal ones (fields starting with _)
        for col in df:
            if col.startswith('_'):
                df.drop(col, axis=1, inplace=True)

        # Convert all NaN to None
        df = df.where(pd.notnull(df), None)

        # Convert records to dictionary
        records = df.to_dict(outtype='records')
        datastore_params = {
            'resource_id': self.resource_id,
            'records': records,
            'force': True
            # 'primary_key': '_id'
        }

        # Check that the data doesn't contain invalid chars
        try:
            json.dumps(datastore_params).encode('ascii')
        except UnicodeDecodeError:
            # At least one of the records contain invalid chars
            # Loop through, validating each of the records

            validated_records = []

            for i, record in enumerate(datastore_params['records']):
                try:
                    json.dumps(record).encode('ascii')
                except UnicodeDecodeError:
                    log.critical('Error encoding record: %s', ' '.join(['%s=%s' % (field, value) for field, value in record.iteritems() if value]))
                else:
                    validated_records.append(record)

            datastore_params['records'] = validated_records

        self.remote_ckan.action.datastore_upsert(**datastore_params)
Esempio n. 3
0
    def write(self, df):

        row_count, col_count = df.shape

        # CSV Chunksize needs to be one more than block_size, so if we do get a UnicodeDecodeError, no rows will have been re-written
        chunksize = row_count + 1

        try:
            # Test whether the data is encoding - we need to test for this here
            # before calling to_csv which writes line by line - so lines preceding an error
            # Will cause duplicate key constraints when imported into the DB
            json.dumps(df.to_dict(outtype='records')).encode('ascii')
        except UnicodeDecodeError:

            # Encoding failed - rather than ditch the whole batch, loop through and write each individually, logging an error for failures
            # Some of these failures are just corrupt records in KE EMu - for example record irn has
            # For example: DarFieldNumber:1=ÃÆâ

            # Loop through each row
            for i in range(row_count):
                # Get one row of the dataframe as new frame
                df_row = df[i:i + 1]
                try:
                    # Try to write the row
                    df_row.to_csv(self.path,
                                  mode='a',
                                  columns=self.columns.keys(),
                                  index=False,
                                  header=False,
                                  encoding='utf-8')
                except UnicodeDecodeError:
                    # On failure, log an error with the _id of that row
                    log.critical('UTF8 Encoding error for record irn=%s',
                                 df_row.iloc[-1]['_id'])

        else:
            # Batch is good to write to CSV
            df.to_csv(self.path,
                      chunksize=chunksize,
                      mode='a',
                      columns=self.columns.keys(),
                      index=False,
                      header=False,
                      encoding='utf-8')