def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def get_schema(self, filename): """ Guess schema using messytables """ table_set = self.read_file(filename) # Have I been able to read the filename if table_set is None: return [] # Get the first table as rowset row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) # Get a sample as well.. sample = next(row_set.sample) clean = lambda v: str(v) if not isinstance(v, str) else v schema = [] for i, h in enumerate(headers): schema.append([h, str(types[i]), clean(sample[i].value)]) return schema
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType() ])
def generate_table(self, meta, sheet, row_set): offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) schema = TabularSchema({ 'sheet_name': row_set.name, 'content_hash': meta.content_hash, 'sheet': sheet }) columns = [schema.add_column(h) for h in headers] log.info("Creating internal table: %s columns, table: %r", len(columns), schema.table_name) tabular = Tabular(schema) tabular.drop() tabular.create() def generate_rows(): for i, row in enumerate(row_set): record = {} for cell, column in zip(row, columns): record[column.name] = string_value(cell.value) if len(record): for column in columns: record[column.name] = record.get(column.name, None) yield record log.info("Loaded %s rows.", i) tabular.load_iter(generate_rows()) return schema
def get_column_types(data: io.BytesIO) \ -> Tuple[List[str], List[types.CellType]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data (io.BytesIO): sample of the CSV file Returns: (List[str], List[str]): tuple of list of header names and list of column types """ table_set = messytables.CSVTableSet(data) row_set = table_set.tables[0] offset, csv_headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(csv_headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) csv_types = messytables.type_guess(row_set.sample, strict=True) return (csv_headers, csv_types)
def convert(self): table_set = CSVTableSet.from_fileobj(self.stream) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) data_row = {} result = [] for row in row_set: for index, cell in enumerate(row): data_row[cell.column] = cell.value result.append(data_row) return fields, result
def main(basic_config_file, batch_config_file): with open(basic_config_file, "r") as f: base_settings = yaml.load(f) if batch_config_file: # RUN MANY # parse csv into a list of settings-dicts import messytables with open(batch_config_file, "rb") as f: row_set = messytables.CSVRowSet("", f) offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, strict=True) row_set.register_processor(messytables.types_processor(types)) settings_list = row_set.dicts() name = batch_config_file.replace(".csv", "") run_many(settings_list, name, base_settings=base_settings) else: # RUN ONE # parse yaml into a settings-dict settings_file = os.path.join(base_settings["out_dir"], "settings.yml") with open(settings_file, "w") as f: yaml.dump(base_settings, f) training_log, exit_status = run_one(**base_settings) training_log_file = os.path.join(base_settings["out_dir"], "training_log.csv") training_log.to_csv(training_log_file) stats = compute_final_stats(training_log) stats["exit_status"] = exit_status training_stats_file = os.path.join(base_settings["out_dir"], "training_stats.yml") with open(training_stats_file, "w") as f: yaml.dump(stats, f)
def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data {io.BytesIO} -- sample of the CSV file Returns: (List[str], List[str]) -- tuple of list of header names and list of column types """ table_set = CSVTableSet(data) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logging.info(headers) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) logging.info(types) return (headers, types)
def generate_table(self, document, meta, sheet, row_set): offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) tabular = self.create_tabular(sheet, row_set.name) columns = [tabular.add_column(h) for h in headers] if not len(columns): return def generate_rows(): for i, row in enumerate(row_set): record = {} try: for cell, column in zip(row, columns): record[column.name] = string_value(cell.value) if len(record): for column in columns: record[column.name] = record.get(column.name, None) yield record except Exception as exception: log.warning("Could not decode row %s in %s: %s", i, meta, exception) document.insert_records(sheet, generate_rows()) return tabular
def main(argv=None): args = parse_args(argv) if args.file is None: # slurp the whole input since there seems to be a bug in messytables # which should be able to handle streams but doesn't args.file = cStringIO.StringIO(sys.stdin.read()) relation_key = args_to_relation_key(args) table_set = any_tableset(args.file) if len(table_set.tables) != 1: raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables)) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(strip_processor()) row_set.register_processor(headers_processor(headers)) # Temporarily, mark the offset of the header row_set.register_processor(offset_processor(offset + 1)) # guess types and register them types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType]) row_set.register_processor(types_processor(types)) # Messytables seems to not handle the case where there are no headers. # Work around this as follows: # 1) offset must be 0 # 2) if the types of the data match the headers, assume there are # actually no headers if offset == 0: try: [t.cast(v) for (t, v) in zip(types, headers)] except: pass else: # We don't need the headers_processor or the offset_processor row_set._processors = [] row_set.register_processor(strip_processor()) row_set.register_processor(types_processor(types)) headers = None # Construct the Myria schema schema = messy_to_schema(types, headers) logging.info("Myria schema: {}".format(json.dumps(schema))) # Prepare data for writing to Myria data, kwargs = write_data(row_set, schema) if not args.dry: # Connect to Myria and send the data connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl) ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs) sys.stdout.write(pretty_json(ret)) else: sys.stdout.write(data)
def lines(self): fh = urlopen(self.source.url) row_set = CSVRowSet('data', fh, window=3) headers = list(row_set.sample)[0] headers = [c.value for c in headers] row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(1)) for row in row_set: yield dict([(c.column, c.value) for c in row])
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs): '''Parse Excel (xls or xlsx) to structured objects. :param excel_type: xls | xlsx :param sheet: index of sheet in spreadsheet to convert (starting from index = 1) ''' sheet_number = int(sheet) - 1 xlsclass = XLSTableSet if excel_type == 'xlsx': xlsclass = XLSXTableSet table_set = xlsclass.from_fileobj(stream) try: row_set = table_set.tables[sheet_number] except IndexError: raise Exception('This file does not have sheet number %d' % (sheet_number + 1)) offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guess_types = [ StringType, IntegerType, FloatType, DecimalType, DateUtilType ] row_types = type_guess(row_set.sample, guess_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def test_read_head_offset_excel(self): fh = horror_fobj("simple.xls") table_set = XLSTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert_equal(offset, 0) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) assert_equal(int(data[0][1].value), 1) data = list(row_set) assert_equal(int(data[0][1].value), 1)
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs): '''Parse Excel (xls or xlsx) to structured objects. :param excel_type: xls | xlsx :param sheet: index of sheet in spreadsheet to convert (starting from index = 1) ''' sheet_number = int(sheet) - 1 xlsclass = XLSTableSet if excel_type == 'xlsx': xlsclass = XLSXTableSet table_set = xlsclass.from_fileobj(stream) try: row_set = table_set.tables[sheet_number] except IndexError: raise Exception('This file does not have sheet number %d' % (sheet_number + 1)) offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guess_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guess_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def test_read_encoded_characters_csv(self): fh = horror_fobj('characters.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set) assert_equal(382, len(data)) assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處') assert_equal(data[-1][2].value, u'沈光文紀念廳')
def test_read_head_offset_csv(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert_equal(offset, 0) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) assert_equal(int(data[0][1].value), 1) data = list(row_set) assert_equal(int(data[0][1].value), 1)
def connect(self, host=None, port=None, database=None, username=None, password=None, file=None): # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle self.database = database conn_string = '' if self.engine == 'psycopg2': if database: conn_string += "dbname='%s' " % database if username: conn_string += "user='******' " % username if host: conn_string += "host='%s' " % host if port: conn_string += "port='%s' " % port if password: conn_string += "password='******' " % password self.conn = psycopg2.connect(conn_string) elif self.engine == 'pymssql': self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1') elif self.engine == 'csv': # https://messytables.readthedocs.io/en/latest/ fh = StringIO.StringIO(self.data) #dialect = csv.Sniffer().sniff(f.read(1024)) #f.seek(0) #self.conn = csv.DictReader(f, dialect=dialect) #fh = open('messy.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) self.conn = row_set return self.conn
def proc(f, database_name, table_name): table_set = messytables.any_tableset(f) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=[ messytables.types.StringType, messytables.types.DateType, ], strict=True) hive_data_file = tempfile.NamedTemporaryFile(mode='w') fields_ddl = ','.join([ ' {0} {1}\n'.format( canonicalize_column_name(colName), hive_column_type(colType) ) for colName, colType in zip(headers, types) ]) hive_sql = ''' DROP TABLE IF EXISTS {0}; CREATE TABLE {0} ( {1} ) STORED AS TEXTFILE TBLPROPERTIES ("comment"="add_messytable on {3}"); LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0}; '''.format(table_name, fields_ddl, hive_data_file.name, datetime.datetime.now().isoformat()) hive_cmd_file = tempfile.NamedTemporaryFile(mode='w') print(hive_sql, file=hive_cmd_file) hive_cmd_file.flush() row_set.register_processor(messytables.types_processor(types)) for row in row_set: print('\001'.join(map(str, [ c.value for c in row])), file=hive_data_file) hive_data_file.flush() subprocess.call([ 'hive', '--database', database_name, '-f', hive_cmd_file.name, ])
def test_read_head_padding_csv(self): fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert 11 == len(headers), headers assert_equal('1985', headers[1].strip()) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) for row in row_set: assert_equal(11, len(row)) value = data[1][0].value.strip() assert value == u'Gefäßchirurgie', value
def test_read_head_padding_csv(self): fh = horror_fobj("weird_head_padding.csv") table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert 11 == len(headers), headers assert_equal(u"1985", headers[1].strip()) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) for row in row_set: assert_equal(11, len(row)) value = data[1][0].value.strip() assert value == u"Gefäßchirurgie", value
def csvParse(csv_file_path): fh = open(csv_file_path, 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set, headers, offset, types
def get_diff(self, filename1, filename2): #print("get_diff", filename1, filename2) ext = filename1.split(".")[-1].lower() if ext not in ['csv', 'tsv', 'xls']: return None csvs = {} for f in [filename1, filename2]: # print("Loading file", f) table_set = self.read_file(f) if table_set is None: raise Exception("Invalid table set") row_set = table_set.tables[0] #print("Guessing headers") offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) # Output of rowset is a structure csvs[f] = [headers] for row in row_set: csvs[f].append([r.value for r in row]) #print(csvs[f][:3]) # Loaded csv1 and csv2 table1 = daff.PythonTableView(csvs[filename1]) table2 = daff.PythonTableView(csvs[filename2]) alignment = daff.Coopy.compareTables(table1, table2).align() # print("Achieved alignment") data_diff = [] table_diff = daff.PythonTableView(data_diff) flags = daff.CompareFlags() highlighter = daff.TableDiff(alignment, flags) highlighter.hilite(table_diff) # Parse the differences #print("Parsing diff") diff = self.parse_diff(table_diff) #print("Computed diff", diff) return diff
def get_diff(self, filename1, filename2): # print("get_diff", filename1, filename2) ext = filename1.split(".")[-1].lower() if ext not in ['csv', 'tsv', 'xls']: return None csvs = {} for f in [filename1, filename2]: # print("Loading file", f) table_set = self.read_file(f) if table_set is None: raise Exception("Invalid table set") row_set = table_set.tables[0] #print("Guessing headers") offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset+1)) # Output of rowset is a structure csvs[f] = [headers] for row in row_set: csvs[f].append([r.value for r in row]) #print(csvs[f][:3]) # Loaded csv1 and csv2 table1 = daff.PythonTableView(csvs[filename1]) table2 = daff.PythonTableView(csvs[filename2]) alignment = daff.Coopy.compareTables(table1,table2).align() # print("Achieved alignment") data_diff = [] table_diff = daff.PythonTableView(data_diff) flags = daff.CompareFlags() highlighter = daff.TableDiff(alignment,flags) highlighter.hilite(table_diff) # Parse the differences #print("Parsing diff") diff = self.parse_diff(table_diff) # print("Computed diff", diff) return diff
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType()])
def load_data(config): if not 'url' in config: yield {config.get('field'): config.get('value')} return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def test_guess_headers(self): fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set) assert 'Frauenheilkunde' in data[9][0].value, data[9][0].value fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(headers_processor(['foo', 'bar'])) data = list(row_set) assert 'foo' in data[12][0].column, data[12][0] assert 'Chirurgie' in data[12][0].value, data[12][0].value
def test_guess_headers(self): fh = horror_fobj("weird_head_padding.csv") table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set) assert "Frauenheilkunde" in data[9][0].value, data[9][0].value fh = horror_fobj("weird_head_padding.csv") table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(headers_processor(["foo", "bar"])) data = list(row_set) assert "foo" in data[12][0].column, data[12][0] assert "Chirurgie" in data[12][0].value, data[12][0].value
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def prepare_csv_rows(csv_file): row_set = CSVTableSet(csv_file).tables[0] offset, headers = headers_guess(row_set.sample) headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)] row_set.register_processor(headers_processor_remove_blank(headers)) row_set.register_processor(offset_processor(offset + 1)) DateType.formats = create_date_formats(day_first=False) # We are never wanting boolean types, so remove that from the default list eligible_types = [StringType, DecimalType, IntegerType, DateType] types = type_guess(row_set.sample, types=eligible_types, strict=True) row_set.register_processor(types_processor(types)) return row_set
def parse_data(input): fh = open(input, 'rb') try: table_set = messytables.any_tableset(fh) except messytables.ReadError as e: print(e) get_row_set = lambda table_set: table_set.tables.pop() row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] print('Determined headers and types: {headers}'.format( headers=headers_dicts)) return headers_dicts, result
def parse_table(source): # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. # We're also passing in an extended window size to give more # reliable type detection. # Because Python's CSV dialect sniffer isn't the best, this also # constrains the field quoting character to a double quote. table_set = mt.any_tableset(source.fh(), extension=source.meta.get('extension'), mimetype=source.meta.get('mime_type'), quotechar='"', window=20000) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] headers = [c.value for c in next(row_set.sample)] row_set.register_processor(mt.headers_processor(headers)) row_set.register_processor(mt.offset_processor(1)) types = mt.type_guess(row_set.sample, strict=True) row_set.register_processor(mt.types_processor(types, strict=True)) fields, i = {}, 0 row_iter = iter(row_set) while True: i += 1 try: row = row_iter.next() if not len(fields): fields = generate_field_spec(row) data = convert_row(row, fields, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue yield None, fields, data except StopIteration: return except Exception, e: # log.exception(e) yield e, fields, None
def load_data(config): if not 'url' in config: yield { config.get('field'): config.get('value') } return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as f: table_set = CSVTableSet(f) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) return zip(headers, types)
def resource_row_set(package, resource): """ Generate an iterator over all the rows in this resource's source data. """ # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. table_set = any_tableset(resource.fh(), extension=resource.meta.get('extension'), mimetype=resource.meta.get('mime_type')) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set
def convert(self): xlsclass = XLSTableSet if 'xlsx' == self.excel_type: xlsclass = XLSXTableSet table_set = xlsclass.from_fileobj(self.stream) try: row_set = table_set.tables[self.sheet_number] except IndexError: raise Exception('This file does not have worksheet number %d' % (self.sheet_number + 1)) offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) info = {} result = [] for row in row_set: for index, cell in enumerate(row): if isinstance(cell.value, datetime): info[cell.column] = cell.value.isoformat() else: info[cell.column] = cell.value result.append(info) return fields, result
def _guess_csv_datatype(fh): table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logger.info("(offset, headers) = ({}, {})".format(offset, headers)) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) counter = 0 for row in row_set: logger.info(row) counter += 1 if counter >= 32: break d = {h: t for h, t in zip(headers, types)} logger.info(d) return d
def generate_table(self, document, sheet, row_set): offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) tabular = self.create_tabular(sheet, row_set.name) columns = [tabular.add_column(h) for h in headers] if not len(columns): return def generate_rows(): for row in row_set: record = {} for cell, column in zip(row, columns): record[column.name] = string_value(cell.value) if len(record): for column in columns: record[column.name] = record.get(column.name, None) yield record document.insert_records(sheet, generate_rows()) return tabular
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict: """ Guess columns types from the given samples and build json schema :param samples: List of dictionaries containing samples data from csv file(s) :param table_spec: table/stream specs given in the tap definition :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema """ schema = {} table_set = CSVTableSet(_csv2bytesio(samples)) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) for header, header_type in zip(headers, types): date_overrides = set(table_spec.get('date_overrides', [])) if header in date_overrides: schema[header] = {'type': ['null', 'string'], 'format': 'date-time'} else: if isinstance(header_type, IntegerType): schema[header] = { 'type': ['null', 'integer'] } elif isinstance(header_type, DecimalType): schema[header] = { 'type': ['null', 'number'] } else: schema[header] = { 'type': ['null', 'string'] } return schema
def lines(self): fh = urlopen(self.source.url) row_set = CSVRowSet('data', fh, window=3) headers = list(row_set.sample)[0] headers = [c.value for c in headers] row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(1)) for row in row_set: row_dict = dict([(c.column, c.value) for c in row]) # Rename id to row_id row_dict['row_id'] = row_dict.pop('id') # Set time as empty string to use the default value row_dict['time'] = '' # Transform COFOG field into six fields with code and label as # the same value cofog = row_dict.pop('cofog', None) if cofog: row_dict['cofog1code'] = self.cofog_code(cofog, level=1) row_dict['cofog1label'] = self.cofog_code(cofog, level=1) row_dict['cofog2code'] = self.cofog_code(cofog, level=2) row_dict['cofog2label'] = self.cofog_code(cofog, level=2) row_dict['cofog3code'] = self.cofog_code(cofog, level=3) row_dict['cofog3label'] = self.cofog_code(cofog, level=3) # Transform gfsm expense field into three fields gfsmexpense = row_dict.pop('gfsmexpense', None) if gfsmexpense: row_dict['gfsmexpense1'] = self.gfsm_code(gfsmexpense, level=1) row_dict['gfsmexpense2'] = self.gfsm_code(gfsmexpense, level=2) row_dict['gfsmexpense3'] = self.gfsm_code(gfsmexpense, level=3) # Transform gfsm revenue field into three fields gfsmrevenue = row_dict.pop('gfsmrevenue', None) if gfsmrevenue: row_dict['gfsmrevenue1'] = self.gfsm_code(gfsmrevenue, level=1) row_dict['gfsmrevenue2'] = self.gfsm_code(gfsmrevenue, level=2) row_dict['gfsmrevenue3'] = self.gfsm_code(gfsmrevenue, level=3) yield row_dict
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as csvfile: table_set = CSVTableSet(csvfile, window=1) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = [ 'integer' if header == S3Helper.SDC_SOURCE_LINENO_COLUMN else 'string' for header in headers ] return zip(headers, types)
def headersDataTypes(CSV): '''Get column headers and data types using messytables''' table = open(path[0]+CSV, 'rb') # Creates a set of tables as file object, although it'll just be one tableset = messytables.CSVTableSet(table) rowset = tableset.tables[0] # get first and only table as iterator # guesses header names and offset of header, returns headers as list offset, headers = messytables.headers_guess(rowset.sample) print "Here is the offset", str(offset), "\nHere are the headers:\n"\ , str(headers) # test # establish headers in table rowset.register_processor(messytables.headers_processor(headers)) # begin iterator at content, rather than header rowset.register_processor(messytables.offset_processor(offset + 1)) # guess column types, return as list types = messytables.type_guess(rowset.sample, strict=True) print "Here are the data types", str(types) dtypedict = {} # empty dictionary to append columns and datatype needed # for pandas csv to dataframe conversion colcount = 0 # location to append datatypes to match columns in dict for column in types: dtypedict[headers[colcount]]=column colcount+=1 return headers, dtypedict
def parse_table(row_set, save_func): num_rows = 0 fields = {} offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) for i, row in enumerate(row_set): if not len(fields): fields = generate_field_spec(row) data = {} for cell, field in zip(row, fields): value = cell.value if isinstance(value, datetime): value = value.date() if isinstance(value, Decimal): # Baby jesus forgive me. value = float(value) if isinstance(value, basestring) and not len(value.strip()): value = None data[field['name']] = value random_sample(value, field, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue save_func(data) num_rows = i fields = {f.get('name'): f for f in fields} return num_rows, fields
def determine_messytables_types(file_handle, types=messytables.types.TYPES): """ :param file_handle: file handle opened in binary mode :return: (headers, types, row_set) """ # Load a file object: table_set = messytables.CSVTableSet(file_handle) # If you aren't sure what kind of file it is # table_set = messytables.any_tableset(file_handle) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: print(next(row_set.sample)) # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(messytables.offset_processor(offset + 1)) # guess column types: types = messytables.type_guess(row_set.sample, types, strict=True) # and tell the row set to apply these types to # each row when traversing the iterator: row_set.register_processor(messytables.types_processor(types)) # now run some operation on the data: return headers, types, row_set
def ku_openlearning(self, filename, source_id): CATEGORY_MAPPING = { 'Assessment of learning': 2298, #Assessment, 'Finance': 2235, 'Public Service': 'Criminal Justice', 'Health Science': 'Health Sciences', 'Management': 2248, 'Online Instruction': 'Hybrid and Online Course Development', 'Early Childhood': ['Career Counseling and Services', 'Childhood and Adolescence'], 'Law, Legal': 'Law', 'Psychology': 'Psychology', 'Customer Service': 2246, 'Communications': 'Communications', 'Professionalism': 'Personal Development' } source = Source.objects.get(pk=source_id) fh = open(filename, 'rb') table_set = XLSTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: url = row[0].value title = row[1].value description = row[2].value # language = row[4].value # material_type = row[5].value license = row[6].value categories = row[7].value keywords = row[8].value # audience = row[9].value course, is_created = Course.objects.get_or_create( linkurl = url, provider = source.provider, source = source, defaults = { 'title': title, 'description': description, 'tags': keywords, 'language': 'English', 'license': license, 'content_medium': 'text', 'creative_commons': 'Yes', 'creative_commons_commercial': 'No', 'creative_commons_derivatives': 'No' } ) merlot_cat = CATEGORY_MAPPING[categories] if type(merlot_cat) != list: merlot_cat = [merlot_cat,] for item in merlot_cat: try: m = MerlotCategory.objects.get(merlot_id=item) course.merlot_categories.add(m) except ValueError: m = MerlotCategory.objects.get(name=item) course.merlot_categories.add(m)
try: table_set = messytables.any_tableset(f, mimetype=ct, extension=ct) except messytables.ReadError as e: ## try again with format f.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(f, mimetype=format, extension=format) except: raise util.JobError(e) row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row
def push_to_datastore(self, context, resource): # Get the resource's content hash, which is used to check whether the # resource file has changed since last time. hash_dict = resource.get('hash') if hash_dict: original_content_hash = json.loads(hash_dict)['content'] check_hash = not self.options.force else: # This resource has no hash yet, it must be a new resource. original_content_hash = '' check_hash = False try: result = fetch_resource.download(context, resource, self.max_content_length, DATA_FORMATS, check_modified=check_hash) except fetch_resource.ResourceNotModified as e: logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) return {'success': True, 'resource': resource['id'], 'error': None} except Exception as e: logger.exception(e) return {'success': False, 'resource': resource['id'], 'error': 'Could not download resource'} if check_hash and (result['hash'] == original_content_hash): logger.info( u'Skipping unmodified resource: {0}'.format(resource['url']) ) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None} content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') try: table_sets = any_tableset( f, mimetype=content_type, extension=resource['format'].lower() ) # only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error parsing the resource'} row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data, 'force': True, } response = toolkit.get_action('datastore_create')( context, data_dict ) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Trying to delete existing datastore for resource {0} ' '(may not exist).'.format(resource['id'])) try: toolkit.get_action('datastore_delete')( context, {'resource_id': resource['id'], 'force': True} ) except toolkit.ObjectNotFound: logger.info('Datastore not found for resource {0}.'.format( resource['id'])) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 try: for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) except Exception as e: logger.exception(e) os.remove(result['saved_file']) return {'success': False, 'resource': resource['id'], 'error': 'Error pushing data to datastore'} logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'] )) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.now().isoformat() }) toolkit.get_action('resource_update')(context, resource) os.remove(result['saved_file']) return {'success': True, 'resource': resource['id'], 'error': None}
def detect_headers(row_set): offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) return headers
# Uses Messytables example (https://messytables.readthedocs.io/en/latest/#example) # To extract from a CSV flatfile the required BIGQUERY JSON metadata # For importing a table in BIGQUERY # Example: python csv_to_json_import_bq.py the_csv_file.csv from messytables import CSVTableSet, type_guess, \ types_processor, headers_guess, headers_processor, \ offset_processor, any_tableset import sys fh = open(sys.argv[1], 'rb') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset+1)) types = type_guess(row_set.sample, strict=True) for i in range(len(headers)): output = "[\n" if ("DATE" in str(types[i]).upper()): types[i] = "TIMESTAMP" elif ("DECIMAL" in str(types[i]).upper()): types[i] = "FLOAT" output = "{\"name\":\"" + str(headers[i]).lower() + "\", \"type\":\"" + str(types[i]).upper() + "\"}" if i == (len(headers)-1): output += "\n]" else: output += "," print output
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guessable_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor(types_processor(row_types)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}