def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
Beispiel #2
0
    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)

        # Have I been able to read the filename
        if table_set is None:
            return []

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v
        schema = []
        for i, h in enumerate(headers):
            schema.append([h, str(types[i]), clean(sample[i].value)])

        return schema
Beispiel #3
0
    def generate_table(self, meta, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        schema = TabularSchema({
            'sheet_name': row_set.name,
            'content_hash': meta.content_hash,
            'sheet': sheet
        })
        columns = [schema.add_column(h) for h in headers]
        log.info("Creating internal table: %s columns, table: %r", len(columns),
                 schema.table_name)
        tabular = Tabular(schema)
        tabular.drop()
        tabular.create()

        def generate_rows():
            for i, row in enumerate(row_set):
                record = {}
                for cell, column in zip(row, columns):
                    record[column.name] = string_value(cell.value)
                if len(record):
                    for column in columns:
                        record[column.name] = record.get(column.name, None)
                    yield record
            log.info("Loaded %s rows.", i)

        tabular.load_iter(generate_rows())
        return schema
Beispiel #4
0
def rowset_as_jts(rowset, headers=None, types=None):
    ''' Create a json table schema from a rowset
    '''
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    return headers_and_typed_as_jts(headers, types)
Beispiel #5
0
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType()
     ])
Beispiel #6
0
def get_column_types(data: io.BytesIO) \
        -> Tuple[List[str], List[types.CellType]]:
    """derive the column types

  Using messytables' CSV API, attempt to derive the column types based on a
  best-guess of a sample of the rows.

  This is still a WIP due to the parlous state of the DV360/CM CSV data formats
  in general

  Arguments:
      data (io.BytesIO):  sample of the CSV file

  Returns:
      (List[str], List[str]): tuple of list of header names and list of
                                column types
  """
    table_set = messytables.CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, csv_headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(csv_headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    csv_types = messytables.type_guess(row_set.sample, strict=True)

    return (csv_headers, csv_types)
Beispiel #7
0
def rowset_as_jts(rowset, headers=None, types=None):
    ''' Create a json table schema from a rowset
    '''
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    return headers_and_typed_as_jts(headers, types)
Beispiel #8
0
    def generate_table(self, document, meta, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        tabular = self.create_tabular(sheet, row_set.name)
        columns = [tabular.add_column(h) for h in headers]
        if not len(columns):
            return

        def generate_rows():
            for i, row in enumerate(row_set):
                record = {}
                try:
                    for cell, column in zip(row, columns):
                        record[column.name] = string_value(cell.value)
                    if len(record):
                        for column in columns:
                            record[column.name] = record.get(column.name, None)
                        yield record
                except Exception as exception:
                    log.warning("Could not decode row %s in %s: %s",
                                i, meta, exception)

        document.insert_records(sheet, generate_rows())
        return tabular
Beispiel #9
0
    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)
            
        # Have I been able to read the filename
        if table_set is None: 
            return [] 

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v 
        schema = []
        for i, h in enumerate(headers):
            schema.append([h,
                           str(types[i]),
                           clean(sample[i].value)])

        return schema
Beispiel #10
0
  def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]:
    """derive the column types

    Using messytables' CSV API, attempt to derive the column types based on a best-guess
    of a sample of the rows.

    This is still a WIP due to the parlous state of the DV360/CM CSV data formats in
    general
    
    Arguments:
        data {io.BytesIO} -- sample of the CSV file

    Returns:
        (List[str], List[str]) -- tuple of list of header names and list of column types
    """
    table_set = CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logging.info(headers)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    logging.info(types)

    return (headers, types)
    def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] =  u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id'] for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
Beispiel #12
0
def main(basic_config_file, batch_config_file):
    with open(basic_config_file, "r") as f:
        base_settings = yaml.load(f)

    if batch_config_file:
        # RUN MANY
        # parse csv into a list of settings-dicts
        import messytables
        with open(batch_config_file, "rb") as f:
            row_set = messytables.CSVRowSet("", f)
            offset, headers = messytables.headers_guess(row_set.sample)
            row_set.register_processor(messytables.headers_processor(headers))
            row_set.register_processor(messytables.offset_processor(offset +
                                                                    1))
            types = messytables.type_guess(row_set.sample, strict=True)
            row_set.register_processor(messytables.types_processor(types))
            settings_list = row_set.dicts()
        name = batch_config_file.replace(".csv", "")
        run_many(settings_list, name, base_settings=base_settings)
    else:
        # RUN ONE
        # parse yaml into a settings-dict
        settings_file = os.path.join(base_settings["out_dir"], "settings.yml")
        with open(settings_file, "w") as f:
            yaml.dump(base_settings, f)
        training_log, exit_status = run_one(**base_settings)
        training_log_file = os.path.join(base_settings["out_dir"],
                                         "training_log.csv")
        training_log.to_csv(training_log_file)
        stats = compute_final_stats(training_log)
        stats["exit_status"] = exit_status
        training_stats_file = os.path.join(base_settings["out_dir"],
                                           "training_stats.yml")
        with open(training_stats_file, "w") as f:
            yaml.dump(stats, f)
Beispiel #13
0
    def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id']
                                                      for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
Beispiel #14
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
Beispiel #15
0
def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
Beispiel #16
0
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [
            StringType, IntegerType, FloatType, DecimalType, DateUtilType
        ]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
Beispiel #17
0
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [StringType, IntegerType, FloatType, DecimalType,
                       DateUtilType]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
Beispiel #18
0
 def test_read_encoded_characters_csv(self):
     fh = horror_fobj('characters.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set)
     assert_equal(382, len(data))
     assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
     assert_equal(data[-1][2].value, u'沈光文紀念廳')
Beispiel #19
0
 def test_read_head_offset_excel(self):
     fh = horror_fobj("simple.xls")
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert_equal(offset, 0)
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     assert_equal(int(data[0][1].value), 1)
     data = list(row_set)
     assert_equal(int(data[0][1].value), 1)
Beispiel #20
0
 def test_read_encoded_characters_csv(self):
     fh = horror_fobj('characters.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set)
     assert_equal(382, len(data))
     assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
     assert_equal(data[-1][2].value, u'沈光文紀念廳')
Beispiel #21
0
 def test_read_head_offset_csv(self):
     fh = horror_fobj('simple.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert_equal(offset, 0)
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     assert_equal(int(data[0][1].value), 1)
     data = list(row_set)
     assert_equal(int(data[0][1].value), 1)
def rowset_as_schema(rowset):
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    j = jsontableschema.JSONTableSchema()

    for field_id, field_type in zip(headers, types):
        j.add_field(field_id=field_id, 
                    label=field_id,
                    field_type=field_type)

    return j
Beispiel #23
0
    def connect(self,
                host=None,
                port=None,
                database=None,
                username=None,
                password=None,
                file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host,
                                        username,
                                        password,
                                        database,
                                        port=port,
                                        as_dict=True,
                                        charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn
def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
Beispiel #25
0
def validate_file(file_tmp, file_name, tmp_filepath):

    log.info("upload: checking file * %s * ", file_name)
    MAX_HEADER_LENGTH = 64
    # not allowed characters ( - ' " ’ ‘) regex
    inappropriate_chars = re.compile(r"[\-|\'|\"|\u2018|\u2019]");
    datastore_ext = config.get('ckan.mimetype_guess', "csv xls xlsx tsv")
    tmp_file_name, tmp_file_ext = os.path.splitext(file_name)

    #check if datastore file (csv xls xlsx tsv)
    if tmp_file_ext[1:].lower() in datastore_ext:
        table_set = any_tableset(file_tmp)
        #check if only one data sheet in the file
        if len(table_set.tables)>1:
            rollback_tmp(file_tmp, tmp_filepath)
            log.error("upload: the file * %s * was not uploaded - There is more then one data sheet in the file", file_name)
            raise logic.ValidationError(
                {'upload': ['There is more then one data sheet in the file']}
            )
        else:
            row_set = table_set.tables[0]
            # guess header names and the offset of the header:
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            for header in headers:
                # too long header
                if len(header) > MAX_HEADER_LENGTH:
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - too long header - * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['too long header (64 max)']}
                    )
                # not allowed characters in header ( - ' " ’ ‘)
                if inappropriate_chars.search(header):
                    rollback_tmp(file_tmp, tmp_filepath)
                    log.error("upload: the file * %s * was not uploaded - there are inappropriate characters in headers * %s *",
                              file_name, header)
                    raise logic.ValidationError(
                        {'upload': ['there are inappropriate characters in headers (apostrophe/apostrophes/dash)']}
                    )
            # Check for duplicate fields
            unique_fields = set(headers)
            if not len(unique_fields) == len(headers):
                rollback_tmp(file_tmp, tmp_filepath)
                log.error("upload: the file * %s * was not uploaded - Duplicate column names are not supported", file_name)
                raise logic.ValidationError({'upload': ['Duplicate column names are not supported']})
        log.info("passed validation succesfully - the file * %s * was uploaded to CKAN (filestore)", file_name)
    else:
        pass
Beispiel #26
0
 def test_read_head_padding_csv(self):
     fh = horror_fobj("weird_head_padding.csv")
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal(u"1985", headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u"Gefäßchirurgie", value
Beispiel #27
0
def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set, headers, offset, types
Beispiel #28
0
 def test_read_head_padding_csv(self):
     fh = horror_fobj('weird_head_padding.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal('1985', headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u'Gefäßchirurgie', value
Beispiel #29
0
    def get_diff(self, filename1, filename2):

        # print("get_diff", filename1, filename2)

        ext = filename1.split(".")[-1].lower() 
        if ext not in ['csv', 'tsv', 'xls']: 
            return None

        csvs = {} 
        for f in [filename1, filename2]: 
            # print("Loading file", f)
            table_set = self.read_file(f) 
            if table_set is None: 
                raise Exception("Invalid table set")
            row_set = table_set.tables[0]
            #print("Guessing headers")
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset+1))
            
            # Output of rowset is a structure
            csvs[f] = [headers] 
            for row in row_set: 
                csvs[f].append([r.value for r in row])
            
            #print(csvs[f][:3])

        # Loaded csv1 and csv2 
        table1 = daff.PythonTableView(csvs[filename1])
        table2 = daff.PythonTableView(csvs[filename2])

        alignment = daff.Coopy.compareTables(table1,table2).align()

        # print("Achieved alignment") 

        data_diff = []
        table_diff = daff.PythonTableView(data_diff)

        flags = daff.CompareFlags()
        highlighter = daff.TableDiff(alignment,flags)
        highlighter.hilite(table_diff)

        # Parse the differences
        #print("Parsing diff") 
        diff = self.parse_diff(table_diff)

        # print("Computed diff", diff) 
        return diff 
Beispiel #30
0
    def get_diff(self, filename1, filename2):

        #print("get_diff", filename1, filename2)

        ext = filename1.split(".")[-1].lower()
        if ext not in ['csv', 'tsv', 'xls']:
            return None

        csvs = {}
        for f in [filename1, filename2]:
            # print("Loading file", f)
            table_set = self.read_file(f)
            if table_set is None:
                raise Exception("Invalid table set")
            row_set = table_set.tables[0]
            #print("Guessing headers")
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            # Output of rowset is a structure
            csvs[f] = [headers]
            for row in row_set:
                csvs[f].append([r.value for r in row])

            #print(csvs[f][:3])

        # Loaded csv1 and csv2
        table1 = daff.PythonTableView(csvs[filename1])
        table2 = daff.PythonTableView(csvs[filename2])

        alignment = daff.Coopy.compareTables(table1, table2).align()

        # print("Achieved alignment")

        data_diff = []
        table_diff = daff.PythonTableView(data_diff)

        flags = daff.CompareFlags()
        highlighter = daff.TableDiff(alignment, flags)
        highlighter.hilite(table_diff)

        # Parse the differences
        #print("Parsing diff")
        diff = self.parse_diff(table_diff)

        #print("Computed diff", diff)
        return diff
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(), IntegerType(),
         IntegerType(), IntegerType(), IntegerType(), IntegerType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), IntegerType(), StringType(),
         StringType()])
Beispiel #32
0
def load_data(config):
    if not 'url' in config:
        yield {config.get('field'): config.get('value')}
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Beispiel #33
0
	def create_sql_table(self, rowset, sql_table_name=None, headers=None, types=None):
		"""
		Create a SQL table schema from a MessyTables RowSet
		"""

		# if a different name isn't specified, use the primary root name
		if not sql_table_name:
			sql_table_name = self.table_name
		
		# we don't care about the offset returned, so just throw it away, get headers
		_, headers = messytables.headers_guess(rowset.sample)
		types = map(self.celltype_as_string, messytables.type_guess(rowset.sample, strict=False))
		
		self.headers = headers
		self.header_types = types

		return self.headers_and_typed_as_sql(sql_table_name, headers, types)
Beispiel #34
0
    def test_guess_headers(self):
        fh = horror_fobj("weird_head_padding.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        data = list(row_set)
        assert "Frauenheilkunde" in data[9][0].value, data[9][0].value

        fh = horror_fobj("weird_head_padding.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(headers_processor(["foo", "bar"]))
        data = list(row_set)
        assert "foo" in data[12][0].column, data[12][0]
        assert "Chirurgie" in data[12][0].value, data[12][0].value
Beispiel #35
0
    def test_guess_headers(self):
        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        data = list(row_set)
        assert 'Frauenheilkunde' in data[9][0].value, data[9][0].value

        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(headers_processor(['foo', 'bar']))
        data = list(row_set)
        assert 'foo' in data[12][0].column, data[12][0]
        assert 'Chirurgie' in data[12][0].value, data[12][0].value
Beispiel #36
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Beispiel #37
0
def prepare_csv_rows(csv_file):
    row_set = CSVTableSet(csv_file).tables[0]

    offset, headers = headers_guess(row_set.sample)
    headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)]

    row_set.register_processor(headers_processor_remove_blank(headers))
    row_set.register_processor(offset_processor(offset + 1))

    DateType.formats = create_date_formats(day_first=False)

    # We are never wanting boolean types, so remove that from the default list
    eligible_types = [StringType, DecimalType, IntegerType, DateType]
    types = type_guess(row_set.sample, types=eligible_types, strict=True)

    row_set.register_processor(types_processor(types))

    return row_set
Beispiel #38
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Beispiel #39
0
def parse_data(input):
    fh = open(input, 'rb')

    try:
        table_set = messytables.any_tableset(fh)
    except messytables.ReadError as e:
        print(e)

    get_row_set = lambda table_set: table_set.tables.pop()
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)
    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row

    result = row_iterator()

    headers_dicts = [
        dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
        for field in zip(headers, types)
    ]

    print('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    return headers_dicts, result
Beispiel #40
0
def load_data(config):
    if not 'url' in config:
        yield {
            config.get('field'): config.get('value')
            }
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Beispiel #41
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as f:
            table_set = CSVTableSet(f)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)))
            return zip(headers, types)
Beispiel #42
0
def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return

    row_set = tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set
    def convert(self):
        xlsclass = XLSTableSet
        if 'xlsx' == self.excel_type:
            xlsclass = XLSXTableSet
        table_set = xlsclass.from_fileobj(self.stream)
        try:
            row_set = table_set.tables[self.sheet_number]
        except IndexError:
            raise Exception('This file does not have worksheet number %d' %
                            (self.sheet_number + 1))
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id']
                                                      for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        info = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                if isinstance(cell.value, datetime):
                    info[cell.column] = cell.value.isoformat()
                else:
                    info[cell.column] = cell.value
            result.append(info)
        return fields, result
Beispiel #44
0
    def generate_table(self, document, sheet, row_set):
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        tabular = self.create_tabular(sheet, row_set.name)
        columns = [tabular.add_column(h) for h in headers]
        if not len(columns):
            return

        def generate_rows():
            for row in row_set:
                record = {}
                for cell, column in zip(row, columns):
                    record[column.name] = string_value(cell.value)
                if len(record):
                    for column in columns:
                        record[column.name] = record.get(column.name, None)
                    yield record

        document.insert_records(sheet, generate_rows())
        return tabular
def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d
Beispiel #46
0
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict:
    """
    Guess columns types from the given samples and build json schema
    :param samples: List of dictionaries containing samples data from csv file(s)
    :param table_spec: table/stream specs given in the tap definition
    :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema
    """
    schema = {}

    table_set = CSVTableSet(_csv2bytesio(samples))

    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    types = type_guess(row_set.sample, strict=True)

    for header, header_type in zip(headers, types):

        date_overrides = set(table_spec.get('date_overrides', []))

        if header in date_overrides:
            schema[header] = {'type': ['null', 'string'], 'format': 'date-time'}
        else:
            if isinstance(header_type, IntegerType):
                schema[header] = {
                    'type': ['null', 'integer']
                }
            elif isinstance(header_type, DecimalType):
                schema[header] = {
                    'type': ['null', 'number']
                }
            else:
                schema[header] = {
                    'type': ['null', 'string']
                }

    return schema
Beispiel #47
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as csvfile:
            table_set = CSVTableSet(csvfile, window=1)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = [
                'integer' if header == S3Helper.SDC_SOURCE_LINENO_COLUMN else 'string'
                for header in headers
            ]
            return zip(headers, types)
    def convert(self):
        xlsclass = XLSTableSet
        if 'xlsx' == self.excel_type:
            xlsclass = XLSXTableSet
        table_set = xlsclass.from_fileobj(self.stream)
        try:
            row_set = table_set.tables[self.sheet_number]
        except IndexError:
            raise Exception('This file does not have worksheet number %d' % (self.sheet_number + 1))
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] =  u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id'] for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        info = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                if isinstance(cell.value, datetime):
                    info[cell.column] = cell.value.isoformat()
                else:
                    info[cell.column] = cell.value
            result.append(info)
        return fields, result
def headersDataTypes(CSV):  
    '''Get column headers and data types using messytables'''  
    table = open(path[0]+CSV, 'rb')
    # Creates a set of tables as file object, although it'll just be one
    tableset = messytables.CSVTableSet(table) 
    rowset = tableset.tables[0] # get first and only table as iterator
    # guesses header names and offset of header, returns headers as list
    offset, headers = messytables.headers_guess(rowset.sample) 
    print "Here is the offset", str(offset), "\nHere are the headers:\n"\
    , str(headers) # test 
    # establish headers in table
    rowset.register_processor(messytables.headers_processor(headers))
    # begin iterator at content, rather than header
    rowset.register_processor(messytables.offset_processor(offset + 1))
    # guess column types, return as list
    types = messytables.type_guess(rowset.sample, strict=True)
    print "Here are the data types", str(types)  
    dtypedict = {} # empty dictionary to append columns and datatype needed
    # for pandas csv to dataframe conversion
    colcount = 0  # location to append datatypes to match columns in dict
    for column in types:
        dtypedict[headers[colcount]]=column
        colcount+=1
    return headers, dtypedict  
Beispiel #50
0
def determine_messytables_types(file_handle, types=messytables.types.TYPES):
    """

    :param file_handle: file handle opened in binary mode
    :return: (headers, types, row_set)
    """

    # Load a file object:
    table_set = messytables.CSVTableSet(file_handle)

    # If you aren't sure what kind of file it is
    # table_set = messytables.any_tableset(file_handle)

    # A table set is a collection of tables:
    row_set = table_set.tables[0]

    # A row set is an iterator over the table, but it can only
    # be run once. To peek, a sample is provided:
    print(next(row_set.sample))

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))

    # add one to begin with content, not the header:
    row_set.register_processor(messytables.offset_processor(offset + 1))

    # guess column types:
    types = messytables.type_guess(row_set.sample, types, strict=True)

    # and tell the row set to apply these types to
    # each row when traversing the iterator:
    row_set.register_processor(messytables.types_processor(types))

    # now run some operation on the data:
    return headers, types, row_set
Beispiel #51
0
def parse_table(row_set, save_func):
    num_rows = 0
    fields = {}

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    for i, row in enumerate(row_set):
        if not len(fields):
            fields = generate_field_spec(row)

        data = {}
        for cell, field in zip(row, fields):
            value = cell.value
            if isinstance(value, datetime):
                value = value.date()
            if isinstance(value, Decimal):
                # Baby jesus forgive me.
                value = float(value)
            if isinstance(value, basestring) and not len(value.strip()):
                value = None
            data[field['name']] = value
            random_sample(value, field, i)

        check_empty = set(data.values())
        if None in check_empty and len(check_empty) == 1:
            continue

        save_func(data)
        num_rows = i

    fields = {f.get('name'): f for f in fields}
    return num_rows, fields
Beispiel #52
0
    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(f, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        f.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(f, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)
    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
Beispiel #53
0
    try:
        table_set = messytables.any_tableset(f, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        ## try again with format
        f.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(f,
                                                 mimetype=format,
                                                 extension=format)
        except:
            raise util.JobError(e)

    row_set = table_set.tables.pop()
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
                             for f in existing.get('fields', [])
                             if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [unicode(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)
Beispiel #54
0
    def ku_openlearning(self, filename, source_id):
        CATEGORY_MAPPING = {
            'Assessment of learning': 2298, #Assessment,
            'Finance': 2235,
            'Public Service': 'Criminal Justice',
            'Health Science': 'Health Sciences',
            'Management': 2248,
            'Online Instruction': 'Hybrid and Online Course Development',
            'Early Childhood': ['Career Counseling and Services', 'Childhood and Adolescence'],
            'Law, Legal': 'Law',
            'Psychology': 'Psychology',
            'Customer Service': 2246,
            'Communications': 'Communications',
            'Professionalism': 'Personal Development'
        }

        source = Source.objects.get(pk=source_id)

        fh = open(filename, 'rb')
        table_set = XLSTableSet(fh)

        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))

        row_set.register_processor(offset_processor(offset + 1))
        for row in row_set:
            url = row[0].value
            title = row[1].value
            description = row[2].value
            # language = row[4].value
            # material_type = row[5].value
            license = row[6].value
            categories = row[7].value
            keywords = row[8].value
            # audience = row[9].value

            course, is_created = Course.objects.get_or_create(
                linkurl = url,
                provider = source.provider,
                source = source,
                
                defaults = {
                    'title': title,
                    'description': description,
                    'tags': keywords,
                    'language': 'English',
                    'license': license,
                    'content_medium': 'text',
                    'creative_commons': 'Yes',
                    'creative_commons_commercial': 'No',
                    'creative_commons_derivatives': 'No'
                    }
                )

            merlot_cat = CATEGORY_MAPPING[categories]
            if type(merlot_cat) != list:
                merlot_cat = [merlot_cat,]

            for item in merlot_cat:
                try:
                    m = MerlotCategory.objects.get(merlot_id=item)
                    course.merlot_categories.add(m)
                except ValueError:
                    m = MerlotCategory.objects.get(name=item)
                    course.merlot_categories.add(m)
Beispiel #55
0
def detect_headers(row_set):
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    return headers
# Uses Messytables example (https://messytables.readthedocs.io/en/latest/#example)
# To extract from a CSV flatfile the required BIGQUERY JSON metadata
# For importing a table in BIGQUERY
# Example: python csv_to_json_import_bq.py the_csv_file.csv
from messytables import CSVTableSet, type_guess, \
types_processor, headers_guess, headers_processor, \
offset_processor, any_tableset
import sys

fh = open(sys.argv[1], 'rb')
table_set = CSVTableSet(fh)
row_set = table_set.tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset+1))
types = type_guess(row_set.sample, strict=True)

for i in range(len(headers)):
    output = "[\n"
    if ("DATE" in str(types[i]).upper()):
        types[i] = "TIMESTAMP"
    elif ("DECIMAL" in str(types[i]).upper()):
        types[i] = "FLOAT"
    output = "{\"name\":\"" + str(headers[i]).lower() + "\", \"type\":\"" + str(types[i]).upper() + "\"}"
    if i == (len(headers)-1):
        output += "\n]"
    else:
        output += ","
    print output
    def push_to_datastore(self, context, resource):

        # Get the resource's content hash, which is used to check whether the
        # resource file has changed since last time.
        hash_dict = resource.get('hash')
        if hash_dict:
            original_content_hash = json.loads(hash_dict)['content']
            check_hash = not self.options.force
        else:
            # This resource has no hash yet, it must be a new resource.
            original_content_hash = ''
            check_hash = False

        try:
            result = fetch_resource.download(context,
                                             resource,
                                             self.max_content_length,
                                             DATA_FORMATS,
                                             check_modified=check_hash)
        except fetch_resource.ResourceNotModified as e:
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}
        except Exception as e:
            logger.exception(e)
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Could not download resource'}

        if check_hash and (result['hash'] == original_content_hash):
            logger.info(
                u'Skipping unmodified resource: {0}'.format(resource['url'])
            )
            os.remove(result['saved_file'])
            return {'success': True,
                    'resource': resource['id'],
                    'error': None}

        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        try:
            table_sets = any_tableset(
                f,
                mimetype=content_type,
                extension=resource['format'].lower()
            )
            # only first sheet in xls for time being
            row_set = table_sets.tables[0]
            offset, headers = headers_guess(row_set.sample)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error parsing the resource'}

        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(
            row_set.sample,
            [
                messytables.types.StringType,
                messytables.types.IntegerType,
                messytables.types.FloatType,
                messytables.types.DecimalType,
                messytables.types.DateUtilType
            ],
            strict=True
        )
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in
                              guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id': resource['id'],
                'fields': [dict(id=name, type=typename) for name, typename
                           in zip(headers, guessed_type_names)],
                'records': data,
                'force': True,
            }
            response = toolkit.get_action('datastore_create')(
                context,
                data_dict
            )
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Trying to delete existing datastore for resource {0} '
                    '(may not exist).'.format(resource['id']))
        try:
            toolkit.get_action('datastore_delete')(
                context,
                {'resource_id': resource['id'], 'force': True}
            )
        except toolkit.ObjectNotFound:
            logger.info('Datastore not found for resource {0}.'.format(
                resource['id']))
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(
                    itertools.imap(
                        dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        try:
            for data in chunky(row_set.dicts(), 100):
                count += len(data)
                send_request(data)
        except Exception as e:
            logger.exception(e)
            os.remove(result['saved_file'])
            return {'success': False,
                    'resource': resource['id'],
                    'error': 'Error pushing data to datastore'}

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count,
            res_id=resource['id']
        ))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.now().isoformat()
        })

        toolkit.get_action('resource_update')(context, resource)
        os.remove(result['saved_file'])
        return {'success': True,
                'resource': resource['id'],
                'error': None}