Beispiel #1
0
def generate_sample_task_without_check(label_config, mode='upload'):
    """ Generate sample task only
    """
    # load config
    parser = etree.XMLParser()
    xml = etree.fromstring(label_config, parser)
    if xml is None:
        raise etree.XMLSchemaParseError('Project config is empty or incorrect')

    # make examples pretty
    examples = data_examples(mode=mode)

    # iterate over xml tree and find values with '$'
    task = {}
    parent = xml.findall('.//*[@value]')  # take all tags with value attribute
    for p in parent:
        value = p.get('value')

        # process List
        if p.tag == 'List':
            key = p.get('elementValue').replace('$', '')
            examples['List'] = [{key: 'Hello world'}, {key: 'Goodbye world'}]

        if value and value[0] == '$':
            # try get example by variable name
            by_name = examples.get(value, None)
            # not found by name, try get example by type
            task[value[1:]] = examples.get(p.tag, 'Something') if by_name is None else by_name

    return task
def parse_config_to_json(config_string):
    parser = etree.XMLParser(recover=False)
    xml = etree.fromstring(config_string, parser)
    if xml is None:
        raise etree.XMLSchemaParseError('xml is empty or incorrect')
    config = xmljson.badgerfish.data(xml)
    return config
Beispiel #3
0
    def supported_formats(self):
        """Returns supported input formats for project (json / csv)

        :param project: project with label config
        :return: list of supported file types
        """
        # load config
        parser = etree.XMLParser()
        xml = etree.fromstring(self.label_config, parser)
        if xml is None:
            raise etree.XMLSchemaParseError(
                "Project config is empty or incorrect")

        supported = {"json", "csv", "tsv"}

        if len(self.data_types.keys()) == 1:
            supported.add("txt")

        # if any of Lists are presented there is only json allowed
        lists = xml.findall(".//List")  # take all tags with value attribute
        if lists:
            supported.remove("csv")
            supported.remove("tsv")
            supported.remove("txt")

        return supported
Beispiel #4
0
def loadSchema(uri, base_uri=None):
    """Load an XSD XML document (specified by filename or URL), and return a
    :class:`lxml.etree.XMLSchema`.
    """

    # uri to use for reporting errors - include base uri if any
    if uri in _loaded_schemas:
        return _loaded_schemas[uri]

    error_uri = uri
    if base_uri is not None:
        error_uri += ' (base URI %s)' % base_uri

    try:
        logger.debug('Loading schema %s' % uri)
        _loaded_schemas[uri] = etree.XMLSchema(
            etree.parse(uri, parser=_get_xmlparser(), base_url=base_uri))
        return _loaded_schemas[uri]
    except IOError as io_err:
        # add a little more detail to the error message - but should still be an IO error
        raise IOError('Failed to load schema %s : %s' % (error_uri, io_err))
    except etree.XMLSchemaParseError as parse_err:
        # re-raise as a schema parse error, but ensure includes details about schema being loaded
        raise etree.XMLSchemaParseError('Failed to parse schema %s -- %s' %
                                        (error_uri, parse_err))
Beispiel #5
0
def loadSchema(uri, base_uri=None, override_proxy_requirement=False):
    """Load an XSD XML document (specified by filename or URL), and return a
    :class:`lxml.etree.XMLSchema`.

    Note that frequently loading a schema without using a web proxy may
    introduce significant network resource usage as well as instability if
    the schema becomes unavailable. Thus this function will fail if the
    ``HTTP_PROXY`` environment variable is not set.
    """

    # uri to use for reporting errors - include base uri if any
    if uri in _loaded_schemas:
        return _loaded_schemas[uri]

    error_uri = uri
    if base_uri is not None:
        error_uri += ' (base URI %s)' % base_uri

    # typical reliable use should include a proxy. warn if they're not using
    # one.
    if 'HTTP_PROXY' not in os.environ and _http_uri(uri):
        message = ('Loading schema %s without a web proxy may introduce ' +
                   'significant network resource usage as well as ' +
                   'instability if that server becomes inaccessible. ' +
                   'The HTTP_PROXY environment variable is required ' +
                   'for loading schemas.  Schema validation will be disabled.') \
                  % (error_uri,)
        if override_proxy_requirement:
            message += (' (overridden: Requesting without proxy. Please ' +
                        'set HTTP_PROXY as soon as possible.)')
            logger.warning(message)
        else:
            warnings.warn(message, UserWarning)
            # bail out and return None instead of a schema, so methods
            # that rely on a loaded schema can detect its absence and
            # proceed accordingly.
            return None

    try:
        logger.debug('Loading schema %s' % uri)
        _loaded_schemas[uri] = etree.XMLSchema(
            etree.parse(uri, parser=_get_xmlparser(), base_url=base_uri))
        return _loaded_schemas[uri]
    except IOError as io_err:
        # add a little more detail to the error message - but should still be an IO error
        raise IOError('Failed to load schema %s : %s' % (error_uri, io_err))
    except etree.XMLSchemaParseError as parse_err:
        # re-raise as a schema parse error, but ensure includes details about schema being loaded
        raise etree.XMLSchemaParseError('Failed to parse schema %s -- %s' %
                                        (error_uri, parse_err))
Beispiel #6
0
    def extract_data_types(cls, label_config):
        # load config
        parser = etree.XMLParser()
        xml = etree.fromstring(label_config, parser)
        if xml is None:
            raise etree.XMLSchemaParseError('Project config is empty or incorrect')

        # take all tags with values attribute and fit them to tag types
        data_type = {}
        parent = xml.findall('.//*[@value]')
        for match in parent:
            name = match.get('value')
            if len(name) > 1 and name[0] == '$':
                name = name[1:]
                data_type[name] = match.tag

        return data_type
Beispiel #7
0
def generate_sample_task_without_check(label_config, mode='upload'):
    """ Generate sample task only
    """
    # load config
    parser = etree.XMLParser()
    xml = etree.fromstring(label_config, parser)
    if xml is None:
        raise etree.XMLSchemaParseError('Project config is empty or incorrect')

    # make examples pretty
    examples = data_examples(mode=mode)

    # iterate over xml tree and find values with '$'
    task = {}
    parent = xml.findall('.//*[@value]')  # take all tags with value attribute
    for p in parent:
        value = p.get('value')
        value_type = p.get('valueType', p.get('valuetype', None))

        # process List
        if p.tag == 'List':
            key = p.get('elementValue').replace('$', '')
            examples['List'] = [{key: 'Hello world'}, {key: 'Goodbye world'}]

        # valueType="url"
        examples['Text'] = examples[
            'TextUrl'] if value_type == 'url' else examples['TextRaw']
        examples['TimeSeries'] = examples[
            'TimeSeriesUrl'] if value_type == 'url' or value_type is None else examples[
                'TimeSeriesRaw']

        if value and value[0] == '$':
            # try get example by variable name
            by_name = examples.get(value, None)
            # not found by name, try get example by type
            task[value[1:]] = examples.get(
                p.tag, 'Something') if by_name is None else by_name

    # TimeSeries special case
    for ts_tag in xml.findall('.//TimeSeries'):
        time_column = ts_tag.get('timeColumn')
        value_columns = []
        for ts_child in ts_tag:
            if ts_child.tag != 'Channel':
                continue
            value_columns.append(ts_child.get('column'))
        sep = ts_tag.get('sep')
        time_format = ts_tag.get('timeFormat')

        tag_value = ts_tag.attrib['value'].lstrip('$')
        ts_task = task[tag_value]
        if isinstance(ts_task, str):
            # data is URL
            params = {'time': time_column, 'values': ','.join(value_columns)}
            if sep:
                params['sep'] = sep
            if time_format:
                params['tf'] = time_format
            task[tag_value] = '/samples/time-series.csv?' + urlencode(params)

        elif isinstance(ts_task, dict):
            # data is JSON
            task[tag_value] = generate_time_series_json(
                time_column, value_columns, time_format)
    return task
def generate_sample_task_without_check(label_config,
                                       mode='upload',
                                       secure_mode=False):
    """ Generate sample task only
    """
    # load config
    parser = etree.XMLParser()
    xml = etree.fromstring(label_config, parser)
    if xml is None:
        raise etree.XMLSchemaParseError('Project config is empty or incorrect')

    # make examples pretty
    examples = data_examples(mode=mode)

    # iterate over xml tree and find values with '$'
    task = {}
    parent = xml.findall('.//*[@value]')  # take all tags with value attribute
    for p in parent:

        # Make sure it is a real object tag, extract data placeholder key
        value = p.get('value')
        if not value or not value.startswith('$'):
            continue
        value = value[1:]

        # detect secured mode - objects served as URLs
        value_type = p.get('valueType') or p.get('valuetype')
        only_urls = secure_mode or value_type == 'url'

        example_from_field_name = examples.get('$' + value)
        if example_from_field_name:
            # try get example by variable name
            task[value] = example_from_field_name

        elif p.tag == 'Paragraphs':
            # Paragraphs special case - replace nameKey/textKey if presented
            name_key = p.get('nameKey') or p.get('namekey') or 'author'
            text_key = p.get('textKey') or p.get('textkey') or 'text'
            task[value] = []
            for item in examples[p.tag]:
                task[value].append({
                    name_key: item['author'],
                    text_key: item['text']
                })

        elif p.tag == 'TimeSeries':
            # TimeSeries special case - generate signals on-the-fly
            time_column = p.get('timeColumn')
            value_columns = []
            for ts_child in p:
                if ts_child.tag != 'Channel':
                    continue
                value_columns.append(ts_child.get('column'))
            sep = p.get('sep')
            time_format = p.get('timeFormat')

            if only_urls:
                # data is URL
                params = {
                    'time': time_column,
                    'values': ','.join(value_columns)
                }
                if sep:
                    params['sep'] = sep
                if time_format:
                    params['tf'] = time_format
                task[value] = '/samples/time-series.csv?' + urlencode(params)
            else:
                # data is JSON
                task[value] = generate_time_series_json(
                    time_column, value_columns, time_format)

        else:
            # patch for valueType="url"
            examples['Text'] = examples['TextUrl'] if only_urls else examples[
                'TextRaw']
            # not found by name, try get example by type
            task[value] = examples.get(p.tag, 'Something')

    return task