Beispiel #1
0
    def parse(s):
        """Parse a single tag pattern, like #tag+foo-bar."""

        if not s:
            # edge case: null value
            raise hxl.HXLException('Attempt to parse empty tag pattern')
        elif isinstance(s, TagPattern):
            # edge case: already parsed
            return s

        result = re.match(TagPattern.PATTERN, s)
        if result:
            tag = '#' + result.group(1).lower()
            include_attributes = set()
            exclude_attributes = set()
            attribute_specs = re.split(r'\s*([+-])', result.group(2))
            for i in range(1, len(attribute_specs), 2):
                if attribute_specs[i] == '+':
                    include_attributes.add(attribute_specs[i + 1].lower())
                else:
                    exclude_attributes.add(attribute_specs[i + 1].lower())
            if result.group(3) == '!':
                is_absolute = True
                if exclude_attributes:
                    raise ValueError('Exclusions not allowed in absolute patterns')
            else:
                is_absolute = False
            return TagPattern(
                tag,
                include_attributes=include_attributes,
                exclude_attributes=exclude_attributes,
                is_absolute=is_absolute
            )
        else:
            raise hxl.HXLException('Malformed tag: ' + s)
 def __next__(self):
     """Return the next line of input (including the new tags)."""
     if not self._found_tags:
         # Search the first 25 rows for a match.
         if self._add_tags():
             self._found_tags = True
         else:
             # if no match, through an exception
             raise hxl.HXLException("Tagging failed")
     if len(self._cache) > 0:
         # read from the cache, first
         return self._cache.pop(0)
     else:
         return next(self.input)
Beispiel #3
0
 def operator_is(s, condition):
     """Advanced tests
     Note: this won't be called for aggregate values like "is min" or "is not max";
     for these, the aggregate will already be calculated, and a simple comparison
     operator substituted by L{calc_aggregate}.
     """
     if condition == 'empty':
         return hxl.datatypes.is_empty(s)
     elif condition == 'not empty':
         return not hxl.datatypes.is_empty(s)
     elif condition == 'number':
         return hxl.datatypes.is_number(s)
     elif condition == 'not number':
         return not hxl.datatypes.is_number(s)
     elif condition == 'date':
         return (hxl.datatypes.is_date(s))
     elif condition == 'not date':
         return (hxl.datatypes.is_date(s) is False)
     else:
         raise hxl.HXLException('Unknown is condition: {}'.format(condition))
Beispiel #4
0
def from_spec(spec):
    """Build a full spec (including source) from a JSON-like data structure."""

    if isinstance(spec, six.string_types):
        # a JSON string (parse it first)
        spec = json.loads(spec)

    # source
    input_spec = spec.get('input')
    allow_local = spec.get('allow_local', False)
    sheet_index = spec.get('sheet_index', None)
    timeout = spec.get('timeout', None)
    verify_ssl = spec.get('verify_ssl', True)
    http_headers = spec.get('http_headers', None)
    encoding = spec.get('encoding', None)

    # recipe
    tagger_spec = spec.get('tagger', None)
    recipe_spec = spec.get('recipe', [])

    if not input_spec:
        raise hxl.HXLException("No input property specified.")

    # set up the input
    input = make_input(raw_source=input_spec,
                       allow_local=allow_local,
                       sheet_index=sheet_index,
                       timeout=timeout,
                       verify_ssl=verify_ssl,
                       http_headers=http_headers,
                       encoding=encoding)

    # autotag if requested
    if tagger_spec:
        source = hxl.converters.Tagger._load(input, tagger_spec)
    else:
        source = HXLReader(input)

    # compile the main recipe
    return hxl.filters.from_recipe(source=source, recipe=recipe_spec)
Beispiel #5
0
 def parse(raw_string, header=None, use_exception=False, column_number=None):
     """
     Attempt to parse a full hashtag specification.
     """
     # Already parsed?
     if isinstance(raw_string, Column):
         return raw_string
     
     # Pattern for a single tag
     result = re.match(Column.PATTERN, raw_string)
     if result:
         tag = result.group(1)
         attribute_string = result.group(2)
         if attribute_string:
             attributes = re.split(r'\s*\+', attribute_string.strip().strip('+'))
         else:
             attributes = []
         return Column(tag=tag, attributes=attributes, header=header, column_number=column_number)
     else:
         if use_exception:
             raise hxl.HXLException("Malformed tag expression: " + raw_string)
         else:
             return None
Beispiel #6
0
def make_input(raw_source,
               allow_local=False,
               sheet_index=None,
               timeout=None,
               verify_ssl=True,
               http_headers=None,
               selector=None,
               encoding=None):
    """Figure out what kind of input to create.

    Can detect a URL or filename, an input stream, or an array.
    Will also try to detect HTML and Excel before defaulting to CSV.
    The result is an object that can deliver rows of data for the HXL library to parse.

    @param raw_source: the raw data source (e.g. a URL or input stream).
    @param allow_local: if True, allow opening local files as well as remote URLs (default: False).
    @param sheet_index: if a number, read that sheet from an Excel workbook (default: None).
    @param timeout: if supplied, time out an HTTP(S) request after the specified number of seconds with no data received (default: None)
    @param verify_ssl: if False, don't try to verify SSL certificates (e.g. for self-signed certs).
    @param http_headers: an optional dict of HTTP headers to send with a request.
    @param selector: a property to select the data in a JSON record (may later extend to spreadsheet tabs).
    @param encoding: specify a character encoding
    @return: an object belonging to a subclass of AbstractInput, returning rows of raw data.
    """
    def make_tempfile(input):
        tmpfile = tempfile.NamedTemporaryFile()
        shutil.copyfileobj(input, tmpfile)
        tmpfile.seek(0)
        input.close()
        return tmpfile  # have to return the object, so it doesn't get garbage collected and delete the file

    def wrap_stream(stream):
        if hasattr(stream, 'peek'):
            # already buffered
            return stream
        else:
            stream = io_wrapper.RawIOWrapper(stream)
            return io.BufferedReader(io_wrapper.RawIOWrapper(stream))

    def match_sigs(sig, sigs):
        for s in sigs:
            if sig.startswith(s):
                return True
        return False

    if isinstance(raw_source, AbstractInput):
        # already an input source: no op
        return raw_source

    elif hasattr(raw_source,
                 '__len__') and (not isinstance(raw_source, six.string_types)):
        # it's an array
        logger.debug('Making input from an array')
        return ArrayInput(raw_source)

    else:
        mime_type = None
        file_ext = None

        if hasattr(raw_source, 'read'):
            # it's an input stream
            logger.debug('Making input from a stream')
            input = wrap_stream(raw_source)
        else:
            # assume a URL or filename
            logger.debug('Opening source %s as a URL or file', raw_source)
            (input, mime_type, file_ext,
             specified_encoding) = open_url_or_file(raw_source,
                                                    allow_local=allow_local,
                                                    timeout=timeout,
                                                    verify_ssl=verify_ssl,
                                                    http_headers=http_headers)
            input = wrap_stream(input)
            if encoding is None:  # if no encoding was provided, use the inferred one
                if specified_encoding:
                    encoding = specified_encoding

        if not encoding:  # if we still have no character encoding, default to UTF-8
            encoding = "utf-8"

        sig = input.peek(4)[:4]

        if (mime_type in HTML5_MIME_TYPES) or match_sigs(sig, HTML5_SIGS):
            logger.exception(
                hxl.HXLException(
                    "Received HTML5 markup.\nCheck that the resource (e.g. a Google Sheet) is publicly readable.",
                    {
                        'input':
                        input,
                        'source':
                        raw_source,
                        'munged':
                        munge_url(raw_source, http_headers=http_headers)
                        if str(raw_source).startswith('http') else None
                    }))

        if match_sigs(sig, XLS_SIGS):  # legacy XLS Excel workbook
            tmpfile = make_tempfile(input)
            return XLSInput(tmpfile, sheet_index=sheet_index)

        if match_sigs(
                sig, XLSX_SIGS
        ):  # superset of ZIP_SIGS - could be zipfile or XLSX Excel workbook
            tmpfile = make_tempfile(input)

            try:
                logger.debug('Trying input from an Excel file')
                return XLSXInput(tmpfile, sheet_index=sheet_index)
            except:
                if match_sigs(sig, ZIP_SIGS):  # more-restrictive
                    zf = zipfile.ZipFile(tmpfile, "r")
                    for name in zf.namelist():
                        if os.path.splitext(name)[1].lower() == ".csv":
                            return CSVInput(wrap_stream(
                                io.BytesIO(zf.read(name))),
                                            encoding=encoding)

            raise HXLIOException(
                "Cannot find CSV file or Excel content in zip archive")

        elif (mime_type in JSON_MIME_TYPES) or (
                file_ext in JSON_FILE_EXTS) or match_sigs(sig, JSON_SIGS):
            logger.debug('Trying to make input as JSON')
            return JSONInput(input, selector=selector, encoding=encoding)

        # fall back to CSV if all else fails
        logger.debug('Making input from CSV')
        return CSVInput(input, encoding=encoding)
Beispiel #7
0
def make_input(raw_source,
               allow_local=False,
               sheet_index=None,
               timeout=None,
               verify_ssl=True,
               http_headers=None,
               selector=None):
    """Figure out what kind of input to create.

    Can detect a URL or filename, an input stream, or an array.
    Will also try to detect HTML and Excel before defaulting to CSV.
    The result is an object that can deliver rows of data for the HXL library to parse.

    @param raw_source: the raw data source (e.g. a URL or input stream).
    @param allow_local: if True, allow opening local files as well as remote URLs (default: False).
    @param sheet_index: if a number, read that sheet from an Excel workbook (default: None).
    @param timeout: if supplied, time out an HTTP(S) request after the specified number of seconds with no data received (default: None)
    @param verify_ssl: if False, don't try to verify SSL certificates (e.g. for self-signed certs).
    @param http_headers: an optional dict of HTTP headers to send with a request.
    @param selector: a property to select the data in a JSON record (may later extend to spreadsheet tabs).
    @return: an object belonging to a subclass of AbstractInput, returning rows of raw data.
    """
    def wrap_stream(stream):
        if hasattr(stream, 'peek'):
            # already buffered
            return stream
        else:
            stream = io_wrapper.RawIOWrapper(stream)
            return io.BufferedReader(io_wrapper.RawIOWrapper(stream))

    def match_sigs(sig, sigs):
        for s in sigs:
            if sig.startswith(s):
                return True
        return False

    if isinstance(raw_source, AbstractInput):
        # already an input source: no op
        return raw_source

    elif hasattr(raw_source,
                 '__len__') and (not isinstance(raw_source, six.string_types)):
        # it's an array
        logger.debug('Making input from an array')
        return ArrayInput(raw_source)

    else:
        mime_type = None
        file_ext = None
        encoding = None

        if hasattr(raw_source, 'read'):
            # it's an input stream
            logger.debug('Making input from a stream')
            input = wrap_stream(raw_source)
        else:
            # assume a URL or filename
            logger.debug('Opening source %s as a URL or file', raw_source)
            (input, mime_type, file_ext,
             encoding) = open_url_or_file(raw_source,
                                          allow_local=allow_local,
                                          timeout=timeout,
                                          verify_ssl=verify_ssl,
                                          http_headers=http_headers)
            input = wrap_stream(input)

        sig = input.peek(4)[:4]

        if (mime_type in HTML5_MIME_TYPES) or match_sigs(sig, HTML5_SIGS):
            logger.exception(
                hxl.HXLException(
                    "Received HTML5 markup.\nCheck that the resource (e.g. a Google Sheet) is publicly readable.",
                    {
                        'input':
                        input,
                        'source':
                        raw_source,
                        'munged':
                        munge_url(raw_source)
                        if str(raw_source).startswith('http') else None
                    }))

        elif (mime_type in ZIP_MIME_TYPES) or (file_ext in ZIP_FILE_EXTS):
            zf = zipfile.ZipFile(io.BytesIO(input.read()), "r")
            for name in zf.namelist():
                if os.path.splitext(name)[1].lower() == ".csv":
                    return CSVInput(wrap_stream(io.BytesIO(zf.read(name))))

        if (mime_type in EXCEL_MIME_TYPES) or (
                file_ext in EXCEL_FILE_EXTS) or match_sigs(sig, EXCEL_SIGS):
            logger.debug('Making input from an Excel file')
            return ExcelInput(input, sheet_index=sheet_index)

        elif (mime_type in JSON_MIME_TYPES) or (
                file_ext in JSON_FILE_EXTS) or match_sigs(sig, JSON_SIGS):
            logger.debug('Trying to make input as JSON')
            return JSONInput(input, selector=selector)

        # fall back to CSV if all else fails
        logger.debug('Making input from CSV')
        return CSVInput(input)