def parse(s): """Parse a single tag pattern, like #tag+foo-bar.""" if not s: # edge case: null value raise hxl.HXLException('Attempt to parse empty tag pattern') elif isinstance(s, TagPattern): # edge case: already parsed return s result = re.match(TagPattern.PATTERN, s) if result: tag = '#' + result.group(1).lower() include_attributes = set() exclude_attributes = set() attribute_specs = re.split(r'\s*([+-])', result.group(2)) for i in range(1, len(attribute_specs), 2): if attribute_specs[i] == '+': include_attributes.add(attribute_specs[i + 1].lower()) else: exclude_attributes.add(attribute_specs[i + 1].lower()) if result.group(3) == '!': is_absolute = True if exclude_attributes: raise ValueError('Exclusions not allowed in absolute patterns') else: is_absolute = False return TagPattern( tag, include_attributes=include_attributes, exclude_attributes=exclude_attributes, is_absolute=is_absolute ) else: raise hxl.HXLException('Malformed tag: ' + s)
def __next__(self): """Return the next line of input (including the new tags).""" if not self._found_tags: # Search the first 25 rows for a match. if self._add_tags(): self._found_tags = True else: # if no match, through an exception raise hxl.HXLException("Tagging failed") if len(self._cache) > 0: # read from the cache, first return self._cache.pop(0) else: return next(self.input)
def operator_is(s, condition): """Advanced tests Note: this won't be called for aggregate values like "is min" or "is not max"; for these, the aggregate will already be calculated, and a simple comparison operator substituted by L{calc_aggregate}. """ if condition == 'empty': return hxl.datatypes.is_empty(s) elif condition == 'not empty': return not hxl.datatypes.is_empty(s) elif condition == 'number': return hxl.datatypes.is_number(s) elif condition == 'not number': return not hxl.datatypes.is_number(s) elif condition == 'date': return (hxl.datatypes.is_date(s)) elif condition == 'not date': return (hxl.datatypes.is_date(s) is False) else: raise hxl.HXLException('Unknown is condition: {}'.format(condition))
def from_spec(spec): """Build a full spec (including source) from a JSON-like data structure.""" if isinstance(spec, six.string_types): # a JSON string (parse it first) spec = json.loads(spec) # source input_spec = spec.get('input') allow_local = spec.get('allow_local', False) sheet_index = spec.get('sheet_index', None) timeout = spec.get('timeout', None) verify_ssl = spec.get('verify_ssl', True) http_headers = spec.get('http_headers', None) encoding = spec.get('encoding', None) # recipe tagger_spec = spec.get('tagger', None) recipe_spec = spec.get('recipe', []) if not input_spec: raise hxl.HXLException("No input property specified.") # set up the input input = make_input(raw_source=input_spec, allow_local=allow_local, sheet_index=sheet_index, timeout=timeout, verify_ssl=verify_ssl, http_headers=http_headers, encoding=encoding) # autotag if requested if tagger_spec: source = hxl.converters.Tagger._load(input, tagger_spec) else: source = HXLReader(input) # compile the main recipe return hxl.filters.from_recipe(source=source, recipe=recipe_spec)
def parse(raw_string, header=None, use_exception=False, column_number=None): """ Attempt to parse a full hashtag specification. """ # Already parsed? if isinstance(raw_string, Column): return raw_string # Pattern for a single tag result = re.match(Column.PATTERN, raw_string) if result: tag = result.group(1) attribute_string = result.group(2) if attribute_string: attributes = re.split(r'\s*\+', attribute_string.strip().strip('+')) else: attributes = [] return Column(tag=tag, attributes=attributes, header=header, column_number=column_number) else: if use_exception: raise hxl.HXLException("Malformed tag expression: " + raw_string) else: return None
def make_input(raw_source, allow_local=False, sheet_index=None, timeout=None, verify_ssl=True, http_headers=None, selector=None, encoding=None): """Figure out what kind of input to create. Can detect a URL or filename, an input stream, or an array. Will also try to detect HTML and Excel before defaulting to CSV. The result is an object that can deliver rows of data for the HXL library to parse. @param raw_source: the raw data source (e.g. a URL or input stream). @param allow_local: if True, allow opening local files as well as remote URLs (default: False). @param sheet_index: if a number, read that sheet from an Excel workbook (default: None). @param timeout: if supplied, time out an HTTP(S) request after the specified number of seconds with no data received (default: None) @param verify_ssl: if False, don't try to verify SSL certificates (e.g. for self-signed certs). @param http_headers: an optional dict of HTTP headers to send with a request. @param selector: a property to select the data in a JSON record (may later extend to spreadsheet tabs). @param encoding: specify a character encoding @return: an object belonging to a subclass of AbstractInput, returning rows of raw data. """ def make_tempfile(input): tmpfile = tempfile.NamedTemporaryFile() shutil.copyfileobj(input, tmpfile) tmpfile.seek(0) input.close() return tmpfile # have to return the object, so it doesn't get garbage collected and delete the file def wrap_stream(stream): if hasattr(stream, 'peek'): # already buffered return stream else: stream = io_wrapper.RawIOWrapper(stream) return io.BufferedReader(io_wrapper.RawIOWrapper(stream)) def match_sigs(sig, sigs): for s in sigs: if sig.startswith(s): return True return False if isinstance(raw_source, AbstractInput): # already an input source: no op return raw_source elif hasattr(raw_source, '__len__') and (not isinstance(raw_source, six.string_types)): # it's an array logger.debug('Making input from an array') return ArrayInput(raw_source) else: mime_type = None file_ext = None if hasattr(raw_source, 'read'): # it's an input stream logger.debug('Making input from a stream') input = wrap_stream(raw_source) else: # assume a URL or filename logger.debug('Opening source %s as a URL or file', raw_source) (input, mime_type, file_ext, specified_encoding) = open_url_or_file(raw_source, allow_local=allow_local, timeout=timeout, verify_ssl=verify_ssl, http_headers=http_headers) input = wrap_stream(input) if encoding is None: # if no encoding was provided, use the inferred one if specified_encoding: encoding = specified_encoding if not encoding: # if we still have no character encoding, default to UTF-8 encoding = "utf-8" sig = input.peek(4)[:4] if (mime_type in HTML5_MIME_TYPES) or match_sigs(sig, HTML5_SIGS): logger.exception( hxl.HXLException( "Received HTML5 markup.\nCheck that the resource (e.g. a Google Sheet) is publicly readable.", { 'input': input, 'source': raw_source, 'munged': munge_url(raw_source, http_headers=http_headers) if str(raw_source).startswith('http') else None })) if match_sigs(sig, XLS_SIGS): # legacy XLS Excel workbook tmpfile = make_tempfile(input) return XLSInput(tmpfile, sheet_index=sheet_index) if match_sigs( sig, XLSX_SIGS ): # superset of ZIP_SIGS - could be zipfile or XLSX Excel workbook tmpfile = make_tempfile(input) try: logger.debug('Trying input from an Excel file') return XLSXInput(tmpfile, sheet_index=sheet_index) except: if match_sigs(sig, ZIP_SIGS): # more-restrictive zf = zipfile.ZipFile(tmpfile, "r") for name in zf.namelist(): if os.path.splitext(name)[1].lower() == ".csv": return CSVInput(wrap_stream( io.BytesIO(zf.read(name))), encoding=encoding) raise HXLIOException( "Cannot find CSV file or Excel content in zip archive") elif (mime_type in JSON_MIME_TYPES) or ( file_ext in JSON_FILE_EXTS) or match_sigs(sig, JSON_SIGS): logger.debug('Trying to make input as JSON') return JSONInput(input, selector=selector, encoding=encoding) # fall back to CSV if all else fails logger.debug('Making input from CSV') return CSVInput(input, encoding=encoding)
def make_input(raw_source, allow_local=False, sheet_index=None, timeout=None, verify_ssl=True, http_headers=None, selector=None): """Figure out what kind of input to create. Can detect a URL or filename, an input stream, or an array. Will also try to detect HTML and Excel before defaulting to CSV. The result is an object that can deliver rows of data for the HXL library to parse. @param raw_source: the raw data source (e.g. a URL or input stream). @param allow_local: if True, allow opening local files as well as remote URLs (default: False). @param sheet_index: if a number, read that sheet from an Excel workbook (default: None). @param timeout: if supplied, time out an HTTP(S) request after the specified number of seconds with no data received (default: None) @param verify_ssl: if False, don't try to verify SSL certificates (e.g. for self-signed certs). @param http_headers: an optional dict of HTTP headers to send with a request. @param selector: a property to select the data in a JSON record (may later extend to spreadsheet tabs). @return: an object belonging to a subclass of AbstractInput, returning rows of raw data. """ def wrap_stream(stream): if hasattr(stream, 'peek'): # already buffered return stream else: stream = io_wrapper.RawIOWrapper(stream) return io.BufferedReader(io_wrapper.RawIOWrapper(stream)) def match_sigs(sig, sigs): for s in sigs: if sig.startswith(s): return True return False if isinstance(raw_source, AbstractInput): # already an input source: no op return raw_source elif hasattr(raw_source, '__len__') and (not isinstance(raw_source, six.string_types)): # it's an array logger.debug('Making input from an array') return ArrayInput(raw_source) else: mime_type = None file_ext = None encoding = None if hasattr(raw_source, 'read'): # it's an input stream logger.debug('Making input from a stream') input = wrap_stream(raw_source) else: # assume a URL or filename logger.debug('Opening source %s as a URL or file', raw_source) (input, mime_type, file_ext, encoding) = open_url_or_file(raw_source, allow_local=allow_local, timeout=timeout, verify_ssl=verify_ssl, http_headers=http_headers) input = wrap_stream(input) sig = input.peek(4)[:4] if (mime_type in HTML5_MIME_TYPES) or match_sigs(sig, HTML5_SIGS): logger.exception( hxl.HXLException( "Received HTML5 markup.\nCheck that the resource (e.g. a Google Sheet) is publicly readable.", { 'input': input, 'source': raw_source, 'munged': munge_url(raw_source) if str(raw_source).startswith('http') else None })) elif (mime_type in ZIP_MIME_TYPES) or (file_ext in ZIP_FILE_EXTS): zf = zipfile.ZipFile(io.BytesIO(input.read()), "r") for name in zf.namelist(): if os.path.splitext(name)[1].lower() == ".csv": return CSVInput(wrap_stream(io.BytesIO(zf.read(name)))) if (mime_type in EXCEL_MIME_TYPES) or ( file_ext in EXCEL_FILE_EXTS) or match_sigs(sig, EXCEL_SIGS): logger.debug('Making input from an Excel file') return ExcelInput(input, sheet_index=sheet_index) elif (mime_type in JSON_MIME_TYPES) or ( file_ext in JSON_FILE_EXTS) or match_sigs(sig, JSON_SIGS): logger.debug('Trying to make input as JSON') return JSONInput(input, selector=selector) # fall back to CSV if all else fails logger.debug('Making input from CSV') return CSVInput(input)