def __init__(self): message = "In order to use the UniversalSentenceEncoder primitive install 'nlp_primitives[complete]'" self.tf = import_or_raise("tensorflow", message) hub = import_or_raise("tensorflow_hub", message) self.tf.compat.v1.disable_eager_execution() self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" self.embed = hub.Module(self.module_url) self.number_output_features = 512 self.n = 512
def save(self, location, profile_name): features_dict = self.to_dict() if location is None: return json.dumps(features_dict) if isinstance(location, str): transport_params = {} if _is_url(location): raise ValueError("Writing to URLs is not supported") if _is_s3(location): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) session = boto3.Session() if isinstance(profile_name, str): transport_params = {'session': boto3.Session(profile_name=profile_name)} use_smartopen_features(location, features_dict, transport_params, read=False) elif profile_name is False: use_s3fs_features(location, features_dict, read=False) elif session.get_credentials() is not None: use_smartopen_features(location, features_dict, read=False) else: use_s3fs_features(location, features_dict, read=False) else: with open(location, "w") as f: json.dump(features_dict, f) else: json.dump(features_dict, location)
def use_s3fs_es(file_path, path, read=True): s3fs = import_or_raise("s3fs", S3FS_ERR_MSG) s3 = s3fs.S3FileSystem(anon=True) if read: s3.get(path, file_path) else: s3.put(file_path, path)
def write_data_description(entityset, path, profile_name=None, **kwargs): '''Serialize entityset to data description and write to disk or S3 path. Args: entityset (EntitySet) : Instance of :class:`.EntitySet`. path (str) : Location on disk or S3 path to write `data_description.json` and entity data. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile. ''' if _is_s3(path): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) with tempfile.TemporaryDirectory() as tmpdir: os.makedirs(os.path.join(tmpdir, 'data')) dump_data_description(entityset, tmpdir, **kwargs) file_path = create_archive(tmpdir) transport_params = {} session = boto3.Session() if isinstance(profile_name, str): transport_params = {'session': boto3.Session(profile_name=profile_name)} use_smartopen_es(file_path, path, transport_params, read=False) elif profile_name is False: use_s3fs_es(file_path, path, read=False) elif session.get_credentials() is not None: use_smartopen_es(file_path, path, read=False) else: use_s3fs_es(file_path, path, read=False) elif _is_url(path): raise ValueError("Writing to URLs is not supported") else: path = os.path.abspath(path) os.makedirs(os.path.join(path, 'data'), exist_ok=True) dump_data_description(entityset, path, **kwargs)
def get_transport_params(profile_name): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config if isinstance(profile_name, str): session = boto3.Session(profile_name=profile_name) transport_params = {'client': session.client('s3')} elif profile_name is False or boto3.Session().get_credentials() is None: session = boto3.Session() client = session.client('s3', config=Config(signature_version=UNSIGNED)) transport_params = {'client': client} else: transport_params = None return transport_params
def load(cls, features, profile_name): if isinstance(features, str): try: features_dict = json.loads(features) except ValueError: if _is_url(features): features_dict = use_smartopen_features(features) elif _is_s3(features): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) session = boto3.Session() if isinstance(profile_name, str): transport_params = { 'session': boto3.Session(profile_name=profile_name) } features_dict = use_smartopen_features( features, transport_params) elif profile_name is False: features_dict = use_s3fs_features(features) elif session.get_credentials() is not None: features_dict = use_smartopen_features(features) else: features_dict = use_s3fs_features(features) else: with open(features, 'r') as f: features_dict = json.load(f) return cls(features_dict) return cls(json.load(features))
def get_transport_params(profile_name): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config if isinstance(profile_name, str): transport_params = { 'session': boto3.Session(profile_name=profile_name) } elif profile_name is False or boto3.Session().get_credentials() is None: transport_params = { 'resource_kwargs': { 'config': Config(signature_version=UNSIGNED) } } else: transport_params = None return transport_params
def use_smartopen_features(path, features_dict=None, transport_params=None, read=True): open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open if read: with open(path, 'r', encoding='utf-8', transport_params=transport_params) as f: features_dict = json.load(f) return features_dict else: with open(path, "w", transport_params=transport_params) as f: json.dump(features_dict, f)
def use_smartopen_es(file_path, path, transport_params=None, read=True): open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open if read: with open(path, "rb", transport_params=transport_params) as fin: with open(file_path, 'wb') as fout: shutil.copyfileobj(fin, fout) else: with open(file_path, 'rb') as fin: with open(path, 'wb', transport_params=transport_params) as fout: shutil.copyfileobj(fin, fout)
def use_s3fs_features(file_path, features_dict=None, read=True): s3fs = import_or_raise("s3fs", S3FS_ERR_MSG) s3 = s3fs.S3FileSystem(anon=True) if read: with s3.open(file_path, "r", encoding='utf-8') as f: features_dict = json.load(f) return features_dict else: with s3.open(file_path, "w", encoding='utf-8') as f: features = json.dumps(features_dict, ensure_ascii=False) f.write(features)
def check_graphviz(): GRAPHVIZ_ERR_MSG = ( 'Please install graphviz to plot.' + ' (See https://docs.featuretools.com/en/stable/getting_started/install.html#installing-graphviz for' + ' details)') graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( "To plot entity sets, a graphviz backend is required.\n" + "Install the backend using one of the following commands:\n" + " Mac OS: brew install graphviz\n" + " Linux (Ubuntu): sudo apt-get install graphviz\n" + " Windows: conda install python-graphviz\n" + " For more details visit: https://docs.featuretools.com/en/stable/getting_started/install.html" ) return graphviz
def check_graphviz(): GRAPHVIZ_ERR_MSG = ( "Please install graphviz to plot." + " (See https://featuretools.alteryx.com/en/stable/install.html#installing-graphviz for" + " details)" ) graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( "To plot entity sets, a graphviz backend is required.\n" + "Install the backend using one of the following commands:\n" + " Mac OS: brew install graphviz\n" + " Linux (Ubuntu): $ sudo apt install graphviz\n" + " Windows (conda): conda install -c conda-forge python-graphviz\n" + " Windows (pip): pip install graphviz\n" + " Windows (EXE required if graphviz was installed via pip): https://graphviz.org/download/#windows" + " For more details visit: https://featuretools.alteryx.com/en/stable/install.html#installing-graphviz" ) return graphviz
def read_entityset(path, profile_name=None, **kwargs): '''Read entityset from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `data_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. ''' if _is_url(path) or _is_s3(path): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = {} session = boto3.Session() if _is_url(path): use_smartopen_es(file_path, path) elif isinstance(profile_name, str): transport_params = {'session': boto3.Session(profile_name=profile_name)} use_smartopen_es(file_path, path, transport_params) elif profile_name is False: use_s3fs_es(file_path, path) elif session.get_credentials() is not None: use_smartopen_es(file_path, path) else: use_s3fs_es(file_path, path) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) data_description = read_data_description(tmpdir) return description_to_entityset(data_description, **kwargs) else: data_description = read_data_description(path) return description_to_entityset(data_description, **kwargs)
def test_import_or_raise_errors(): with pytest.raises(ImportError, match="error message"): import_or_raise("_featuretools", "error message")
def read_entity_data(description, path): '''Read description data from disk. Args: description (dict) : Description of :class:`.Entity`. path (str): Location on disk to read entity data. Returns: df (DataFrame) : Instance of dataframe. ''' file = os.path.join(path, description['loading_info']['location']) kwargs = description['loading_info'].get('params', {}) load_format = description['loading_info']['type'] entity_type = description['loading_info'].get('entity_type', 'pandas') read_kwargs = {} if entity_type == 'dask': lib = dd elif entity_type == 'koalas': import_error = 'Cannot load Koalas entityset - unable to import Koalas. ' \ 'Consider doing a pip install with featuretools[koalas] to install Koalas with pip' lib = import_or_raise('databricks.koalas', import_error) read_kwargs['multiline'] = True kwargs['compression'] = str(kwargs['compression']) else: lib = pd if load_format == 'csv': dataframe = lib.read_csv(file, engine=kwargs['engine'], compression=kwargs['compression'], encoding=kwargs['encoding'], **read_kwargs) elif load_format == 'parquet': dataframe = lib.read_parquet(file, engine=kwargs['engine']) elif load_format == 'pickle': dataframe = pd.read_pickle(file, **kwargs) else: error = 'must be one of the following formats: {}' raise ValueError(error.format(', '.join(FORMATS))) dtypes = description['loading_info']['properties']['dtypes'] if entity_type == 'koalas': for col, dtype in dtypes.items(): if dtype == 'object': dtypes[col] = 'str' if dtype == 'datetime64[ns]': dtypes[col] = np.datetime64 dataframe = dataframe.astype(dtypes) if load_format in ['parquet', 'csv']: latlongs = [] for var_description in description['variables']: if var_description['type']['value'] == LatLong.type_string: latlongs.append(var_description["id"]) def parse_latlong_tuple(x): return tuple(float(y) for y in x[1:-1].split(",")) def parse_latlong_list(x): return list(float(y) for y in x[1:-1].split(",")) for column in latlongs: if entity_type == 'dask': meta = (column, tuple([float, float])) dataframe[column] = dataframe[column].apply( parse_latlong_tuple, meta=meta) elif entity_type == 'koalas': dataframe[column] = dataframe[column].apply(parse_latlong_list) else: dataframe[column] = dataframe[column].apply( parse_latlong_tuple) return dataframe
def test_import_or_raise_imports(): math = import_or_raise("math", "error message") assert math.ceil(0.1) == 1
def plot(self, to_file=None): """ Create a UML diagram-ish graph of the EntitySet. Args: to_file (str, optional) : Path to where the plot should be saved. If set to None (as by default), the plot will not be saved. Returns: graphviz.Digraph : Graph object that can directly be displayed in Jupyter notebooks. """ GRAPHVIZ_ERR_MSG = ( 'Please install graphviz to plot entity sets.' + ' (See https://docs.featuretools.com/en/stable/getting_started/install.html#installing-graphviz for' + ' details)') graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG) # Try rendering a dummy graph to see if a working backend is installed try: graphviz.Digraph().pipe() except graphviz.backend.ExecutableNotFound: raise RuntimeError( "To plot entity sets, a graphviz backend is required.\n" + "Install the backend using one of the following commands:\n" + " Mac OS: brew install graphviz\n" + " Linux (Ubuntu): sudo apt-get install graphviz\n" + " Windows: conda install python-graphviz\n" + " For more details visit: https://docs.featuretools.com/en/stable/getting_started/install.html" ) if to_file: # Explicitly cast to str in case a Path object was passed in to_file = str(to_file) split_path = to_file.split('.') if len(split_path) < 2: raise ValueError("Please use a file extension like '.pdf'" + " so that the format can be inferred") format = split_path[-1] valid_formats = graphviz.backend.FORMATS if format not in valid_formats: raise ValueError("Unknown format. Make sure your format is" + " amongst the following: %s" % valid_formats) else: format = None # Initialize a new directed graph graph = graphviz.Digraph(self.id, format=format, graph_attr={'splines': 'ortho'}) # Draw entities for entity in self.entities: variables_string = '\l'.join([ var.id + ' : ' + var.type_string # noqa: W605 for var in entity.variables ]) nrows = entity.shape[0] label = '{%s (%d row%s)|%s\l}' % (entity.id, nrows, 's' * (nrows > 1), variables_string ) # noqa: W605 graph.node(entity.id, shape='record', label=label) # Draw relationships for rel in self.relationships: # Display the key only once if is the same for both related entities if rel._parent_variable_id == rel._child_variable_id: label = rel._parent_variable_id else: label = '%s -> %s' % (rel._parent_variable_id, rel._child_variable_id) graph.edge(rel._child_entity_id, rel._parent_entity_id, xlabel=label) if to_file: # Graphviz always appends the format to the file name, so we need to # remove it manually to avoid file names like 'file_name.pdf.pdf' offset = len(format) + 1 # Add 1 for the dot output_path = to_file[:-offset] graph.render(output_path, cleanup=True) return graph