class HDFSEnsureNew(HadoopBaseModule): """ Make sure the file is removed """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('Name', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Name', String)] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: entry_name = self.get_input('Name') if '://' not in entry_name: entry_name = self.add_prefix(entry_name, machine) if not int(self.call_hdfs('dfs -test -e ' + entry_name + '; echo $?', machine)): #self.call_hdfs('dfs -rm -r ' + entry_name, machine) # we are using -rmr but it is deprecated self.call_hdfs('dfs -rmr ' + entry_name, machine) d = {'entry_name':entry_name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('Name', job.parameters['entry_name']) self.set_output('Machine', machine)
class FindData(Module): _settings = ModuleSettings(namespace="io") _input_ports = [ IPort(name="file name", label="file name and extension to search for", signature="basic:String"), IPort(name="seed path", label="path corresponding to the " "search starting point. Defaults to " "the user's home directory.", default="~", signature="basic:String"), ] _output_ports = [OPort(name="file path", signature="basic:String")] def compute(self): seed_path = self.get_input("seed path") file_name = self.get_input("file name") print file_name existing_files = [] for dir_tree in os.walk(os.path.expanduser(seed_path)): if file_name in dir_tree[2]: existing_files.append(os.path.join(dir_tree[0], file_name)) print existing_files for path in existing_files: if 'Demos' in path: file_path = path self.set_output("file path", file_path)
class PythonSourceToFile(Module): """ This is the class for specifying a python code snippet for running with Hadoop Streaming, it will take its contents and output to a temporary Python file. The code will not be passed around. """ _settings = ModuleSettings( namespace='hadoop', configure_widget=PythonSourceToFileConfigurationWidget) _input_ports = [ IPort('Input File', File), IPort('source', String, optional=True) ] _output_ports = [OPort('Temporary File', File)] def compute(self): inputFile = self.force_get_input('Input File') if inputFile != None: # tempFile = file_pool.make_local_copy(inputFile.name) tempFile = inputFile else: source = urllib.unquote(self.force_get_input('source', '')) tempFile = self.interpreter.filePool.create_file() f = open(tempFile.name, 'w') f.write(source) f.close() self.set_output('Temporary File', tempFile)
class GMapHeatmap(GMapValueVis): TEMPLATE = Template(""" var data = $heatmap_data; var options = $heatmap_options; options["data"] = data; options["map"] = map; heatmap = new google.maps.visualization.HeatmapLayer(options); """) SPECS = ['dissipating', 'maxIntensity', 'opacity', 'radius'] _input_ports = [ IPort("dissipating", "basic:Boolean", optional=True, default=True), IPort("maxIntensity", "basic:Float", optional=True), IPort("opacity", "basic:Float", optional=True, default=0.6), IPort("radius", "basic:Float", optional=True) ] def compute(self): (positions, center) = self.get_positions() values = self.get_values() heatmap_data = [{ "location": positions[i], "weight": float(values[i]) } for i in xrange(len(positions))] heatmap_options = self.get_options(self.SPECS) data = { "heatmap_data": heatmap_data, "heatmap_options": heatmap_options } vis_data = GMapVisData([], self.TEMPLATE, data, center) self.set_output("self", vis_data)
class GMapMarkers(GMapVis, TitlesMixin): TEMPLATE = Template(""" var positions = $marker_data; var options = $marker_options; var titles = $marker_titles; for (var i=0; i < positions.length; i++) { marker = new google.maps.Marker({"position": positions[i], "map": map}); marker.setOptions(options); if (titles) { marker.setTitle(titles[i]); } } """) SPECS = ['flat'] _input_ports = [ IPort("flat", "basic:Boolean", optional=True), IPort('titleColIdx', 'basic:Integer', optional=True), IPort('titleColName', 'basic:String', optional=True) ] def compute(self): (positions, center) = self.get_positions() marker_options = self.get_options(self.SPECS) titles = self.get_titles() print "got titles:", titles data = { "marker_options": marker_options, "marker_data": positions, "marker_titles": titles } vis_data = GMapVisData([], self.TEMPLATE, data, center) self.set_output("self", vis_data)
class URICreator(HadoopBaseModule): """ The class for caching HDFS file onto the TaskNode local drive """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('HDFS File/URI', String), IPort('Symlink', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('URI', String)] def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: uri = self.force_get_input('HDFS File/URI') symlink = self.force_get_input('Symlink') if uri==None or symlink==None: raise ModuleError(self, "Missing 'HDFS File/URI' or 'Symlink' values") if '://' not in uri: uri = self.add_prefix(uri, machine) uri += '#' + symlink d = {'uri':uri} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('URI', job.parameters['uri']) self.set_output('Machine', machine)
class GMapCell(SpreadsheetCell, OptionsMixin): """ GMapCell is a custom Module to view TabularData geographically """ SPECS = [('zoom', None, True), 'center'] _input_ports = [ IPort("layers", "GMapVis"), IPort("zoom", "basic:Integer", optional=True, default=11), IPort("center", "basic:Float,basic:Float", optional=True) ] def compute(self): """compute() -> None Dispatch the URL to the spreadsheet """ layers = self.get_input_list("layers") if len(layers) < 1: raise ModuleError(self, "Must provide at least one layer") map_options = self.get_options(self.SPECS) self.displayAndWait(GMapCellWidget, (layers, map_options, self.interpreter))
class HDFSGet(HadoopBaseModule): """ Getting a file from the Hadoop DFS Then getting it from the server """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [ IPort('Local File', Path), IPort('Remote Location', String), IPort('Override', Boolean), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)') ] _output_ports = [ OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Local File', File) ] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: remote = self.get_input('Remote Location') local = self.get_input('Local File') override = self.force_get_input('Override', False) if '://' not in remote: remote = self.add_prefix(remote, machine) if os.path.exists(local.name): if override == False: raise ModuleError(self, 'Output already exists') else: if os.path.isdir(local.name): shutil.rmtree(local.name) else: os.unlink(local.name) tempfile = machine.remote.send_command('mktemp -d -u').strip() result = self.call_hdfs('dfs -get %s %s' % (remote, tempfile), machine) # too slow with many files #res = machine.send_command("get -r %s %s" % (tempfile, local.name) ) # tar files to increase speed result = machine.local.send_command('mkdir %s' % local.name) result = machine.sync(local.name, tempfile, mode=machine.MODE_REMOTE_LOCAL, use_tar=True) result = machine.remote.rm(tempfile, force=True, recursively=True) d = {'remote': remote, 'local': local.name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('Local File', PathObject(job.parameters['local'])) self.set_output('Machine', machine)
class PersistedPath(Module): """Records a file in the file store. """ _input_ports = [ IPort('path', Path), IPort('metadata', Metadata, optional=True)] _output_ports = [ OPort('path', Path)] _cached = None def update_upstream(self): """A modified version of the update_upstream method. Only updates upstream if the file is not found in the store. """ if not hasattr(self, 'signature'): raise ModuleError(self, "Module has no signature") file_store = get_default_store() entries = file_store.query({KEY_SIGNATURE: self.signature}) best = None for entry in entries: if best is None or entry[KEY_TIME] > best[KEY_TIME]: best = entry if best is not None: self._cached = best.filename else: super(PersistedPath, self).update_upstream() def compute(self): if self._cached is not None: self._set_result(self._cached) else: file_store = get_default_store() newpath = self.get_input('path').name self.check_path_type(newpath) metadata = self.get_input_list('metadata') metadata = dict(m.metadata for m in metadata) metadata[KEY_TYPE] = TYPE_OUTPUT metadata[KEY_TIME] = datetime.strftime(datetime.utcnow(), '%Y-%m-%d %H:%M:%S') metadata[KEY_SIGNATURE] = self.signature locator = self.moduleInfo.get('locator') if locator is not None: metadata[KEY_WORKFLOW] = "%s:%s" % ( locator.name, self.moduleInfo['version']) metadata[KEY_MODULE_ID] = self.moduleInfo['moduleId'] entry = file_store.add(newpath, metadata) self.annotate({'added_file': entry['hash']}) self._set_result(entry.filename) def check_path_type(self, path): pass def _set_result(self, path): self.set_output('path', PathObject(path))
class QueriedInputPath(Module): """Base class for file-querying modules. This uses QueryConditions instead of Metadata, allowing for more complex queries than what PersistedInputPath provides (equality in metadata). """ _input_ports = [ IPort('query', QueryCondition), IPort('unique', Boolean, optional=True, default='False') ] # TODO: Order by more conditions than only `vistrails_timestamp` _output_ports = [ OPort('most_recent', Path), OPort('results', List), OPort('count', Integer, optional=True) ] # TODO: Set query from `configure_widget` def compute(self): # Do the query queries = self.get_input_list('query') conditions = {} for c in conditions: conditions.update(c.conditions) file_store = get_default_store() nb = 0 best = None entries = list(file_store.query(conditions)) for entry in entries: nb += 1 self.check_path_type(entry.filename) if best is None or (KEY_TIME in entry.metadata and KEY_TIME in best.metadata and entry[KEY_TIME] > best[KEY_TIME]): best = entry if best is None: raise ModuleError(self, "No match") if nb > 1 and self.get_input('unique'): raise ModuleError( self, "Query returned %d results and 'unique' is " "True" % nb) self._set_result(entries, best) def check_path_type(self, path): pass def _set_result(self, results, latest): self.set_output('most_recent', PathObject(latest.filename)) self.set_output('results', [PathObject(e.filename) for e in results]) self.set_output('count', len(results))
class EqualString(Metadata): """A string metadata. A piece of metadata with a value of type string. When used in a query, means "key has a value of type string equal to <value>". """ _input_ports = [IPort('key', String), IPort('value', String)] _type = 'str'
class OutputPort(Module): _input_ports = [IPort("name", "String", optional=True), IPort("optional", "Boolean", optional=True), IPort("spec", "String"), IPort("InternalPipe", "Variant")] _output_ports = [OPort("ExternalPipe", "Variant", optional=True)] def compute(self): inPipe = self.get_input('InternalPipe') self.set_output('ExternalPipe', inPipe)
class SpecFileParams(Module): _input_ports = [ IPort(name="spec_file_root", label="Spec File Root", signature="basic:String"), IPort(name="data_folder_path", label="Data Folder Path", signature="basic:String"), IPort(name="scan_number", label="Scan number", signature="basic:Integer"), ] _output_ports = [ OPort(name="spec_file_params", signature="gov.nsls2.spec.SpecData:SpecFileParams"), ] def compute(self): self.spec_file_root = self.get_input("spec_file_root") self.data_folder_path = self.get_input("data_folder_path") self.scan_number = self.get_input("scan_number") self.set_output("spec_file_params", self)
class HDFSPut(HadoopBaseModule): """ Putting a local file to the Hadoop DFS First copying it to the server """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [ IPort('Local File', File), IPort('Remote Location', String), IPort('Override', Boolean), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)') ] _output_ports = [ OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Remote Location', String) ] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: remote = self.get_input('Remote Location') local = self.get_input('Local File') override = self.force_get_input('Override', False) if '://' not in remote: remote = self.add_prefix(remote, machine) if not int( self.call_hdfs('dfs -test -e ' + remote + '; echo $?', machine)): if override: self.call_hdfs('dfs -rm -r ' + remote, machine) else: raise ModuleError(self, 'Remote entry already exists') tempfile = machine.remote.send_command('mktemp -u').strip() result = machine.sendfile(local.name, tempfile) self.call_hdfs('dfs -put %s %s' % (tempfile, remote), machine) result = machine.remote.rm(tempfile, force=True, recursively=True) d = {'remote': remote, 'local': local.name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getJob(id) self.set_output('Remote Location', job.parameters['remote']) self.set_output('Machine', machine)
class PersistedDir(PersistedPath): """Records a directory in the file store. """ _input_ports = [ IPort('path', Directory), IPort('metadata', Metadata, optional=True)] _output_ports = [ OPort('path', Directory)] _settings = ModuleSettings(configure_widget= 'vistrails.packages.persistent_archive.widgets:SetMetadataWidget') def check_path_type(self, path): if not os.path.isdir(path): raise ModuleError(self, "Path is not a directory")
class EqualInt(Metadata): """An integer metadata. A piece of metadata with a value of type integer. When used in a query, means "key has a value of type integer equal to <value>". """ _input_ports = [IPort('key', String), IPort('value', Integer)] _type = 'int' def __init__(self, *args): if args: key, value = args assert isinstance(value, (int, long)) Metadata.__init__(self, *args)
class IntInRange(QueryCondition): """An integer range condition. Means "key has a value of type integer which lies between <lower_bound> and <higher_bound>". Note that you can omit one of the bounds. """ _input_ports = [ IPort('key', String), IPort('lower_bound', Integer, optional=True), IPort('higher_bound', Integer, optional=True) ] def __init__(self, *args): super(IntInRange, self).__init__() if args: self.key, self.low, self.high = args assert isinstance(self.low, (int, long)) assert isinstance(self.high, (int, long)) self.set_results() else: self.key, self.low, self.high = None, None, None def compute(self): self.key = self.get_input('key') if self.has_input('lower_bound'): self.low = self.get_input('lower_bound') if self.has_input('higher_bound'): self.high = self.get_input('higher_bound') if not (self.low is not None or self.high is not None): raise ModuleError(self, "No bound set") self.set_results() def set_results(self): dct = {} if self.low is not None: dct['gt'] = self.low if self.high is not None: dct['lt'] = self.high dct['type'] = 'int' self.conditions = {self.key: dct} self.set_output('value', self) def __str__(self): return '%s(%r, %r, %r)' % ('IntInRange', self.key, self.low, self.high)
class ImageStackSum(Module): _input_ports = [ IPort(name="img_stack", label="Stack of 2D Images", \ signature="basic:List"), ] _output_ports = [ OPort(name="2D_img", signature="basic:List"), ] def compute(self): img_stack = self.get_input("img_stack") #======================================================================= # img_stack_np = (numpy.ndarray) img_stack #======================================================================= img_sum = img_stack[0] total_images = 0 for i in range(1, len(img_stack)): img_sum.__add__(img_stack[i]) total_images += 1 img_sum.__mul__(1 / total_images) self.set_output("2D_img", img_sum)
class SwapAxes(Module): _input_ports = [ IPort(name="axis1", label="Axis to swap", signature="basic:Integer"), IPort(name="axis2", label="Axis to swap", signature="basic:Integer"), IPort(name="ndarray", label="ndarray to swap the axes of", signature="basic:List"), ] _output_ports = [ OPort(name="swapped_ndarray", signature="basic:List"), ] def compute(self): axis1 = self.get_input("axis1") axis2 = self.get_input("axis2") ndarray = self.get_input("ndarray") swapped_ndarray = np.swapaxes(ndarray, axis1, axis2) self.set_output("swapped_ndarray", swapped_ndarray)
class MplFigure(Module): _input_ports = [ IPort("addPlot", "(MplPlot)", depth=1), ("axesProperties", "(MplAxesProperties)"), ("figureProperties", "(MplFigureProperties)"), ("setLegend", "(MplLegend)") ] _output_ports = [("figure", "(MplFigure)")] def compute(self): # Create a figure figInstance = pylab.figure() pylab.hold(True) # Run the plots plots = self.get_input("addPlot") for plot in plots: plot(figInstance) if self.has_input("figureProperties"): figure_props = self.get_input("figureProperties") figure_props.update_props(figInstance) if self.has_input("axesProperties"): axes_props = self.get_input("axesProperties") axes_props.update_props(figInstance.gca()) if self.has_input("setLegend"): legend = self.get_input("setLegend") figInstance.gca().legend() self.set_output("figure", figInstance)
class ModelAggregator(Module): """Combine 1+ models into an aggregate model """ _settings = ModuleSettings(namespace='fitting') _input_ports = [ IPort(name='models', label='models to aggregate', signature='basic:Variant') ] _output_ports = [ OPort(name='aggregated_models', signature='basic:Variant') ] def compute(self): """Mandatory override of parent `Module` class. Loop over however many models are connected to the 'models' input port and combine them into a single aggregate model """ models = self.get_input_list('models') aggregated = models[0] if len(models) > 2: for model in models[1:]: aggregated += model self.set_output('aggregated_models', aggregated)
class SpecMetadata(Module): _input_ports = [ IPort(name="meta_data", label="Metadata from Spec Scan", \ signature="basic:Dictionary"), ] def compute(self): pass
class Metadata(QueryCondition): """Base class for metadata pairs. This is abstract and implemented by modules Equal* This both provides a metadata pair, as the 'metadata' attribute, for inserting, and conditions, through the 'conditions' attribute. """ _input_ports = [ IPort('key', String), IPort('value', Module)] def __init__(self, *args): super(Metadata, self).__init__() if args: self.key, self.value = args self.set_results() else: self.key, self.value = None, None @staticmethod def translate_to_python(c): return QueryCondition.translate_to_python( c, top_class=Metadata, text_query=False) def compute(self): self.key = self.get_input('key') self.value = self.get_input('value') self.set_results() def set_results(self): self.conditions = {self.key: {'type': self._type, 'equal': self.value}} self.metadata = (self.key, self.value) self.set_output('value', self) def __str__(self): return '%s(%r, %r)' % (self.__class__.__name__, self.key, self.value) @staticmethod def get_widget_class(): from .widgets import MetadataConstantWidget return MetadataConstantWidget
class SwapAxes(Module): _settings = ModuleSettings(namespace="utility") _input_ports = [ IPort(name='arr', label='N-D array', signature='basic:List'), IPort(name='ax0', label='Axis to swap from', signature='basic:Integer'), IPort(name='ax1', label='Axis to swap from', signature='basic:Integer'), ] _output_ports = [OPort(name='out', signature='basic:List')] def compute(self): arr = self.get_input('arr') ax0 = self.get_input('ax0') ax1 = self.get_input('ax1') arr = np.asarray(arr) self.set_output('out', np.swapaxes(arr, ax0, ax1))
class TimedJob(JobMixin, Module): """ A module that suspends until 'how_long' seconds have passed """ _input_ports = [IPort("how_long", "basic:Integer", default=10)] _output_ports = [OPort("finished", "basic:Boolean")] def job_read_inputs(self): """ Implemented by modules to read job parameters from input ports. Returns the `params` dictionary used by subsequent methods. """ return {'how_long': self.force_get_input('how_long') or 10} def job_start(self, params): """ Implemented by modules to submit the job. Gets the `params` dictionary and returns a new dictionary, for example with additional info necessary to check the status later. """ # this example gets the current time and stores it # this time represents the information necessary to check the status of the job params['start_time'] = time.time() return params def job_finish(self, params): """ Implemented by modules to get info from the finished job. This is called once the job is finished to get the results. These can be added to the `params` dictionary that this method returns. This is the right place to clean up the job from the server if they are not supposed to persist. """ return params def job_set_results(self, params): """ Implemented by modules to set the output ports. This is called after job_finished() or after getting the cached results to set the output ports on this module, from the `params` dictionary. """ self.set_output('finished', True) def job_get_handle(self, params): """ Implemented by modules to return the JobHandle object. This returns an object following the JobHandle interface. The JobMonitor will use it to check the status of the job and call back this module once the job is done. JobHandle needs the following method: * finished(): returns True if the job is finished """ return TimedJobMonitor(params['start_time'], params['how_long'])
class Stack1DCell(SpreadsheetCell): _settings = ModuleSettings(namespace="vis") _input_ports = [ IPort(name="data", label="Data to display",signature="basic:List"), IPort(name="keys", label="Names of the data",signature="basic:List"), ] _output_ports = [ OPort(name="displayed_data", signature="basic:List"), ] def compute(self): data = self.get_input("data") try: keys = self.get_input("keys") except ModuleError: keys = range(len(data)) self.cellWidget = self.displayAndWait(Stack1DWidget, (data,keys,))
class SpecFileProcessor(Module): _input_ports = [ IPort(name="img_stack", label="Stack of 2D Images", \ signature="basic:List"), IPort(name="has_dark", label="Are dark files present?", \ signature="basic:Boolean", default=True, optional=True), IPort(name="spec_file", label="Spec File Object", \ signature="gov.nsls2.spec.SpecData:SpecFile"), IPort(name="scan_numbers", label="List of scan numbers", signature="basic:List"), ] _output_ports = [ OPort(name="single_img_array", signature="basic:List"), ] def compute(self): pass
class ImageStackImageSelector(Module): _input_ports = [ IPort(name="img_stack", label="Stack of 2D Images", \ signature="basic:List"), IPort(name="img_no", label="Desired Image Number", \ signature="basic:Integer"), ] _output_ports = [ OPort(name="2D_img", signature="basic:List"), ] def compute(self): img_stack = self.get_input("img_stack") print "Image Stack class: {0}".format(img_stack.__class__) img_no = self.get_input("img_no") single_img = img_stack[img_no] self.set_output("2D_img", single_img)
class NestedDictCell(SpreadsheetCell): _settings = ModuleSettings(namespace="vis") _input_ports = [ IPort(name="dict_list", label="Dictionary to display", signature="basic:List"), ] def compute(self): dict_list = self.get_input("dict_list") self.cellWidget = self.displayAndWait(NestedDictWidget, (dict_list,))
class InputPort(Module): _settings = ModuleSettings(signature=input_port_signature) _input_ports = [IPort("name", "String", optional=True), IPort("optional", "Boolean", optional=True), IPort("spec", "String"), IPort("ExternalPipe", "Variant", optional=True), IPort("Default", "Variant")] _output_ports = [OPort("InternalPipe", "Variant")] def compute(self): exPipe = self.force_get_input('ExternalPipe') if exPipe is not None: self.set_output('InternalPipe', exPipe) else: if self.has_input('Default'): self.set_output('InternalPipe', self.get_input('Default')) else: self.set_output('InternalPipe', InvalidOutput)