def build_relation_graph(cls, inputs, output, options=RelationGraphOptions(), program=None, extras=None) -> 'RelationGraph': relation_graph: RelationGraph = RelationGraph(options) input_dfs, output_dfs, extra_dfs = cls.convert_input_output( inputs, output, options, extras) for _out in output_dfs: _out = _out.df if len(_out) > 100 or len(_out.columns) > 100: raise NotImplementedError( "Cannot handle outputs with >100 rows or columns") try: relation_graph.add_dfs(input_dfs, output_dfs, extra_dfs) except Exception as e: logger.err("Error while adding dfs") print(e) raise NotImplementedError("Caught exception while adding dfs") relation_graph.input_dfs = input_dfs relation_graph.output_dfs = output_dfs return relation_graph
def run_upload(cmd_args: ArgNamespace): home_dir = os.path.expanduser("~") gdrive_bin = home_dir + '/gdrive' if not os.path.exists(gdrive_bin): logger.err( "Could not find gdrive at {home_dir}. " "Please download binary from https://github.com/gdrive-org/gdrive \n" "WARNING : Delete gdrive and {home_dir}/.gdrive after use".format( home_dir=home_dir)) return runner = GDriveRunner(home_dir, cmd_args) if cmd_args.parent_id is None and cmd_args.parent is None: raise Exception("One of --parent-id and --parent should be provided") if cmd_args.parent_id is not None: parent = cmd_args.parent_id else: parent = runner.get_id(cmd_args.parent) cmd = '{gdrive} upload -p {data_url} {path}' if cmd_args.desc is not None: cmd += ' --description ' + cmd_args.desc paths = [cmd_args.path] if os.path.exists(cmd_args.path + ".index"): paths.append(cmd_args.path + ".index") for path in paths: p_cmd = cmd.format(gdrive=gdrive_bin, data_url=parent, path=path) runner.run(p_cmd)
def compile_gens_from_module( spec_ast: ast.Module, cmd_args: ArgNamespace, parse_cache: Dict[str, Optional[IGenerator]] = None ) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]: # All the function-defs containing the signature decorator will be treated as generators gen_defs: Dict[Tuple[str, str], ast.FunctionDef] = GenCollector().collect(spec_ast) compiled_map: Dict[ast.FunctionDef, Optional[ast.ClassDef]] = {} if parse_cache is None: parse_cache = {} parse_cache.update( parse_gens_from_defs(gen_defs, cmd_args, parse_cache=parse_cache)) for (namespace, gen_id), gen_def in gen_defs.items(): igen: IGenerator = parse_cache[namespace + '.' + gen_id] if igen is None: logger.err("Skipping {}.{} because of parse error".format( namespace, gen_id)) compiled_map[gen_def] = None continue try: logger.info("Compiling {}.{}".format(namespace, gen_id)) compiled_def: ast.ClassDef = compile_gen(igen) compiled_map[gen_def] = compiled_def except Exception as e: logger.err("Compilation of {}.{} failed".format(namespace, gen_id)) logging.exception(e) compiled_map[gen_def] = None return compiled_map
def process_with_tracking(cls, raw_data: Dict): spec: GeneratorInversionSpec = GeneratorInversionSpec( raw_data['inputs'], raw_data['output'], raw_data['intermediates'], raw_data['generator_tracking']) results: List[Tuple[str, Dict[str, List[Any]]]] = [] # print(raw_data['program']) # print([t.record for t in raw_data['generator_tracking']]) for depth, fn in enumerate(raw_data['function_sequence'], 1): if fn not in cls.generators: logger.warn("Generator not defined for {}".format(fn), use_cache=True) continue try: tracker = spec.trackers[depth - 1] results.append( (fn, cls.generators[fn].generate_arguments_training_data( spec, depth=depth, tracker=tracker))) except SilentException as e: pass except Exception as e: logger.err("Encountered Exception for {}".format(fn)) logging.exception(e) return results
def init(self): if (not os.path.exists(self.args.outdir)) or self.args.force: os.system('rm -rf {}'.format(self.args.outdir)) os.system('mkdir -p {}'.format(self.args.outdir)) if not os.path.exists(self.args.outdir): logger.err("Failed to create output directory at {}".format( self.args.outdir)) sys.exit(1) self.file_map: Dict[str, Dict[str, IndexedFileWriter]] = {}
def add_external_edges(self, other: 'GraphNodeCollection', collector: EdgeCollection, is_reverse=False): if is_reverse: return if self.options.EQUALITY_EDGES and self.source[0] != other.source[0]: # We only add equality edges between collections from different kinds of sources # I don't see much point in having equality edges between say the groupby groups produced # We want to capture relationships between the input and output, not amongst the outputs themselves for val1, nodes1 in self.value_map.items(): if val1 in other.value_map: val2 = val1 nodes2 = other.value_map[val2] try: # This can fail for NaNs etc. if val1 == val2: for n1, n2 in itertools.product(nodes1, nodes2): collector.add_edge(n1, n2, GraphEdgeType.EQUALITY, directed=False) except Exception as e: logger.err("Error comparing {} and {}".format( val1, val2)) logging.exception(e) if self.options.SUBSTR_EDGES and self.source[0] != other.source[0]: # We only add substr edges between collections from different kinds of sources. # The reasoning is the same as in the equality edges case for val1, nodes1 in self.value_map.items(): for val2, nodes2 in other.value_map.items(): if isinstance(val1, str) or isinstance(val2, str): # if (str(val1) in str(val2)) or (str(val2) in str(val1)): if str(val1) in str(val2): for n1, n2 in itertools.product(nodes1, nodes2): collector.add_edge(n1, n2, GraphEdgeType.SUBSTR) collector.add_edge(n2, n1, GraphEdgeType.SUPSTR) elif str(val2) in str(val1): for n1, n2 in itertools.product(nodes1, nodes2): collector.add_edge(n2, n1, GraphEdgeType.SUBSTR) collector.add_edge(n1, n2, GraphEdgeType.SUPSTR)
def run(self, cmd: str): attempts = 0 sleep_time = 5 max_sleep_time = 20 code = os.system(cmd) while code != 0: attempts += 1 if attempts <= self.max_gdrive_retries: logger.info("Retrying after {sleep} seconds...".format(sleep=sleep_time)) time.sleep(sleep_time) sleep_time = min(sleep_time + 5, max_sleep_time) code = os.system(cmd) continue logger.err("Command {cmd} failed with exit code {code}".format(cmd=cmd, code=code)) sys.exit(1)
def get_output(self, cmd: str): attempts = 0 sleep_time = 5 max_sleep_time = 20 while True: attempts += 1 try: out = subprocess.check_output(cmd, shell=True) return out.decode("utf-8") except subprocess.CalledProcessError as e: e.output = str(e.output) if 'rateLimitExceeded' in e.output and attempts <= self.max_gdrive_retries: logger.info("Rate Limit Exceeded. Waiting {sleep} seconds...".format(sleep=sleep_time)) time.sleep(sleep_time) sleep_time = min(sleep_time + 5, max_sleep_time) continue logger.err("Command {cmd} failed with exit code {code} " "and output {output}".format(cmd=cmd, code=e.returncode, output=e.output)) sys.exit(1)
def add_equality_edges(self, wrapped_df1: DfTypeWrapper, df1_idx: str, wrapped_df2: DfTypeWrapper, df2_idx: str): df1_values_to_nodes = self.value_to_node_map(wrapped_df1, df1_idx) df2_values_to_nodes = self.value_to_node_map(wrapped_df2, df2_idx) for df1_value in df1_values_to_nodes.keys(): for df2_value in df2_values_to_nodes.keys(): try: if df1_value == df2_value: for df1_node in df1_values_to_nodes[df1_value]: for df2_node in df2_values_to_nodes[df2_value]: self.add_edge(df1_node, df2_node, RelationGraphEdgeType.EQUALITY) except TypeError: pass except ValueError: pass except SyntaxError: pass except Exception as e: logger.err("Error comparing {} and {}".format( df1_value, df2_value)) logging.exception(e)
def parse_gens_from_defs( gen_defs: Dict[Tuple[str, str], ast.FunctionDef], cmd_args: ArgNamespace, parse_cache: Dict[str, Optional[IGenerator]] = None ) -> Dict[str, Optional[IGenerator]]: parse_results: Dict[str, Optional[IGenerator]] = {} if parse_cache is not None: parse_results.update(parse_cache) for (namespace, gen_id), gen_def in gen_defs.items(): try: logger.info("Parsing {}.{}".format(namespace, gen_id)) igen: IGenerator = parse_gen_from_ast(gen_def, namespace, gen_id, parse_results, cmd_args) parse_results[namespace + '.' + gen_id] = igen except Exception as e: logger.err("Parsing of {}.{} failed".format(namespace, gen_id)) logging.exception(e) parse_results[namespace + '.' + gen_id] = None return parse_results
def run_download(cmd_args: ArgNamespace): home_dir = os.path.expanduser("~") gdrive_bin = home_dir + '/gdrive' if not os.path.exists(gdrive_bin): logger.err( "Could not find gdrive at {home_dir}. " "Please download binary from https://github.com/gdrive-org/gdrive \n" "WARNING : Delete gdrive and {home_dir}/.gdrive after use".format( home_dir=home_dir)) return if cmd_args.path is None and cmd_args.path_id is None: raise Exception("One of --path and --path-id should be provided") runner = GDriveRunner(home_dir, cmd_args) if cmd_args.path_id is not None: path = cmd_args.path_id else: path = runner.get_id(cmd_args.path) cmd = '{gdrive} download {path} --force --path {outdir}'.format( gdrive=gdrive_bin, path=path, outdir=cmd_args.outdir) runner.run(cmd)
def convert_input_output(cls, inputs, output, options: RelationGraphOptions, extras=None): input_dfs = [] for input_ in inputs: try: input_dfs += cls.get_df(input_) except NotImplementedError: raise except Exception as e: logger.err( "Error while getting df for input : {}".format(input_)) print(e) raise NotImplementedError( "Caught exception for input : {}".format(input_)) for _inp in input_dfs: _inp = _inp.df if len(_inp) > 100 or len(_inp.columns) > 100: raise NotImplementedError( "Cannot handle inputs with >100 rows or columns") try: output_dfs = cls.get_df(output, mode='output') except NotImplementedError: raise except Exception as e: logger.err("Error while getting df for output : {}".format(output)) print(e) raise NotImplementedError( "Caught exception for output : {}".format(output)) extra_dfs = [] if extras: for extra_ in extras: try: extra_dfs += cls.get_df(extra_, mode='extra') except NotImplementedError: raise except Exception as e: logger.err( "Error while getting df for extra : {}".format(extra_)) print(e) raise NotImplementedError( "Caught exception for input : {}".format(extra_)) return input_dfs, output_dfs, extra_dfs