コード例 #1
0
    def build_relation_graph(cls,
                             inputs,
                             output,
                             options=RelationGraphOptions(),
                             program=None,
                             extras=None) -> 'RelationGraph':

        relation_graph: RelationGraph = RelationGraph(options)
        input_dfs, output_dfs, extra_dfs = cls.convert_input_output(
            inputs, output, options, extras)

        for _out in output_dfs:
            _out = _out.df
            if len(_out) > 100 or len(_out.columns) > 100:
                raise NotImplementedError(
                    "Cannot handle outputs with >100 rows or columns")

        try:
            relation_graph.add_dfs(input_dfs, output_dfs, extra_dfs)
        except Exception as e:
            logger.err("Error while adding dfs")
            print(e)
            raise NotImplementedError("Caught exception while adding dfs")

        relation_graph.input_dfs = input_dfs
        relation_graph.output_dfs = output_dfs

        return relation_graph
コード例 #2
0
def run_upload(cmd_args: ArgNamespace):
    home_dir = os.path.expanduser("~")
    gdrive_bin = home_dir + '/gdrive'
    if not os.path.exists(gdrive_bin):
        logger.err(
            "Could not find gdrive at {home_dir}. "
            "Please download binary from https://github.com/gdrive-org/gdrive \n"
            "WARNING : Delete gdrive and {home_dir}/.gdrive after use".format(
                home_dir=home_dir))
        return

    runner = GDriveRunner(home_dir, cmd_args)
    if cmd_args.parent_id is None and cmd_args.parent is None:
        raise Exception("One of --parent-id and --parent should be provided")

    if cmd_args.parent_id is not None:
        parent = cmd_args.parent_id
    else:
        parent = runner.get_id(cmd_args.parent)

    cmd = '{gdrive} upload -p {data_url} {path}'

    if cmd_args.desc is not None:
        cmd += ' --description ' + cmd_args.desc

    paths = [cmd_args.path]
    if os.path.exists(cmd_args.path + ".index"):
        paths.append(cmd_args.path + ".index")

    for path in paths:
        p_cmd = cmd.format(gdrive=gdrive_bin, data_url=parent, path=path)
        runner.run(p_cmd)
コード例 #3
0
ファイル: compiler.py プロジェクト: rbavishi/autopandas
def compile_gens_from_module(
    spec_ast: ast.Module,
    cmd_args: ArgNamespace,
    parse_cache: Dict[str, Optional[IGenerator]] = None
) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]:
    #  All the function-defs containing the signature decorator will be treated as generators
    gen_defs: Dict[Tuple[str, str],
                   ast.FunctionDef] = GenCollector().collect(spec_ast)
    compiled_map: Dict[ast.FunctionDef, Optional[ast.ClassDef]] = {}
    if parse_cache is None:
        parse_cache = {}

    parse_cache.update(
        parse_gens_from_defs(gen_defs, cmd_args, parse_cache=parse_cache))

    for (namespace, gen_id), gen_def in gen_defs.items():
        igen: IGenerator = parse_cache[namespace + '.' + gen_id]
        if igen is None:
            logger.err("Skipping {}.{} because of parse error".format(
                namespace, gen_id))
            compiled_map[gen_def] = None
            continue

        try:
            logger.info("Compiling {}.{}".format(namespace, gen_id))
            compiled_def: ast.ClassDef = compile_gen(igen)
            compiled_map[gen_def] = compiled_def
        except Exception as e:
            logger.err("Compilation of {}.{} failed".format(namespace, gen_id))
            logging.exception(e)
            compiled_map[gen_def] = None

    return compiled_map
コード例 #4
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
        def process_with_tracking(cls, raw_data: Dict):
            spec: GeneratorInversionSpec = GeneratorInversionSpec(
                raw_data['inputs'], raw_data['output'],
                raw_data['intermediates'], raw_data['generator_tracking'])

            results: List[Tuple[str, Dict[str, List[Any]]]] = []
            # print(raw_data['program'])
            # print([t.record for t in raw_data['generator_tracking']])
            for depth, fn in enumerate(raw_data['function_sequence'], 1):
                if fn not in cls.generators:
                    logger.warn("Generator not defined for {}".format(fn),
                                use_cache=True)
                    continue

                try:
                    tracker = spec.trackers[depth - 1]
                    results.append(
                        (fn,
                         cls.generators[fn].generate_arguments_training_data(
                             spec, depth=depth, tracker=tracker)))
                except SilentException as e:
                    pass

                except Exception as e:
                    logger.err("Encountered Exception for {}".format(fn))
                    logging.exception(e)

            return results
コード例 #5
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def init(self):
        if (not os.path.exists(self.args.outdir)) or self.args.force:
            os.system('rm -rf {}'.format(self.args.outdir))
            os.system('mkdir -p {}'.format(self.args.outdir))

        if not os.path.exists(self.args.outdir):
            logger.err("Failed to create output directory at {}".format(
                self.args.outdir))
            sys.exit(1)

        self.file_map: Dict[str, Dict[str, IndexedFileWriter]] = {}
コード例 #6
0
    def add_external_edges(self,
                           other: 'GraphNodeCollection',
                           collector: EdgeCollection,
                           is_reverse=False):
        if is_reverse:
            return

        if self.options.EQUALITY_EDGES and self.source[0] != other.source[0]:
            #  We only add equality edges between collections from different kinds of sources
            #  I don't see much point in having equality edges between say the groupby groups produced
            #  We want to capture relationships between the input and output, not amongst the outputs themselves
            for val1, nodes1 in self.value_map.items():
                if val1 in other.value_map:
                    val2 = val1
                    nodes2 = other.value_map[val2]
                    try:
                        #  This can fail for NaNs etc.
                        if val1 == val2:
                            for n1, n2 in itertools.product(nodes1, nodes2):
                                collector.add_edge(n1,
                                                   n2,
                                                   GraphEdgeType.EQUALITY,
                                                   directed=False)

                    except Exception as e:
                        logger.err("Error comparing {} and {}".format(
                            val1, val2))
                        logging.exception(e)

        if self.options.SUBSTR_EDGES and self.source[0] != other.source[0]:
            #  We only add substr edges between collections from different kinds of sources.
            #  The reasoning is the same as in the equality edges case
            for val1, nodes1 in self.value_map.items():
                for val2, nodes2 in other.value_map.items():
                    if isinstance(val1, str) or isinstance(val2, str):
                        # if (str(val1) in str(val2)) or (str(val2) in str(val1)):
                        if str(val1) in str(val2):
                            for n1, n2 in itertools.product(nodes1, nodes2):
                                collector.add_edge(n1, n2,
                                                   GraphEdgeType.SUBSTR)
                                collector.add_edge(n2, n1,
                                                   GraphEdgeType.SUPSTR)

                        elif str(val2) in str(val1):
                            for n1, n2 in itertools.product(nodes1, nodes2):
                                collector.add_edge(n2, n1,
                                                   GraphEdgeType.SUBSTR)
                                collector.add_edge(n1, n2,
                                                   GraphEdgeType.SUPSTR)
コード例 #7
0
    def run(self, cmd: str):
        attempts = 0
        sleep_time = 5
        max_sleep_time = 20
        code = os.system(cmd)
        while code != 0:
            attempts += 1
            if attempts <= self.max_gdrive_retries:
                logger.info("Retrying after {sleep} seconds...".format(sleep=sleep_time))
                time.sleep(sleep_time)
                sleep_time = min(sleep_time + 5, max_sleep_time)
                code = os.system(cmd)

                continue

            logger.err("Command {cmd} failed with exit code {code}".format(cmd=cmd, code=code))
            sys.exit(1)
コード例 #8
0
    def get_output(self, cmd: str):
        attempts = 0
        sleep_time = 5
        max_sleep_time = 20
        while True:
            attempts += 1
            try:
                out = subprocess.check_output(cmd, shell=True)
                return out.decode("utf-8")

            except subprocess.CalledProcessError as e:
                e.output = str(e.output)
                if 'rateLimitExceeded' in e.output and attempts <= self.max_gdrive_retries:
                    logger.info("Rate Limit Exceeded. Waiting {sleep} seconds...".format(sleep=sleep_time))
                    time.sleep(sleep_time)
                    sleep_time = min(sleep_time + 5, max_sleep_time)
                    continue

                logger.err("Command {cmd} failed with exit code {code} "
                           "and output {output}".format(cmd=cmd, code=e.returncode, output=e.output))
                sys.exit(1)
コード例 #9
0
 def add_equality_edges(self, wrapped_df1: DfTypeWrapper, df1_idx: str,
                        wrapped_df2: DfTypeWrapper, df2_idx: str):
     df1_values_to_nodes = self.value_to_node_map(wrapped_df1, df1_idx)
     df2_values_to_nodes = self.value_to_node_map(wrapped_df2, df2_idx)
     for df1_value in df1_values_to_nodes.keys():
         for df2_value in df2_values_to_nodes.keys():
             try:
                 if df1_value == df2_value:
                     for df1_node in df1_values_to_nodes[df1_value]:
                         for df2_node in df2_values_to_nodes[df2_value]:
                             self.add_edge(df1_node, df2_node,
                                           RelationGraphEdgeType.EQUALITY)
             except TypeError:
                 pass
             except ValueError:
                 pass
             except SyntaxError:
                 pass
             except Exception as e:
                 logger.err("Error comparing {} and {}".format(
                     df1_value, df2_value))
                 logging.exception(e)
コード例 #10
0
ファイル: compiler.py プロジェクト: rbavishi/autopandas
def parse_gens_from_defs(
    gen_defs: Dict[Tuple[str, str], ast.FunctionDef],
    cmd_args: ArgNamespace,
    parse_cache: Dict[str, Optional[IGenerator]] = None
) -> Dict[str, Optional[IGenerator]]:

    parse_results: Dict[str, Optional[IGenerator]] = {}
    if parse_cache is not None:
        parse_results.update(parse_cache)

    for (namespace, gen_id), gen_def in gen_defs.items():
        try:
            logger.info("Parsing {}.{}".format(namespace, gen_id))
            igen: IGenerator = parse_gen_from_ast(gen_def, namespace, gen_id,
                                                  parse_results, cmd_args)
            parse_results[namespace + '.' + gen_id] = igen
        except Exception as e:
            logger.err("Parsing of {}.{} failed".format(namespace, gen_id))
            logging.exception(e)
            parse_results[namespace + '.' + gen_id] = None

    return parse_results
コード例 #11
0
def run_download(cmd_args: ArgNamespace):
    home_dir = os.path.expanduser("~")
    gdrive_bin = home_dir + '/gdrive'
    if not os.path.exists(gdrive_bin):
        logger.err(
            "Could not find gdrive at {home_dir}. "
            "Please download binary from https://github.com/gdrive-org/gdrive \n"
            "WARNING : Delete gdrive and {home_dir}/.gdrive after use".format(
                home_dir=home_dir))
        return

    if cmd_args.path is None and cmd_args.path_id is None:
        raise Exception("One of --path and --path-id should be provided")

    runner = GDriveRunner(home_dir, cmd_args)

    if cmd_args.path_id is not None:
        path = cmd_args.path_id
    else:
        path = runner.get_id(cmd_args.path)

    cmd = '{gdrive} download {path} --force --path {outdir}'.format(
        gdrive=gdrive_bin, path=path, outdir=cmd_args.outdir)
    runner.run(cmd)
コード例 #12
0
    def convert_input_output(cls,
                             inputs,
                             output,
                             options: RelationGraphOptions,
                             extras=None):
        input_dfs = []
        for input_ in inputs:
            try:
                input_dfs += cls.get_df(input_)
            except NotImplementedError:
                raise
            except Exception as e:
                logger.err(
                    "Error while getting df for input : {}".format(input_))
                print(e)
                raise NotImplementedError(
                    "Caught exception for input : {}".format(input_))

        for _inp in input_dfs:
            _inp = _inp.df
            if len(_inp) > 100 or len(_inp.columns) > 100:
                raise NotImplementedError(
                    "Cannot handle inputs with >100 rows or columns")

        try:
            output_dfs = cls.get_df(output, mode='output')
        except NotImplementedError:
            raise
        except Exception as e:
            logger.err("Error while getting df for output : {}".format(output))
            print(e)
            raise NotImplementedError(
                "Caught exception for output : {}".format(output))

        extra_dfs = []
        if extras:
            for extra_ in extras:
                try:
                    extra_dfs += cls.get_df(extra_, mode='extra')
                except NotImplementedError:
                    raise
                except Exception as e:
                    logger.err(
                        "Error while getting df for extra : {}".format(extra_))
                    print(e)
                    raise NotImplementedError(
                        "Caught exception for input : {}".format(extra_))

        return input_dfs, output_dfs, extra_dfs