def get_keys_config_file(self):
        logging.info("List of keys config file")
        data_pipeline_schema = self.download_template_schema()
        data_pipeline_yaml = YAMLReader(data_pipeline_schema)
        data_pipeline_config = data_pipeline_yaml.read_yaml(True)

        return data_pipeline_config.keys()
    def create_config_file(self, list_files, prefix_file=None):
        logging.info("Creating data pipeline config file.")
        data_pipeline_schema = self.download_template_schema()
        data_pipeline_yaml = YAMLReader(data_pipeline_schema)
        data_pipeline_config = data_pipeline_yaml.read_yaml(True)
        list_resources = {}
        for k, v in list_files.items():
            list_resources.setdefault(v['resource'], []).append(k)

        for k, v in list_resources.items():
            if k in data_pipeline_config:
                # The template file is the reference for creating a list or a single entry.
                # String vs. List
                if (len(v) == 1) and isinstance(data_pipeline_config[k],
                                                basestring):
                    data_pipeline_config[k] = v[0]
                else:
                    data_pipeline_config[k] = v
            else:
                logging.error("The key %s does not exist", k)

        with self.open_config_file(prefix_file) as outfile:
            yaml.safe_dump(data_pipeline_config,
                           outfile,
                           default_flow_style=False,
                           allow_unicode=True)

        logging.info("Data Pipeline YAML file created.")
def main():
    cfg.setup_parser()
    args = cfg.get_args()
    yaml = YAMLReader(args.config)
    yaml_dict = yaml.read_yaml()
    print_list_steps(yaml.get_list_keys())
    cfg.set_up_logging(args)

    resources = RetrieveResource(args, yaml_dict)
    resources.run()
def main():
    cfg.setup_parser()
    args = cfg.get_args()
    yaml = YAMLReader()
    yaml_dict = yaml.read_yaml()
    get_list_steps_on_request(args.list_steps, yaml.get_list_keys())
    cfg.set_up_logging(args)

    google_opts = GoogleBucketResource.has_google_parameters(
        args.google_credential_key, args.google_bucket)
    resources = RetrieveResource(args, yaml_dict,
                                 yaml_dict.data_pipeline_schema)
    resources.run()
Example #5
0
def main():
    cfg.setup_parser()
    args = cfg.get_args()
    yaml = YAMLReader(args.config)
    yaml_dict = yaml.read_yaml()
    get_list_steps_on_request(args.list_steps, yaml.get_list_keys())
    cfg.set_up_logging(args)

    #--gkey and --google_bucket are mandatory for the google storage access. Both keys must be parameters or none.
    GoogleBucketResource.has_google_parameters(args.google_credential_key,
                                               args.google_bucket)
    resources = RetrieveResource(args, yaml_dict,
                                 yaml_dict.data_pipeline_schema)
    resources.run()
Example #6
0
class TestYamlReader(unittest.TestCase):
    """
    YamlReader reads the 'config.yaml' file in the base directory and returning a dictionary representation.

    This test module includes basic validation tests (eg. 'test_config_parses'). Additional 'key specific' tests
    should be included here too as a validation mechanism (eg. chembl_indexes_have_fields).
    """
    def setUp(self):
        default_conf_file = ROOT_DIR + '/' + 'config.yaml'
        print(default_conf_file)
        self.yaml_reader = YAMLReader(default_conf_file)
        self.yaml_dict = self.yaml_reader.read_yaml()

    def test_config_parses(self):
        """
        Basic test to see if there are yaml parse errors: if the config file has a syntax problem an empty dictionary
        is returned.
        """
        self.assertGreater(len(self.yaml_dict), 0,
                           "Yaml dict should not be empty.")

    def test_chembl_indexes_have_fields(self):
        """
        Checks that all listed ChEMBL elasticsearch indexes in config select at least one query field.
        """
        indexes = self.yaml_dict.ChEMBL.datasources.indices
        for i in list(indexes.values()):
            for k, v in list(i.items()):
                print(k)
                if k is 'fields':
                    self.assertGreater(
                        len(v), 0, 'No fields provided on index {}'.format(k))
Example #7
0
class TestDrugStep(unittest.TestCase):
    """
    YamlReader reads the 'config.yaml' file in the base directory and returning a dictionary representation.
    """
    def setUp(self):
        default_conf_file = ROOT_DIR + '/' + 'config.yaml'
        self.yaml_reader = YAMLReader(default_conf_file)
        self.config = self.yaml_reader.read_yaml()

    @patch('modules.Drug.Drug._download_elasticsearch_data')
    def test_output_has_fields_to_write_to_GCP(self, mock1):
        """
        Step should return a dictionary with GCP target directory so that if `RetrieveResource` is configured to upload
        results it can save the files somewhere valid.
        """
        # Given
        # file names 'saved' by step
        es_return_values = ['f1', 'f2', 'f3']
        mock1.return_value = es_return_values
        # When
        drugStep = Drug.Drug(self.config['drug'])
        results = drugStep.get_all()
        # Then
        # Each file saved should be in returned dictionary
        self.assertEqual(len(results), len(es_return_values))
        # returned dictionary should have fields 'resource' and 'gs_output_dir'
        for f in es_return_values:
            self.assertTrue(f in results)
            self.assertTrue('resource' in results[f])
            self.assertTrue('gs_output_dir' in results[f])
Example #8
0
class TestDrugStep(unittest.TestCase):
    """
    YamlReader reads the 'config.yaml' file in the base directory and returning a dictionary representation.
    """
    def setUp(self):
        default_conf_file = ROOT_DIR + '/' + 'config.yaml'
        self.yaml_reader = YAMLReader(default_conf_file)
        self.config = self.yaml_reader.read_yaml()
        self.output = self.create_output_dir_test()

    def create_output_dir_test(self):
        output = Dict()
        output.prod_dir = "prod"
        output.staging_dir = "staging"
        return output

    @patch('plugins.Drug.Drug._download_elasticsearch_data')
    def test_output_has_fields_to_write_to_GCP(self, mock1):
        """
        Step should return a dictionary with GCP target directory so that if `RetrieveResource` is configured to upload
        results it can save the files somewhere valid.
        """
        # Given
        # file names 'saved' by step
        es_return_values = ['f1', 'f2', 'f3']
        mock1.return_value = es_return_values
        # We only want to test the results of ES configuration at this point.
        es_config = self.config['drug']
        es_config.datasources.pop('downloads', None)
        # When
        drugStep = Drug.Drug()
        results = drugStep.download_indices(self.config.drug, self.output)
        # Then
        # Each file saved should be in returned dictionary
        self.assertEqual(len(results), len(es_return_values))
        # returned dictionary should have fields 'resource' and 'gs_output_dir'
        for f in es_return_values:
            self.assertTrue(f in results)
Example #9
0
 def setUp(self):
     default_conf_file = ROOT_DIR + '/' + 'config.yaml'
     print(default_conf_file)
     self.yaml_reader = YAMLReader(default_conf_file)
     self.yaml_dict = self.yaml_reader.read_yaml()
Example #10
0
 def setUp(self):
     default_conf_file = ROOT_DIR + '/' + 'config.yaml'
     self.yaml_reader = YAMLReader(default_conf_file)
     self.config = self.yaml_reader.read_yaml()
     self.output = self.create_output_dir_test()