Esempio n. 1
0
    def __init__(self, config_file):
        self._logger = logging.getLogger(__name__)
        self._dao = RemoteBackboneDAO()

        if os.getenv('LOCAL_TEST'):
            self._dao = LocalBackboneDAO('upload_test', [
                'cn=editor,ou=sims,ou=projects,ou=groups,dc=malariagen,dc=net'
            ])

        self._config_file = config_file

        try:
            with open(config_file) as json_file:
                args = json.load(json_file)
                if 'debug' in args:
                    if args['debug']:
                        log_time = datetime.datetime.now().strftime(
                            "%Y_%m_%d_%H_%M")
                        log_file = 'uploader_{}.log'.format(log_time)
                        print("Debugging to {}".format(log_file))
                        logging.basicConfig(level=logging.DEBUG,
                                            filename=log_file)
                if 'dao_type' in args:
                    if args['dao_type'] == 'local':
                        if 'database' in args:
                            os.environ['POSTGRES_DB'] = args['database']
                        self._logger.debug('Using database {}'.format(
                            os.getenv('POSTGRES_DB', 'backbone_service')))
                        self._dao = LocalBackboneDAO(args['username'],
                                                     args['auths'])
        except FileNotFoundError as fnfe:
            print('No config file found: {}'.format(config_file))
            pass

        self._dao.setup(config_file)
Esempio n. 2
0
    def getDAO(self):
        dao = RemoteBackboneDAO()

        if os.getenv('LOCAL_TEST'):
            dao = LocalBackboneDAO('upload_test', [
                'cn=editor,ou=sims,ou=projects,ou=groups,dc=malariagen,dc=net'
            ])

        dao.setup(self._config_file)
        return dao
Esempio n. 3
0
class SetStudies():

    _auth_token = ''
    _api_client = None

    def __init__(self, config_file, cmis_config):

        self._logger = logging.getLogger(__name__)
        self.get_cmis_client(cmis_config)

        self._dao = RemoteBackboneDAO()

        if os.getenv('LOCAL_TEST'):
            self._dao = LocalBackboneDAO('upload_test', [
                'cn=editor,ou=sims,ou=projects,ou=groups,dc=malariagen,dc=net'
            ])

        self._config_file = config_file

        try:
            with open(config_file) as json_file:
                args = json.load(json_file)
                if 'debug' in args:
                    if args['debug']:
                        log_time = datetime.datetime.now().strftime(
                            "%Y_%m_%d_%H_%M")
                        log_file = 'uploader_{}.log'.format(log_time)
                        print("Debugging to {}".format(log_file))
                        logging.basicConfig(level=logging.DEBUG,
                                            filename=log_file)
                if 'dao_type' in args:
                    if args['dao_type'] == 'local':
                        if 'database' in args:
                            os.environ['POSTGRES_DB'] = args['database']
                        self._logger.debug('Using database {}'.format(
                            os.getenv('POSTGRES_DB', 'backbone_service')))
                        self._dao = LocalBackboneDAO(args['username'],
                                                     args['auths'])
        except FileNotFoundError as fnfe:
            print('No config file found: {}'.format(config_file))
            pass

        self._dao.setup(config_file)

    def get_cmis_client(self, config_file):

        with open(config_file) as json_file:
            config = json.load(json_file)

        self.cmis_client = CmisClient(config['endpoint'], config['username'],
                                      config['password'])
        self.repo = self.cmis_client.defaultRepository

    def update_study_names(self):

        studies = self._dao.download_studies()

        studies_dict = {}
        for study in studies.studies:
            studies_dict[study.code] = study.name

        results = self.repo.query("select * from cggh:collaborationFolder")

        for result in results:
            code = result.getName()[:4]
            if code in studies_dict:
                if studies_dict[code] != result.getName():
                    print('Updating {} to {}'.format(studies_dict[code],
                                                     result.getName()))
                    study_detail = self._dao.download_study(code)
                    study_detail.name = result.getName()
                    self._dao.update_study(code, study_detail)
Esempio n. 4
0
class SetTaxa():

    _taxa_map = {}
    _auth_token = ''
    _api_client = None

    def __init__(self, config_file):
        self._logger = logging.getLogger(__name__)
        self._dao = RemoteBackboneDAO()

        if os.getenv('LOCAL_TEST'):
            self._dao = LocalBackboneDAO('upload_test', [
                'cn=editor,ou=sims,ou=projects,ou=groups,dc=malariagen,dc=net'
            ])

        self._config_file = config_file

        try:
            with open(config_file) as json_file:
                args = json.load(json_file)
                if 'debug' in args:
                    if args['debug']:
                        log_time = datetime.datetime.now().strftime(
                            "%Y_%m_%d_%H_%M")
                        log_file = 'uploader_{}.log'.format(log_time)
                        print("Debugging to {}".format(log_file))
                        logging.basicConfig(level=logging.DEBUG,
                                            filename=log_file)
                if 'dao_type' in args:
                    if args['dao_type'] == 'local':
                        if 'database' in args:
                            os.environ['POSTGRES_DB'] = args['database']
                        self._logger.debug('Using database {}'.format(
                            os.getenv('POSTGRES_DB', 'backbone_service')))
                        self._dao = LocalBackboneDAO(args['username'],
                                                     args['auths'])
        except FileNotFoundError as fnfe:
            print('No config file found: {}'.format(config_file))
            pass

        self._dao.setup(config_file)

    def load_taxa_map(self):

        input_stream = open('taxon_mapping.csv')

        with input_stream as csvfile:
            data_reader = csv.reader(csvfile)

            for row in data_reader:
                taxas = []
                for taxa in row[6].split(';'):
                    taxas.append(
                        openapi_client.Taxonomy(taxonomy_id=int(taxa)))

                self._taxa_map[row[0]] = taxas

    def set_taxa(self):

        studies = self._dao.download_studies()

        update = False
        for study in studies.studies:
            study_detail = self._dao.download_study(study.code)
            for species in study_detail.partner_species:
                if species.partner_species in self._taxa_map:
                    taxas = self._taxa_map[species.partner_species]
                    for taxa in taxas:
                        found = False
                        for st in species.taxa:
                            if int(taxa.taxonomy_id) == int(st.taxonomy_id):
                                found = True
                        if not found:
                            print(
                                "In study {} Setting taxa for {} to {} from {}"
                                .format(study.code, species.partner_species,
                                        taxas, species.taxa))
                            species.taxa = taxas
                            update = True
                else:
                    #print("No mapping for species {} {}".format(species.partner_species, study_detail))
                    pass

            if update:
                #print(study_detail)
                self._dao.update_study(study.code, study_detail)
Esempio n. 5
0
class Uploader():

    _data_file = None

    _event_set = None

    _dao = None

    _config_file = None

    def __init__(self, config_file):
        self._logger = logging.getLogger(__name__)

        self._dao = RemoteBackboneDAO()

        if os.getenv('LOCAL_TEST'):
            self._dao = LocalBackboneDAO('upload_test', [
                'cn=editor,ou=sims,ou=projects,ou=groups,dc=malariagen,dc=net'
            ])

        self._config_file = config_file

        try:
            with open(config_file) as json_file:
                args = json.load(json_file)
                if 'debug' in args:
                    if args['debug']:
                        log_time = datetime.datetime.now().strftime(
                            "%Y_%m_%d_%H_%M")
                        log_file = 'uploader_{}.log'.format(log_time)
                        print("Debugging to {}".format(log_file))
                        logging.basicConfig(level=logging.DEBUG,
                                            filename=log_file)
                if 'dao_type' in args:
                    if args['dao_type'] == 'local':
                        if 'database' in args:
                            os.environ['POSTGRES_DB'] = args['database']
                        self._logger.debug('Using database {}'.format(
                            os.getenv('POSTGRES_DB', 'backbone_service')))
                        self._dao = LocalBackboneDAO(args['username'],
                                                     args['auths'])
        except FileNotFoundError as fnfe:
            print('No config file found: {}'.format(config_file))
            pass

        self._dao.setup(config_file)

    @property
    def message_buffer(self):
        return BaseEntity.message_buffer

    @property
    def use_message_buffer(self):
        return BaseEntity.use_message_buffer

    @use_message_buffer.setter
    def use_message_buffer(self, use_buffer):
        BaseEntity.set_use_message_buffer(use_buffer)

    def setup(self, filename):

        self._data_file = os.path.basename(filename)

        self._event_set = os.path.basename(filename).split('.')[0]

        event_set_id = self._event_set  # str | ID of eventSet to create

        self.se_processor = SamplingEventProcessor(self._dao, self._event_set)
        self.os_processor = OriginalSampleProcessor(self._dao, self._event_set)
        self.os_processor.sampling_event_processor = self.se_processor
        self.ds_processor = DerivativeSampleProcessor(self._dao,
                                                      self._event_set)
        self.ad_processor = AssayDataProcessor(self._dao, self._event_set)
        self.i_processor = IndividualProcessor(self._dao, self._event_set)

        api_response = self._dao.create_event_set(event_set_id)

    def load_data_file(self, data_def, filename):

        self.setup(filename)

        input_stream = open(filename)

        if self._logger.isEnabledFor(logging.DEBUG):
            import cProfile
            profile = cProfile.Profile()
            profile.enable()

        ret = self.load_data(data_def, input_stream, True, False)

        if self._logger.isEnabledFor(logging.DEBUG):
            profile.disable()
            #profile.print_stats()
            import io, pstats
            s = io.StringIO()
            sortby = 'cumulative'
            ps = pstats.Stats(profile, stream=s).sort_stats(sortby)
            ps.print_stats(.1, 'uploader')
            self._logger.debug(s.getvalue())
            profile.dump_stats('upload_source_stats.cprof')

        return ret

    def parse_date(self, defn, date_value):

        accuracy = None
        data_value = date_value.split(' ')[0]
        try:
            if 'date_format' in defn:
                date_format = defn['date_format']
            else:
                date_format = '%Y-%m-%d'
            data_value = datetime.datetime(
                *(time.strptime(data_value, date_format))[:6]).date()
        except ValueError as dpe:
            try:
                date_format = '%d/%m/%Y'
                data_value = datetime.datetime(
                    *(time.strptime(data_value, date_format))[:6]).date()
            except ValueError as dpe:
                try:
                    date_format = '%d-%b-%Y'
                    data_value = datetime.datetime(
                        *(time.strptime(data_value, date_format))[:6]).date()
                except ValueError as dpe:
                    try:
                        date_format = '%d/%m/%y'
                        data_value = datetime.datetime(
                            *(time.strptime(data_value, date_format)
                              )[:6]).date()
                    except ValueError as dpe:
                        try:
                            date_format = '%d %b %Y'
                            data_value = datetime.datetime(
                                *(time.strptime(date_value, date_format)
                                  )[:6]).date()
                        except ValueError as dpe:
                            date_format = '%Y'
                            data_value = datetime.datetime(
                                *(time.strptime(data_value[:4], date_format)
                                  )[:6]).date()
                            accuracy = 'year'


#          else:
#To make sure that the default conversion works
#              data.typed_data_value

        return data_value, accuracy

    def load_data(self, data_def, input_stream, skip_header, update_only):

        processed = 0

        with input_stream as csvfile:
            data_reader = csv.reader(csvfile, delimiter='\t')

            if skip_header:
                next(data_reader)

            for row in data_reader:
                entity_id = None
                values = {}
                prop_by_column = {}
                processed = processed + 1
                #Ensure columns are processed in order - see also doc_accuracy comment below
                #For more predictable behaviour
                for name, defn in sorted(data_def['values'].items(),
                                         key=lambda x: x[1]['column']):
                    identity = False
                    #print(repr(defn))
                    #print(repr(row))
                    data_value = row[defn['column']]
                    if data_value == '\\N':
                        continue
                    #Convert data value - make sure you set data_value
                    try:
                        if 'regex' in defn:
                            re_match = re.search(defn['regex'], data_value)
                            if re_match:
                                #print("Groupdict:" + repr(re_match.groupdict()))
                                try:
                                    data_value = re_match.group(1)
                                except IndexError as iere:
                                    raise InvalidDataValueException(
                                        "Failed to parse {} using {}".format(
                                            data_value,
                                            defn['regex'])) from iere
                                #print("Transformed value is:" + data_value + " from " + row[defn['column']])
                                #print(repr(re_match.groupdict()))
                                #if row[defn['column']] != "" and data_value == "":
                                #    print("Empty match: {} {}".format(defn['regex'], row[defn['column']]))
                            #else:
                            #    print("No match: {} {}".format(defn['regex'], data_value))
                        if defn['type'] == 'datetime':
                            if not (data_value == '' or data_value == 'NULL' or
                                    data_value == '-' or data_value == 'None'):
                                try:
                                    data_value, values[
                                        name + '_accuracy'] = self.parse_date(
                                            defn, data_value)
                                except ValueError as dpe:
                                    self.se_processor.report(
                                        "Failed to parse date '{}'".format(
                                            data_value), values)
                                    continue
                            else:
                                #Skip this property
                                continue

                        if 'replace' in defn:
                            for subs in defn['replace']:
                                data_value = re.sub("^" + subs[0] + "$",
                                                    subs[1], data_value)
                                #print("Transformed value is:" + data_value + " from " + row[defn['column']])

                    except IndexError:
                        self._logger.critical(repr(defn))
                        self._logger.critical(repr(row))
                        raise

                    if defn['type'] == 'string':
                        #Ignore empty values
                        #This can be important e.g. if date is parsed and set doc_accuracy to year
                        #and doc_accuracy accuracy defined column is empty
                        if data_value and data_value.strip():
                            values[name] = data_value.strip()
                        #else:
                        #    print('Ignoring {} {} {}'.format(name, data_value, values))
                    else:
                        values[name] = data_value

                self.process_item(values)

    def process_item(self, values):

        #Reset connections for each item
        #Have had problems with pool.join blocking in ApiClient for
        #larger input files
        self._dao.setup(self._config_file)

        if 'study_id' not in values:
            values['study_id'] = '0000-Unknown'

        o_sample = self.os_processor.create_original_sample_from_values(values)

        o_existing = self.os_processor.lookup_original_sample(o_sample, values)

        if o_existing and values['study_id'][:4] == '0000':
            values['study_id'] = o_existing.study_name

        samp = self.se_processor.create_sampling_event_from_values(values)

        location_name, location = self.se_processor.process_location(
            values, '')
        proxy_location_name, proxy_location = self.se_processor.process_location(
            values, 'proxy_')

        #print(samp)

        existing = self.se_processor.lookup_sampling_event(
            o_existing, samp, location, proxy_location, values)

        if location:
            samp.location_id = location.location_id

        if proxy_location:
            samp.proxy_location_id = proxy_location.location_id

        indiv = self.i_processor.create_individual_from_values(values)
        existing_indiv = None
        if existing and existing.individual_id:
            existing_indiv = self._dao.download_individual(
                existing.individual_id)
        else:
            existing_indiv = self.i_processor.lookup_individual(indiv, values)

        individual = self.i_processor.process_individual(
            values, indiv, existing_indiv)
        if individual:
            samp.individual_id = individual.individual_id
        sampling_event = self.se_processor.process_sampling_event(
            values, samp, existing)

        if sampling_event:
            o_sample.sampling_event_id = sampling_event.sampling_event_id

        original_sample = self.os_processor.process_original_sample(
            values, o_sample, o_existing)

        d_sample = self.ds_processor.create_derivative_sample_from_values(
            values)

        dsamp = self.ds_processor.lookup_derivative_sample(d_sample, values)

        derivative_sample = self.ds_processor.process_derivative_sample(
            d_sample, dsamp, original_sample, values)

        ad_sample = self.ad_processor.create_assay_datum_from_values(values)

        adsamp = self.ad_processor.lookup_assay_datum(ad_sample, values)

        self.ad_processor.process_assay_datum(ad_sample, adsamp,
                                              derivative_sample, values)

        #print(existing)
        #print(sampling_event)
        #print(values)
        #print(sampling_event)
        #print(original_sample)
        #print(o_sample)
        #print(o_existing)
        return sampling_event