def filter_metadata(self, raw_metadata): """ This extracts the metadata which is actually desired downstream from the bulk of the metadata which has been collected. """ rslt = self.basic_filter_metadata(raw_metadata) candidate_l = [raw_metadata[k] for k in raw_metadata if 'experiment.json' in k] if not candidate_l: raise MetadataError('Cannot find experiment.json') experiment_md = candidate_l[0] for elt in candidate_l[1:]: if elt != experiment_md: raise MetadataError("Multiple experiment.json files that do not match") tests = [("resolution_x_value", "xyResolution", float), ("resolution_y_value", "xyResolution", float), ("resolution_z_value", "zPitch", float), ("number_of_cycles", "cycle_upper_limit", int), ("execution_datetime", "dateProcessed", datetime), #("acquisition_instrument_model", "version", str), ] if 'channelNames' not in experiment_md or 'channelNamesArray' not in experiment_md['channelNames']: raise MetadataError('experiment.json is missing expected element channelNamesArray') # disable internal consistency checks because we trust metadata.tsv #self.internal_consistency_checks(rslt, experiment_md, tests, experiment_md['channelNames']['channelNamesArray']) return rslt
def internal_consistency_checks(self, rslt, experiment_md, test_tpl_lst, channel_names): """ Check a variety of relationships that are believed to hold between [Ee]xperiment.json and metadata found in the metadata.tsv file. """ for rslt_nm, expt_nm, _ in test_tpl_lst: try: print('#### rslt: ', rslt_nm, type(rslt[rslt_nm]), rslt[rslt_nm], 'exp.json: ', expt_nm, type(experiment_md[expt_nm]), experiment_md[expt_nm]) except KeyError as e: print('#### rslt: ', rslt_nm, type(rslt[rslt_nm]), rslt[rslt_nm], 'KeyError: ', e) print('####') for rslt_nm, expt_nm, tp in test_tpl_lst: if rslt_nm not in rslt: raise MetadataError("metadata is missing expected element {}".format(rslt_nm)) if expt_nm not in experiment_md: raise MetadataError("experiment.json is missing expected element {}".format(expt_nm)) if not close_enough_match(rslt[rslt_nm], experiment_md[expt_nm], tp): raise MetadataError("metadata field {} does not match experiment.json field {}" .format(rslt_nm, expt_nm)) if 'number_of_antibodies' not in rslt: raise MetadataError("metadata is missing element number_of_antibodies") if channel_names is not None: arr = [elt for elt in channel_names if elt.lower() not in ['blank', 'empty'] and not elt.startswith('DAPI') and not elt.startswith('HOECHST')] #print(channel_names) if not close_enough_match(rslt['number_of_antibodies'], len(arr), int): raise MetadataError("metadata field number_of_antibodies does not match length of" "experiment.json channelNamesArray")
def internal_consistency_checks(self, rslt, readme_md): """ Check a variety of relationships that are believed to hold between [Ee]xperiment.json and metadata found in the metadata.tsv file. """ if 'tissue_id' not in rslt: raise MetadataError('metadata is missing tissue_id') if 'UUID Identifier' not in readme_md: raise MetadataError('README metadata is missing UUID Identifier') if rslt['tissue_id'] != readme_md['UUID Identifier']: raise MetadataError('tissue_id does not match UUID Identifier')
def basic_filter_metadata(self, raw_metadata): """ Make sure basic components of metadata are present, and promote them """ rslt = {k : raw_metadata[k] for k in ['collectiontype']} if len(raw_metadata['components']) != 1: raise MetadataError("Only one line of metadata.tsv info is currently supported") rslt.update(raw_metadata['components'][0]) # Hard-coded reality checks if 'assay_type' not in rslt or rslt['assay_type'] != 'scRNAseq-10xGenomics': raise MetadataError('assay_type is not ' 'scRNAseq-10xGenomics') return rslt
def scan(target_dir, out_fname, schema_fname, yaml_flag=False): global _KNOWN_DATA_COLLECTION_TYPES if _KNOWN_DATA_COLLECTION_TYPES is None: lst = [] for nm in dir(data_collection_types): elt = getattr(data_collection_types, nm) if isinstance(elt, type) and issubclass(elt, DataCollection): lst.append((elt.match_priority, elt.category_name, elt)) lst.sort(reverse=True) lst = [c for a, b, c in lst] _KNOWN_DATA_COLLECTION_TYPES = lst for collection_type in _KNOWN_DATA_COLLECTION_TYPES: if collection_type.test_match(target_dir): print('collector match: ', collection_type.category_name) collector = collection_type(target_dir) metadata = collector.filter_metadata(collector.collect_metadata()) #print('collector: ', repr(collector)) #print('metadata: %s' % metadata) break else: raise MetadataError( '%s does not match any known data collection type' % target_dir) assert_json_matches_schema(metadata, schema_fname) if yaml_flag: with sys.stdout if out_fname is None else open(out_fname, 'w') as f: yaml.dump(metadata, f) else: with sys.stdout if out_fname is None else open(out_fname, 'w') as f: json.dump(metadata, f)
def close_enough_match(v1, v2, tp): """ Converts both values to the given type, and returns True if they match closely enough to satisfy consistency rules for metadata.tsv, false otherwise. """ if tp == str: return str(v1) == str(v2) elif tp == float: f1 = float(v1) f2 = float(v2) return abs(f1 - f2)/(abs(f1) + abs(f2)) < 0.00001 elif tp == int: return int(v1) == int(v2) elif tp == datetime: if '[' in v1: v1 = v1[:v1.find('[')] if '[' in v2: timezone = pytz.timezone(v2[v2.find('[')+1 : v2.find(']')]) v2 = v2[:v2.find('[')] else: timezone = pytz.utc d1 = translate_timestamp(v1, timezone) d2 = translate_timestamp(v2, timezone) dlt = d1 - d2 #print('timedelta: ', dlt) return abs(dlt).total_seconds() <= 3600.0 # We will allow up to an hour delta else: raise MetadataError('close_enough_match does not know how to compare values of type {}' .format(tp.__name__))
def basic_filter_metadata(self, raw_metadata): """ Make sure basic components of metadata are present, and promote them """ rslt = {k : raw_metadata[k] for k in ['collectiontype']} if len(raw_metadata['components']) != 1: raise MetadataError("Only one line of metadata.tsv info is currently supported") rslt.update(raw_metadata['components'][0]) return rslt
def collect_metadata(self): print('parsing tsv from %s' % self.path) md = [] try: with open(self.path, 'rU', newline='', encoding='ascii') as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: md.append({k: v for k, v in row.items()}) except UnicodeDecodeError as e: raise MetadataError(str(e) + f'in {self.path}') # Scan for the common error of bad keys/values due to missing delimiters for row in md: if any(k in [None, ''] for k in row) or any(v is None for v in row.values()): raise MetadataError( '{} has empty keys or values. Delimiter error?'.format( self.path)) return md
def filter_metadata(self, raw_metadata): """ This extracts the metadata which is actually desired downstream from the bulk of the metadata which has been collected. """ rslt = self.basic_filter_metadata(raw_metadata) candidate_l = [raw_metadata[k][0] for k in raw_metadata if 'README.csv' in k] if not candidate_l: raise MetadataError('Cannot find README.csv') readme_md = candidate_l[0] for elt in candidate_l[1:]: if elt != experiment_md: raise MetadataError("Multiple README.csv files that do not match") # Disable internal consistency checks because we trust metadata.tsv #self.internal_consistency_checks(rslt, readme_md) rslt['other_metadata'] = readme_md return rslt
def translate_timestamp(tstr, default_tz): try: d = datetime.strptime(tstr, '%Y-%m-%dT%H:%M:%S.%f%z') return d except ValueError: try: d = datetime.strptime(tstr, '%Y-%m-%d %H:%M:%S.%f') d = default_tz.localize(d) return d except: try: d = datetime.strptime(tstr, '%Y-%m-%d %H:%M:%S') d = default_tz.localize(d) return d except: try: d = datetime.strptime(tstr, '%Y-%m-%d %H:%M') d = default_tz.localize(d) return d except: raise MetadataError('Cannot translate time string {}'.format(tstr))