def filter_metadata(self, raw_metadata):
        """
        This extracts the metadata which is actually desired downstream from the bulk of the
        metadata which has been collected.
        
        """
        rslt = self.basic_filter_metadata(raw_metadata)

        candidate_l = [raw_metadata[k] for k in raw_metadata if 'experiment.json' in k] 
        if not candidate_l:
            raise MetadataError('Cannot find experiment.json')
        
        experiment_md = candidate_l[0]
        for elt in candidate_l[1:]:
            if elt != experiment_md:
                raise MetadataError("Multiple experiment.json files that do not match")

        tests = [("resolution_x_value", "xyResolution", float),
                 ("resolution_y_value", "xyResolution", float),
                 ("resolution_z_value", "zPitch", float),
                 ("number_of_cycles", "cycle_upper_limit", int),
                 ("execution_datetime", "dateProcessed", datetime),
                 #("acquisition_instrument_model", "version", str),
                 ]
        if 'channelNames' not in experiment_md or 'channelNamesArray' not in experiment_md['channelNames']:
            raise MetadataError('experiment.json is missing expected element channelNamesArray')
        # disable internal consistency checks because we trust metadata.tsv
        #self.internal_consistency_checks(rslt, experiment_md, tests, experiment_md['channelNames']['channelNamesArray'])

        return rslt
    def internal_consistency_checks(self, rslt, experiment_md, test_tpl_lst, channel_names):
        """
        Check a variety of relationships that are believed to hold between [Ee]xperiment.json and
        metadata found in the metadata.tsv file.
        """
        for rslt_nm, expt_nm, _ in test_tpl_lst:
            try:
                print('#### rslt: ', rslt_nm, type(rslt[rslt_nm]), rslt[rslt_nm], 'exp.json: ', expt_nm, type(experiment_md[expt_nm]), experiment_md[expt_nm])
            except KeyError as e:
                print('#### rslt: ', rslt_nm, type(rslt[rslt_nm]), rslt[rslt_nm], 'KeyError: ', e)
        print('####')


        for rslt_nm, expt_nm, tp in test_tpl_lst:
            if rslt_nm not in rslt:
                raise MetadataError("metadata is missing expected element {}".format(rslt_nm))
            if expt_nm not in experiment_md:
                raise MetadataError("experiment.json is missing expected element {}".format(expt_nm))
            if not close_enough_match(rslt[rslt_nm], experiment_md[expt_nm], tp):
                raise MetadataError("metadata field {} does not match experiment.json field {}"
                                    .format(rslt_nm, expt_nm))
                
        if 'number_of_antibodies' not in rslt:
            raise MetadataError("metadata is missing element number_of_antibodies")
        if channel_names is not None:
            arr = [elt for elt in channel_names
                   if elt.lower() not in ['blank', 'empty'] and not elt.startswith('DAPI') and not elt.startswith('HOECHST')]
            #print(channel_names)
            if not close_enough_match(rslt['number_of_antibodies'], len(arr), int):
                raise MetadataError("metadata field number_of_antibodies does not match length of"
                                    "experiment.json channelNamesArray")
Ejemplo n.º 3
0
 def internal_consistency_checks(self, rslt, readme_md):
     """
     Check a variety of relationships that are believed to hold between [Ee]xperiment.json and
     metadata found in the metadata.tsv file.
     """
     if 'tissue_id' not in rslt:
         raise MetadataError('metadata is missing tissue_id')
     if 'UUID Identifier' not in readme_md:
         raise MetadataError('README metadata is missing UUID Identifier')
     if rslt['tissue_id'] != readme_md['UUID Identifier']:
         raise MetadataError('tissue_id does not match UUID Identifier')
Ejemplo n.º 4
0
    def basic_filter_metadata(self, raw_metadata):
        """
        Make sure basic components of metadata are present, and promote them
        """
        rslt = {k : raw_metadata[k] for k in ['collectiontype']}
        if len(raw_metadata['components']) != 1:
            raise MetadataError("Only one line of metadata.tsv info is currently supported")
        rslt.update(raw_metadata['components'][0])
        
        # Hard-coded reality checks
        if 'assay_type' not in rslt or rslt['assay_type'] != 'scRNAseq-10xGenomics':
            raise MetadataError('assay_type is not ' 'scRNAseq-10xGenomics')

        return rslt
def scan(target_dir, out_fname, schema_fname, yaml_flag=False):
    global _KNOWN_DATA_COLLECTION_TYPES

    if _KNOWN_DATA_COLLECTION_TYPES is None:
        lst = []
        for nm in dir(data_collection_types):
            elt = getattr(data_collection_types, nm)
            if isinstance(elt, type) and issubclass(elt, DataCollection):
                lst.append((elt.match_priority, elt.category_name, elt))
        lst.sort(reverse=True)
        lst = [c for a, b, c in lst]
        _KNOWN_DATA_COLLECTION_TYPES = lst

    for collection_type in _KNOWN_DATA_COLLECTION_TYPES:
        if collection_type.test_match(target_dir):
            print('collector match: ', collection_type.category_name)
            collector = collection_type(target_dir)
            metadata = collector.filter_metadata(collector.collect_metadata())
            #print('collector: ', repr(collector))
            #print('metadata: %s' % metadata)
            break
    else:
        raise MetadataError(
            '%s does not match any known data collection type' % target_dir)
    assert_json_matches_schema(metadata, schema_fname)
    if yaml_flag:
        with sys.stdout if out_fname is None else open(out_fname, 'w') as f:
            yaml.dump(metadata, f)
    else:
        with sys.stdout if out_fname is None else open(out_fname, 'w') as f:
            json.dump(metadata, f)
def close_enough_match(v1, v2, tp):
    """
    Converts both values to the given type, and returns True if they match closely enough to
    satisfy consistency rules for metadata.tsv, false otherwise.
    """
    if tp == str:
        return str(v1) == str(v2)
    elif tp == float:
        f1 = float(v1)
        f2 = float(v2)
        return abs(f1 - f2)/(abs(f1) + abs(f2)) < 0.00001
    elif tp == int:
        return int(v1) == int(v2)
    elif tp == datetime:
        if '[' in v1:
            v1 = v1[:v1.find('[')]
        if '[' in v2:
            timezone = pytz.timezone(v2[v2.find('[')+1 : v2.find(']')])
            v2 = v2[:v2.find('[')]
        else:
            timezone = pytz.utc
        d1 = translate_timestamp(v1, timezone)
        d2 = translate_timestamp(v2, timezone)
        dlt = d1 - d2
        #print('timedelta: ', dlt)
        return abs(dlt).total_seconds() <= 3600.0  # We will allow up to an hour delta
    else:
        raise MetadataError('close_enough_match does not know how to compare values of type {}'
                            .format(tp.__name__))
Ejemplo n.º 7
0
 def basic_filter_metadata(self, raw_metadata):
     """
     Make sure basic components of metadata are present, and promote them
     """
     rslt = {k : raw_metadata[k] for k in ['collectiontype']}
     if len(raw_metadata['components']) != 1:
         raise MetadataError("Only one line of metadata.tsv info is currently supported")
     rslt.update(raw_metadata['components'][0])
     
     return rslt
    def collect_metadata(self):
        print('parsing tsv from %s' % self.path)
        md = []
        try:
            with open(self.path, 'rU', newline='', encoding='ascii') as f:
                reader = csv.DictReader(f, delimiter='\t')
                for row in reader:
                    md.append({k: v for k, v in row.items()})
        except UnicodeDecodeError as e:
            raise MetadataError(str(e) + f'in {self.path}')

        # Scan for the common error of bad keys/values due to missing delimiters
        for row in md:
            if any(k in [None, ''] for k in row) or any(v is None
                                                        for v in row.values()):
                raise MetadataError(
                    '{} has empty keys or values. Delimiter error?'.format(
                        self.path))

        return md
Ejemplo n.º 9
0
    def filter_metadata(self, raw_metadata):
        """
        This extracts the metadata which is actually desired downstream from the bulk of the
        metadata which has been collected.
        
        """
        rslt = self.basic_filter_metadata(raw_metadata)

        candidate_l = [raw_metadata[k][0] for k in raw_metadata if 'README.csv' in k] 
        if not candidate_l:
            raise MetadataError('Cannot find README.csv')
        
        readme_md = candidate_l[0]
        for elt in candidate_l[1:]:
            if elt != experiment_md:
                raise MetadataError("Multiple README.csv files that do not match")

        # Disable internal consistency checks because we trust metadata.tsv
        #self.internal_consistency_checks(rslt, readme_md)

        rslt['other_metadata'] = readme_md
            
        return rslt
def translate_timestamp(tstr, default_tz):
    try:
        d = datetime.strptime(tstr, '%Y-%m-%dT%H:%M:%S.%f%z')
        return d
    except ValueError:
        try:
            d = datetime.strptime(tstr, '%Y-%m-%d %H:%M:%S.%f')
            d = default_tz.localize(d)
            return d
        except:
            try:
                d = datetime.strptime(tstr, '%Y-%m-%d %H:%M:%S')
                d = default_tz.localize(d)
                return d
            except:
                try:
                    d = datetime.strptime(tstr, '%Y-%m-%d %H:%M')
                    d = default_tz.localize(d)
                    return d
                except:
                    raise MetadataError('Cannot translate time string {}'.format(tstr))