Ejemplo n.º 1
0
    def __call__(self, input_file: InputFile) -> List[Indexable]:
        """
        Load tagged variables.
        Presumes a harmonized variable list as a CSV file as input.
        A naming convention such that <prefix>_variables_<version>.csv will be the form of the filename.
        An adjacent file called <prefix>_tags_<version.json will exist.
        :param input_file: A list of study variables linked to tags, studies, and other relevant data.
        :returns: Returns variables, a list of parsed variable dictionaries and tags, a list of parsed
                  tag definitions linked to the variabels.
        """

        logger.debug(input_file)
        if not input_file.endswith(".csv"):
            return []

        # Now loop through associated variables and associate each with its parent concept/tag
        elements: List[Indexable] = []
        with open(input_file, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t')
            for row in reader:
                row = {k.strip(): v for k, v in row.items()}
                elem = DugElement(elem_id=row['variable_full_accession'],
                                  name=row['variable_name'],
                                  desc=row['variable_desc'],
                                  elem_type="DbGaP",
                                  collection_id=row['study_full_accession'],
                                  collection_name=row['study_name'])

                # Create DBGaP links as study/variable actions
                elem.collection_action = utils.get_dbgap_study_link(
                    study_id=elem.collection_id)
                elem.action = utils.get_dbgap_var_link(
                    study_id=elem.collection_id,
                    variable_id=elem.id.split(".")[0].split("phv")[1])

                # Add element to list of elements
                elements.append(elem)
                logger.debug(elem)

        return elements
Ejemplo n.º 2
0
    def __call__(self, input_file: InputFile) -> List[Indexable]:
        logger.debug(input_file)
        tree = ET.parse(input_file)
        root = tree.getroot()
        study_id = root.attrib['study_id']
        participant_set = root.get('participant_set', '0')

        # Parse study name from file handle
        study_name = self.parse_study_name_from_filename(str(input_file))

        if study_name is None:
            err_msg = f"Unable to parse DbGaP study name from data dictionary: {input_file}!"
            logger.error(err_msg)
            raise IOError(err_msg)

        elements = []
        for variable in root.iter('variable'):
            elem = DugElement(
                elem_id=f"{variable.attrib['id']}.p{participant_set}",
                name=variable.find('name').text,
                desc=variable.find('description').text.lower(),
                elem_type="DbGaP",
                collection_id=f"{study_id}.p{participant_set}",
                collection_name=study_name)

            # Create DBGaP links as study/variable actions
            elem.collection_action = utils.get_dbgap_study_link(
                study_id=elem.collection_id)
            elem.action = utils.get_dbgap_var_link(
                study_id=elem.collection_id,
                variable_id=elem.id.split(".")[0].split("phv")[1])
            # Add to set of variables
            logger.debug(elem)
            elements.append(elem)

        # You don't actually create any concepts
        return elements
Ejemplo n.º 3
0
    def __call__(self, input_file: InputFile) -> List[Indexable]:
        """
        Load tagged variables.
        Presumes a harmonized variable list as a CSV file as input.
        A naming convention such that <prefix>_variables_<version>.csv will be the form of the filename.
        An adjacent file called <prefix>_tags_<version.json will exist.

        :param input_file: A list of study variables linked to tags, studies, and other relevant data.
        :returns: Returns variables, a list of parsed variable dictionaries and tags, a list of parsed
                  tag definitions linked to the variabels.
        """

        logger.debug(input_file)
        if not input_file.endswith(".csv"):
            return []
        tags_input_file = input_file.replace(".csv", ".json").replace(
            "_variables_", "_tags_")
        if not os.path.exists(tags_input_file):
            raise ValueError(
                f"Accompanying tags file: {tags_input_file} must exist.")

        # Read in huamn-created tags/concepts from json file before reading in elements
        with open(tags_input_file, "r") as stream:
            tags = json.load(stream)

        # Loop through tags and create concepts for each one
        concepts = {}
        for tag in tags:
            concept_id = f"TOPMED.TAG:{tag['pk']}"
            concept = DugConcept(
                concept_id,
                name=tag['fields']['title'],
                desc=
                f'{tag["fields"]["description"]}. {tag["fields"]["instructions"]}',
                concept_type="TOPMed Phenotype Concept")

            # Only use the description for annotation
            concept.ml_ready_desc = tag["fields"]["description"]
            concepts[str(tag['pk'])] = concept
            logger.debug(concept)

        # Now loop through associated variables and associate each with its parent concept/tag
        elements: List[Indexable] = []
        with open(input_file, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t')
            for row in reader:
                row = {k.strip(): v for k, v in row.items()}
                elem = DugElement(elem_id=row['variable_full_accession'],
                                  name=row['variable_name'] if 'variable_name'
                                  in row else row['variable_full_accession'],
                                  desc=row['variable_desc'] if 'variable_name'
                                  in row else row['variable_full_accession'],
                                  elem_type="DbGaP",
                                  collection_id=row['study_full_accession'],
                                  collection_name=row['study_name'])

                # Create DBGaP links as study/variable actions
                elem.collection_action = utils.get_dbgap_study_link(
                    study_id=elem.collection_id)
                elem.action = utils.get_dbgap_var_link(
                    study_id=elem.collection_id,
                    variable_id=elem.id.split(".")[0].split("phv")[1])

                # Add concept parsed from tag file to element
                concept_group = row['tag_pk']
                if concept_group not in concepts:
                    # Raise error if the tag_id parsed from the variable file didn't exist in the tag file
                    err_msg = f"DbGaP variable '{elem.id}' maps to a tag group that doesn't exist in tag file: '{concept_group}'"
                    logger.error(err_msg)
                    raise IOError(err_msg)
                elem.add_concept(concepts[row['tag_pk']])

                # Add element to list of elements
                elements.append(elem)
                logger.debug(elem)

        return list(concepts.values()) + elements