def generate_cross_species_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)
        cross_species_template = []

        headers, cs_by_preferred_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_preferred_alias",
            id_to_lower=True)
        headers, cs_by_aligned_alias = read_csv_to_dict(
            CROSS_SPECIES_PATH,
            id_column_name="cell_set_aligned_alias",
            id_to_lower=True)

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                cross_species_classes = set()
                if o["cell_set_aligned_alias"] and str(
                        o["cell_set_aligned_alias"]).lower(
                        ) in cs_by_aligned_alias:
                    cross_species_classes.add(PCL_BASE + get_class_id(
                        cs_by_aligned_alias[str(o["cell_set_aligned_alias"]).
                                            lower()]["cell_set_accession"]))

                if "cell_set_additional_aliases" in o and o[
                        "cell_set_additional_aliases"]:
                    additional_aliases = str(
                        o["cell_set_additional_aliases"]).lower().split(
                            EXPRESSION_SEPARATOR)
                    for additional_alias in additional_aliases:
                        if additional_alias in cs_by_preferred_alias:
                            cross_species_classes.add(PCL_BASE + get_class_id(
                                cs_by_preferred_alias[additional_alias]
                                ["cell_set_accession"]))

                if len(cross_species_classes):
                    d = dict()
                    d['defined_class'] = PCL_BASE + get_class_id(
                        o['cell_set_accession'])
                    d['cross_species_classes'] = EXPRESSION_SEPARATOR.join(
                        cross_species_classes)

                    cross_species_template.append(d)

        class_robot_template = pd.DataFrame.from_records(
            cross_species_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
Ejemplo n.º 2
0
    def test_base_class_template_generation_with_nomenclature(self):
        generate_base_class_template(PATH_NOMENCLATURE_TABLE,
                                     PATH_OUTPUT_CLASS_TSV)
        output = read_tsv(PATH_OUTPUT_CLASS_TSV)

        # assert only descendants of the root nodes (except root nodes itself) exist
        self.assertFalse(PCL_BASE + get_class_id("CS201912131_149")
                         in output)  # root

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_22")
                        in output)  # child
        self.assertTrue(PCL_BASE + get_class_id("CS201912131_70")
                        in output)  # child

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_125")
                        in output)  # root & leaf
        self.assertFalse(PCL_BASE + get_class_id("CS201912131_148")
                         in output)  # parent
def generate_homologous_to_template(taxonomy_file_path, all_base_files,
                                    output_filepath):
    """
    Homologous_to relations require a separate template. If this operation is driven by the nomenclature tables,
    some dangling classes may be generated due to root classes that don't have a class and should not be aligned.
    So, instead of nomenclature tables, base files are used for populating homologous to relations. This ensures all
    alignments has a corresponding class.
    Args:
        taxonomy_file_path: path of the taxonomy file
        all_base_files: paths of the all class template base files
        output_filepath: template output file path
    """
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    other_taxonomy_aliases = index_base_files(
        [t for t in all_base_files if taxon not in t])

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        data_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                homologous_to = list()
                for other_aliases in other_taxonomy_aliases:
                    if "cell_set_aligned_alias" in o and o["cell_set_aligned_alias"] \
                            and str(o["cell_set_aligned_alias"]).lower() in other_aliases:
                        homologous_to.append(other_aliases[str(
                            o["cell_set_aligned_alias"]).lower()]
                                             ["defined_class"])
                d['homologous_to'] = "|".join(homologous_to)

                data_template.append(d)

        robot_template = pd.DataFrame.from_records(data_template)
        robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_non_taxonomy_classification_template(taxonomy_file_path,
                                                  output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)

    cell_set_accession = 3
    child_cell_set_accessions = 14
    nomenclature_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        NOMENCLATURE_TABLE_PATH.format(taxon))

    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config and os.path.exists(nomenclature_path):
        nomenclature_records = read_csv(nomenclature_path,
                                        id_column=cell_set_accession)
        nomenclature_template = []

        non_taxo_roots = {}
        for root in taxonomy_config['non_taxonomy_roots']:
            non_taxo_roots[root["Node"]] = root["Cell_type"]

        for record in nomenclature_records:
            columns = nomenclature_records[record]
            if columns[cell_set_accession] in non_taxo_roots:
                # dendrogram is not mandatory for human & marmoset
                # if columns[cell_set_accession] in dend_nodes:
                #     raise Exception("Node {} exists both in dendrogram and nomenclature of the taxonomy: {}."
                #                     .format(columns[cell_set_accession], taxon))
                children = columns[child_cell_set_accessions].split("|")
                for child in children:
                    # child of root with cell_set_preferred_alias
                    if child not in non_taxo_roots and nomenclature_records[
                            child][0]:
                        d = dict()
                        d['defined_class'] = PCL_BASE + get_class_id(child)
                        d['Classification'] = non_taxo_roots[
                            columns[cell_set_accession]]
                        nomenclature_template.append(d)

        class_robot_template = pd.DataFrame.from_records(nomenclature_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_curated_class_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        class_curation_seed = [
            'defined_class', 'Curated_synonyms', 'Classification',
            'Classification_comment', 'Classification_pub', 'Expresses',
            'Expresses_comment', 'Expresses_pub', 'Projection_type', 'Layers',
            'Cross_species_text', 'Comment'
        ]
        class_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                if o['cell_set_preferred_alias']:
                    d['prefLabel'] = o['cell_set_preferred_alias']
                elif o['cell_set_additional_aliases']:
                    d['prefLabel'] = str(
                        o['cell_set_additional_aliases']).split(
                            EXPRESSION_SEPARATOR)[0]

                for k in class_curation_seed:
                    if not (k in d.keys()):
                        d[k] = ''
                class_template.append(d)

        class_robot_template = pd.DataFrame.from_records(class_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)
    def test_human_mtg_ids(self):
        self.assertEqual(get_class_id("CS1908210001"), "0023001")
        self.assertEqual(get_class_id("CS1908210148"), "0023148")

        self.assertEqual(get_individual_id("CS1908210001"), "0023501")
        self.assertEqual(get_individual_id("CS1908210148"), "0023648")
    def test_marmoset_ids(self):
        self.assertEqual(get_class_id("CS201912132_1"), "0019001")
        self.assertEqual(get_class_id("CS201912132_121"), "0019121")

        self.assertEqual(get_individual_id("CS201912132_1"), "0019501")
        self.assertEqual(get_individual_id("CS201912132_121"), "0019621")
    def test_human_ids(self):
        self.assertEqual(get_class_id("CS201912131_1"), "0015001")
        self.assertEqual(get_class_id("CS201912131_121"), "0015121")

        self.assertEqual(get_individual_id("CS201912131_1"), "0015501")
        self.assertEqual(get_individual_id("CS201912131_121"), "0015621")
    def test_mouse_ids(self):
        self.assertEqual(get_class_id("CS202002013_1"), "0011001")
        self.assertEqual(get_class_id("CS202002013_121"), "0011121")

        self.assertEqual(get_individual_id("CS202002013_1"), "0011501")
        self.assertEqual(get_individual_id("CS202002013_121"), "0011621")
Ejemplo n.º 10
0
    def test_base_class_template_generation(self):
        generate_base_class_template(PATH_MOUSE_NOMENCLATURE,
                                     PATH_OUTPUT_CLASS_TSV)
        output = read_tsv(PATH_OUTPUT_CLASS_TSV)

        self.assertTrue(PCL_BASE + get_class_id("CS202002013_150")
                        in output)  # child

        # assert only descendants of the root nodes (except root nodes itself) exist
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_117")
                         in output)  # root

        self.assertFalse(PCL_BASE + get_class_id("CS202002013_123")
                         in output)  # root
        self.assertTrue(PCL_BASE + get_class_id("CS202002013_150")
                        in output)  # child
        self.assertTrue(PCL_BASE + get_class_id("CS202002013_124")
                        in output)  # child
        self.assertFalse(
            PCL_BASE + get_class_id("CS202002013_158")
            in output)  # grand child, empty cell_set_preferred_alias
        self.assertTrue(PCL_BASE + get_class_id("CS202002013_3")
                        in output)  # grand child
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_122")
                         in output)  # parent
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_120")
                         in output)  # grand parent

        self.assertTrue(PCL_BASE + get_class_id("CS202002013_103")
                        in output)  # root & leaf
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_220")
                         in output)  # parent

        self.assertFalse(PCL_BASE + get_class_id("CS202002013_179")
                         in output)  # root
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_180")
                         in output)  # child, empty cell_set_preferred_alias
        self.assertTrue(PCL_BASE + get_class_id("CS202002013_207")
                        in output)  # child
        self.assertFalse(
            PCL_BASE + get_class_id("CS202002013_208")
            in output)  # grand child, empty cell_set_preferred_alias
        self.assertTrue(PCL_BASE + get_class_id("CS202002013_83")
                        in output)  # grand child

        self.assertFalse(PCL_BASE + get_class_id("CS202002013_219")
                         in output)  # parent
        self.assertFalse(PCL_BASE + get_class_id("CS202002013_220")
                         in output)  # grand parent
Ejemplo n.º 11
0
    def test_homologous_to_template_generation(self):
        generate_homologous_to_template(PATH_NOMENCLATURE_TABLE,
                                        ALL_BASE_FILES, PATH_OUTPUT_CLASS_TSV)
        output = read_tsv(PATH_OUTPUT_CLASS_TSV)

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_164") in output)
        test_node = output[PCL_BASE + get_class_id("CS201912131_164")]
        homologous_to = test_node[1].split("|")
        self.assertEqual(2, len(homologous_to))
        self.assertTrue(PCL_BASE +
                        get_class_id("CS201912132_039") in homologous_to)
        self.assertTrue(PCL_BASE +
                        get_class_id("CS202002013_244") in homologous_to)

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_176") in output)
        test_node = output[PCL_BASE + get_class_id("CS201912131_176")]
        homologous_to = test_node[1].split("|")
        self.assertEqual(2, len(homologous_to))
        self.assertTrue(PCL_BASE +
                        get_class_id("CS201912132_060") in homologous_to)
        self.assertTrue(PCL_BASE +
                        get_class_id("CS202002013_067") in homologous_to)

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_157") in output)
        test_node = output[PCL_BASE + get_class_id("CS201912131_157")]
        homologous_to = test_node[1].split("|")
        self.assertEqual(1, len(homologous_to))
        self.assertTrue(PCL_BASE +
                        get_class_id("CS201912132_002") in homologous_to)

        self.assertTrue(PCL_BASE + get_class_id("CS201912131_142")
                        in output)  # human Astrocyte
        test_node = output[PCL_BASE + get_class_id("CS201912131_142")]
        homologous_to = test_node[1]
        self.assertFalse(homologous_to)  # mouse and marmoset astro not exists
def generate_ind_template(taxonomy_file_path, output_filepath):
    path_parts = taxonomy_file_path.split(os.path.sep)
    taxon = path_parts[len(path_parts) - 1].split(".")[0]

    if str(taxonomy_file_path).endswith(".json"):
        dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
    else:
        dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        taxon = path_parts[len(path_parts) - 1].split(".")[0].replace(
            "nomenclature_table_", "")

    dend_tree = generate_dendrogram_tree(dend)
    taxonomy_config = read_taxonomy_config(taxon)
    allen_descriptions = read_allen_descriptions(
        ALLEN_DESCRIPTIONS_PATH, taxonomy_config['Species_abbv'][0])

    subtrees = get_subtrees(dend_tree, taxonomy_config)

    robot_template_seed = {
        'ID': 'ID',
        'Label': 'LABEL',
        'PrefLabel': 'A skos:prefLabel',
        'Entity Type': 'TI %',
        'TYPE': 'TYPE',
        'Property Assertions': "I 'subcluster of' SPLIT=|",
        'Synonyms': 'A oboInOwl:hasExactSynonym SPLIT=|',
        'Cluster_ID': "A 'cluster id'",
        'Function': 'TI capable_of some %',
        'cell_set_preferred_alias': "A n2o:cell_set_preferred_alias",
        'original_label': "A n2o:original_label",
        'cell_set_label': "A n2o:cell_set_label",
        'cell_set_aligned_alias': "A n2o:cell_set_aligned_alias",
        'cell_set_additional_aliases':
        "A n2o:cell_set_additional_aliases SPLIT=|",
        'cell_set_alias_assignee': "A n2o:cell_set_alias_assignee SPLIT=|",
        'cell_set_alias_citation': "A n2o:cell_set_alias_citation SPLIT=|",
        'Metadata': "A n2o:node_metadata",
        'Exemplar_of': "TI 'exemplar data of' some %",
        'Comment': "A rdfs:comment",
        'Aliases': "A oboInOwl:hasRelatedSynonym SPLIT=|",
        'Rank': "A 'cell_type_rank' SPLIT=|"
    }
    dl = [robot_template_seed]

    synonym_properties = [
        'cell_set_aligned_alias', 'cell_set_additional_aliases'
    ]

    for o in dend['nodes']:
        d = dict()
        d['ID'] = 'PCL:' + get_individual_id(o['cell_set_accession'])
        d['TYPE'] = 'owl:NamedIndividual'
        d['Label'] = o['cell_set_label'] + ' - ' + o['cell_set_accession']
        if 'cell_set_preferred_alias' in o and o['cell_set_preferred_alias']:
            d['PrefLabel'] = o['cell_set_preferred_alias']
        else:
            d['PrefLabel'] = o['cell_set_accession']
        d['Entity Type'] = 'PCL:0010001'  # Cluster
        d['Metadata'] = json.dumps(o)
        d['Synonyms'] = '|'.join([
            o[prop] for prop in synonym_properties
            if prop in o.keys() and o[prop]
        ])
        d['Property Assertions'] = '|'.join(
            sorted([
                'PCL:' + get_individual_id(e[1]) for e in dend['edges']
                if e[0] == o['cell_set_accession']
            ]))
        meta_properties = [
            'cell_set_preferred_alias', 'original_label', 'cell_set_label',
            'cell_set_aligned_alias', 'cell_set_additional_aliases',
            'cell_set_alias_assignee', 'cell_set_alias_citation'
        ]
        for prop in meta_properties:
            if prop in o.keys():
                d[prop] = '|'.join([
                    prop_val.strip() for prop_val in str(o[prop]).split("|")
                    if prop_val
                ])
            else:
                d[prop] = ''
        d['Cluster_ID'] = o['cell_set_accession']
        if o['cell_set_accession'] in set().union(
                *subtrees) and o['cell_set_preferred_alias']:
            d['Exemplar_of'] = PCL_BASE + get_class_id(o['cell_set_accession'])

        if "cell_type_card" in o:
            d['Rank'] = '|'.join([
                cell_type.strip().replace("No", "None")
                for cell_type in str(o["cell_type_card"]).split(",")
            ])

        if o['cell_set_accession'] in allen_descriptions:
            allen_data = allen_descriptions[o['cell_set_accession']]
            d['Comment'] = allen_data["summary"][0]
            if allen_data["aliases"][0]:
                d['Aliases'] = '|'.join([
                    alias.strip()
                    for alias in str(allen_data["aliases"][0]).split("|")
                ])

        # There should only be one!
        dl.append(d)
    robot_template = pd.DataFrame.from_records(dl)
    robot_template.to_csv(output_filepath, sep="\t", index=False)
def generate_base_class_template(taxonomy_file_path, output_filepath):
    taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
    taxonomy_config = read_taxonomy_config(taxon)

    if taxonomy_config:
        if str(taxonomy_file_path).endswith(".json"):
            dend = dend_json_2_nodes_n_edges(taxonomy_file_path)
        else:
            dend = nomenclature_2_nodes_n_edges(taxonomy_file_path)
        dend_tree = generate_dendrogram_tree(dend)
        subtrees = get_subtrees(dend_tree, taxonomy_config)

        if "Reference_gene_list" in taxonomy_config:
            gene_db_path = ENSEMBLE_PATH.format(
                str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
            gene_names = read_gene_data(gene_db_path)
            minimal_markers = read_markers(
                MARKER_PATH.format(taxon.replace("CCN", "").replace("CS", "")),
                gene_names)
            allen_markers = read_markers(
                ALLEN_MARKER_PATH.format(
                    taxon.replace("CCN", "").replace("CS", "")), gene_names)
        else:
            minimal_markers = {}
            allen_markers = {}

        class_seed = [
            'defined_class', 'prefLabel', 'Alias_citations',
            'Synonyms_from_taxonomy', 'Gross_cell_type', 'Taxon',
            'Brain_region', 'Minimal_markers', 'Allen_markers', 'Individual',
            'Brain_region_abbv', 'Species_abbv', 'Cluster_ID', 'part_of',
            'has_soma_location', 'aligned_alias', 'marker_gene_set'
        ]
        class_template = []

        for o in dend['nodes']:
            if o['cell_set_accession'] in set.union(
                    *subtrees) and (o['cell_set_preferred_alias']
                                    or o['cell_set_additional_aliases']):
                d = dict()
                d['defined_class'] = PCL_BASE + get_class_id(
                    o['cell_set_accession'])
                if o['cell_set_preferred_alias']:
                    d['prefLabel'] = o['cell_set_preferred_alias']
                elif o['cell_set_additional_aliases']:
                    d['prefLabel'] = str(
                        o['cell_set_additional_aliases']).split(
                            EXPRESSION_SEPARATOR)[0]
                d['Synonyms_from_taxonomy'] = get_synonyms_from_taxonomy(o)
                d['Gross_cell_type'] = get_gross_cell_type(
                    o['cell_set_accession'], subtrees, taxonomy_config)
                d['Taxon'] = taxonomy_config['Species'][0]
                d['Brain_region'] = taxonomy_config['Brain_region'][0]
                d['Cluster_ID'] = o['cell_set_accession']
                if 'cell_set_alias_citation' in o and o[
                        'cell_set_alias_citation']:
                    alias_citations = [
                        citation.strip() for citation in str(
                            o["cell_set_alias_citation"]).split("|")
                        if citation and citation.strip()
                    ]
                    d["Alias_citations"] = "|".join(alias_citations)
                if o['cell_set_accession'] in minimal_markers:
                    d['Minimal_markers'] = minimal_markers[
                        o['cell_set_accession']]
                if o['cell_set_accession'] in allen_markers:
                    d['Allen_markers'] = allen_markers[o['cell_set_accession']]
                else:
                    d['Allen_markers'] = ''
                if 'Brain_region_abbv' in taxonomy_config:
                    d['Brain_region_abbv'] = taxonomy_config[
                        'Brain_region_abbv'][0]
                if 'Species_abbv' in taxonomy_config:
                    d['Species_abbv'] = taxonomy_config['Species_abbv'][0]
                d['Individual'] = PCL_BASE + get_individual_id(
                    o['cell_set_accession'])

                for index, subtree in enumerate(subtrees):
                    if o['cell_set_accession'] in subtree:
                        location_rel = taxonomy_config['Root_nodes'][index][
                            'Location_relation']
                        if location_rel == "part_of":
                            d['part_of'] = taxonomy_config['Brain_region'][0]
                            d['has_soma_location'] = ''
                        elif location_rel == "has_soma_location":
                            d['part_of'] = ''
                            d['has_soma_location'] = taxonomy_config[
                                'Brain_region'][0]

                if "cell_set_aligned_alias" in o and o[
                        "cell_set_aligned_alias"]:
                    d['aligned_alias'] = o["cell_set_aligned_alias"]
                if o['cell_set_accession'] in minimal_markers:
                    d['marker_gene_set'] = PCL_PREFIX + get_marker_gene_set_id(
                        o['cell_set_accession'])

                for k in class_seed:
                    if not (k in d.keys()):
                        d[k] = ''
                class_template.append(d)

        class_robot_template = pd.DataFrame.from_records(class_template)
        class_robot_template.to_csv(output_filepath, sep="\t", index=False)