コード例 #1
0
class GeoXrefETL(ETL):
    """GEO XREF ETL"""

    logger = logging.getLogger(__name__)

    geo_xref_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (o:Gene) where o.primaryKey = row.genePrimaryKey
        """ + ETLHelper.get_cypher_xref_text()

    gene_crossref_query_template = """
                   MATCH (g:Gene)-[crr:CROSS_REFERENCE]-(cr:CrossReference)
                   WHERE cr.globalCrossRefId IN {parameter}
                   RETURN g.primaryKey, g.modLocalId, cr.name, cr.globalCrossRefId"""

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):

        for sub_type in self.data_type_config.get_sub_type_objects():

            species_encoded = urllib.parse.quote_plus(\
                    ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider()))

            commit_size = self.data_type_config.get_neo4j_commit_size()
            #batch_size = self.data_type_config.get_generator_batch_size()
            batch_size = 100000

            generators = self.get_generators(sub_type, batch_size,
                                             species_encoded)

            query_template_list = [
                [
                    self.geo_xref_query_template, commit_size,
                    "geo_xref_data_" + sub_type.get_data_provider() + ".csv"
                ],
            ]

            query_and_file_list = self.process_query_params(
                query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators"""

        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1],
                                               "NCBI_Gene",
                                               "gene/other_expression",
                                               "gene/other_expression",
                                               "GEO",
                                               "https://www.ncbi.nlm.nih.gov/sites/entrez?" \
                                                       + "Db=geoprofiles"\
                                                       + "&DbFrom=gene"\
                                                       + "&Cmd=Link"\
                                                       + "&LinkName=gene_geoprofiles"\
                                                       + "&LinkReadableName=GEO%20Profiles"\
                                                       + "&IdsFromResult="\
                                                       + global_cross_ref_id.split(":")[1],
                                               global_cross_ref_id+"gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
コード例 #2
0
class ConstructETL(ETL):
    """Construct ETL"""

    logger = logging.getLogger(__name__)
    xref_url_map = ResourceDescriptorHelper().get_data()

    # Query templates which take params and will be processed later

    construct_query_template = """
          USING PERIODIC COMMIT %s
          LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

              //Create the Construct node and set properties. primaryKey is required.
              MERGE (o:Construct {primaryKey:row.primaryId})
                  ON CREATE SET o.name = row.name,
                   o.dateProduced = row.dateProduced,
                   o.release = row.release,
                   o.localId = row.localId,
                   o.globalId = row.globalId,
                   o.uuid = row.uuid,
                   o.nameText = row.nameText,
                   o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                   o.dataProviders = row.dataProviders,
                   o.dataProvider = row.dataProvider,
                   o.symbol = row.symbol

            """

    construct_secondary_ids_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (f:Construct {primaryKey:row.data_id})

            MERGE (second:SecondaryId {primaryKey:row.secondary_id})
                SET second.name = row.secondary_id
            MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """

    construct_synonyms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (a:Construct {primaryKey:row.data_id})

            MERGE(syn:Synonym {primaryKey:row.synonym})
                SET syn.name = row.synonym
            MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """

    construct_xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text(
    )

    construct_gene_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.constructID}), (g:Gene {primaryKey:row.componentID})
            CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel
            REMOVE rel.noOp"""

    construct_no_gene_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.constructID}), (g:NonBGIConstructComponent {primaryKey:row.componentSymbol})
            CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel
            REMOVE rel.noOp"""

    non_bgi_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
            MERGE (o:NonBGIConstructComponent {primaryKey:row.componentSymbol})"""

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, ))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Construct Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Construct Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                ConstructETL.construct_query_template, commit_size,
                "Construct_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_secondary_ids_query_template,
                commit_size, "Construct_secondary_ids_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_synonyms_query_template, commit_size,
                "Construct_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_xrefs_query_template, commit_size,
                "Construct_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.non_bgi_component_query_template, commit_size,
                "Construct_non_bgi_component_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                ConstructETL.construct_gene_component_query_template,
                commit_size, "Construct_components_gene" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_no_gene_component_query_template,
                commit_size, "Construct_components_no_gene" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, construct_data, data_provider, batch_size):
        """Create Generators"""

        data_providers = []
        release = ""
        constructs = []
        construct_synonyms = []
        construct_secondary_ids = []
        cross_reference_list = []
        component_details = []
        component_no_gene_details = []
        non_bgi_components = []

        counter = 0
        date_produced = construct_data['metaData']['dateProduced']

        data_provider_object = construct_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        self.logger.info("DataProvider: " + data_provider)
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_construct"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        if 'release' in construct_data['metaData']:
            release = construct_data['metaData']['release']

        for construct_record in construct_data['data']:

            counter = counter + 1
            global_id = construct_record['primaryId']
            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            name_text = TextProcessingHelper.cleanhtml(
                construct_record.get('name'))

            construct_dataset = {
                "symbol": construct_record.get('name'),
                "primaryId": construct_record.get('primaryId'),
                "globalId": global_id,
                "localId": local_id,
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "release": release,
                "modGlobalCrossRefId": mod_global_cross_ref_id,
                "uuid": str(uuid.uuid4()),
                "dataProvider": data_provider,
                "nameText": name_text,
                "name": construct_record.get('name')
            }
            constructs.append(construct_dataset)

            if 'crossReferences' in construct_record:

                for cross_ref in construct_record.get('crossReferences'):
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'construct':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                    local_crossref_id, self.xref_url_map,
                                    prefix, page)
                                xref = ETLHelper.get_xref_dict(
                                    local_crossref_id, prefix, page, page,
                                    cross_ref_id, mod_global_cross_ref_id,
                                    cross_ref_id + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'constructComponents' in construct_record:
                for component in construct_record.get('constructComponents'):
                    component_relation = component.get(
                        'componentRelation').upper()
                    component_symbol = component.get('componentSymbol')
                    component_id = component.get('componentID')

                    if component_id is not None:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "componentID": component_id,
                            "constructID": construct_record.get('primaryId')
                        }
                        component_details.append(component_detail)
                    else:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "constructID": construct_record.get('primaryId')
                        }
                        non_bgi_component = {
                            "componentSymbol": component_symbol
                        }
                        non_bgi_components.append(non_bgi_component)
                        component_no_gene_details.append(component_detail)

            if 'synonyms' in construct_record:
                for syn in construct_record.get('synonyms'):
                    construct_synonym = {
                        "data_id": construct_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    construct_synonyms.append(construct_synonym)

            if 'secondaryIds' in construct_record:
                for secondary_id in construct_record.get('secondaryIds'):
                    construct_secondary_id = {
                        "data_id": construct_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    construct_secondary_ids.append(construct_secondary_id)

            if counter == batch_size:
                yield [
                    constructs, construct_secondary_ids, construct_synonyms,
                    cross_reference_list, non_bgi_components,
                    component_details, component_no_gene_details
                ]
                constructs = []
                construct_secondary_ids = []
                construct_synonyms = []
                cross_reference_list = []
                non_bgi_components = []
                component_details = []
                component_no_gene_details = []
                counter = 0

        if counter > 0:
            yield [
                constructs, construct_secondary_ids, construct_synonyms,
                cross_reference_list, non_bgi_components, component_details,
                component_no_gene_details
            ]
コード例 #3
0
class HTPMetaDatasetSampleETL(ETL):

    htp_dataset_sample_query_template = """
    
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        
        MATCH (o:OBITerm {primaryKey:row.sampleType})
        MATCH (s:Species {primaryKey: row.taxonId})
        MATCH (a:MMOTerm {primaryKey: row.assayType})
    
        MERGE (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
          ON CREATE SET ds.dateAssigned = row.dateAssigned,
              ds.abundance = row.abundance,
              ds.sex = row.sex,
              ds.notes = row.notes,
              ds.dateAssigned = row.dateAssigned,
              //ds.biosampleText = row.biosampleText,
              ds.sequencingFormat = row.sequencingFormat,
              ds.title = row.sampleTitle,
              ds.sampleAge = row.sampleAge
              
        MERGE (ds)-[dssp:FROM_SPECIES]-(s)
        //MERGE (ds)-[dsat:ASSAY_TYPE]-(a)
        //MERGE (ds)-[dsst:SAMPLE_TYPE]-(o)
        
          
    """

    htp_dataset_sample_agm_query_template = """
        USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
           
        MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
        MATCH (agm:AffectedGenomicModel {primaryKey:row.biosampleId})

        MERGE (agm)-[agmds:ASSOCIATION]-(ds)
    
    """

    htp_dataset_sample_agmtext_query_template = """
    
        USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
    
        MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
        MERGE (agm:AffectedGenomicModel {primaryKey:row.biosampleText})

        MERGE (agm)-[agmds:ASSOCIATION]-(ds)
    
    """

    htp_bio_entity_expression_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       
       MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement
       
       MERGE (dss)-[dsdss:STRUCTURE_SAMPLED]-(e)
            
    """

    htp_stages_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       MATCH (st:Stage {primaryKey:row.stageName})
               
       MERGE (dss)-[eotcctq:SAMPLED_DURING]-(s)
       
    """

    htp_dataset_join_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (ds:HTPDataset {primaryKey:row.datasetId})
       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       
       MERGE (ds)-[dsdss:ASSOCIATION]-(dss)
    
    """

    htp_secondaryIds_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (dss:HTPDatasetSample {primaryKey: row.datasetSampleId})

        MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondaryId})
                ON CREATE SET sec.name = row.secondaryId

        MERGE (dss)<-[aka:ALSO_KNOWN_AS]-(sec)


    """

    ao_substructures_query_template = """
     USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    
    """

    ao_qualifiers_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    
    
    """

    ao_ss_qualifiers_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 


    """

    ao_terms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    """

    cc_term_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.cellularComponentTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    """

    eas_substructure_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """

    eas_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otastq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """

    eass_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasstq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})

            MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """

    ccq_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otcctq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})

            MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """

    stage_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MERGE (s:Stage {primaryKey:row.stageName})
                ON CREATE SET s.name = row.stageName
            MERGE (ei)-[eotcctq:DURING]-(s) """

    uberon_ao_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId})     
            MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """

    uberon_stage_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId})

            MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """

    uberon_ao_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) 
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) 
            MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """

    uberon_stage_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'})

            MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """

    htp_dataset_sample_assemblies_query_template = """
            USING PERIODIC COMMIT %s
            LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

                MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
                MATCH (u:Assembly {primaryKey:row.assembly})

                MERGE (ds)-[dsu:ASSEMBLY]-(u) """

    htpdatasetsample_xrefs_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:HTPDatasetSample {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text(
    )

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            p = multiprocessing.Process(target=self._process_sub_type,
                                        args=(sub_type, ))
            p.start()
            thread_pool.append(p)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_query_template,
                commit_size, "htp_metadataset_sample_samples_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_bio_entity_expression_query_template, commit_size,
                "htp_metadataset_sample_bioentities_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_secondaryIds_query_template,
                commit_size, "htp_metadataset_sample_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_join_query_template,
                commit_size, "htp_metadataset_sample_datasets_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size,
                "htp_metadataset_sample_stages_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size,
                "htp_metadataset_sample_aoterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_substructures_query_template,
                commit_size, "htp_metadataset_sample_ao_substructures_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.cc_term_query_template, commit_size,
                "htp_metadataset_sample_ccterms" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ccq_expression_query_template,
                commit_size, "htp_metadataset_sample_ccqterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size,
                "htp_metadataset_sample_uberon_ao_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_other_query_template,
                commit_size, "htp_metadataset_sample_uberon_ao_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template,
                commit_size, "htp_metadataset_sample_agms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_agmtext_query_template, commit_size,
                "htp_metadataset_sample_agmstext_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_assemblies_query_template, commit_size,
                "htp_metadataset_sample_assemblies_" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, htp_datasetsample_data, batch_size):

        htp_datasetsamples = []
        secondaryIds = []
        datasetIds = []
        assemblies = []
        uberon_ao_data = []
        ao_qualifiers = []
        bio_entities = []
        ao_ss_qualifiers = []
        ao_substructures = []
        ao_terms = []
        uberon_ao_other_data = []
        stages = []
        ccq_components = []
        cc_components = []
        biosamples = []
        biosamplesTexts = []
        counter = 0

        data_provider_object = htp_datasetsample_data['metaData'][
            'dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')

        for datasample_record in htp_datasetsample_data['data']:

            counter = counter + 1

            sampleIds = ''
            biosampleId = ''
            biosampleText = ''
            sampleId = ''
            sampleTitle = ''
            biosamplesTexts = ''

            if 'sampleId' in datasample_record:
                sampleIdObj = datasample_record.get('sampleId')
                sampleId = sampleIdObj.get('primaryId')

                if 'secondaryIds' in sampleIdObj:
                    for secId in sampleIdObj.get('secondaryIds'):
                        secid = {
                            "datasetSampleId": sampleId,
                            "secondaryId": secId
                        }
                        secondaryIds.append(secid)

            if 'sampleTitle' in sampleIds:
                sampleTitle = datasample_record.get('sampleTitle')

            datasetSampleId = sampleId + sampleTitle

            if 'datasetIds' in datasample_record:
                datasetIdSet = datasample_record.get('datasetIds')
                for datasetID in datasetIdSet:
                    datasetsample = {
                        "datasetSampleId": datasetSampleId,
                        "datasetId": datasetID
                    }
                    datasetIds.append(datasetsample)

                    if self.test_object.using_test_data() is True:
                        is_it_test_entry = self.test_object.check_for_test_id_entry(
                            datasetID)
                        if is_it_test_entry is False:
                            counter = counter - 1
                            continue

            if 'genomicInformation' in datasample_record:
                genomicInformation = datasample_record.get(
                    'genomicInformation')
                if 'biosampleId' in genomicInformation:
                    biosampleId = genomicInformation.get('biosampleId')
                if 'bioSampleText' in genomicInformation:
                    biosampleText = genomicInformation.get('bioSampleText')

                if biosampleId is not None and biosampleId != '':
                    biosample = {
                        "biosampleId": biosampleId,
                        "datasetSampleId": datasetSampleId
                    }
                    biosamples.append(biosample)

                if biosampleText is not None and biosampleText != '' and biosampleId == '':
                    biosampleText = {
                        "biosampleText": biosampleText,
                        "datasetSampleId": datasetSampleId
                    }
                    biosamplesTexts.append(biosampleText)

            if 'assemblyVersions' in datasample_record:
                for assembly in datasample_record.get('assemblyVersions'):

                    datasetsample = {
                        "datasetSampleId": datasetSampleId,
                        "assembly": assembly
                    }
                    assemblies.append(datasetsample)

            age = ''
            if 'sampleAge' in datasample_record:
                sampleAge = datasample_record.get('sampleAge')
                stageId = ""
                if 'age' in sampleAge:
                    age = sampleAge.get('age')
                    stageId = stageId + age
                if 'stage' in sampleAge:
                    stage = sampleAge.get('stage')
                    stageId = stageId + stage.get('stageName')

                    stage = {
                        "stageId": stageId,
                        "stageTermId": stage.get('stageTermId'),
                        "stageName": stage.get('stageName'),
                        "stageUberonSlimTerm":
                        stage.get('stageUberonSlimTerm'),
                        "sampleAge": age,
                        "datasetSampleId": datasetSampleId
                    }
                    stages.append(stage)
                else:
                    stage = {"stageId": stageId, "sampleAge": age}
                    stages.append(stage)

            if 'sampleLocations' in datasample_record:
                sampleLocations = datasample_record.get('sampleLocations')

                for location in sampleLocations:

                    cellular_component_qualifier_term_id = location.get(
                        'cellularComponentQualifierTermId')
                    cellular_component_term_id = location.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = location.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = location.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = location.get(
                        'anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = location.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = location.get(
                        'whereExpressedStatement')

                    expression_unique_key = datasetSampleId
                    expression_entity_unique_key = ''

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement

                    if location.get('anatomicalStructureUberonSlimTermIds'
                                    ) is not None:

                        for uberon_structure_term_object in location.get(
                                'anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = uberon_structure_term_object.get(
                                'uberonTerm')

                            if structure_uberon_term_id is not None and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)

                            elif structure_uberon_term_id is not None and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if location.get('anatomicalSubStructureUberonSlimTermIds'
                                    ) is not None:

                        for uberon_sub_structure_term_object in location.get(
                                'anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = uberon_sub_structure_term_object.get(
                                'uberonTerm')

                            if sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)

                            elif sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is not None:
                        cc_term = {
                            "ebe_uuid": expression_entity_unique_key,
                            "cellularComponentTermId":
                            cellular_component_term_id
                        }
                        cc_components.append(cc_term)

                    if cellular_component_qualifier_term_id is not None:
                        ccq_term = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        ccq_components.append(ccq_term)

                    if anatomical_structure_term_id is not None:
                        ao_term = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id
                        }
                        ao_terms.append(ao_term)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement,
                        "datasetSampleId": datasetSampleId
                    }
                    bio_entities.append(bio_entity)

            htp_dataset_sample = {
                "datasetSampleId": datasetSampleId,
                "abundance": datasample_record.get('abundance'),
                "sampleType": datasample_record.get('sampleType'),
                "taxonId": datasample_record.get('taxonId'),
                "sex": datasample_record.get('sex'),
                "assayType": datasample_record.get('assayType'),
                "notes": datasample_record.get('notes'),
                "dateAssigned": datasample_record.get('dateAssigned'),
                "sequencingFormat": datasample_record.get('sequencingFormat'),
                "sampleTitle": sampleTitle,
                "sampleAge": age
            }

            htp_datasetsamples.append(htp_dataset_sample)

            #
            # if self.test_object.using_test_data() is True:
            #     is_it_test_entry = self.test_object.check_for_test_id_entry(datasetID)
            #     if is_it_test_entry is True:
            #         self.logger.info(htp_dataset_sample)

            if counter == batch_size:
                yield [
                    htp_datasetsamples,
                    bio_entities,
                    secondaryIds,
                    datasetIds,
                    stages,
                    ao_terms,
                    ao_substructures,
                    ao_qualifiers,
                    ao_ss_qualifiers,
                    cc_components,
                    ccq_components,
                    uberon_ao_data,
                    uberon_ao_other_data,
                    biosamples,
                    biosamplesTexts,
                    assemblies,
                ]
                counter = 0
                htp_datasetsamples = []
                datasetIds = []
                uberon_ao_data = []
                ao_qualifiers = []
                bio_entities = []
                ao_ss_qualifiers = []
                ao_substructures = []
                ao_terms = []
                uberon_ao_other_data = []
                stages = []
                ccq_components = []
                cc_components = []
                biosamples = []
                assemblies = []

        if counter > 0:
            yield [
                htp_datasetsamples, bio_entities, secondaryIds, datasetIds,
                stages, ao_terms, ao_substructures, ao_qualifiers,
                ao_ss_qualifiers, cc_components, ccq_components,
                uberon_ao_data, uberon_ao_other_data, biosamples,
                biosamplesTexts, assemblies
            ]
コード例 #4
0
class HTPMetaDatasetETL(ETL):

    htp_dataset_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        CREATE (ds:HTPDataset {primaryKey:row.datasetId})
          SET ds.dateAssigned = row.dateAssigned,
              ds.summary = row.summary,
              ds.numChannels = row.numChannels,
              ds.subSeries = row.subSeries
         """

    htp_dataset_pub_query_template = """
        
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (ds:HTPDataset {primaryKey: row.datasetId})
        
        MERGE (p:Publication {primaryKey: row.pubPrimaryKey})
            ON CREATE SET p.pubModId = row.pubModId,
                          p.pubMedId = row.pubMedId,
                          p.pubModUrl = row.pubModUrl,
                          p.pubMedUrl = row.pubMedUrl
                          
        MERGE (p)-[:ASSOCIATION]-(ds)
    
    """

    htp_category_tags_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (ds:HTPDataset {primaryKey:row.datasetId})
        
        MERGE (ct:CategoryTag {primaryKey:row.tag})
        
        MERGE (ds)-[:CATEGORY_TAG]-(ct)    
            
    """

    htp_secondaryIds_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (ds:HTPDataset {primaryKey: row.datasetId})
        
        MERGE (s:SecondaryId:Identifier {primaryKey:row.secondaryId})
                ON CREATE SET s.name = row.secondaryId
                
        MERGE (ds)-[aka:ALSO_KNOWN_AS]-(s)
   

    """

    htpdataset_xrefs_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:HTPDataset {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text(
    )

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            p = multiprocessing.Process(target=self._process_sub_type,
                                        args=(sub_type, ))
            p.start()
            thread_pool.append(p)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetETL.htp_dataset_query_template, commit_size,
                "htp_metadataset_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_category_tags_query_template,
                commit_size,
                "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size,
                "htp_metadataset_publications_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size,
                "htp_metadataset_xrefs_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size,
                "htp_metadataset_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, htp_dataset_data, batch_size):
        dataset_tags = []
        data_providers = []
        htp_datasets = []
        publications = []
        secondaryIds = []
        cross_reference_list = []
        counter = 0
        date_produced = htp_dataset_data['metaData']['dateProduced']

        data_provider_object = htp_dataset_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        for dataset_record in htp_dataset_data['data']:

            counter = counter + 1

            dataset = dataset_record.get('datasetId')
            datasetId = dataset.get('primaryId')

            # spoke to RGD and they wish to remove these datasets as they overlap with SGD.

            if (datasetId == 'GEO:GSE18157'
                    or datasetId == 'GEO:GSE33497') and data_provider == 'RGD':
                continue
            if 'secondaryIds' in dataset:
                for secId in dataset.get('secondaryIds'):
                    secid = {"datasetId": datasetId, "secondaryId": secId}
                    secondaryIds.append(secid)

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    datasetId)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if 'crossReference' in dataset:
                crossRefO = dataset.get('crossReference')
                if crossRefO is not None:
                    crossRefId = crossRefO.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRefId.split(":")[0]
                    pages = crossRefO.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_crossref_id, page)
                            xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                crossRefId, mod_global_cross_ref_url,
                                crossRefId + page)
                            xref['dataId'] = datasetId
                            cross_reference_list.append(xref)

            category_tags = dataset_record.get('categoryTags')

            if category_tags is not None:
                for tag in category_tags:
                    dataset_category_tag = {"datasetId": datasetId, "tag": tag}
                    dataset_tags.append(dataset_category_tag)

            publicationNew = dataset_record.get('publications')
            if publicationNew is not None:
                for pub in publicationNew:
                    pid = pub.get('publicationId')
                    publication_mod_id = ""
                    pub_med_id = ""
                    pub_mod_url = ""
                    pub_med_url = ""
                    if pid is not None and pid.startswith('PMID:'):
                        pub_med_id = pid
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url(
                            local_pub_med_id, 'PMID', pub_med_id)
                        if 'crossReference' in pub:
                            pub_xref = pub.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                                publication_mod_id)
                    elif pid is not None and not pid.startswith('PMID:'):
                        publication_mod_id = pub.get('publicationId')
                        pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                            publication_mod_id)

                    publication = {
                        "datasetId": datasetId,
                        "pubPrimaryKey": publication_mod_id + pub_med_id,
                        "pubModId": publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModUrl": pub_mod_url
                    }
                    publications.append(publication)

            htp_dataset = {
                "datasetId": datasetId,
                "dateAssigned": dataset_record.get('dateAssigned'),
                "title": dataset_record.get('title'),
                "summary": dataset_record.get('summary'),
                "numChannels": dataset_record.get('numChannels'),
                "subSeries": dataset_record.get('subSeries')
            }
            htp_datasets.append(htp_dataset)

            if counter == batch_size:
                yield [
                    htp_datasets, dataset_tags, publications,
                    cross_reference_list, secondaryIds
                ]
                counter = 0
                htp_datasets = []
                dataset_tags = []
                publications = []
                cross_reference_list = []
                secondaryIds = []

        if counter > 0:
            yield [
                htp_datasets, dataset_tags, publications, cross_reference_list,
                secondaryIds
            ]
コード例 #5
0
class VariationETL(ETL):
    """Variation ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    variation_query_template = """
            USING PERIODIC COMMIT %s
            LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

                MATCH (a:Allele {primaryKey:row.alleleId})
                MATCH (g:Gene)-[:IS_ALLELE_OF]-(a)

                //Create the variant node and set properties. primaryKey is required.
                MERGE (o:Variant {primaryKey:row.hgvs_nomenclature})
                    ON CREATE SET
                     o.name = row.variantHGVSSynonym,
                     o.hgvsNomenclature = row.hgvs_nomenclature,
                     o.genomicReferenceSequence = row.genomicReferenceSequence,
                     o.paddingLeft = row.paddingLeft,
                     o.paddingRight = row.paddingRight,
                     o.genomicVariantSequence = row.genomicVariantSequence,
                     o.dateProduced = row.dateProduced,
                     o.release = row.release,
                     o.dataProviders = row.dataProviders,
                     o.dataProvider = row.dataProvider

                MERGE (s:Synonym:Identifier {primaryKey:row.hgvs_nomenclature})
                    SET s.name = row.hgvs_nomenclature
                MERGE (o)-[aka2:ALSO_KNOWN_AS]->(s)

                MERGE (o)-[:VARIATION]->(a)
                MERGE (g)-[:COMPUTED_GENE]->(o) """

    so_terms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:Variant {primaryKey:row.variantId})
            MATCH (s:SOTerm {primaryKey:row.soTermId})
            CREATE (o)-[:VARIATION_TYPE]->(s)"""

    genomic_locations_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Variant {primaryKey:row.variantId})
            MATCH (chrm:Chromosome {primaryKey:row.chromosome})
            MERGE (a:Assembly {primaryKey:row.assembly})
             ON CREATE SET a.dataProvider = row.dataProvider

            CREATE (o)-[gchrm:LOCATED_ON]->(chrm)

            CREATE (gchrmn:GenomicLocation {primaryKey:row.uuid})
              SET gchrmn.start = apoc.number.parseInt(row.start),
                gchrmn.end = apoc.number.parseInt(row.end),
                gchrmn.assembly = row.assembly,
                gchrmn.strand = row.strand,
                gchrmn.chromosome = row.chromosome

            CREATE (o)-[of:ASSOCIATION]->(gchrmn)
            CREATE (gchrmn)-[ofc:ASSOCIATION]->(chrm)
            CREATE (gchrmn)-[ao:ASSOCIATION]->(a)
    """

    xrefs_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Variant {primaryKey:row.dataId})
    """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, ))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Variation Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Variation Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored

        query_template_list = [
            [
                self.variation_query_template, commit_size,
                "variation_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "variant_genomiclocations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "variant_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "variant_xrefs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        generators = self.get_generators(data, batch_size)
        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))

    def get_hgvs_nomenclature(self, refseq_id, variant_type, start_position,
                              end_position, reference_sequence,
                              variant_sequence, assembly, chromosome):
        """Get HGVS nomenclature."""
        if start_position is None:
            start_position_str = ""
        else:
            start_position_str = str(start_position)

        if end_position is None:
            end_position_str = ""
        else:
            end_position_str = str(end_position)

        if variant_sequence is None:
            variant_sequence_str = ""
        else:
            variant_sequence_str = variant_sequence

        if reference_sequence is None:
            reference_sequence_str = ""
        else:
            reference_sequence_str = reference_sequence

        hgvs_nomenclature = refseq_id.split(
            ":")[1] + ':g.' + start_position_str
        hgvs_synonym = '(' + assembly + ')' + chromosome + ':' + start_position_str

        if variant_type in ['SO:1000002',
                            'SO:1000008']:  # point mutation/substitution
            hgvs_nomenclature += reference_sequence_str + ">" + variant_sequence_str
            hgvs_synonym += reference_sequence_str + ">" + variant_sequence_str
        elif variant_type == "SO:0000667":  # insertion
            hgvs_nomenclature += '_' + end_position_str + 'ins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'ins' + variant_sequence_str
        elif variant_type == "SO:0000159":  # deletion
            hgvs_nomenclature += '_' + end_position_str + 'del'
            hgvs_synonym += '_' + end_position_str + 'del'
        elif variant_type == "SO:0002007":  # MNV
            hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str
        elif variant_type == "SO:1000032":  # DELIN
            hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str
        else:
            hgvs_nomenclature = ''
            hgvs_synonym = ''
        return hgvs_nomenclature, hgvs_synonym

    def get_generators(self, variant_data, batch_size):  # noqa
        """Get Generators."""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        self.data_providers_process(variant_data)
        load_key = date_produced + self.data_provider + "_VARIATION"

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  # not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                local_cross_ref_id, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": self.data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": self.data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]
コード例 #6
0
ファイル: do_etl.py プロジェクト: sierra-moxon/agr_loader
class DOETL(ETL):
    """DO ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    do_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        //Create the DOTerm node and set properties. primaryKey is required.
        MERGE (doterm:DOTerm:Ontology {primaryKey:row.oid})
            SET doterm.name = row.name,
             doterm.nameKey = row.name_key,
             doterm.definition = row.definition,
             doterm.defLinks = apoc.convert.fromJsonList(row.defLinksProcessed),
             doterm.isObsolete = row.is_obsolete,
             doterm.subset = row.subset,
             doterm.doDisplayId = row.oid,
             doterm.doUrl = row.oUrl,
             doterm.doPrefix = "DOID",
             doterm.doId = row.oid,
             doterm.rgdLink = row.rgd_link,
             doterm.ratOnlyRgdLink = row.rat_only_rgd_link,
             doterm.humanOnlyRgdLink = row.human_only_rgd_link,
             doterm.mgiLink = row.mgi_link,
             doterm.zfinLink = row.zfin_link,
             doterm.flybaseLink = row.flybase_link,
             doterm.wormbaseLink = row.wormbase_link,
             doterm.sgdLink = row.sgd_link

            MERGE (doterm)-[ggcg:IS_A_PART_OF_CLOSURE]->(doterm)"""

    doterm_synonyms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d:DOTerm {primaryKey:row.primary_id})

            MERGE (syn:Synonym:Identifier {primaryKey:row.synonym})
                SET syn.name = row.synonym
            MERGE (d)-[aka2:ALSO_KNOWN_AS]->(syn) """

    doterm_isas_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d1:DOTerm:Ontology {primaryKey:row.primary_id})
            MATCH (d2:DOTerm:Ontology {primaryKey:row.primary_id2})
            MERGE (d1)-[aka:IS_A]->(d2) """

    xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:DOTerm {primaryKey:row.oid}) """ + ETLHelper.get_cypher_xref_text(
    )

    doterm_alt_ids_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d:DOTerm {primaryKey:row.primary_id})

            MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondary_id})

            MERGE (d)-[aka2:ALSO_KNOWN_AS]->(sec) """

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.do_query_template, commit_size, "do_term_data.csv"],
            [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"],
            [
                self.doterm_synonyms_query_template, commit_size,
                "do_synonyms_data.csv"
            ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"],
            [
                self.doterm_alt_ids_query_template, commit_size,
                "do_alt_ids_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("DO-?: ")

    def get_generators(self, filepath,
                       batch_size):  # noqa TODO:Needs splitting up really
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {"primary_id": key, "synonym": synonym}
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    self.ortho_xrefs(o_xrefs, ident, xrefs)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition
                               ) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #     if not isinstance(alt_ids, (list, tuple)):
            #         alt_ids = [alt_ids]
            # else:
            #     alt_ids = []

            # TODO: Need to add urls to resource Descriptis for SGD and MGI.
            # NOTE: MGI had one but has 'MGI:' at the end of the url not required here.
            dict_to_append = {
                'oid':
                node['id'],
                'name':
                node.get('label'),
                'name_key':
                node.get('label'),
                'definition':
                definition,
                'defLinksProcessed':
                def_links_processed,
                'is_obsolete':
                is_obsolete,
                'subset':
                subset,
                'oUrl':
                self.etlh.rdh2.return_url_from_key_value('DOID', node['id']),
                'rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/all'),
                'rat_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/rat'),
                'human_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/human'),
                'mgi_link':
                'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'ZFIN', node['id'], 'disease'),
                'flybase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'FB', node['id'], 'disease'),
                'wormbase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'WB', node['id'], 'disease'),
                'sgd_link':
                'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    do_term_list, do_isas_list, do_synonyms_list, xrefs,
                    do_alt_ids_list
                ]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [
                do_term_list, do_isas_list, do_synonyms_list, xrefs,
                do_alt_ids_list
            ]
コード例 #7
0
class ExpressionAtlasETL(ETL):
    """Expression Atlas ETL."""

    logger = logging.getLogger(__name__)

    # Querys which do not take params and can be used as is

    get_all_gene_primary_to_ensmbl_ids_query = """
        MATCH (g:Gene)-[:CROSS_REFERENCE]-(c:CrossReference)
        WHERE c.prefix = 'ENSEMBL'
        RETURN g.primaryKey, c.localId"""

    get_mod_gene_symbol_to_primary_ids_query = """
        MATCH (g:Gene)
        WHERE g.dataProvider = {parameter}
        RETURN g.primaryKey, g.symbol"""

    get_genes_with_expression_atlas_links_query = """
        MATCH (g:Gene)
        WHERE LOWER(g.primaryKey) IN {parameter}
        RETURN g.primaryKey, g.modLocalId"""

    # Query templates which take params and will be processed later

    add_expression_atlas_crossreferences_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (o:Gene)
        WHERE o.primaryKey = row.genePrimaryKey
        """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []
        ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids(
        )

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(
                target=self._process_sub_type,
                args=(sub_type, ensg_to_gene_primary_id_map))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    @staticmethod
    def _get_primary_gene_ids_to_ensembl_ids():
        return_set = Neo4jHelper.run_single_query(
            ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query)
        return {
            record["c.localId"].lower(): record["g.primaryKey"]
            for record in return_set
        }

    @staticmethod
    def _get_mod_gene_symbol_to_primary_ids(data_provider):
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query,
            data_provider)
        return {
            record["g.symbol"].lower(): record["g.primaryKey"]
            for record in return_set
        }

    # Returns only pages for genes that we have in the Alliance
    def _get_expression_atlas_gene_pages(self, sub_type, data_provider,
                                         ensg_to_gene_primary_id_map):
        filepath = sub_type.get_filepath()
        gene_symbol_to_primary_id_map = self._get_mod_gene_symbol_to_primary_ids(
            data_provider)

        expression_atlas_gene_pages = {}
        with open(filepath) as file_handle:
            doc = xmltodict.parse(file_handle.read())["urlset"]
            for value in doc.values():
                if isinstance(value, (list, )):
                    for element in value:
                        url = element['loc']
                        expression_atlas_gene = url.split("/")[-1]
                        expression_atlas_gene = expression_atlas_gene.lower()
                        if expression_atlas_gene in ensg_to_gene_primary_id_map:
                            expression_atlas_gene_pages[
                                ensg_to_gene_primary_id_map[
                                    expression_atlas_gene].lower()] = url
                        elif expression_atlas_gene in gene_symbol_to_primary_id_map:
                            expression_atlas_gene_pages[
                                gene_symbol_to_primary_id_map[
                                    expression_atlas_gene].lower()] = url
                        else:
                            alliance_gene = data_provider + ":" + expression_atlas_gene
                            expression_atlas_gene_pages[
                                alliance_gene.lower()] = url

        return expression_atlas_gene_pages

    def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map):

        data_provider = sub_type.get_data_provider()
        expression_atlas_gene_pages = self._get_expression_atlas_gene_pages(
            sub_type, data_provider, ensg_to_gene_primary_id_map)

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(expression_atlas_gene_pages,
                                         data_provider, batch_size)

        query_template_list = [
            [
                self.add_expression_atlas_crossreferences_query_template,
                commit_size, "expression_atlas_" + data_provider + "_data.csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("ExpAtlas-{}: ".format(
            sub_type.get_data_provider()))

    def get_generators(self, expression_atlas_gene_pages, data_provider,
                       batch_size):
        """Get Generators."""
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_genes_with_expression_atlas_links_query,
            list(expression_atlas_gene_pages.keys()))

        counter = 0
        cross_reference_list = []
        for record in return_set:
            counter += 1
            cross_reference = ETLHelper.get_xref_dict(
                record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene",
                "gene/expression-atlas", "gene/expressionAtlas",
                record["g.modLocalId"],
                expression_atlas_gene_pages[record["g.primaryKey"].lower()],
                data_provider + ":" + record["g.modLocalId"] +
                "gene/expression-atlas")
            cross_reference["genePrimaryKey"] = record["g.primaryKey"]
            cross_reference_list.append(cross_reference)
            if counter > batch_size:
                yield [cross_reference_list]
                counter = 0
                cross_reference_list = []

        if counter > 0:
            yield [cross_reference_list]
コード例 #8
0
class ExpressionETL(ETL):
    """Expression ETL"""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})
        """ + ETLHelper.get_cypher_xref_text()

    bio_entity_expression_query_template = """
    
    USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
    
    MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
         ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement"""

    bio_entity_gene_expression_join_query_template = """
    
     USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
    
        MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay})
        MERGE (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})
            ON CREATE SET gej.joinType = 'expression'
    
        MERGE (gej)-[geja:ASSAY]->(assay)"""

    bio_entity_gene_ao_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (g:Gene {primaryKey:row.geneId})
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MATCH (otast:Ontology {primaryKey:row.anatomicalStructureTermId}) 
                WHERE NOT 'UBERONTerm' in LABELS(otast)
                AND NOT 'FBCVTerm' in LABELS(otast)
            
            MERGE (g)-[gex:EXPRESSED_IN]->(e)
                    ON CREATE SET gex.uuid = row.ei_uuid
            MERGE (e)-[gejotast:ANATOMICAL_STRUCTURE]->(otast)"""

    add_pubs_query_template = """
    
     USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
            MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})  
    
            MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey})
                    ON CREATE SET pubf.pubModId = row.pubModId,
                     pubf.pubMedId = row.pubMedId,
                     pubf.pubModUrl = row.pubModUrl,
                     pubf.pubMedUrl = row.pubMedUrl

            CREATE (gej)-[gejpubf:EVIDENCE]->(pubf) """

    ao_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            // GET PRIMARY DATA OBJECTS

            // LOAD NODES
            MATCH (g:Gene {primaryKey:row.geneId}) 
            MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})  
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
                
            MERGE (g)-[ggej:ASSOCIATION]->(gej)     
            MERGE (e)-[egej:ASSOCIATION]->(gej)"""

    sgd_cc_expression_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            // GET PRIMARY DATA OBJECTS

            // LOAD NODES
            MATCH (g:Gene {primaryKey:row.geneId})
            MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay})
            MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId})

            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})

                MERGE (g)-[gex:EXPRESSED_IN]->(e)
                    ON CREATE SET gex.uuid = row.ei_uuid
                MERGE (gej)-[geja:ASSAY]->(assay)

                MERGE (g)-[ggej:ASSOCIATION]->(gej)

                MERGE (e)-[egej:ASSOCIATION]->(gej)

                MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct)"""

    cc_expression_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            // GET PRIMARY DATA OBJECTS

            // LOAD NODES
            MATCH (g:Gene {primaryKey:row.geneId})
            MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay})
            MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId})

            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})         
               
                MERGE (g)-[gex:EXPRESSED_IN]->(e)
                    ON CREATE SET gex.uuid = row.ei_uuid

                
                MERGE (gej)-[geja:ASSAY]->(assay)

                MERGE (g)-[ggej:ASSOCIATION]->(gej)
                    
                MERGE (e)-[egej:ASSOCIATION]->(gej)
                    
                MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct)"""

    ao_cc_expression_query_template = """
        
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            // GET PRIMARY DATA OBJECTS

            // LOAD NODES
            MATCH (g:Gene {primaryKey:row.geneId})
            MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay})
            MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId})
            MATCH (otast:Ontology {primaryKey:row.anatomicalStructureTermId})                 
                WHERE NOT 'UBERONTerm' in LABELS(otast)
                    AND NOT 'FBCVTerm' in LABELS(otast)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid})   
                
            WITH g, e, gej, assay, otcct, otast, row WHERE NOT otast IS NULL AND NOT otcct IS NULL
                
   
                MERGE (g)-[gex:EXPRESSED_IN]->(e)
                    ON CREATE SET gex.uuid = row.ei_uuid
                            
                
                MERGE (gej)-[geja:ASSAY]->(assay)

                MERGE (g)-[ggej:ASSOCIATION]->(gej)
                    
                MERGE (e)-[egej:ASSOCIATION]->(gej)
                
                
                MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct)
                    
                MERGE (e)-[gejotast:ANATOMICAL_STRUCTURE]-(otast)"""

    eas_substructure_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """

    eas_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otastq)
                    AND NOT 'FBCVTerm' in LABELS(otastq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """

    eass_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasstq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            
            MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """

    ccq_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otcctq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
                      
            MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """

    stage_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MERGE (s:Stage {primaryKey:row.stageName})
                ON CREATE SET s.name = row.stageName
            MERGE (ei)-[eotcctq:DURING]-(s) """

    uberon_ao_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId})     
            MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """

    uberon_stage_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId})
            
            MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """

    uberon_ao_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) 
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) 
            MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """

    uberon_stage_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'})
            
            MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        # add the 'other' nodes to support the expression ribbon components.
        self.add_other()

        thread_pool = []
        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        Neo4jTransactor.execute_query_batch(queries)

    def _process_sub_type(self, sub_type, query_tracking_list):

        self.logger.info("Loading Expression Data: %s",
                         sub_type.get_data_provider())
        data_file = sub_type.get_filepath()
        data_provider = sub_type.get_data_provider()

        if data_file is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.bio_entity_expression_query_template, commit_size,
                "expression_entities_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.bio_entity_gene_ao_query_template, commit_size,
                "expression_gene_ao_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.bio_entity_gene_expression_join_query_template,
                commit_size, "expression_entity_joins_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.ao_expression_query_template, commit_size,
                "expression_ao_expression_" + sub_type.get_data_provider() +
                ".csv"
            ]
        ]

        if data_provider == 'SGD':
            query_template_list += [[
                self.sgd_cc_expression_query_template, commit_size,
                "expression_SGD_cc_expression_" +
                sub_type.get_data_provider() + ".csv"
            ]]
        else:
            query_template_list += [[
                self.cc_expression_query_template, commit_size,
                "expression_cc_expression_" + sub_type.get_data_provider() +
                ".csv"
            ]]

        query_template_list += [
            [
                self.ao_cc_expression_query_template, commit_size,
                "expression_ao_cc_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eas_qualified_query_template, commit_size,
                "expression_eas_qualified_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eas_substructure_query_template, commit_size,
                "expression_eas_substructure_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eass_qualified_query_template, commit_size,
                "expression_eass_qualified_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.ccq_expression_query_template, commit_size,
                "expression_ccq_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.stage_expression_query_template, commit_size,
                "expression_stage_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_stage_query_template, commit_size,
                "expression_uberon_stage_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_ao_query_template, commit_size,
                "expression_uberon_ao_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.uberon_ao_other_query_template, commit_size,
                "expression_uberon_ao_other_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_stage_other_query_template, commit_size,
                "expression_uberon_stage_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "expression_cross_references_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.add_pubs_query_template, commit_size,
                "expression_add_pubs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data_file, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.logger.info("Finished Loading Expression Data: %s",
                         sub_type.get_data_provider())

    def add_other(self):
        """Add Other"""

        self.logger.debug("made it to the addOther statement")

        add_other_query = """

            MERGE(other:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'})
                ON CREATE SET other.name = 'other'
            MERGE(otherstage:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'})
                ON CREATE SET otherstage.name = 'post embryonic, pre-adult'
            MERGE(othergo:GOTerm:Ontology {primaryKey:'GO:otherLocations'})
                ON CREATE SET othergo.name = 'other locations'
                ON CREATE SET othergo.definition = 'temporary node to group expression entities up to ribbon terms'
                ON CREATE SET othergo.type = 'other'
                ON CREATE SET othergo.subset = 'goslim_agr' """

        Neo4jHelper.run_single_query(add_other_query)

    def get_generators(self, expression_file, batch_size):
        """Get Generators"""

        self.logger.debug("made it to the expression generator")

        counter = 0

        cross_references = []
        bio_entities = []
        bio_join_entities = []
        bio_entity_gene_aos = []
        pubs = []
        ao_expressions = []
        cc_expressions = []
        ao_qualifiers = []
        ao_substructures = []
        ao_ss_qualifiers = []
        cc_qualifiers = []
        ao_cc_expressions = []
        stage_list = []
        stage_uberon_data = []
        uberon_ao_data = []
        uberon_ao_other_data = []
        uberon_stage_other_data = []

        self.logger.debug("streaming json data from %s ...", expression_file)
        with codecs.open(expression_file, 'r', 'utf-8') as file_handle:
            for xpat in ijson.items(file_handle, 'data.item'):
                counter = counter + 1

                pub_med_url = None
                pub_mod_url = None
                pub_med_id = ""
                publication_mod_id = ""
                stage_term_id = ""
                stage_name = ""
                stage_uberon_term_id = ""
                gene_id = xpat.get('geneId')

                if self.test_object.using_test_data() is True:
                    is_it_test_entry = self.test_object.check_for_test_id_entry(
                        gene_id)
                    if is_it_test_entry is False:
                        counter = counter - 1
                        continue

                evidence = xpat.get('evidence')

                if 'publicationId' in evidence:
                    if evidence.get('publicationId').startswith('PMID:'):
                        pub_med_id = evidence.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_prefix = pub_med_id.split(":")[0]
                        pub_med_url = ETLHelper.get_no_page_complete_url(
                            local_pub_med_id, self.xref_url_map,
                            pub_med_prefix, gene_id)
                        if pub_med_id is None:
                            pub_med_id = ""

                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')

                            if publication_mod_id is not None:
                                pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \
                                        publication_mod_id)

                    else:
                        publication_mod_id = evidence['publicationId']
                        if publication_mod_id is not None:
                            pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\
                                              publication_mod_id)

                    if publication_mod_id is None:
                        publication_mod_id = ""

                assay = xpat.get('assay')

                if 'whereExpressed' in xpat:

                    where_expressed = xpat.get('whereExpressed')
                    cellular_component_qualifier_term_id = \
                           where_expressed.get('cellularComponentQualifierTermId')
                    cellular_component_term_id = where_expressed.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = where_expressed.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = \
                            where_expressed.get('anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = where_expressed.get(
                        'whereExpressedStatement')

                    when_expressed_stage = xpat.get('whenExpressed')

                    if 'stageTermId' in when_expressed_stage:
                        stage_term_id = when_expressed_stage.get('stageTermId')
                    if 'stageName' in when_expressed_stage:
                        stage_name = when_expressed_stage.get('stageName')

                    # TODO: making unique BioEntityGeneExpressionJoin nodes
                    # and ExpressionBioEntity nodes is tedious.
                    # TODO: Lets get the DQMs to fix this.
                    expression_unique_key = gene_id + assay + stage_name
                    expression_entity_unique_key = ""

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key \
                                    += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement
                    expression_unique_key += where_expressed_statement

                    if where_expressed.get(
                            'anatomicalStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_structure_term_object in \
                                where_expressed.get('anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = \
                                    uberon_structure_term_object.get('uberonTerm')
                            if structure_uberon_term_id is not None \
                                    and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)
                            elif structure_uberon_term_id is not None \
                                    and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if where_expressed.get(
                            'anatomicalSubStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_sub_structure_term_object in \
                                where_expressed.get('anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = \
                                    uberon_sub_structure_term_object.get('uberonTerm')
                            if sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)
                            elif sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is None:
                        cellular_component_term_id = ""

                    if when_expressed_stage.get(
                            'stageUberonSlimTerm') is not None:
                        stage_uberon_term_object = when_expressed_stage.get(
                            'stageUberonSlimTerm')
                        stage_uberon_term_id = stage_uberon_term_object.get(
                            "uberonTerm")
                        if stage_uberon_term_id is not None \
                            and stage_uberon_term_id != "post embryonic, pre-adult":
                            stage_uberon = {
                                "uberonStageId": stage_uberon_term_id,
                                "ei_uuid": expression_unique_key
                            }
                            stage_uberon_data.append(stage_uberon)
                        if stage_uberon_term_id == "post embryonic, pre-adult":
                            stage_uberon_other = {
                                "ei_uuid": expression_unique_key
                            }
                            uberon_stage_other_data.append(stage_uberon_other)

                    if stage_term_id is None or stage_name == 'N/A':
                        stage_term_id = ""
                        stage_name = ""
                        stage_uberon_term_id = ""

                    if stage_name is not None:
                        stage = {
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "ei_uuid": expression_unique_key
                        }
                        stage_list.append(stage)
                    else:
                        stage_uberon_term_id = ""

                    if 'crossReference' in xpat:
                        cross_ref = xpat.get('crossReference')
                        cross_ref_id = cross_ref.get('id')
                        local_cross_ref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref.get('id').split(":")[0]
                        pages = cross_ref.get('pages')

                        # some pages collection have 0 elements
                        if pages is not None and len(pages) > 0:
                            for page in pages:
                                if page == 'gene/expression/annotation/detail':
                                    mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\
                                            local_cross_ref_id,
                                            self.xref_url_map,
                                            prefix, page)

                                    xref = ETLHelper.get_xref_dict(
                                        local_cross_ref_id, prefix, page, page,
                                        cross_ref_id, mod_global_cross_ref_id,
                                        cross_ref_id + page)
                                    xref['ei_uuid'] = expression_unique_key
                                    cross_references.append(xref)

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement
                    }
                    bio_entities.append(bio_entity)

                    bio_join_entity = {
                        "ei_uuid": expression_unique_key,
                        "assay": assay
                    }
                    bio_join_entities.append(bio_join_entity)

                    bio_entity_gene_ao = {
                        "geneId": gene_id,
                        "ebe_uuid": expression_entity_unique_key,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "ei_uuid": expression_unique_key
                    }
                    bio_entity_gene_aos.append(bio_entity_gene_ao)

                    pub = {
                        "ei_uuid": expression_unique_key,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url
                    }
                    pubs.append(pub)

                    ao_expression = {
                        "geneId": gene_id,
                        "whenExpressedStage": when_expressed_stage,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "uuid": str(uuid.uuid4()),
                        "assay": assay,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "whereExpressedStatement": where_expressed_statement,
                        "ei_uuid": expression_unique_key,
                        "ebe_uuid": expression_entity_unique_key
                    }
                    ao_expressions.append(ao_expression)

                    if cellular_component_qualifier_term_id is not None:

                        cc_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        cc_qualifiers.append(cc_qualifier)

                    if anatomical_structure_term_id is None:
                        anatomical_structure_term_id = ""

                        cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "assay": assay,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }
                        cc_expressions.append(cc_expression)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    if anatomical_structure_term_id is not None \
                            and anatomical_structure_term_id != "" \
                            and cellular_component_term_id is not None \
                            and cellular_component_term_id != "":

                        ao_cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "uuid": str(uuid.uuid4()),
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "stageUberonTermId": stage_uberon_term_id,
                            "assay": assay,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }

                        ao_cc_expressions.append(ao_cc_expression)

                if counter == batch_size:
                    yield [
                        bio_entities, bio_entity_gene_aos, bio_join_entities,
                        ao_expressions, cc_expressions, ao_cc_expressions,
                        ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                        cc_qualifiers, stage_list, stage_uberon_data,
                        uberon_ao_data, uberon_ao_other_data,
                        uberon_stage_other_data, cross_references, pubs
                    ]
                    bio_entities = []
                    bio_join_entities = []
                    ao_expressions = []
                    cc_expressions = []
                    ao_qualifiers = []
                    ao_substructures = []
                    ao_ss_qualifiers = []
                    cc_qualifiers = []
                    ao_cc_expressions = []
                    stage_list = []
                    uberon_stage_other_data = []
                    stage_uberon_data = []
                    uberon_ao_other_data = []
                    uberon_ao_data = []
                    cross_references = []
                    bio_entity_gene_aos = []
                    pubs = []
                    counter = 0

            if counter > 0:
                yield [
                    bio_entities, bio_entity_gene_aos, bio_join_entities,
                    ao_expressions, cc_expressions, ao_cc_expressions,
                    ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                    cc_qualifiers, stage_list, stage_uberon_data,
                    uberon_ao_data, uberon_ao_other_data,
                    uberon_stage_other_data, cross_references, pubs
                ]
コード例 #9
0
class AlleleETL(ETL):
    allele_construct_no_gene_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (c:Construct {primaryKey: row.constructId})
            MATCH (s:Species {primaryKey: row.taxonId})
            //Create the Allele node and set properties. primaryKey is required.
            MERGE (o:Allele:Feature {primaryKey:row.primaryId})
                ON CREATE SET o.symbol = row.symbol,
                 o.taxonId = row.taxonId,
                 o.dateProduced = row.dateProduced,
                 o.release = row.release,
                 o.localId = row.localId,
                 o.globalId = row.globalId,
                 o.uuid = row.uuid,
                 o.symbolText = row.symbolText,
                 o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                 o.dataProviders = row.dataProviders,
                 o.dataProvider = row.dataProvider,
                 o.symbolWithSpecies = row.symbolWithSpecies,
                 o.symbolTextWithSpecies = row.symbolTextWithSpecies,
                 o.description = row.alleleDescription
            MERGE (o)-[:FROM_SPECIES]-(s)
            MERGE (o)-[:CONTAINS]-(c) """

    allele_construct_gene_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (g:Gene {primaryKey: row.geneId})
            MATCH (c:Construct {primaryKey: row.constructId})
            MATCH (s:Species {primaryKey: row.taxonId})
            //Create the Allele node and set properties. primaryKey is required.
            MERGE (o:Allele:Feature {primaryKey:row.primaryId})
                ON CREATE SET o.symbol = row.symbol,
                 o.taxonId = row.taxonId,
                 o.dateProduced = row.dateProduced,
                 o.release = row.release,
                 o.localId = row.localId,
                 o.globalId = row.globalId,
                 o.uuid = row.uuid,
                 o.symbolText = row.symbolText,
                 o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                 o.dataProviders = row.dataProviders,
                 o.dataProvider = row.dataProvider,
                 o.symbolWithSpecies = row.symbolWithSpecies,
                 o.symbolTextWithSpecies = row.symbolTextWithSpecies,
                 o.description = row.alleleDescription
            MERGE (o)-[:FROM_SPECIES]-(s)
            MERGE (o)-[:IS_ALLELE_OF]-(g)
            MERGE (o)-[:CONTAINS]-(c) """

    allele_gene_no_construct_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (g:Gene {primaryKey: row.geneId})
            MATCH (s:Species {primaryKey: row.taxonId})
            //Create the Allele node and set properties. primaryKey is required.
            MERGE (o:Allele:Feature {primaryKey:row.primaryId})
                ON CREATE SET o.symbol = row.symbol,
                 o.taxonId = row.taxonId,
                 o.dateProduced = row.dateProduced,
                 o.release = row.release,
                 o.localId = row.localId,
                 o.globalId = row.globalId,
                 o.uuid = row.uuid,
                 o.symbolText = row.symbolText,
                 o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                 o.dataProviders = row.dataProviders,
                 o.dataProvider = row.dataProvider,
                 o.symbolWithSpecies = row.symbolWithSpecies,
                 o.symbolTextWithSpecies = row.symbolTextWithSpecies,
                 o.description = row.alleleDescription
            MERGE (o)-[:FROM_SPECIES]-(s)
            MERGE (o)-[:IS_ALLELE_OF]->(g) """

    allele_no_gene_no_construct_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (s:Species {primaryKey: row.taxonId})
            //Create the Allele node and set properties. primaryKey is required.
            MERGE (o:Allele:Feature {primaryKey:row.primaryId})
                ON CREATE SET o.symbol = row.symbol,
                 o.taxonId = row.taxonId,
                 o.dateProduced = row.dateProduced,
                 o.release = row.release,
                 o.localId = row.localId,
                 o.globalId = row.globalId,
                 o.uuid = row.uuid,
                 o.symbolText = row.symbolText,
                 o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                 o.dataProviders = row.dataProviders,
                 o.dataProvider = row.dataProvider,
                 o.symbolWithSpecies = row.symbolWithSpecies,
                 o.symbolTextWithSpecies = row.symbolTextWithSpecies,
                 o.description = row.alleleDescription
            MERGE (o)-[:FROM_SPECIES]-(s)
    """

    allele_secondaryids_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (f:Allele:Feature {primaryKey:row.data_id})
            MERGE (second:SecondaryId {primaryKey:row.secondary_id})
                SET second.name = row.secondary_id
            MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """

    allele_synonyms_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (a:Allele:Feature {primaryKey:row.data_id})
            MERGE(syn:Synonym {primaryKey:row.synonym})
                SET syn.name = row.synonym
            MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """

    allele_xrefs_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:Allele {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,))
            p.start()
            thread_pool.append(p)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        logger.info("Loading Allele Data: %s" % sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading Allele Data: %s" % sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" % sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [AlleleETL.allele_gene_no_construct_query_template, commit_size,
             "allele_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_construct_gene_query_template, commit_size,
             "allele_construct_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_construct_no_gene_query_template, commit_size,
             "allele_construct_no_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_no_gene_no_construct_query_template, commit_size,
             "allele_no_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_secondaryids_template, commit_size,
             "allele_secondaryids_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_synonyms_template, commit_size,
             "allele_synonyms_" + sub_type.get_data_provider() + ".csv"],
            [AlleleETL.allele_xrefs_template, commit_size, "allele_xrefs_" + sub_type.get_data_provider() + ".csv"],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, allele_data, batch_size):

        data_providers = []
        release = ""
        alleles_no_constrcut_no_gene = []
        alleles_construct_gene = []
        alleles_no_construct = []
        alleles_no_gene = []
        allele_synonyms = []
        allele_secondary_ids = []
        cross_reference_list = []

        counter = 0
        date_produced = allele_data['metaData']['dateProduced']

        data_provider_object = allele_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        loadKey = date_produced + data_provider + "_ALLELE"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider,
                                                                      data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page,
                                                                       data_provider_page, data_provider,
                                                                       cross_ref_complete_url,
                                                                       data_provider + data_provider_page))

                data_providers.append(data_provider)
                logger.info("data provider: " + data_provider)

        if 'release' in allele_data['metaData']:
            release = allele_data['metaData']['release']

        for allele_record in allele_data['data']:
            counter = counter + 1
            global_id = allele_record['primaryId']
            # fixing parsing error on this end while MGI fixes on their end.
            if global_id == 'MGI:3826848':
                description = allele_record.get('description')[:-2]
            else:
                description = allele_record.get('description')

            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            gene_id = ''
            construct_id = ''
            association_type = ''

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId'))
            symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol'))

            if allele_record.get('alleleObjectRelations') is not None:
                for relation in allele_record.get('alleleObjectRelations'):
                    association_type = relation.get('objectRelation').get('associationType')
                    if relation.get('objectRelation').get('gene') is not None:
                        gene_id = relation.get('objectRelation').get('gene')
                    if relation.get('objectRelation').get('construct') is not None:
                        construct_id = relation.get('objectRelation').get('construct')

                    if gene_id != '' and construct_id != '':
                        allele_construct_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }
                        alleles_construct_gene.append(allele_construct_gene_dataset)

                    elif construct_id != '' and gene_id == '':
                        allele_construct_no_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }

                        alleles_no_gene.append(allele_construct_no_gene_dataset)

                    elif gene_id != '' and construct_id == '':
                        allele_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_construct.append(allele_gene_no_construct_dataset)

                    elif gene_id == '' and construct_id == '':
                        allele_no_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            else:
                allele_no_gene_no_construct_dataset = {
                    "symbol": allele_record.get('symbol'),
                    "primaryId": allele_record.get('primaryId'),
                    "globalId": global_id,
                    "localId": local_id,
                    "taxonId": allele_record.get('taxonId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": loadKey,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": data_provider,
                    "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                    "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                    "symbolText": symbol_text,
                    "alleleDescription": description,
                    "associationType": association_type
                }
                alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            if 'crossReferences' in allele_record:

                for crossRef in allele_record['crossReferences']:
                    crossRefId = crossRef.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRef.get('id').split(":")[0]
                    pages = crossRef.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \
                                    or page == 'transgene/references' or page == 'construct/references':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id,
                                                                                      self.xref_url_map, prefix, page)
                                xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId,
                                                               mod_global_cross_ref_id, crossRefId + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'synonyms' in allele_record:
                for syn in allele_record.get('synonyms'):
                    allele_synonym = {
                        "data_id": allele_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    allele_synonyms.append(allele_synonym)

            if 'secondaryIds' in allele_record:
                for secondary_id in allele_record.get('secondaryIds'):
                    allele_secondary_id = {
                        "data_id": allele_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    allele_secondary_ids.append(allele_secondary_id)

            if counter == batch_size:
                yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                       allele_secondary_ids, allele_synonyms, cross_reference_list]
                alleles_no_construct = []
                alleles_construct_gene = []
                alleles_no_gene = []
                alleles_no_constrcut_no_gene = []

                allele_secondary_ids = []
                allele_synonyms = []
                cross_reference_list = []
                counter = 0

        if counter > 0:
            yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                   allele_secondary_ids, allele_synonyms, cross_reference_list]
コード例 #10
0
class MolecularInteractionETL(ETL):
    """Molecular Interaction ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    main_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        MATCH (g1:Gene {primaryKey:row.interactor_A})
        MATCH (g2:Gene {primaryKey:row.interactor_B})

        MATCH (mi:MITerm) WHERE mi.primaryKey = row.detection_method
        MATCH (sdb:MITerm) WHERE sdb.primaryKey = row.source_database
        MATCH (adb:MITerm) WHERE adb.primaryKey = row.aggregation_database
        MATCH (ita:MITerm) WHERE ita.primaryKey = row.interactor_A_type
        MATCH (itb:MITerm) WHERE itb.primaryKey = row.interactor_B_type
        MATCH (ira:MITerm) WHERE ira.primaryKey = row.interactor_A_role
        MATCH (irb:MITerm) WHERE irb.primaryKey = row.interactor_B_role
        MATCH (it:MITerm) WHERE it.primaryKey = row.interaction_type

        //Create the relationship between the two genes.
        CREATE (g1)-[iw:INTERACTS_WITH {uuid:row.uuid}]->(g2)

        //Create the Association node to be used for the object.
        CREATE (oa:Association {primaryKey:row.uuid})
            SET oa :InteractionGeneJoin
            SET oa.joinType = 'molecular_interaction'
        CREATE (g1)-[a1:ASSOCIATION]->(oa)
        CREATE (oa)-[a2:ASSOCIATION]->(g2)

        //Create the publication nodes and link them to the Association node.
        MERGE (pn:Publication {primaryKey:row.pub_med_id})
            ON CREATE SET pn.pubMedUrl = row.pub_med_url,
            pn.pubMedId = row.pub_med_id
        CREATE (oa)-[ev:EVIDENCE]->(pn)

        //Link detection method to the MI ontology.
        CREATE (oa)-[dm:DETECTION_METHOD]->(mi)

        //Link source database to the MI ontology.
        CREATE (oa)-[sd:SOURCE_DATABASE]->(sdb)

        //Link aggregation database to the MI ontology.
        CREATE (oa)-[ad:AGGREGATION_DATABASE]->(adb)

        //Link interactor roles and types to the MI ontology.
        CREATE (oa)-[ita1:INTERACTOR_A_TYPE]->(ita)
        CREATE (oa)-[itb1:INTERACTOR_B_TYPE]->(itb)
        CREATE (oa)-[ira1:INTERACTOR_A_ROLE]->(ira)
        CREATE (oa)-[irb1:INTERACTOR_B_ROLE]->(irb)

        //Link interaction type to the MI ontology.
        CREATE (oa)-[it1:INTERACTION_TYPE]->(it)
    """

    xref_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        // This needs to be a MERGE below.
        MATCH (o:InteractionGeneJoin :Association) WHERE o.primaryKey = row.reference_uuid
        """ + ETLHelper.get_cypher_xref_text()

    mod_xref_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Gene {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        """Initiaslise object."""
        super().__init__()
        self.data_type_config = config

        # Initialize an instance of ResourceDescriptor for processing external links.
        # self.resource_descriptor_dict = ResourceDescriptorHelper2()
        self.missed_database_linkouts = set()
        self.successful_database_linkouts = set()
        self.ignored_database_linkouts = set()
        self.successful_mod_interaction_xrefs = []

    def _load_and_process_data(self):

        # filepath = self.data_type_config.get_single_filepath()
        # Temporary fix for 3.0 release.
        filepath = 'tmp/alliance_molecular_interactions.tsv'

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.main_query_template, commit_size, "mol_int_data.csv"],
            [self.xref_query_template, commit_size, "mol_int_xref.csv"],
            [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    @staticmethod
    def populate_genes():
        """Populate Genes."""
        master_gene_set = set()

        query = "MATCH (g:Gene) RETURN g.primaryKey"

        result = Neo4jHelper().run_single_query(query)

        for record in result:
            master_gene_set.add(record['g.primaryKey'])

        return master_gene_set

    @staticmethod
    def query_crossreferences(crossref_prefix):
        """Query Cross References."""
        query = """MATCH (g:Gene)-[C:CROSS_REFERENCE]-(cr:CrossReference)
                   WHERE cr.prefix = {parameter}
                   RETURN g.primaryKey, cr.globalCrossRefId"""
        return Neo4jHelper().run_single_parameter_query(query, crossref_prefix)

    def populate_crossreference_dictionary(self):
        """Populate the crossreference dictionary.

        We're populating a rather large dictionary to use for looking up Alliance genes by
        their crossreferences.
        Edit the list below if you'd like to add more crossreferences to the dictionary.
        The key of the dictionary is the crossreference and the value is the Alliance
        gene to which it resolves.
        """
        master_crossreference_dictionary = dict()

        # If additional crossreferences need to be used to find interactors, they can be added here.
        # Use the crossreference prefix as the dictionary name.
        # Also add a regex entry to the resolve_identifier function.
        master_crossreference_dictionary['UniProtKB'] = dict()
        master_crossreference_dictionary['ENSEMBL'] = dict()
        master_crossreference_dictionary['NCBI_Gene'] = dict()
        master_crossreference_dictionary['RefSeq'] = dict()

        for key in master_crossreference_dictionary:
            self.logger.info('Querying for %s cross references.', key)
            result = self.query_crossreferences(key)
            for record in result:
                cross_ref_record = None
                # Modify the cross reference ID to match the PSI MITAB format if necessary.
                # So far, this is just converting 'NCBI_Gene' to 'entrez gene/locuslink'.
                if record['cr.globalCrossRefId'].startswith('NCBI_Gene'):
                    cross_ref_record_split = record['cr.globalCrossRefId'].split(':')[1]
                    cross_ref_record = 'entrez gene/locuslink:' + cross_ref_record_split
                else:
                    cross_ref_record = record['cr.globalCrossRefId']

                # The crossreference dictionary is a list of genes
                # linked to a single crossreference.
                # Append the gene if the crossref dict entry exists.
                # Otherwise, create a list and append the entry.
                if cross_ref_record.lower() in master_crossreference_dictionary[key]:
                    master_crossreference_dictionary[key][cross_ref_record.lower()].append(record['g.primaryKey'])
                else:
                    master_crossreference_dictionary[key][cross_ref_record.lower()] = []
                    master_crossreference_dictionary[key][cross_ref_record.lower()].append(record['g.primaryKey'])

                # The ids in PSI-MITAB files are lower case, hence the .lower() used above.

        return master_crossreference_dictionary

    def process_interaction_identifier(self, entry, additional_row):
        """Create cross references for all the external identifiers."""
        xref_main_list = []
        entries = None

        # Identifier types on this list DO NOT receive a
        # cross_ref_complete_url field for external linking.
        ignored_identifier_database_list = [
            # The following entries are not currently required.
            'brenda',
            'bmrb',
            'cell ontology',
            'chebi',
            'chembl compound',
            'efo',
            'flannotator',
            'intenz',
            'interpro',
            'mpidb',
            'omim',
            'pdbj',
            'pmc',
            'pride',
            'prints',
            'proteomexchange',
            'psi-mi',
            'pubmed',
            'go',
            'reactome',
            'refseq',
            'tissue list',
            'uniprotkb'
        ]

        if '|' in entry:
            entries = entry.split('|')
        else:
            entries = [entry]

        for individual in entries:
            """These links are for the individual interaction identifiers and link to the respective database."""
            xref_dict = {}
            page = 'gene/interactions'

            individual_prefix, individual_body, _ = self.etlh.rdh2.split_identifier(individual)
            # Capitalize the prefix to match the YAML
            # and change the prefix if necessary to match the YAML.
            xref_dict['prefix'] = individual_prefix
            xref_dict['localId'] = individual_body

            # Special case for dealing with FlyBase.
            # The identifier link needs to use row 25 from the psi-mitab file.
            # TODO Regex to check for FBig in additional_row?
            if individual.startswith('flybase:FBrf'):
                if '|' in additional_row:
                    individual = additional_row.split('|')[0]
                else:
                    individual = additional_row

                regex_check = re.match('^flybase:FBig\\d{10}$', individual)
                if regex_check is None:
                    self.logger.critical(
                        """Fatal Error: During special handling of FlyBase molecular interaction
                           links, an FBig ID was not found.""")
                    self.logger.critical('Failed identifier: %s', individual)
                    self.logger.critical('PSI-MITAB row entry: %s', additional_row)
                    sys.exit(-1)

            # TODO Optimize and re-add this error tracking.
            if not individual.startswith(tuple(ignored_identifier_database_list)):
                try:
                    individual_url = self.etlh.rdh2.return_url_from_key_value(individual_prefix, individual_body, page)
                    xref_dict['crossRefCompleteUrl'] = individual_url
                except KeyError:
                    pass

            xref_dict['uuid'] = str(uuid.uuid4())
            xref_dict['globalCrossRefId'] = individual
            xref_dict['id'] = individual  # Used for name.
            xref_dict['displayName'] = individual_body
            xref_dict['primaryKey'] = individual
            xref_dict['crossRefType'] = 'interaction'
            xref_dict['page'] = page
            xref_dict['reference_uuid'] = None  # For association interactions (later).

            # Special case for FlyBase as "individual" is not unique in their case.
            # Individual_body needs to be used instead.

            if individual.startswith('flybase'):
                xref_dict['primaryKey'] = individual_body
            xref_main_list.append(xref_dict)

        return xref_main_list

    def add_mod_interaction_links(self, gene_id):
        """Create an XREF linking back to interaction pages at each MOD for a particular gene.
        These links appear at the top of the molecular interactions table once per gene page.
        """
        xref_dict = {}
        page = 'gene/MODinteractions'

        individual_prefix, individual_body, _ = self.etlh.rdh2.split_identifier(gene_id)
        individual_url = self.etlh.rdh2.return_url_from_identifier(gene_id, page)

        # Exception for MGI
        if individual_prefix == 'MGI':
            xref_dict['displayName'] = gene_id
            xref_dict['id'] = gene_id
            xref_dict['globalCrossRefId'] = gene_id
            xref_dict['primaryKey'] = gene_id + page
        else:
            xref_dict['displayName'] = individual_body
            xref_dict['id'] = individual_body
            xref_dict['globalCrossRefId'] = individual_body
            xref_dict['primaryKey'] = individual_body + page

        xref_dict['prefix'] = individual_prefix
        xref_dict['localId'] = individual_body
        xref_dict['crossRefCompleteUrl'] = individual_url
        xref_dict['uuid'] = str(uuid.uuid4())
        xref_dict['crossRefType'] = page
        xref_dict['page'] = page
        xref_dict['reference_uuid'] = str(uuid.uuid4())

#       For matching to the gene when creating the xref relationship in Neo.
        xref_dict['dataId'] = gene_id
        # Add the gene_id of the identifier to a global list so we don't create unnecessary xrefs.
        self.successful_mod_interaction_xrefs.append(gene_id)

        return xref_dict

    def resolve_identifiers_by_row(self, row, master_gene_set, master_crossreference_dictionary):
        """Resolve Identifiers by Row."""
        interactor_a_rows = [0, 2, 4, 22]
        interactor_b_rows = [1, 3, 5, 23]

        interactor_a_resolved = None
        interactor_b_resolved = None

        for row_entry in interactor_a_rows:
            try:
                # We need to change uniprot/swiss-prot to uniprotkb for interactor a and b.
                # This is the only current prefix adjustment.
                # If we need to do more, we should break this out into a function or small piece of code.
                interactor_a = row[row_entry].replace("uniprot/swiss-prot:", "uniprotkb:")
                interactor_a_resolved = self.resolve_identifier(interactor_a,
                                                                master_gene_set,
                                                                master_crossreference_dictionary)
                if interactor_a_resolved is not None:
                    break
            except IndexError:  # Biogrid has less rows than other files, continue on IndexErrors.
                continue

        for row_entry in interactor_b_rows:
            try:
                interactor_b = row[row_entry].replace("uniprot/swiss-prot:", "uniprotkb:")
                interactor_b_resolved = self.resolve_identifier(interactor_b,
                                                                master_gene_set,
                                                                master_crossreference_dictionary)
                if interactor_b_resolved is not None:
                    break
            except IndexError:  # Biogrid has less rows than other files, continue on IndexErrors.
                continue

        return interactor_a_resolved, interactor_b_resolved

    def resolve_identifier(self, row_entry, master_gene_set, master_crossreference_dictionary):  # noqa
        """Resolve Identifier."""
        list_of_crossref_regex_to_search = [
            'uniprotkb:[\\w\\d_-]*$',
            'ensembl:[\\w\\d_-]*$',
            'entrez gene/locuslink:.*',
            'refseq:[\\w\\d_-]*$'
        ]

        # If we're dealing with multiple identifiers separated by a pipe.
        if '|' in row_entry:
            row_entries = row_entry.split('|')
        else:
            row_entries = [row_entry]

        for individual_entry in row_entries:

            # For use in wormbase / flybase lookups.
            # If we run into an IndexError, there's no identifier to resolve and we return False.
            # All valid identifiers in the PSI-MI TAB file should be "splittable".
            try:
                entry_stripped = individual_entry.split(':')[1]
            except IndexError:
                return None

            if individual_entry.startswith('uniprotkb:'):
                individual_entry = individual_entry.split('-')[0]

            prefixed_identifier = None

            # TODO implement regex for WB / FB gene identifiers.
            if entry_stripped.startswith('WB'):
                prefixed_identifier = 'WB:' + entry_stripped
                if prefixed_identifier in master_gene_set:
                    return [prefixed_identifier]  # Always return a list for later processing.
                return None
            # TODO implement regex for WB / FB gene identifiers.
            elif entry_stripped.startswith('FB'):
                prefixed_identifier = 'FB:' + entry_stripped
                if prefixed_identifier in master_gene_set:
                    return [prefixed_identifier]  # Always return a list for later processing.
                return None

            for regex_entry in list_of_crossref_regex_to_search:
                regex_output = re.findall(regex_entry, individual_entry)
                if regex_output is not None:
                    # We might have multiple regex matches.
                    # Search them all against our crossreferences.
                    for regex_match in regex_output:
                        identifier = regex_match
                        for crossreference_type in master_crossreference_dictionary.keys():
                            # Using lowercase in the identifier to be consistent
                            # with Alliance lowercase identifiers.
                            if identifier.lower() in \
                                     master_crossreference_dictionary[crossreference_type]:
                                # Return the corresponding Alliance gene(s).
                                return master_crossreference_dictionary[crossreference_type][identifier.lower()]
        # If we can't resolve any of the crossReferences, return None

        # print('Could not resolve identifiers.')
        # print(row_entries)

        return None

    def get_generators(self, filepath, batch_size):  # noqa
        """Get Generators."""
        list_to_yield = []
        xref_list_to_yield = []
        mod_xref_list_to_yield = []

        # TODO Taxon species needs to be pulled out into a standalone
        # module to be used by other scripts.
        # TODO External configuration script for these types of filters?
        # Not a fan of hard-coding.

        # Populate our master dictionary for resolving cross references.
        master_crossreference_dictionary = self.populate_crossreference_dictionary()
        self.logger.info('Obtained the following number of cross references from Neo4j:')
        for entry in master_crossreference_dictionary:
            self.logger.info('%s: %s', entry, len(master_crossreference_dictionary[entry]))

        # Populate our master gene set for filtering Alliance genes.
        master_gene_set = self.populate_genes()
        self.logger.info('Obtained %s gene primary ids from Neo4j.', len(master_gene_set))

        resolved_a_b_count = 0
        unresolved_a_b_count = 0
        total_interactions_loaded_count = 0
        unresolved_publication_count = 0

        # Used for debugging.
        # unresolved_entries = []
        # unresolved_crossref_set = set()

        self.logger.info('Attempting to open %s', filepath)
        with open(filepath, 'r', encoding='utf-8') as tsvin:
            tsvin = csv.reader(tsvin, delimiter='\t')
            counter = 0
            total_counter = 0
            for row in tsvin:
                counter += 1
                total_counter += 1
                if total_counter % 100000 == 0:
                    self.logger.info('Processing row %s.', total_counter)

                # Skip commented rows.
                if row[0].startswith('#'):
                    continue

                taxon_id_1 = row[9]
                taxon_id_2 = row[10]

                # After we pass all our filtering / continue opportunities,
                # we start working with the variables.
                taxon_id_1_re = re.search(r'\d+', taxon_id_1)
                taxon_id_1_to_load = 'NCBITaxon:' + taxon_id_1_re.group(0)

                taxon_id_2_to_load = None
                if taxon_id_2 != '-':
                    taxon_id_2_re = re.search(r'\d+', taxon_id_2)
                    taxon_id_2_to_load = 'NCBITaxon:' + taxon_id_2_re.group(0)
                else:
                    taxon_id_2_to_load = taxon_id_1_to_load  # self interaction

                try:
                    # Interactor ID for the UI table
                    identifier_linkout_list = self.process_interaction_identifier(row[13], row[24])
                except IndexError:
                    # Interactor ID for the UI table
                    identifier_linkout_list = self.process_interaction_identifier(row[13], None)
                source_database = None

                # grab the MI identifier between two quotes ""
                source_database = re.findall(r'"([^"]*)"', row[12])[0]

                # database_linkout_set.add(source_database)

                aggregation_database = 'MI:0670'  # IMEx

                if source_database == 'MI:0478':  # FlyBase
                    aggregation_database = 'MI:0478'
                elif source_database == 'MI:0487':  # WormBase
                    aggregation_database = 'MI:0487'
                elif source_database == 'MI:0463':  # BioGRID
                    aggregation_database = 'MI:0463'

                detection_method = 'MI:0686'  # Default to unspecified.
                try:
                    # grab the MI identifier between two quotes ""
                    detection_method = re.findall(r'"([^"]*)"', row[6])[0]
                except IndexError:
                    pass  # Default to unspecified, see above.

                # TODO Replace this publication work with a service.
                # Re-think publication implementation in Neo4j.
                publication = None
                publication_url = None

                if row[8] != '-':
                    # Check for pubmed publication.
                    publication_re = re.search(r'pubmed:\d+', row[8])
                    if publication_re is not None:
                        publication = publication_re.group(0)  # matching bit
                        publication = publication.replace('pubmed', 'PMID')
                        publication_url = self.etlh.rdh2.return_url_from_identifier(publication)
                    elif publication_re is None:
                        # If we can't find a pubmed publication, check for DOI.
                        publication_re = re.search(r'^(DOI\:)?\d{2}\.\d{4}.*$', row[8])
                        # e.g. DOI:10.1101/2020.03.31.019216
                        if publication_re is not None:
                            publication = publication_re.group(0)
                            publication = publication.replace('DOI', 'doi')
                            publication_url = self.etlh.rdh2.return_url_from_identifier(publication)
                    else:
                        unresolved_publication_count += 1
                        continue
                else:
                    unresolved_publication_count += 1
                    continue

                # Other hardcoded values to be used for now.
                interactor_a_role = 'MI:0499'  # Default to unspecified.
                interactor_b_role = 'MI:0499'  # Default to unspecified.
                interactor_a_type = 'MI:0499'  # Default to unspecified.
                interactor_b_type = 'MI:0499'  # Default to unspecified.

                try:
                    interactor_a_role = re.findall(r'"([^"]*)"', row[18])[0]
                except IndexError:
                    pass  # Default to unspecified, see above.
                try:
                    interactor_b_role = re.findall(r'"([^"]*)"', row[19])[0]
                except IndexError:
                    pass  # Default to unspecified, see above.

                try:
                    interactor_a_type = re.findall(r'"([^"]*)"', row[20])[0]
                except IndexError:
                    pass  # Default to unspecified, see above.

                try:
                    interactor_b_type = re.findall(r'"([^"]*)"', row[21])[0]
                except IndexError:
                    pass  # Default to unspecified, see above.

                interaction_type = None
                interaction_type = re.findall(r'"([^"]*)"', row[11])[0]

                interactor_a_resolved = None
                interactor_b_resolved = None

                interactor_a_resolved, interactor_b_resolved = self.resolve_identifiers_by_row(
                    row,
                    master_gene_set,
                    master_crossreference_dictionary)

                if interactor_a_resolved is None or interactor_b_resolved is None:
                    unresolved_a_b_count += 1  # Tracking unresolved identifiers.

                    # Uncomment the line below for debugging.
                    # unresolved_entries.append([row[0], interactor_a_resolved, row[1], interactor_b_resolved, row[8]])
                    # if interactor_a_resolved is None:
                    #     unresolved_crossref_set.add(row[0])
                    # if interactor_b_resolved is None:
                    #     unresolved_crossref_set.add(row[1])

                    continue  # Skip this entry.

                mol_int_dataset = {
                    'interactor_A': None,
                    'interactor_B': None,
                    'interactor_A_type': interactor_a_type,
                    'interactor_B_type': interactor_b_type,
                    'interactor_A_role': interactor_a_role,
                    'interactor_B_role': interactor_b_role,
                    'interaction_type': interaction_type,
                    'taxon_id_1': taxon_id_1_to_load,
                    'taxon_id_2': taxon_id_2_to_load,
                    'detection_method': detection_method,
                    'pub_med_id': publication,
                    'pub_med_url': publication_url,
                    'uuid': None,
                    'source_database': source_database,
                    'aggregation_database':  aggregation_database
                }

                # Remove possible duplicates from interactor lists.
                interactor_a_resolved_no_dupes = list(set(interactor_a_resolved))
                interactor_b_resolved_no_dupes = list(set(interactor_b_resolved))

                # Get every possible combination of interactor A x interactor B
                # (if multiple ids resulted from resolving the identifier.)
                int_combos = list(itertools.product(interactor_a_resolved_no_dupes,
                                                    interactor_b_resolved_no_dupes))

                # Update the dictionary with every possible combination of
                # interactor A x interactor B.
                list_of_mol_int_dataset = [dict(mol_int_dataset,
                                                interactor_A=x,
                                                interactor_B=y,
                                                uuid=str(uuid.uuid4())) for x, y in int_combos]
                # Tracking successfully loaded identifiers.
                total_interactions_loaded_count += len(list_of_mol_int_dataset)
                # Tracking successfully resolved identifiers.
                resolved_a_b_count += 1
                # We need to also create new crossreference dicts for every
                # new possible interaction combination.
                new_identifier_linkout_list = []
                for dataset_entry in list_of_mol_int_dataset:
                    for identifier_linkout in identifier_linkout_list:
                        new_identifier_linkout_list.append(
                            dict(identifier_linkout,
                                 reference_uuid=dataset_entry['uuid']))

                # Create dictionaries for xrefs from Alliance genes
                # to MOD interaction sections of gene reports.
                for primary_gene_to_link in interactor_a_resolved_no_dupes:
                    # We have the potential for numerous duplicate xrefs.
                    # Check whether we've made this xref previously by looking in a list.
                    # Should cut down loading time for Neo4j significantly.
                    # Hopefully the lookup is not too long -- this should be refined if it's slow.
                    # Ignore ZFIN interaction pages and REFSEQ.
                    if not primary_gene_to_link.startswith('ZFIN') and not primary_gene_to_link.startswith('RefSeq'):
                        if primary_gene_to_link not in self.successful_mod_interaction_xrefs:
                            mod_xref_dataset = self.add_mod_interaction_links(primary_gene_to_link)
                            mod_xref_list_to_yield.append(mod_xref_dataset)

                # Establishes the number of entries to yield (return) at a time.
                xref_list_to_yield.extend(new_identifier_linkout_list)
                list_to_yield.extend(list_of_mol_int_dataset)

                if counter == batch_size:
                    counter = 0
                    yield list_to_yield, xref_list_to_yield, mod_xref_list_to_yield
                    list_to_yield = []
                    xref_list_to_yield = []
                    mod_xref_list_to_yield = []

            if counter > 0:
                yield list_to_yield, xref_list_to_yield, mod_xref_list_to_yield

        # TODO Clean up the set output.
        # for entry in unresolved_entries:
        #     self.logger.info(*entry)

        # self.logger.info('A set of unique unresolvable cross references:')
        # for unique_entry in unresolved_crossref_set:
        #     self.logger.info(unique_entry)

        self.logger.info('Resolved identifiers for %s PSI-MITAB interactions.',
                         resolved_a_b_count)
        self.logger.info('Prepared to load %s total interactions %s.',
                         total_interactions_loaded_count,
                         '(accounting for multiple possible identifier resolutions)')

        self.logger.info('Note: Interactions missing valid publications will be skipped, even if their identifiers'
                         ' resolve correctly.')

        self.logger.info('Could not resolve [and subsequently will not load] '
                         '{} interactions due to missing publications.'.format(unresolved_publication_count))

        self.logger.info('Could not resolve [and subsequently will not load] %s interactions due to unresolved'
                         ' identifiers.',
                         unresolved_a_b_count)