Ejemplo n.º 1
0
    def read_file(self):
        oboParser = OboParser()
        with FileReader.open_file(self.in_path) as file:
            df = oboParser.obo_to_df(file, self.quadruple_list)
            df_cols = df.columns
            defined_cols = [x[3] for x in self.quadruple_list]
            if len(df_cols) != len(defined_cols):

                no_occurences = [x for x in defined_cols if x not in df_cols]
                info_string = "Reader %s should parse %s but there are no occurrences in file %s. " % (
                    str(self.readerType),
                    str(no_occurences),
                    self.in_path,
                )
                if globalConfig.INTERACTIVE_MODE:
                    ask_continue_string = info_string + "Continue if you do not need these edges in your graph"
                    if globConst.GUI_MODE:
                        from openbiolink.gui import gui

                        gui.askForExit(ask_continue_string)
                    else:
                        Cli.ask_for_exit(ask_continue_string)
                    for col in no_occurences:
                        df[col] = np.nan
                else:
                    logging.error(info_string)
                    sys.exit(info_string)

            return df
Ejemplo n.º 2
0
    def create_nodes_and_edges (self, edge_metadata, tn= None):
        if not os.path.isfile(edge_metadata.edges_file_path):
            message ='File does not exist: %s ! Edgetype %s will not be created' %(edge_metadata.edges_file_path, str(edge_metadata.edgeType))
            if globalConfig.INTERACTIVE_MODE:
                if globConst.GUI_MODE:
                    from openbiolink.gui import gui
                    gui.askForExit(message)
                else:
                    Cli.ask_for_exit(message)
            else:
                logging.error(message)
            return set(), set(), set()

        # --- mapping ---
        mapping1 = utils.db_mapping_file_to_dic(edge_metadata.mapping1_file, edge_metadata.map1_sourceindex, edge_metadata.map1_targetindex)
        mapping2 = utils.db_mapping_file_to_dic(edge_metadata.mapping2_file, edge_metadata.map2_sourceindex, edge_metadata.map2_targetindex)
        altid_mapping1 = utils.db_mapping_file_to_dic(edge_metadata.altid_mapping1_file, edge_metadata.altid_map1_sourceindex, edge_metadata.altid_map1_targetindex)
        altid_mapping2 = utils.db_mapping_file_to_dic(edge_metadata.altid_mapping2_file, edge_metadata.altid_map2_sourceindex, edge_metadata.altid_map2_targetindex)

        for mapping in [edge_metadata.mapping1_file, edge_metadata.mapping2_file, edge_metadata.altid_mapping1_file, edge_metadata.altid_mapping2_file]:
            if mapping is not None:
                infile_folder = os.path.join(globConst.WORKING_DIR, gcConst.IN_FILE_FOLDER_NAME)
                mapping_path = os.path.join(infile_folder, mapping)
                if not os.path.isfile(mapping_path):
                    message = 'File does not exist: %s ! Edgetype %s will not be created' % (
                    edge_metadata.edges_file_path, str(edge_metadata.edgeType))
                    if globalConfig.INTERACTIVE_MODE:
                        if globConst.GUI_MODE:
                            from openbiolink.gui import gui
                            gui.askForExit(message)
                        else:
                            Cli.ask_for_exit(message)
                    else:
                        logging.error(message)
                    return set(), set(), set()

        # --- edges ---
        nodes1 = set()
        nodes2 = set()
        edges = set()
        ids1_no_mapping = set()
        ids2_no_mapping = set()
        ids1 = set()
        ids2 = set()
        nr_edges = 0
        nr_edges_return_dir = 0
        nr_edges_after_mapping = 0
        nr_edges_incl_dup = 0
        nr_edges_below_cutoff = 0
        nr_edges_no_mapping = 0

        no_cutoff_defined = edge_metadata.cutoff_num is None and edge_metadata.cutoff_txt is None

        with open(edge_metadata.edges_file_path, "r", encoding="utf8") as edge_content:

            reader = csv.reader(edge_content, delimiter = ";")

            for row in reader:
                raw_id1 = row[edge_metadata.colindex1]
                raw_id2 = row[edge_metadata.colindex2]
                if edge_metadata.colindex_qscore is not None:
                    qscore = row[edge_metadata.colindex_qscore]
                else:
                    qscore = None
                edge_id1 = None
                edge_id2 = None
                ids1.add(raw_id1)
                ids2.add(raw_id2)

                #apply mapping
                if (edge_metadata.mapping1_file is not None and raw_id1 in mapping1):
                    edge_id1 = mapping1.get(raw_id1)
                elif(edge_metadata.mapping1_file is None):
                    edge_id1 = [raw_id1]
                if (edge_metadata.mapping2_file is not None and raw_id2 in mapping2):
                    edge_id2 = mapping2.get(raw_id2)
                elif (edge_metadata.mapping2_file is None):
                    edge_id2 = [raw_id2]

                #if mapped successfully
                if edge_id1 is not None and edge_id2 is not None:
                    for id1 in edge_id1:
                        #apply alt_id mapping 1
                        if (edge_metadata.altid_mapping1_file is not None and id1 in altid_mapping1):
                            id1 = altid_mapping1[id1][0] #there should only be one
                        for id2 in edge_id2:
                            # apply alt_id mapping 2
                            if (edge_metadata.altid_mapping2_file is not None and id2 in altid_mapping2):
                                id2 = altid_mapping2[id2][0] #there should only be one
                            #check for quality cutoff
                            within_num_cutoff= edge_metadata.cutoff_num is not None and float(qscore) > edge_metadata.cutoff_num
                            within_text_cutoff = edge_metadata.cutoff_txt is not None and qscore not in edge_metadata.cutoff_txt
                            if no_cutoff_defined or within_num_cutoff or within_text_cutoff:
                                bimeg_id1 = edge_metadata.node1_type.name + '_' + id1
                                bimeg_id2 = edge_metadata.node2_type.name + '_' + id2
                                edges.add(Edge(bimeg_id1, edge_metadata.edgeType, bimeg_id2, None, qscore))
                                # add an edge in the other direction when edge is undirectional and graph is directional
                                if (not edge_metadata.is_directional) and graphProp.DIRECTED:
                                    edges.add(Edge(bimeg_id2, edge_metadata.edgeType, bimeg_id1, None, qscore))
                                    nr_edges_incl_dup += 1
                                    nr_edges_return_dir+=1
                                nodes1.add(Node(bimeg_id1, edge_metadata.node1_type))
                                nodes2.add(Node(bimeg_id2, edge_metadata.node2_type))

                                nr_edges_incl_dup += 1
                            else:
                                nr_edges_below_cutoff += 1

                #if not mapped successfully
                else:
                    nr_edges_no_mapping += 1
                    if (edge_id1 is None and edge_metadata.mapping1_file is not None):
                        ids1_no_mapping.add(raw_id1)
                    if (edge_id2 is None and edge_metadata.mapping2_file is not None):
                        ids2_no_mapping.add(raw_id2)
                nr_edges += 1

        nr_edges_after_mapping = len(edges)

        if not no_cutoff_defined and nr_edges_below_cutoff==0:
            logging.warning("No edges of type %s were cut off by quality cutoff, maybe the metric has changed?" %edge_metadata.edgeType.name)
        if nr_edges_after_mapping==0:
            logging.warning("No edges of type %s are left after mapping and cutoff!"%edge_metadata.edgeType.name)

        # print statistics
        stats_dic = {
            'edge_type': edge_metadata.edgeType,
            'node1_type': edge_metadata.node1_type,
            'node2_type': edge_metadata.node2_type,
            'nr_edges': nr_edges,
            'nr_edges_below_cutoff': nr_edges_below_cutoff,
            'nr_edges_no_mapping': nr_edges_no_mapping,
            'nr_edges_after_mapping': nr_edges_after_mapping,
            'nr_edges_incl_dup': nr_edges_incl_dup,
            'nr_edges_return_dir': nr_edges_return_dir,
            'ids1_no_mapping': ids1_no_mapping,
            'ids2_no_mapping': ids2_no_mapping,
            'ids1':ids1,
            'ids2': ids2
        }
        self.print_graph_stats(stats_dic, tn)

        return nodes1, nodes2, edges
Ejemplo n.º 3
0
    def init_custom_sources_bottom_up(self, use_db_metdata_classes):
        """helper __init__ function for custom db_metadata_classes"""

        self.db_file_metadata = []

        # remove dbMetadata from list
        # make sure to use instances of classes
        for x in use_db_metdata_classes:
            if inspect.isclass(x):
                self.db_file_metadata.append(x())
            else:
                self.db_file_metadata.append(x)

        # remove readers
        keep_dbType = [x.dbType for x in self.db_file_metadata]
        logging.info("readers removed: " + str([
            x.__class__.__name__
            for x in self.file_readers if x.dbType not in keep_dbType
        ]))
        self.file_readers = [
            x for x in self.file_readers if x.dbType in keep_dbType
        ]
        self.dbType_reader_map = utils.cls_list_to_dic(self.file_readers,
                                                       "dbType")

        # remove processors
        keep_readerType = [x.readerType for x in self.file_readers]
        logging.info("processors removed: %s" % (str([
            x.__class__.__name__ for x in self.file_processors
            if x.readerType not in keep_readerType
        ])))
        self.file_processors = [
            x for x in self.file_processors if x.readerType in keep_readerType
        ]
        self.readerType_processor_map = utils.cls_list_to_dic(
            self.file_processors, "readerType")

        # remove infile metadata
        keep_infileType = [x.infileType for x in self.file_processors]
        logging.info("processors removed: " + str([
            x.__class__.__name__ for x in self.infile_metadata
            if x.infileType not in keep_infileType
        ]))
        self.infile_metadata = [
            x for x in self.infile_metadata if x.infileType in keep_infileType
        ]
        self.infileType_inMetadata_map = {
            x.infileType: x
            for x in self.infile_metadata
        }

        # remove edge metadata
        logging.info("edges removed: " + str([
            x.__class__.__name__
            for x in self.edge_metadata + self.tn_edge_metadata
            if x.EDGE_INMETA_CLASS.INFILE_TYPE not in keep_infileType
        ]))
        self.edge_metadata = [
            x for x in self.edge_metadata
            if x.EDGE_INMETA_CLASS.INFILE_TYPE in keep_infileType
        ]
        self.tn_edge_metadata = [
            x for x in self.tn_edge_metadata
            if x.EDGE_INMETA_CLASS.INFILE_TYPE in keep_infileType
        ]

        # check for deleted dependencies of mappings
        additional_remove_metaEdges = []
        additional_remove_mapping_infileType = []
        for metaEdge in self.edge_metadata + self.tn_edge_metadata:
            mappings = [
                metaEdge.MAP1_META_CLASS,
                metaEdge.MAP2_META_CLASS,
                metaEdge.MAP1_ALT_ID_META_CLASS,
                metaEdge.MAP2_ALT_ID_META_CLASS,
            ]
            for mapping in mappings:

                if mapping is not None and mapping.INFILE_TYPE not in keep_infileType:
                    additional_remove_metaEdges.append(metaEdge)
                    additional_remove_mapping_infileType.append(
                        mapping.INFILE_TYPE)
        if len(additional_remove_metaEdges) > 0:
            message = (
                "\nDue to manual exclusion of DB resources, also the edges: %s\n "
                "will be removed due to deleted dependencies of used mappings (i.e. %s\n "
                "Consider manually exclude edges instead of DB resources." % (
                    str([
                        x.__class__.__name__
                        for x in additional_remove_metaEdges
                    ]),
                    str([str(x)
                         for x in additional_remove_mapping_infileType]),
                ))
            logging.warning(message)
            if globConst.GUI_MODE:
                from openbiolink.gui import gui

                gui.askForExit(message)
            elif globConst.INTERACTIVE_MODE:
                Cli.ask_for_exit(message)
            else:
                sys.exit()

            self.edge_metadata = [
                x for x in self.edge_metadata
                if x not in additional_remove_metaEdges
            ]
            self.tn_edge_metadata = [
                x for x in self.tn_edge_metadata
                if x not in additional_remove_metaEdges
            ]