Exemple #1
0
# Create a Protein Disease graph from the DB adapter 'OlegDB'

from ProteinGraphML.DataAdapter import OlegDB
from ProteinGraphML.GraphTools import ProteinDiseaseAssociationGraph

## we construct a base map of protein to disease just by creating the ProteinDiseaseAs

dbAdapter = OlegDB()
proteinGraph = ProteinDiseaseAssociationGraph(dbAdapter)

## the 'ProteinDiseaseAssociationGraph' object has helper methods, but we can also access the networkx graph directly it is created with:

print('Total nodes: %d' % len(proteinGraph.graph.nodes))

## we will want to filter by the proteins we are interested in, this list comes from a DB adapter, but any set will do
proteins = dbAdapter.loadTotalProteinList().protein_id
filterByProteins = set(proteins)

# using .attach will add edges from a DB as defined by the adapter,
# with this method we can create a graph of data, which can itself be saved, prevents the
# need from, rebuilding as we work on different diseases, perform analysis
# We've also filter by proteins we care about, in this case it is our original list

proteinGraph.attach(dbAdapter.loadPPI(filterByProteins))
proteinGraph.attach(dbAdapter.loadKegg(filterByProteins))
proteinGraph.attach(dbAdapter.loadReactome(filterByProteins))
proteinGraph.attach(dbAdapter.loadInterpro(filterByProteins))
proteinGraph.attach(dbAdapter.loadGo(filterByProteins))

# networkx provides an api we can nodes from \n",
# here i exploit the unique features of each node to count them\n",
Exemple #2
0
    parser.add_argument("-v", "--verbose", action="count", default=0)
    args = parser.parse_args()

    logging.basicConfig(format="%(levelname)s:%(message)s", level=(logging.DEBUG if args.verbose > 1 else logging.INFO))

    if not args.sources:
        parser.error("--sources required.")

    sources = re.split("[, ]+", args.sources.strip())
    if len(set(sources) - set(SOURCES)) > 0:
        parser.error("Invalid sources: {0}".format(','.join(list(set(sources) - set(SOURCES)))))

    t0 = time.time()

    # Make TCRD as the default DB
    dbad = OlegDB() if args.db == "olegdb" else TCRD()

    if "gtex" in sources:
        ofile_gtex = args.outputdir + "/gtex.tsv"
        logging.info("GTEX: writing {0}".format(ofile_gtex))
        gtex = staticData.gtex(dbad)
        logging.info("GTEX: rows: {0}; cols: {1}".format(gtex.shape[0], gtex.shape[1]))
        gtex.round(args.decimals).to_csv(ofile_gtex, "\t", index=True)
        logging.info('{0}: elapsed time: {1}'.format(os.path.basename(sys.argv[0]),
                                                     time.strftime('%Hh:%Mm:%Ss', time.gmtime(time.time() - t0))))

    if "hpa" in sources:
        ofile_hpa = args.outputdir + "/hpa.tsv"
        logging.info("HPA: writing {0}".format(ofile_hpa))
        hpa = staticData.hpa(dbad)
        logging.info("HPA: rows: {0}; cols: {1}".format(hpa.shape[0], hpa.shape[1]))
    #df['feature'] = df['feature'].map(processFeature)
    r = df.head(count).plot(kind='barh',
                            title=TITLE,
                            x='feature',
                            y='gain',
                            color='tomato',
                            legend=False,
                            figsize=(10, 12))
    r.set_xlabel('Importance')
    r.set_ylabel('Features')
    r.invert_yaxis()

    r.figure.savefig(FILETITLE + '.png', bbox_inches='tight')


dbAdapter = OlegDB()


def load_obj(name):
    with open('results/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)


importance = load_obj('firsty')

#loadedObject
labelMap = convertLabels(importance.keys(), dbAdapter, selectAsDF, type='plot')

print('HERE ARE LABELS')

#importance = {'hsa01100': 0.31735258141642814, 'hsa04740': 0.2208299216149202, 'hsa05100': 0.1847905733996812, 'hsa04930': 0.10625980494746863, 'hsa04514': 0.047493659101048136, 'hsa04114': 0.03542724660274679, 'hsa04810': 0.03365848585388666, 'hsa04144': 0.030556051003490892}#{"MP_0000180":34,343:1.0,30001:0.3}
Exemple #4
0
numOfFeatures = argData['num']
diseaseName = argData['disease']

logging.info('Running visualization using file...{0}'.format(fileName))
#filePath = argData['dir'] + fileName #IMPORTANT: update this if folder name changes
tmpPath = fileName.split('/')[:-1]
filePath = '/'.join(i for i in tmpPath)

#fetch the saved important features
importance = load_obj(fileName)
#importance = Counter({'R-HSA-927802': 0.31735258141642814, 'hsa04740': 0.2208299216149202, 'hsa05100': 0.1847905733996812, 'hsa04930': 0.10625980494746863, 'hsa04514': 0.047493659101048136, 'hsa04114': 0.03542724660274679, 'hsa04810': 0.03365848585388666, 'hsa04144': 0.030556051003490892})


#access the database to get the description of important features
#dbAdapter = OlegDB()
dbAdapter = TCRD() if argData['db'] == "tcrd" else OlegDB()
#labelMap = convertLabels(importance.keys(),dbAdapter,selectAsDF,type='plot')

if True:
	currentGraph = ProteinDiseaseAssociationGraph.load(argData['kgfile'])

	# for the graph, we need the original importance 
	for imp in importance.most_common(numOfFeatures):
		print(imp)
		Visualize(imp, currentGraph.graph, diseaseName, filePath, dbAdapter=dbAdapter) #g,currentGraph.graph,Disease)
		#break

#newSet = {}
#for key in importance.keys():
#	newSet[labelMap[key]] = importance[key]
Exemple #5
0
def Visualize(importance, graph, disease, resultDir, dbAdapter=None):
    #print(resultDir)
    #exit()

    #get shortest paths
    # make the graph, dump it to JSON, save that in an HTML template with our formatting
    #firstFeature = importance.most_common()[4]
    #print(firstFeature[0])
    firstFeature = importance

    # this parameter will change based on the features ... we will need the name saving ability here...
    # maybe we can test the disease list feature as well

    nodesInGraph = set()

    #cutoff = 3

    #print(firstFeature)

    middleNode = firstFeature[0]
    if ProteinInteractionNode.isThisNode(firstFeature[0]):
        #print("THIS IS INT",firstFeature[0])
        middleNode = int(middleNode)
        #cutoff = 2

    cutoff = FindCutoff(graph, disease, middleNode)

    print("here is the cuttoffe", cutoff)
    for path in nx.all_simple_paths(graph,
                                    source=disease,
                                    target=middleNode,
                                    cutoff=cutoff):
        #print(path)
        nodesInGraph |= set(path)

    #print('hehe',list(nx.all_simple_paths(graph, source=disease, target=firstFeature[0], cutoff=4)))

    #if len(list(nx.all_simple_paths(graph, source=disease, target=middleNode, cutoff=cutoff))) == 0:
    #	raise Exception('WARNING- {0} cannot be visualized to {1}, no paths'.format(middleNode,disease))

    #print("ALL",nodesInGraph)

    if dbAdapter is None:
        dbAdapter = OlegDB()

    niceLabels = convertLabels(list(nodesInGraph),
                               dbAdapter,
                               selectAsDF,
                               type='graph')
    #print(11268 in list(nodesInGraph))
    #print('11268' in list(nodesInGraph))

    finalGraph = graph.subgraph(list(nodesInGraph))
    #print(type(nx.cytoscape_data(finalGraph)),nx.cytoscape_data(finalGraph)
    #print(niceLabels[11268])
    cytoscapeDump = nx.cytoscape_data(finalGraph)
    #print("this is data",cytoscapeDump['data'])
    #print("this is E",cytoscapeDump['elements'])
    #print(len(cytoscapeDump['elements']['nodes']))
    #print(len(cytoscapeDump['elements']['edges']))

    #cytoscapeDump['elements']['edges'] = cytoscapeDump['elements']['edges'][:10]
    for k, n in enumerate(cytoscapeDump['elements']['nodes']):
        cytoscapeDump['elements']['nodes'][k]['data']['name'] = niceLabels[
            n['data']['value']]

        if ProteinInteractionNode.isThisNode(
                n['data']['value']):  #isinstance(n['data']['value'],int):
            cytoscapeDump['elements']['nodes'][k]['data']['value'] = str(
                n['data']['value'])

    dataOut = str(cytoscapeDump).replace("True",
                                         "true").replace("False", "false")[:-1]

    #for key in niceLabels.keys():
    #print(key,niceLabels[key])
    #dataOut = dataOut.replace(str(key),str(niceLabels[key]))

    header = """
		<style type="text/css">
		.disease {
			background-color: blue;
			color: blue;
		}
	</style>

	<!--cy.getElementById("GO:0016323").addClass()-->

	<script type="text/javascript" src=https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.7.3/cytoscape.min.js></script>
	<script src="https://unpkg.com/layout-base/layout-base.js"></script>
	<script src="https://unpkg.com/avsdf-base/avsdf-base.js"></script>
	<script type="text/javascript" src="https://ivis-at-bilkent.github.io/cytoscape.js-avsdf/cytoscape-avsdf.js"></script>

	<div id="cy" style="width:900px; height:750px; border-style: solid">


	</div>
        """ + firstFeature[0] + """
	<script type="text/javascript">
	data = 
	"""
    #,
    #  'style': [{
    #  selector: "node",
    #  css: {
    #    label: "data(name)"
    #   }
    #}]
    #};

    footer = """
	,'container':document.getElementById('cy')
	,
	'style': [{
      selector: "node",
      css: {
        label: "data(name)"
       }
		}]


	};

	var cy = cytoscape(data);

let options = {
  name: 'avsdf', //'breadthfirst',

  fit: false, // whether to fit the viewport to the graph
  directed: false, // whether the tree is directed downwards (or edges can point in any direction if false)
  padding: 30, // padding on fit
  nodeSep: 40,
  circle: false, // put depths in concentric circles if true, put depths top down if false
  grid: false, // whether to create an even grid into which the DAG is placed (circle:false only)
  spacingFactor: 1.75, // positive spacing factor, larger => more space between nodes (N.B. n/a if causes overlap)
  boundingBox: undefined, // constrain layout bounds; { x1, y1, x2, y2 } or { x1, y1, w, h }
  avoidOverlap: true, // prevents node overlap, may overflow boundingBox if not enough space
  nodeDimensionsIncludeLabels: false, // Excludes the label when calculating node bounding boxes for the layout algorithm
  roots: undefined, // the roots of the trees
  maximal: false, // whether to shift nodes down their natural BFS depths in order to avoid upwards edges (DAGS only)
  animate: false, // whether to transition the node positions
  animationDuration: 500, // duration of animation in ms if enabled
  animationEasing: undefined, // easing of animation if enabled,
  animateFilter: function ( node, i ){ return true; }, // a function that determines whether the node should be animated.  All nodes animated by default on animate enabled.  Non-animated nodes are positioned immediately when the layout starts
  ready: undefined, // callback on layoutready
  stop: undefined, // callback on layoutstop
  transform: function (node, position ){ return position; } // transform a given node position. Useful for changing flow direction in discrete layouts
};

cy.layout(options).run();

//cy.$('#j, #e').addClass('foo'); ## ADD A CLASS TO THE MP nodes, and their label


// * 
// here we can build a harness that will color the nodes, and set edge weights? 
// the more this map is annotated, tbe better 

function isMPNode(input) {
	if(input[0]+input[1] == "MP") {
		return true;
	}

	return false;
}
function isGoNode(input) {
	if(input[0]+input[1] == "GO") {
		return true;
	}

	return false;
}

function isNotProteinOrMP(input) {
	if(input.length > 5) {
		return true && !isMPNode(input);
	}
	return false;
}

function edgeHasNode(inputEdge,nodeCheck) {
	if(nodeCheck(inputEdge.source) || nodeCheck(inputEdge.target)) {		
		return true
	}
	else {
		return false
	}
}


for(var node of Object.values(cy.nodes())) {
	//console.log(node)
	if(node.id) { 
		//cy.getElementById(id).style('label',id)
		var id = node.id()
		//console.log(id);
		if(isMPNode(id)) {
			cy.getElementById(id).style('background-color','#0081CF')			
		} else if(id.length > 5) {
			cy.getElementById(id).style('background-color','lightgreen')
		} else {
			cy.getElementById(id).style('background-color','rgb(0, 149, 149)')
		}
	}
	//node.addClass("disease")
}

// for each edge in the graph, if its got an association, color that, if its got a combined score, color that


for(var edge of Object.values(cy.edges())) {
	if(edge.data && edge.data().association != undefined) {
		console.log(edge.data().association)
		if(edge.data().association) {
			edge.style('line-color','#00C9A7');
			edge.style('width','12px');
			edge.style('label','KNOWN POSITIVE ASSOCIATION');
			edge.style('text-rotation','autorotate')
		} else {
			//edge.style('line-color','#FF6F91');
			//edge.style('width','3px');



			var data = edge.data();
			var noderemove = null;
			if(isMPNode(data.source)) {
				noderemove = data.target;
			} else {
				noderemove = data.source;
			}


			cy.remove(edge)
			cy.remove(cy.getElementById(noderemove))
		}
	} else {

		if(edge.data) {
			if(edgeHasNode(edge.data(),isNotProteinOrMP)){
				edge.style('line-color','lightgreen')
				edge.style('width','6px');
			} else if(edgeHasNode(edge.data(),isMPNode)){
				edge.style('line-color','lightblue')
				edge.style('label','')
				edge.style('width','6px');
			} else {
				var score = parseFloat(edge.data().combined_score)/1000.0
				edge.style('line-color','#444')
				console.log(parseFloat(edge.data().combined_score)/1000.0,score)
				edge.style('opacity',score.toString())
				edge.style('width','5%');
				edge.style('line-style','dotted')
			}

		}

		
		//console.log("NO!",edge)



	}
}

cy.layout(options).run();

//for(var edge in nod)

</script>
	"""

    #outDir = "results/graphs"
    #if not os.path.exists(outDir):
    #	os.mkdir(outDir)
    filePath = resultDir + "/{0}{1}{2}.html".format(str(
        firstFeature[0]), str(int(time.time())), disease)
    text_file = open(filePath, "w")
    text_file.write(header + dataOut + footer)
    text_file.close()
    print("Output visualization file: {0}".format(filePath))
Exemple #6
0
    logging.error('Result directory is needed')
    exit()
nfolds = argData['crossval_folds']  # applicable for average CV

#Nodes
nodes = [ProteinInteractionNode, KeggNode, ReactomeNode, GoNode, InterproNode]

#staticFeatures = []
staticFeatures = argData['static_data'].split(',')
logging.info(staticFeatures)

logging.info("--- USING {0} METAPATH FEATURE SETS".format(len(nodes)))
logging.info("--- USING {0} STATIC FEATURE SETS".format(len(staticFeatures)))

#fetch the description of proteins and pathway_ids
dbAdapter = OlegDB()
idDescription = dbAdapter.fetchPathwayIdDescription()  #fetch the description
idNameSymbol = dbAdapter.fetchSymbolForProteinId(
)  #fetch name and symbol for protein

if fileData is not None:
    #logging.info("FOUND {0} POSITIVE LABELS".format(len(fileData[True])))
    #logging.info("FOUND {0} NEGATIVE LABELS".format(len(fileData[False])))
    trainData = metapathFeatures(disease,
                                 currentGraph,
                                 nodes,
                                 idDescription,
                                 staticFeatures,
                                 loadedLists=fileData).fillna(0)
else:
    trainData = metapathFeatures(disease, currentGraph, nodes, idDescription,
Exemple #7
0
        logging.info('Create the output directory')
        os.makedirs(args.outputdir)
    logging.info('Output directory for ML data(Training/predict): {0}'.format(
        args.outputdir))

    # check whether file or disease was given
    if args.trainingfile is None and args.disease is None:
        parser.error("--disease or -- training file must be specified.")

    # fetch KG data
    # graphString = args.kgfile
    currentGraph = ProteinDiseaseAssociationGraph.load(args.kgfile)
    logging.info("GRAPH {0} LOADED".format(args.kgfile))

    # Access the db adaptor. Make TCRD as the default DB
    dbAdapter = OlegDB() if args.db == "olegdb" else TCRD()

    if args.trainingfile is not None and args.disease is None:
        # trainingPklFile = trainingfile
        logging.info('Input training file: {0}'.format(args.trainingfile))
        try:
            with open(args.trainingfile, 'rb') as f:
                fileData = pickle.load(f)
        except:
            logging.error('Invalid pickled training set file')
            exit()

        # Also add predict data if provided
        if args.predictfile is not None:
            # predictPklFile = predictfile
            logging.info('Input predict file: {0}'.format(args.predictfile))
Exemple #8
0
logging.info("Python version: {0}".format(sys.version.split()[0]))
logging.info("Pandas version: {0}".format(pd.__version__))
logging.info("NetworkX version: {0}".format(nx.__version__))

from ProteinGraphML.DataAdapter import OlegDB, selectAsDF
from ProteinGraphML.GraphTools import ProteinDiseaseAssociationGraph
from ProteinGraphML.Analysis import Visualize
from ProteinGraphML.MLTools.MetapathFeatures import metapathFeatures, ProteinInteractionNode, KeggNode, ReactomeNode, GoNode, InterproNode
from ProteinGraphML.MLTools.MetapathFeatures import getMetapaths
from ProteinGraphML.MLTools.Data import BinaryLabel
from ProteinGraphML.MLTools.Models import XGBoostModel
from ProteinGraphML.MLTools.Procedures import *  #XGBCrossVal

t0 = time.time()

dbAdapter = OlegDB()

# CANT FIND THIS DISEASE(?)
# disease = "MP_0000180"
disease = "MP_0000184"
with db_session:
    dname = dbAdapter.db.get("SELECT name FROM mp_onto WHERE mp_term_id = '" +
                             disease + "'")
    logging.info("disease: {0}: \"{1}\"".format(disease, dname))

fileData = None

pickleFile = "ProteinDisease_GRAPH.pickle"
# CANT FIND THIS GRAPH(?)
currentGraph = ProteinDiseaseAssociationGraph.load(pickleFile)
Exemple #9
0
logging.info('Procedure: {0} ({1})'.format(args.procedure, locals()[args.procedure]))

# directory and file name for the ML Model
if args.modelfile is None:
    logging.error("--modelfile required.")
    exit()
else:
    logging.info("Model '{0}' will be used for prediction".format(args.modelfile))

# Get result directory
if args.resultdir is not None:
    logging.info('Results will be saved in directory: {0}'.format('results/' + args.resultdir))
else:
    logging.error('Result directory is needed')
    exit()

# Access the db adaptor. Make TCRD as the default DB
dbAdapter = OlegDB() if args.db == "olegdb" else TCRD()

idDescription = dbAdapter.fetchPathwayIdDescription()  # fetch the description
idNameSymbol = dbAdapter.fetchSymbolForProteinId()  # fetch name and symbol for protein

# call ML codes
d = BinaryLabel()
d.loadPredictData(predictData)

locals()[args.procedure](d, idDescription, idNameSymbol, args.modelfile, args.resultdir, infofile)

logging.info('{0}: elapsed time: {1}'.format(os.path.basename(sys.argv[0]),
                                             time.strftime('%Hh:%Mm:%Ss', time.gmtime(time.time() - t0))))
Exemple #10
0
    logging.basicConfig(
        format='%(levelname)s:%(message)s',
        level=(logging.DEBUG if args.verbose > 1 else logging.INFO))

    fileName = os.path.basename(args.ifile)
    fileExt = fileName.split('.')[-1]
    dataDir = os.path.dirname(os.path.abspath(args.ifile))
    outBaseName = fileName.split('.')[0]

    if fileExt.lower() not in ('csv', 'tsv', 'txt', 'rds', 'xlsx', 'xls'):
        parser.error('Unsupported filetype: {0} ({1})'.format(
            fileName, fileExt))

    # Access the db adaptor. Make TCRD as the default DB
    dbAdapter = OlegDB() if args.db == "olegdb" else TCRD()

    allProteinIds = dbAdapter.fetchAllProteinIds()
    allProteinIds = set(allProteinIds['protein_id'].tolist())

    # check if negative labels need to be fetched from the database
    if args.use_default_negatives:
        logging.info(
            'INFO: Default negative protein ids will be selected for negative labels.'
        )
        negProteinIds = dbAdapter.fetchNegativeClassProteinIds()
        negProteinIds = set(negProteinIds['protein_id'].tolist())

    # Generate a dictionary to store the protein_ids for class 0 and class 1.
    # The dictionary will be saved in pickle format.
Exemple #11
0
                        default=0,
                        help="verbosity")

    args = parser.parse_args()

    logging.basicConfig(
        format='%(asctime)s %(levelname)s:%(message)s',
        level=(logging.DEBUG if args.verbose > 1 else logging.INFO))

    t0 = time.time()

    ## Construct base protein-disease map from ProteinDiseaseAssociationGraph.
    ## Db is PonyORM db (https://docs.ponyorm.org/api_reference.html).

    # Make TCRD as the default DB
    dbad = OlegDB() if args.db == "olegdb" else TCRD()

    pdg = ProteinDiseaseAssociationGraph(dbad)

    ## ProteinDiseaseAssociationGraph object has helper methods, but
    ## NetworkX methods also available.
    ## https://networkx.github.io/documentation/stable/reference/

    logging.info('Total nodes: %d; edges: %d' %
                 (pdg.graph.order(), pdg.graph.size()))

    ## Filter by proteins of interest; this list comes from a DB adapter, but any set will do.
    proteins = dbad.loadTotalProteinList().protein_id
    proteinSet = set(proteins)
    logging.info('Protein set: %d' % (len(proteinSet)))