def Main(annotFile=None,trailFile=None,outFile=None,header=None,new=None,old=None,skip=None,log=None): """ Main program """ ### Argument/options listing ### Option processing =========================================== ## Filename definitions ======================================== i0 = time.clock() inext = i0 ## File reading options ======================================== ## Body ======================================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file. if old: newAnnot = transferAnnotationsSkipOld(annotFile,trailFile,new,header=header,skip=skip) else: newAnnot = transferAnnotations(annotFile,trailFile,new,header=header,skip=skip) ## Sortie ====================================================== try: HEADER = newAnnot['header'] except: HEADER = None print(HEADER) outputDict2(newAnnot,outFile,header=HEADER) inext = myTimer(inext,"New annotation writing",handle=log) ## Output options ====================================================== ## Ending ============================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0,"Total computing time for %s" % prog,handle=log) return
def Main(edgeFile=None,outEdgeFile=None,nodeFile=None,nodeList=None,comp=None,subType=None,sep=None,log=None): """ Main program """ i0 = time.clock() inext = i0 ### Argument processing ======================================== ## Filename definitions ====================================== i0 = time.clock() ## Lecture des fichiers ======================================== if nodeFile: subnodes = ut.file2list(nodeFile) elif nodeList: subnodes = nodeList.strip().split(",") elif comp: compFile,compID = comp.strip().split(",") subnodes = ut.getNodes(compFile,compID) else: subnodes = None ## Corps du programme =========================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file ut.inducedSubgraph(edgeFile,subnodes=subnodes,nodeType=subType,outFile=outEdgeFile,sep=sep) ## Sortie ====================================================== inext = myTimer(inext,handle=log) ## Ending ============================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0,"Total computing time for %s" % prog,handle=log) return
def Main(edgeFile=None,outFile=None,method=None,log=None): """ Main program """ ### Argument/options listing i0 = time.clock() inext = i0 ## Lecture des fichiers ======================================== if not outFile: outFile = edgeFile+"_"+method+".comp" ## File reading ======================================== try: g = igraph.read(edgeFile, format="ncol", directed=False, names=True) except SystemError: tag = random.randint(100000,1000000) tempFile = edgeFile+"_"+str(tag)+".edges" tempCmd = """cut -f1,2,9 %s > %s""" % (edgeFile,tempFile) proc1 = Popen(args=[tempCmd],shell=True,executable = "/bin/bash") proc1.communicate() g = igraph.read(tempFile, format="ncol", directed=False, names=True) rmCmd = """rm %s""" % tempFile proc2 = Popen(args=[rmCmd],shell=True,executable = "/bin/bash") proc2.communicate() names = g.vs["name"] try: clustering = Clustering(g,method=method) except igraph._igraph.InternalError as e: sys.exit(e) f = open(outFile,'w') for node in g.vs: f.write("""%s\t%s\n""" % (names[node.index],clustering.membership[node.index])) f.close() ## Sortie ====================================================== inext = myTimer(inext,handle=log) ## Ending ============================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0,"Total computing time for %s" % prog,handle=log) return
def Main(edgeFile=None, outEdgeFile=None, degree=None, nodeType=None, sep=None, log=None): """ Main program """ ### Argument/options listing d = int(degree) i0 = time.clock() inext = i0 ## Lecture des fichiers ======================================== h2t, t2h = ut.adjacencyList(edgeFile) if nodeType == 1: adj = h2t elif nodeType == 2: adj = t2h else: adj = h2t adj.update(t2h) subnodes = [i for (i, v) in iter(adj.items()) if len(v) <= d] nodeType = -1 ## Corps du programme =========================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file ut.inducedSubgraph(edgeFile, subnodes, nodeType=nodeType, outFile=outEdgeFile, sep=sep) ## Sortie ====================================================== inext = myTimer(inext, "Removed %s nodes" % len(subnodes), handle=log) ## Ending ============================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0, "Total computing time for %s" % prog, handle=log) return
def Main(blastFile=None, genome2sequence=None, sep=None, thr=None, cov=None, in_network=None, fasta=None, aln=None, clust=None, annot=None, key=None, keyList=None, log=None, directory=None, config=None): """ Main program """ ### try: startWD = os.path.abspath(os.path.dirname(blastFile)) except: startWD = os.path.abspath(os.getcwd()) os.chdir(startWD) if directory: rootDir = os.path.abspath(directory) if not os.path.exists(rootDir): os.makedirs(rootDir) else: rootDir = os.getcwd() if log != sys.stderr: log = os.path.join(rootDir, log) ### Argument processing ============================================================================================================= if not blastFile or not genome2sequence: sys.exit("Required files %s and %s" % ("blastFile", "genome2sequence")) blastFile = os.path.abspath(blastFile) genome2sequence = os.path.abspath(genome2sequence) ThresholdList = list(map(int, thr.strip().split(","))) cover = float(cov) print("Starting directory: %s" % startWD) print("Root directory: %s" % rootDir) if fasta: if aln == "b": runBlast(fasta, blastFile) elif aln == "d": runDiamond(fasta, blastFile) else: sys.exit( "Wrong sequence comparison option -- use (b) for BLAST - (d) for DIAMOND" ) UniqID = key ## Filename definitions ============================================================================================================= if in_network: geneNetwork = os.path.abspath(in_network) else: geneNetwork = blastFile + ".cleanNetwork" if annot: annot = os.path.abspath(os.path.join(startWD, annot)) if keyList: keyList = keyList.split(",") else: with open(annot, 'r') as ANNOT: keyList = ANNOT.readline().strip().split(sep)[1:] else: annot = None keyList = None ## Corps du programme =========================================== inext = time.clock() os.chdir(rootDir) ## A) from the blast output to the sequence families # a) filter self-hits and keep only best hit if not in_network: cmd1 = "%s -n 1 -i %s" % ( cleanblast, blastFile ) # the output are three files named blastFile".cleanNetwork", blastFile".cleanNetwork.dico" and blastFile".cleanNetwork.genes" printLog( "--------------------------------------------------\nRunning %s" % cmd1, log) proc1 = Popen(args=[cmd1], shell=True, stdout=PIPE, executable="/bin/bash") out = proc1.communicate()[0] printLog(out.decode('utf-8'), log) # b) perform complete analysis for each threshold for n in ThresholdList: STR = """--------------------------------------------------\nSimilarity threshold %d%%""" % n printLog(STR, log) completeAnalysis(geneNetwork, genome2sequence, n, cover, a=annot, clustType=clust, UniqID=key, sep=sep, keyList=keyList, handle=log, config=config) os.chdir(rootDir) ## Fin ====================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] ## Sortie ====================================================== return ()
def Main(edgeFile=None, outEdgeFile=None, outTrailFile=None, direct=None, community=None, comm_fasta=None, comm_id=None, in_trail=None, inType=None, outType=None, sep=None, weight=None, log=None, header=None): """ Main program """ ### Argument/options listing if not outEdgeFile: outEdgeFile = edgeFile + ".out" if not outTrailFile: outTrailFile = edgeFile + ".trail" ### Option processing =========================================== if not os.stat(edgeFile).st_size: if myModule() == "__main__.py": sys.exit("Error: Empty file %s" % edgeFile) else: raise IOError("Empty file %s" % edgeFile) if direct: # Out_directory processing directory = os.path.join(os.getcwd(), direct) if not os.path.exists(directory): os.makedirs(directory) else: directory = os.getcwd() if community: # clustering option communityFile = community # a filename with the attribution of community for each node ## Filename definitions ======================================== i0 = time.clock() inext = i0 outFile = os.path.join(directory, outEdgeFile) outDict = os.path.join(directory, outTrailFile) ## File reading options ======================================== # Read the clustering file and attribute a community identifier to nodes if community: # this part tackles the case of the explicitly redefined nodes -- there may be some missing nodes that will be treated later newNodes = ut.node2community(communityFile, sep=sep, ID=comm_id) inext = myTimer(inext, "Community reading", handle=log) elif comm_fasta: newNodes = ut.node2communityFasta(comm_fasta, sep=sep) else: # otherwise, all nodes are missing nodes, and appear in the completion process below newNodes = None ## Body ======================================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file. newDictionary, edges_dict, edges_std = ut.network2communityNetwork( edgeFile, dictionary=newNodes, sep=sep, useWeights=True) inext = myTimer(inext, "Community construction", handle=log) if weight: ut.outputEdgesDict_Weights(edges_dict, edges_std=edges_std, outFile=outFile, sep=sep) else: ut.outputEdgesDict_NoWeights(edges_dict, outFile=outFile, sep=sep) inext = myTimer(inext, "New graph writing", handle=log) ## Output options ====================================================== if in_trail: ut.outputTrailFile(newDictionary, in_trail, outfile=outDict, sep=sep, header=header) else: ut.outputFile(newDictionary, outfile=outDict, sep=sep, header=header) inext = myTimer(inext, "TrailFile writing", handle=log) if inType: if outType: outType = os.path.join(directory, outType) else: outType = os.path.join(directory, output_network + ".type") ut.outputTypeFile(newDictionary, inType, outfile=outType, sep=sep) inext = myTimer(inext, "TypeFile writing", handle=log) ## Ending ============================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0, "Total computing time for %s" % prog, handle=log) return
def Main(edgeFile=None,annotFile=None,sep=None,outFile=None,Xout=None,restrAnnot=None,nodeList=None,NodeType=None,nodeID=None,unilat=None,track=None,empty=None,\ x=None,X=None,K=None,hist=None,display=None,trail=None,comp=None,keyList=None,log=None): """ Main program """ ### Argument/options listing startWD = os.getcwd() if log != sys.stderr: try: log = os.path.join(startWD, log) except TypeError: log = sys.stderr ### Argument processing ======================================== ## Filename definitions ====================================== i0 = time.clock() inext = i0 if not outFile: inRad = edgeFile.split(".")[0] outFile = inRad + ".desc" if empty: track = None ## File reading options ======================================== # Step 1) Store the node type (top(1)/bottom(2) in the bipartite graph), adapted for the k-partite case if not os.stat(edgeFile).st_size: if myModule() == "__main__.py": sys.exit("Error: Empty file %s" % edgeFile) else: raise IOError("Empty file %s" % edgeFile) nodeType = ut.readNodeType( edgeFile, Type=NodeType ) # this step is not REALLY necessary, in the sense that only the values of the nodeType file are used here try: nodeTypes = list(set(nodeType.values( ))) # likely an overkill, but does not seem to be time-consuming except AttributeError: # this is for the unipartite case (or is it?) nodeTypes = [1] inext = myTimer(inext, "Reading nodeType", handle=log) ## Step 2) Read XML configuration file or generate and exit. ++++++++++++++++++++++++++++++++++++++++++++++++++ # a) set variables. if keyList: keyDict = ut.processOptions(keyList, nodeTypes) else: selectedKeys = list(ut.getHeader(annotFile).keys()) selectedKeys.remove(nodeID) keyDict = dict() for n in nodeTypes: keyDict[n] = selectedKeys trailObjects = [] compObject = None root = os.getcwd() if comp: compObject = ut.myMod(fileName=comp, attDict={0: "Module"}) if hist: # added option to generate complete trailHistory in the XML file : options.H is the main trailFile (from rootDir to cwDir) history = ut.trailTrack(hist) root = ut.trailHist(hist)['root'] k = 1 for trailName in history: trailKeyDict = dict([(i, "NodeType" + str(i)) for i in nodeTypes]) Trail = ut.myTrail(fileName=trailName, rank=k, attDict=trailKeyDict) trailObjects.append(Trail) k += 1 if x: # this option gives the name of the config file, and proceeds with the description procedure configFile = x if X: # options.X is the name of the configurationFile that will be generated (default = "config.xml") if x == X: configFile = ut.generateXML(nodeTypes, trailObjects=trailObjects, compObject=compObject, attDict=keyDict, outFile=X, display=display, root=root) else: sys.exit("Conflicting fields -x and -X. Check and run again.") if K: ret = xmlform.main(xmlFile=configFile) if ret == "Cancel": sys.exit(0) trailObjects, compObject, keyDict, selectedKeys, XML = ut.readConfigFile( configFile) ut.printDescription(trailObjects, compObject, keyDict, selectedKeys, handle=sys.stderr) else: # this block will generate the config file and stop: we start with this part. if X: # options.X is the name of the configurationFile that will be generated (default = "config.xml") outConf = X else: outConf = "config.xml" ## selectedKeys are obtained as header of the annotFile configFile = ut.generateXML(nodeTypes, trailObjects=trailObjects, compObject=compObject, attDict=keyDict, outFile=outConf, display=display, root=root) #configFile = generateXML(nodeTypes,trailObjects=trailObjects,compObject=compObject,attDict=keyDict,outFile=X,display=display) if myModule() == "__main__.py": printLog( "Configured file %s: please check options, and pass it with -x option" % outConf, log) return () ## Step 3) Define nodeLists of currentID and UniqID. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if NodeType == '2': nodes = nodeType.keys() elif NodeType == '1': nodes = ut.readNodes(edgeFile, sep=sep) nodeType = ut.initDict(nodes, value=1) else: nodes = ut.readNodes(edgeFile, sep=sep) if nodeList: # if we explicitly give a file with the currentID to restrict to. nodeFile = options.n nodes = ut.file2set( nodeFile) # nodes is actually a list (but without repetitions)! inext = myTimer(inext, "Reading nodeFile", handle=log) if unilat: nTypes = set(unilat.strip().split(",")) nodes = (node for node in nodes if nodeType[node] in nTypes) printLog("""Loaded %d nodes""" % len(nodes), log) # Selected UniqIDs: ======== if trailObjects: trailObjects[-1].getDict( ) # here the dictionaries of the main trail file are loaded. current2UniqID = trailObjects[-1].dict_inv myEntries = ut.unList(map(lambda x: current2UniqID[x], nodes)) else: myEntries = nodes current2UniqID = None printLog("""Found %d entries""" % len(myEntries), log) inext = myTimer(inext, "Reading allEntries", handle=log) # Annotation file processing: ========== if restrAnnot: annotationDict, fields = ut.restrictAnnot(annotFile, mainKey=str(nodeID), valueKeyList=selectedKeys) else: annotationDict, fields = ut.myLoadAnnotations( annotFile, mainKey=str(nodeID), valueKeyList=selectedKeys, counter=0) inext = myTimer(inext, "Reading annotations", handle=log) ## Step 4) Construct the actual description. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ OutFile = Xout ut.xmlDescription(annot=annotationDict, nodeDict=current2UniqID, entries=myEntries, compObject=compObject, trails=trailObjects, nodeType=nodeType, keyDict=keyDict, xmlOutFile=OutFile, outFile=outFile, track=track, X=XML, handle=log) if Xout: printLog("XML output written to %s" % OutFile, log) else: printLog("Description written to %s" % outFile, log) ## Output and exit ====================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0, "Total computing time for %s" % prog, handle=log) return
def Main(edgeFile=None,outFile=None,sep=None,unilat=None,twin_supp=None,Twin_Supp=None,min_supp=None,min_size=None,nodeType=None,comp=None,debug=None,log=None): """ Main program """ i0 = time.clock() inext = i0 ## File reading options ======================================== if not outFile: outFile = edgeFile+".twins" thr = min_supp try: k_part = int(nodeType) except (TypeError,ValueError): k_part = nodeType ## File reading ======================================== if not os.stat(edgeFile).st_size: if myModule() == "__main__.py": sys.exit("Error: Empty file %s" % edgeFile) else: raise IOError("Empty file %s" % edgeFile) g = ut.myReadGraph(edgeFile) print(g.summary()) id2name = {} name2id = {} for n in g.vs(): name = n['name'] ind = n.index id2name[ind] = name name2id[name] = ind inext = myTimer(i0,"Loading graph",handle=log) ## Program body =========================================== # Adjacency list computation ------------------------------ getName = lambda x:id2name[x] nodes = None if unilat: typeSet = set(map(lambda x:int(x),unilat.strip().split(","))) typeDict = defaultdict(int) if k_part == 2 or not k_part: typeDict.update(ut.rawNodeType(edgeFile)) elif k_part != 1: typeDict.update(ut.loadNodeType(k_part)) nodes = (n.index for n in g.vs() if typeDict[n['name']] in typeSet) ADJ = ut.getAdjlist(g,nodes=nodes) inext = myTimer(inext,"Computation of adjacency list",handle=log) # Twin computation ---------------------------------------- support,twins = ut.detectRepeated(ADJ,k_init=0,debug=debug) # support: groupID -> common_list_of_neighbours; twins: node -> groupID_of_its_twin_class inext = myTimer(inext,"Computation of twins",handle=log) new_support = dict([(gid,tuple(map(getName,support[gid]))) for gid in support]) new_twins = dict([(id2name[node],twins[node]) for node in twins]) support = new_support twins = new_twins inext = myTimer(inext,"Renumbering of twins",handle=log) sniwt = ut.InvertMap(twins) # groupID -> list_of_twin_nodes inext = myTimer(inext,"Computation of twin support",handle=log) # Computation of components (twins + support) if comp: with open(comp,'w') as h: for key,val in iter(twins.items()): outString = str(key)+sep+str(val)+"\n" h.write(outString) inext = myTimer(inext,"Writing twins file",handle=log) for val,nodes in iter(support.items()): for node in nodes: outString = str(node)+sep+str(val)+"\n" h.write(outString) inext = myTimer(inext,"Writing twins component file",handle=log) # Computation of twinSupport (twinID twinNb twinSupport) if twin_supp: with open(twin_supp,'w') as g: for i,nodeList in iter(sniwt.items()): supp = support[i] if len(supp) >= thr and len(nodeList) >= min_size: # Threshold for trivial twins (new option 15/12/15) vals = [str(i),str(len(nodeList)),str(len(supp))] vals.extend(list(map(lambda x:str(x),supp))) g.write("\t".join(vals)+"\n") inext = myTimer(inext,"Writing twins support file",handle=log) # Computation of TwinSupport (twinID twinNodes twinSupport) if Twin_Supp: with open(Twin_Supp,'w') as g: for i,nodeList in iter(sniwt.items()): supp = support[i] if len(supp) >= thr and len(nodeList) >= min_size: # Threshold for trivial twins (new option 15/12/15) myTwins = ",".join(map(lambda x:str(x),nodeList)) mySupport = ",".join(map(lambda x:str(x),supp)) vals = [str(i)] vals.extend(list(map(lambda x:str(x),supp))) g.write("\t".join(vals)+"\n") inext = myTimer(inext,"Writing Twins Support file",handle=log) ut.outputDict(twins,outFile,sep=sep) allTwins = len(sniwt.keys()) t = len([i for (i,v) in iter(sniwt.items()) if len(v) == 1]) try: tp = 100*float(t)/float(allTwins) except: tp = 0 nt = allTwins - t try: ntp = 100*float(nt)/float(allTwins) except: ntp = 0 printLog("""Found %s twins, %s trivial twins (%.2f%%) and %s non-trivial twins (%.2f%%)""" % (allTwins,t,tp,nt,ntp),log) ## Ending ====================================================== prog = myModule() if prog == "__main__.py": prog = sys.argv[0].split("/")[-1] inext = myTimer(i0,"Total computing time for %s" % prog,handle=log) return