Ejemplo n.º 1
0
def Main(annotFile=None,trailFile=None,outFile=None,header=None,new=None,old=None,skip=None,log=None):
    """ Main program """
    ### Argument/options listing
    ### Option processing ===========================================
    ## Filename definitions ======================================== 
    i0 = time.clock()
    inext = i0
    ## File reading options ======================================== 
    ## Body ======================================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file.
    if old:
        newAnnot = transferAnnotationsSkipOld(annotFile,trailFile,new,header=header,skip=skip)
    else:
        newAnnot = transferAnnotations(annotFile,trailFile,new,header=header,skip=skip)
    ## Sortie ======================================================
    try:
        HEADER = newAnnot['header']
    except:
        HEADER = None
    print(HEADER)
    outputDict2(newAnnot,outFile,header=HEADER)
    inext = myTimer(inext,"New annotation writing",handle=log)
    ## Output options ======================================================
    ## Ending ==============================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0,"Total computing time for %s" % prog,handle=log)
    return
Ejemplo n.º 2
0
def Main(edgeFile=None,outEdgeFile=None,nodeFile=None,nodeList=None,comp=None,subType=None,sep=None,log=None):
    """ Main program """
    i0 = time.clock()
    inext = i0
    ### Argument processing ========================================
    ## Filename definitions ======================================
    i0 = time.clock()
    ## Lecture des fichiers ========================================  
    if nodeFile:
        subnodes = ut.file2list(nodeFile)
    elif nodeList:
        subnodes = nodeList.strip().split(",")
    elif comp:
        compFile,compID = comp.strip().split(",")
        subnodes = ut.getNodes(compFile,compID)
    else:
        subnodes = None
    ## Corps du programme =========================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file
    ut.inducedSubgraph(edgeFile,subnodes=subnodes,nodeType=subType,outFile=outEdgeFile,sep=sep)
    ## Sortie ======================================================
    inext = myTimer(inext,handle=log)
    ## Ending ==============================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0,"Total computing time for %s" % prog,handle=log)
    return
Ejemplo n.º 3
0
def Main(edgeFile=None,outFile=None,method=None,log=None):
    """ Main program """
    ### Argument/options listing
    i0 = time.clock()
    inext = i0
    ## Lecture des fichiers ========================================
    if not outFile:
        outFile = edgeFile+"_"+method+".comp"
    ## File reading ========================================
    try:
        g = igraph.read(edgeFile, format="ncol", directed=False, names=True)
    except SystemError:
        tag = random.randint(100000,1000000)
        tempFile = edgeFile+"_"+str(tag)+".edges"
        tempCmd = """cut -f1,2,9 %s > %s""" %  (edgeFile,tempFile)
        proc1 = Popen(args=[tempCmd],shell=True,executable = "/bin/bash")
        proc1.communicate()
        g = igraph.read(tempFile, format="ncol", directed=False, names=True)
        rmCmd = """rm %s""" %  tempFile
        proc2 = Popen(args=[rmCmd],shell=True,executable = "/bin/bash")
        proc2.communicate()
    names = g.vs["name"]
    try:
        clustering = Clustering(g,method=method)
    except igraph._igraph.InternalError as e:
        sys.exit(e)
    f = open(outFile,'w')
    for node in g.vs:
        f.write("""%s\t%s\n""" % (names[node.index],clustering.membership[node.index]))
    f.close()
    ## Sortie ======================================================
    inext = myTimer(inext,handle=log)
    ## Ending ==============================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0,"Total computing time for %s" % prog,handle=log)
    return
Ejemplo n.º 4
0
def Main(edgeFile=None,
         outEdgeFile=None,
         degree=None,
         nodeType=None,
         sep=None,
         log=None):
    """ Main program """
    ### Argument/options listing
    d = int(degree)
    i0 = time.clock()
    inext = i0
    ## Lecture des fichiers ========================================
    h2t, t2h = ut.adjacencyList(edgeFile)
    if nodeType == 1:
        adj = h2t
    elif nodeType == 2:
        adj = t2h
    else:
        adj = h2t
        adj.update(t2h)
    subnodes = [i for (i, v) in iter(adj.items()) if len(v) <= d]
    nodeType = -1
    ## Corps du programme =========================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file
    ut.inducedSubgraph(edgeFile,
                       subnodes,
                       nodeType=nodeType,
                       outFile=outEdgeFile,
                       sep=sep)
    ## Sortie ======================================================
    inext = myTimer(inext, "Removed %s nodes" % len(subnodes), handle=log)
    ## Ending ==============================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0, "Total computing time for %s" % prog, handle=log)
    return
Ejemplo n.º 5
0
def Main(blastFile=None,
         genome2sequence=None,
         sep=None,
         thr=None,
         cov=None,
         in_network=None,
         fasta=None,
         aln=None,
         clust=None,
         annot=None,
         key=None,
         keyList=None,
         log=None,
         directory=None,
         config=None):
    """ Main program """
    ###
    try:
        startWD = os.path.abspath(os.path.dirname(blastFile))
    except:
        startWD = os.path.abspath(os.getcwd())
    os.chdir(startWD)
    if directory:
        rootDir = os.path.abspath(directory)
        if not os.path.exists(rootDir):
            os.makedirs(rootDir)
    else:
        rootDir = os.getcwd()
    if log != sys.stderr:
        log = os.path.join(rootDir, log)
    ### Argument processing =============================================================================================================
    if not blastFile or not genome2sequence:
        sys.exit("Required files %s and %s" % ("blastFile", "genome2sequence"))
    blastFile = os.path.abspath(blastFile)
    genome2sequence = os.path.abspath(genome2sequence)
    ThresholdList = list(map(int, thr.strip().split(",")))
    cover = float(cov)
    print("Starting directory: %s" % startWD)
    print("Root directory: %s" % rootDir)
    if fasta:
        if aln == "b":
            runBlast(fasta, blastFile)
        elif aln == "d":
            runDiamond(fasta, blastFile)
        else:
            sys.exit(
                "Wrong sequence comparison option -- use (b) for BLAST - (d) for DIAMOND"
            )
    UniqID = key
    ## Filename definitions =============================================================================================================
    if in_network:
        geneNetwork = os.path.abspath(in_network)
    else:
        geneNetwork = blastFile + ".cleanNetwork"
    if annot:
        annot = os.path.abspath(os.path.join(startWD, annot))
        if keyList:
            keyList = keyList.split(",")
        else:
            with open(annot, 'r') as ANNOT:
                keyList = ANNOT.readline().strip().split(sep)[1:]
    else:
        annot = None
        keyList = None
    ## Corps du programme ===========================================
    inext = time.clock()
    os.chdir(rootDir)
    ## A) from the blast output to the sequence families
    # a) filter self-hits and keep only best hit
    if not in_network:
        cmd1 = "%s -n 1 -i %s" % (
            cleanblast, blastFile
        )  # the output are three files named blastFile".cleanNetwork", blastFile".cleanNetwork.dico" and blastFile".cleanNetwork.genes"
        printLog(
            "--------------------------------------------------\nRunning %s" %
            cmd1, log)
        proc1 = Popen(args=[cmd1],
                      shell=True,
                      stdout=PIPE,
                      executable="/bin/bash")
        out = proc1.communicate()[0]
        printLog(out.decode('utf-8'), log)
    # b) perform complete analysis for each threshold
    for n in ThresholdList:
        STR = """--------------------------------------------------\nSimilarity threshold %d%%""" % n
        printLog(STR, log)
        completeAnalysis(geneNetwork,
                         genome2sequence,
                         n,
                         cover,
                         a=annot,
                         clustType=clust,
                         UniqID=key,
                         sep=sep,
                         keyList=keyList,
                         handle=log,
                         config=config)
        os.chdir(rootDir)
    ## Fin ======================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    ## Sortie ======================================================
    return ()
Ejemplo n.º 6
0
def Main(edgeFile=None,
         outEdgeFile=None,
         outTrailFile=None,
         direct=None,
         community=None,
         comm_fasta=None,
         comm_id=None,
         in_trail=None,
         inType=None,
         outType=None,
         sep=None,
         weight=None,
         log=None,
         header=None):
    """ Main program """
    ### Argument/options listing
    if not outEdgeFile:
        outEdgeFile = edgeFile + ".out"
    if not outTrailFile:
        outTrailFile = edgeFile + ".trail"
    ### Option processing ===========================================
    if not os.stat(edgeFile).st_size:
        if myModule() == "__main__.py":
            sys.exit("Error: Empty file %s" % edgeFile)
        else:
            raise IOError("Empty file %s" % edgeFile)
    if direct:  # Out_directory processing
        directory = os.path.join(os.getcwd(), direct)
        if not os.path.exists(directory):
            os.makedirs(directory)
    else:
        directory = os.getcwd()
    if community:  # clustering option
        communityFile = community  # a filename with the attribution of community for each node
    ## Filename definitions ========================================
    i0 = time.clock()
    inext = i0
    outFile = os.path.join(directory, outEdgeFile)
    outDict = os.path.join(directory, outTrailFile)
    ## File reading options ========================================
    # Read the clustering file and attribute a community identifier to nodes
    if community:  # this part tackles the case of the explicitly redefined nodes -- there may be some missing nodes that will be treated later
        newNodes = ut.node2community(communityFile, sep=sep, ID=comm_id)
        inext = myTimer(inext, "Community reading", handle=log)
    elif comm_fasta:
        newNodes = ut.node2communityFasta(comm_fasta, sep=sep)
    else:  # otherwise, all nodes are missing nodes, and appear in the completion process below
        newNodes = None
    ## Body ======================================================== The dictionary old_nodes --> new_nodes obtained as the completion of the clustering file.
    newDictionary, edges_dict, edges_std = ut.network2communityNetwork(
        edgeFile, dictionary=newNodes, sep=sep, useWeights=True)
    inext = myTimer(inext, "Community construction", handle=log)
    if weight:
        ut.outputEdgesDict_Weights(edges_dict,
                                   edges_std=edges_std,
                                   outFile=outFile,
                                   sep=sep)
    else:
        ut.outputEdgesDict_NoWeights(edges_dict, outFile=outFile, sep=sep)
    inext = myTimer(inext, "New graph writing", handle=log)
    ## Output options ======================================================
    if in_trail:
        ut.outputTrailFile(newDictionary,
                           in_trail,
                           outfile=outDict,
                           sep=sep,
                           header=header)
    else:
        ut.outputFile(newDictionary, outfile=outDict, sep=sep, header=header)
    inext = myTimer(inext, "TrailFile writing", handle=log)
    if inType:
        if outType:
            outType = os.path.join(directory, outType)
        else:
            outType = os.path.join(directory, output_network + ".type")
        ut.outputTypeFile(newDictionary, inType, outfile=outType, sep=sep)
        inext = myTimer(inext, "TypeFile writing", handle=log)
    ## Ending ==============================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0, "Total computing time for %s" % prog, handle=log)
    return
Ejemplo n.º 7
0
def Main(edgeFile=None,annotFile=None,sep=None,outFile=None,Xout=None,restrAnnot=None,nodeList=None,NodeType=None,nodeID=None,unilat=None,track=None,empty=None,\
         x=None,X=None,K=None,hist=None,display=None,trail=None,comp=None,keyList=None,log=None):
    """ Main program """
    ### Argument/options listing
    startWD = os.getcwd()
    if log != sys.stderr:
        try:
            log = os.path.join(startWD, log)
        except TypeError:
            log = sys.stderr
    ### Argument processing ========================================
    ## Filename definitions ======================================
    i0 = time.clock()
    inext = i0
    if not outFile:
        inRad = edgeFile.split(".")[0]
        outFile = inRad + ".desc"
    if empty:
        track = None
    ## File reading options ========================================
    # Step 1) Store the node type (top(1)/bottom(2) in the bipartite graph), adapted for the k-partite case
    if not os.stat(edgeFile).st_size:
        if myModule() == "__main__.py":
            sys.exit("Error: Empty file %s" % edgeFile)
        else:
            raise IOError("Empty file %s" % edgeFile)
    nodeType = ut.readNodeType(
        edgeFile, Type=NodeType
    )  # this step is not REALLY necessary, in the sense that only the values of the nodeType file are used here
    try:
        nodeTypes = list(set(nodeType.values(
        )))  # likely an overkill, but does not seem to be time-consuming
    except AttributeError:  # this is for the unipartite case (or is it?)
        nodeTypes = [1]
    inext = myTimer(inext, "Reading nodeType", handle=log)
    ## Step 2) Read XML configuration file or generate and exit. ++++++++++++++++++++++++++++++++++++++++++++++++++
    # a) set variables.
    if keyList:
        keyDict = ut.processOptions(keyList, nodeTypes)
    else:
        selectedKeys = list(ut.getHeader(annotFile).keys())
        selectedKeys.remove(nodeID)
        keyDict = dict()
        for n in nodeTypes:
            keyDict[n] = selectedKeys
    trailObjects = []
    compObject = None
    root = os.getcwd()
    if comp:
        compObject = ut.myMod(fileName=comp, attDict={0: "Module"})
    if hist:  # added option to generate complete trailHistory in the XML file : options.H is the main trailFile (from rootDir to cwDir)
        history = ut.trailTrack(hist)
        root = ut.trailHist(hist)['root']
        k = 1
        for trailName in history:
            trailKeyDict = dict([(i, "NodeType" + str(i)) for i in nodeTypes])
            Trail = ut.myTrail(fileName=trailName,
                               rank=k,
                               attDict=trailKeyDict)
            trailObjects.append(Trail)
            k += 1
    if x:  # this option gives the name of the config file, and proceeds with the description procedure
        configFile = x
        if X:  # options.X is the name of the configurationFile that will be generated (default = "config.xml")
            if x == X:
                configFile = ut.generateXML(nodeTypes,
                                            trailObjects=trailObjects,
                                            compObject=compObject,
                                            attDict=keyDict,
                                            outFile=X,
                                            display=display,
                                            root=root)
            else:
                sys.exit("Conflicting fields -x and -X. Check and run again.")
        if K:
            ret = xmlform.main(xmlFile=configFile)
            if ret == "Cancel":
                sys.exit(0)
        trailObjects, compObject, keyDict, selectedKeys, XML = ut.readConfigFile(
            configFile)
        ut.printDescription(trailObjects,
                            compObject,
                            keyDict,
                            selectedKeys,
                            handle=sys.stderr)
    else:  # this block will generate the config file and stop: we start with this part.
        if X:  # options.X is the name of the configurationFile that will be generated (default = "config.xml")
            outConf = X
        else:
            outConf = "config.xml"
        ## selectedKeys are obtained as header of the annotFile
        configFile = ut.generateXML(nodeTypes,
                                    trailObjects=trailObjects,
                                    compObject=compObject,
                                    attDict=keyDict,
                                    outFile=outConf,
                                    display=display,
                                    root=root)
        #configFile = generateXML(nodeTypes,trailObjects=trailObjects,compObject=compObject,attDict=keyDict,outFile=X,display=display)
        if myModule() == "__main__.py":
            printLog(
                "Configured file %s: please check options, and pass it with -x option"
                % outConf, log)
        return ()
    ## Step 3) Define nodeLists of currentID and UniqID. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    if NodeType == '2':
        nodes = nodeType.keys()
    elif NodeType == '1':
        nodes = ut.readNodes(edgeFile, sep=sep)
        nodeType = ut.initDict(nodes, value=1)
    else:
        nodes = ut.readNodes(edgeFile, sep=sep)
    if nodeList:  # if we explicitly give a file with the currentID to restrict to.
        nodeFile = options.n
        nodes = ut.file2set(
            nodeFile)  # nodes is actually a list (but without repetitions)!
        inext = myTimer(inext, "Reading nodeFile", handle=log)
    if unilat:
        nTypes = set(unilat.strip().split(","))
        nodes = (node for node in nodes if nodeType[node] in nTypes)
    printLog("""Loaded %d nodes""" % len(nodes), log)
    # Selected UniqIDs: ========
    if trailObjects:
        trailObjects[-1].getDict(
        )  # here the dictionaries of the main trail file are loaded.
        current2UniqID = trailObjects[-1].dict_inv
        myEntries = ut.unList(map(lambda x: current2UniqID[x], nodes))
    else:
        myEntries = nodes
        current2UniqID = None
    printLog("""Found %d entries""" % len(myEntries), log)
    inext = myTimer(inext, "Reading allEntries", handle=log)
    # Annotation file processing: ==========
    if restrAnnot:
        annotationDict, fields = ut.restrictAnnot(annotFile,
                                                  mainKey=str(nodeID),
                                                  valueKeyList=selectedKeys)
    else:
        annotationDict, fields = ut.myLoadAnnotations(
            annotFile,
            mainKey=str(nodeID),
            valueKeyList=selectedKeys,
            counter=0)
    inext = myTimer(inext, "Reading annotations", handle=log)
    ## Step 4) Construct the actual description. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    OutFile = Xout
    ut.xmlDescription(annot=annotationDict,
                      nodeDict=current2UniqID,
                      entries=myEntries,
                      compObject=compObject,
                      trails=trailObjects,
                      nodeType=nodeType,
                      keyDict=keyDict,
                      xmlOutFile=OutFile,
                      outFile=outFile,
                      track=track,
                      X=XML,
                      handle=log)
    if Xout:
        printLog("XML output written to %s" % OutFile, log)
    else:
        printLog("Description written to %s" % outFile, log)
    ## Output and exit ======================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0, "Total computing time for %s" % prog, handle=log)
    return
Ejemplo n.º 8
0
def Main(edgeFile=None,outFile=None,sep=None,unilat=None,twin_supp=None,Twin_Supp=None,min_supp=None,min_size=None,nodeType=None,comp=None,debug=None,log=None):
    """ Main program """
    i0 = time.clock()
    inext = i0
    ## File reading options ========================================
    if not outFile:
        outFile = edgeFile+".twins"
    thr = min_supp
    try:
        k_part = int(nodeType)
    except (TypeError,ValueError):
        k_part = nodeType
    ## File reading ========================================
    if not os.stat(edgeFile).st_size:
        if myModule() == "__main__.py":
            sys.exit("Error: Empty file %s" % edgeFile)
        else:
            raise IOError("Empty file %s" % edgeFile)
    g = ut.myReadGraph(edgeFile)
    print(g.summary())
    id2name = {}
    name2id = {}
    for n in g.vs():
        name = n['name']
        ind = n.index
        id2name[ind] = name
        name2id[name] = ind
    inext = myTimer(i0,"Loading graph",handle=log)
    ## Program body ===========================================
    # Adjacency list computation ------------------------------
    getName = lambda x:id2name[x]
    nodes = None
    if unilat:
        typeSet = set(map(lambda x:int(x),unilat.strip().split(",")))
        typeDict = defaultdict(int)
        if k_part == 2 or not k_part:
            typeDict.update(ut.rawNodeType(edgeFile))
        elif k_part != 1:
            typeDict.update(ut.loadNodeType(k_part))
        nodes = (n.index for n in g.vs() if typeDict[n['name']] in typeSet)
    ADJ = ut.getAdjlist(g,nodes=nodes)
    inext = myTimer(inext,"Computation of adjacency list",handle=log)
    # Twin computation ----------------------------------------
    support,twins = ut.detectRepeated(ADJ,k_init=0,debug=debug)   # support: groupID -> common_list_of_neighbours; twins: node -> groupID_of_its_twin_class
    inext = myTimer(inext,"Computation of twins",handle=log)
    new_support = dict([(gid,tuple(map(getName,support[gid]))) for gid in support])
    new_twins = dict([(id2name[node],twins[node]) for node in twins])
    support = new_support
    twins = new_twins
    inext = myTimer(inext,"Renumbering of twins",handle=log)
    sniwt = ut.InvertMap(twins)   # groupID -> list_of_twin_nodes
    inext = myTimer(inext,"Computation of twin support",handle=log)
    # Computation of components (twins + support)
    if comp:
        with open(comp,'w') as h:
            for key,val in iter(twins.items()):
                outString = str(key)+sep+str(val)+"\n"
                h.write(outString)
            inext = myTimer(inext,"Writing twins file",handle=log)
            for val,nodes in iter(support.items()):
                for node in nodes:
                    outString = str(node)+sep+str(val)+"\n"
                    h.write(outString)
            inext = myTimer(inext,"Writing twins component file",handle=log)
    # Computation of twinSupport (twinID twinNb twinSupport)
    if twin_supp:
        with open(twin_supp,'w') as g:
            for i,nodeList in iter(sniwt.items()):
                supp = support[i]
                if len(supp) >= thr and len(nodeList) >= min_size:   # Threshold for trivial twins (new option 15/12/15)
                    vals = [str(i),str(len(nodeList)),str(len(supp))]
                    vals.extend(list(map(lambda x:str(x),supp)))
                    g.write("\t".join(vals)+"\n")
        inext = myTimer(inext,"Writing twins support file",handle=log)
    # Computation of TwinSupport (twinID twinNodes twinSupport)
    if Twin_Supp:
        with open(Twin_Supp,'w') as g:
            for i,nodeList in iter(sniwt.items()):
                supp = support[i]
                if len(supp) >= thr and len(nodeList) >= min_size:   # Threshold for trivial twins (new option 15/12/15)
                    myTwins = ",".join(map(lambda x:str(x),nodeList))
                    mySupport = ",".join(map(lambda x:str(x),supp))
                    vals = [str(i)]
                    vals.extend(list(map(lambda x:str(x),supp)))
                    g.write("\t".join(vals)+"\n")
        inext = myTimer(inext,"Writing Twins Support file",handle=log)
    ut.outputDict(twins,outFile,sep=sep)
    allTwins = len(sniwt.keys())
    t = len([i for (i,v) in iter(sniwt.items()) if len(v) == 1])
    try:
        tp = 100*float(t)/float(allTwins)
    except:
        tp = 0
    nt = allTwins - t
    try:
        ntp = 100*float(nt)/float(allTwins)
    except:
        ntp = 0
    printLog("""Found %s twins, %s trivial twins (%.2f%%) and %s non-trivial twins (%.2f%%)""" % (allTwins,t,tp,nt,ntp),log)
    ## Ending ======================================================
    prog = myModule()
    if prog == "__main__.py":
        prog = sys.argv[0].split("/")[-1]
    inext = myTimer(i0,"Total computing time for %s" % prog,handle=log)
    return