Python mangle_expn_vectors Beispiele

Programmiersprache: Python

Namespace / Paketname: rSeq.utils.expression

Methode / Funktion: mangle_expn_vectors

Beispiele auf hotexamples.com: 3

Python mangle_expn_vectors - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die rSeq.utils.expression.mangle_expn_vectors, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: plot_coExpn_Ortho_scatter.py Projekt: asntech/rSeqPipeline

def main():
    """
    1: ...
    """
    
    desc = """... ask me later! I'm on a deadline! ..."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    
    parser.add_argument('--hfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing homology relationships:
[--hfile "path;header1;header2"].  At LEAST one hfile is required and all MUST have all three trailing data.""")
    
    parser.add_argument('--xprnfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing expression data:
[--xprnfile "path;nameHeader;conditionHeader1;...;conditionHeaderN"].  At LEAST one xprnfile is required and all MUST have 
exactly one <path>, exactly one <nameHeader>, and at LEAST one <conditionHeader>. It is VERY important that 
you list the same number of conditions for each expnfile and that the order reflects which condition values are to be compared.""")
    
    parser.add_argument('--cmap', type=str, required=True, nargs='+',
                        help="""A list of species-prefix:color combinations to set the node colors:
[--cmap <AAEL:b ...>]. the number of combinations should match the number of files given to --xprnfile.""")
    
    parser.add_argument('--log', action='store_true',
                        help="""Plot the points on a log:log scale. (Default: %(default)s)""")
    parser.add_argument('--show', action='store_true',
                        help="""Plot the image for interactive manipulation, otherwise just write the file. (Default: %(default)s)""")
    parser.add_argument('--pdf', action='store_true',
                        help="""Plot the image as a pdf: png otherwise. Png is preferable when data size is large. (Default: %(default)s)""")
    parser.add_argument('--out', type=str, default='',
                        help="""Base path for output. (Default: current working directory)""")
    parser.add_argument('--load-pickle', type=str, default=False,
                        help="""Load graph from a gpickle. (Default: %(default)s)""")
    
    args = parser.parse_args()
    
    # some manual arg set-up and checking
    for i in range(len(args.hfile)):
        args.hfile[i] = args.hfile[i].split(';')
        if len(args.hfile[i]) != 3:
            raise SanityCheckError('EXACTLY 3 values must follow --hfile: you gave %s' % (args.hfile[i]))
        
    xLen = set()
    for i in range(len(args.xprnfile)):
        args.xprnfile[i] = args.xprnfile[i].split(';')
        if not len(args.xprnfile[i]) >= 3:
            raise SanityCheckError('At LEAST 3 values must follow --xprnfile: you gave %s' % (args.xprnfile[i]))
        else:
            xLen.add(len(args.xprnfile[i]))
    if not len(xLen) == 1:
        raise SanityCheckError('The same number of values must follow every --xprnfile flag.')
    
    if not len(args.xprnfile) == len(args.cmap):
        raise SanityCheckError('The length of values following --xprnfile and --cmap must be the same.')
    
    cDict = {}
    for combo in args.cmap:
        try:
            prefix,color = combo.split(':')
        except:
            raise
        cDict[prefix] = color
    
    
    
    # read in the expression vector data
    tmpDict = {}
    xDict = {}
    for xfile in args.xprnfile:
        tmpDict.update(mangle_expn_vectors(expnPath=xfile[0],txNameHeader=xfile[1],condHeaders=xfile[2:],manualHeaders=False))
        
    # convert -RX into -PX
    for k,v in tmpDict.iteritems():
        xDict[k.replace('-R','-P')] = v
        
    del(tmpDict)
    
    if args.load_pickle:
        subgraphs = nx.read_gpickle('/tmp/ortho_weighted_subgraphs.gpickle')
    else:
        # lets get started: init the graph
        graph = nx.Graph()
        
        for f in args.hfile:
            import_edges(graphObj=graph,edgeTablePath=f[0],startNodeHeader=f[1],endNodeHeader=f[2])
        
            
        # remove the '' node caused by unpaired relationships
        graph.remove_node('')
        
    
        # weight the edges in each graph by the pearsonr between their expression vectors
        weight_edges_with_pearsonr(graphObj=graph,dataVectors=xDict,uni=False)
        
            
        # if the edge length is imposible to graph (inf or nan) kill the edge
        #badEdges = []
        #edgesMissingNodes = []
        #for i,j in graph.edges_iter():
            #try:
                #if math.isnan(graph[i][j]['rVal']) or math.isinf(graph[i][j]['rVal']):
                    #badEdges.append((i,j))
            #except KeyError:
                #edgesMissingNodes.append((i,j))
                    
        #graph.remove_edges_from(badEdges)
        #graph.remove_edges_from(edgesMissingNodes)
        
        # Get all subgraphs
        subgraphs = nx.connected_component_subgraphs(graph)  
        nx.write_gpickle(subgraphs,"/tmp/ortho_weighted_subgraphs.gpickle")
        print "I layed a pickle!!"
    
    args.galaxy = False
    #args.label2 = "Pct w/ significant positive corr (r >= 0.5, p <= 0.05)"
    args.label2 = "Pct with significant correlation" 
    for prefix in cDict:
        args.label1 = "Usable paralogs per subgraph within %s" % (prefix)
        #args.label1 = "%s x" % (prefix) 
        pearsonStats,data = get_within_data(prefix,subgraphs)
        plotScatter(pearsonStats,data,args,color=cDict[prefix])
        
    args.label1 = "Usable orthologs per subgraph between AGAP and CPIJ" 
    #args.label1 = "both x"  
    pearsonStats,data = get_between_data(prefixes=cDict.keys(),subgraphs=subgraphs)
    plotScatter(pearsonStats,data,args,color='green')
    
    print "Done."

Beispiel #2

Datei anzeigen

Datei: plot_coExpn_Ortho_network.py Projekt: asntech/rSeqPipeline

def main():
    """
    1: ...
    """
    
    desc = """... ask me later! I'm on a deadline! ..."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    
    parser.add_argument('--hfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing homology relationships:
[--hfile "path;header1;header2"].  At LEAST one hfile is required and all MUST have all three trailing data.""")
    
    parser.add_argument('--xprnfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing expression data:
[--xprnfile "path;nameHeader;conditionHeader1;...;conditionHeaderN"].  At LEAST one xprnfile is required and all MUST have 
exactly one <path>, exactly one <nameHeader>, and at LEAST one <conditionHeader>. It is VERY important that 
you list the same number of conditions for each expnfile and that the order reflects which condition values are to be compared.""")
    
    parser.add_argument('--targets', type=str, required=True, nargs='+',
                        help="""A list of the gene/tx/protein symbols to use for pulling out all connected nodes.""")
    
    #parser.add_argument('--pfixlen', type=str, required=True, nargs='+',
                        #help="""One length of the symbol prefixes (AAEL for AAEL007639-PA) or a list of prefix le.""")
    
    parser.add_argument('--cmap', type=str, required=True, nargs='+',
                        help="""A list of species-prefix:color combinations to set the node colors:
[--cmap <AAEL:b ...>]. the number of combinations should match the number of files given to --xprnfile.""")
    
    parser.add_argument('--out', type=str, required=True,
                        help="""Path to outfile.  Its file extention chooses the file type.""")
    
    parser.add_argument('--graphml', type=str, required=False,
                        help="""Include a file path if you would like a graphML version of the final graph. (optional)""")
    
    parser.add_argument('--nonames', action='store_true',
                        help="""If used: gene/tx names will NOT be displayed.""")
    
    parser.add_argument('--noshow', action='store_true',
                        help="""If used: the graph NOT be displayed interactively.""")
    
    args = parser.parse_args()
    
    # some manual arg set-up and checking
    for i in range(len(args.hfile)):
        args.hfile[i] = args.hfile[i].split(';')
        if len(args.hfile[i]) != 3:
            raise SanityCheckError('EXACTLY 3 values must follow --hfile: you gave %s' % (args.hfile[i]))
        
    xLen = set()
    for i in range(len(args.xprnfile)):
        args.xprnfile[i] = args.xprnfile[i].split(';')
        if not len(args.xprnfile[i]) >= 3:
            raise SanityCheckError('At LEAST 3 values must follow --xprnfile: you gave %s' % (args.xprnfile[i]))
        else:
            xLen.add(len(args.xprnfile[i]))
    if not len(xLen) == 1:
        raise SanityCheckError('The same number of values must follow every --xprnfile flag.')
    
    if not len(args.xprnfile) == len(args.cmap):
        raise SanityCheckError('The length of values following --xprnfile and --cmap must be the same.')
    
    cDict = {}
    for combo in args.cmap:
        try:
            prefix,color = combo.split(':')
        except:
            raise
        cDict[prefix] = color
    
    
    
    # read in the expression vector data
    tmpDict = {}
    xDict = {}
    for xfile in args.xprnfile:
        tmpDict.update(mangle_expn_vectors(expnPath=xfile[0],txNameHeader=xfile[1],condHeaders=xfile[2:],manualHeaders=False))
        
    # convert -RX into -PX
    for k,v in tmpDict.iteritems():
        xDict[k.replace('-R','-P')] = v
        
    del(tmpDict)
    
    # lets get started: init the graph
    graph = nx.Graph()
    
    for f in args.hfile:
        import_edges(graphObj=graph,edgeTablePath=f[0],startNodeHeader=f[1],endNodeHeader=f[2])
    
        
    # remove the '' node caused by unpaired relationships
    graph.remove_node('')
    
    # for debugging
    nx.write_gpickle(graph,"/tmp/ortho1.gpickle")    
        
    # Cut out a subgraph using provided targets
    subgraph = graph_connected_nodes(graphObj=graph,nodeList=args.targets)
    
    
    # weight the edges in subgraph by the pearsonr between their expression vectors
    weight_edges_with_pearsonr(graphObj=subgraph,dataVectors=xDict,uni=False)
    
    # if the edge length is imposible to graph (inf or nan) kill the edge
    badEdges = []
    edgesMissingNodes = []
    for i,j in subgraph.edges_iter():
        try:
            if math.isnan(subgraph[i][j]['rVal']) or math.isinf(subgraph[i][j]['rVal']):
                badEdges.append((i,j))
        except KeyError:
            edgesMissingNodes.append((i,j))
                
    subgraph.remove_edges_from(badEdges)
    subgraph.remove_edges_from(edgesMissingNodes)
    
    
    # begin drawing the graph by setting the node positions
    #a = nx.to_agraph(subgraph)
    #a.layout()
    #a.draw('%s.gv.png' % args.out)
    #h = nx.from_agraph(a)
    #pos = nx.graphviz_layout(h)
    #pos= nx.spring_layout(subgraph,iterations=100)
    pos = nx.graphviz_layout(subgraph, args='-LC1000000000')
    
    # set node colors
    nodelist = subgraph.nodes()
    node_colors = []
    prefixes = cDict.keys()
    
    # get edge lables:
    eLab = {}
    for i,j in subgraph.edges_iter():
        eLab[(i,j)] = 'r = %s\np = %s' % (round(subgraph[i][j]['rVal'],3),round(subgraph[i][j]['pVal'],3))
    
    for n in nodelist:
        # crazy list comprehension python-voodoo to create a list of colors in the same order as nodelist
        node_colors.extend([cDict[x] for x in prefixes if n.startswith(x)])
    nx.draw_networkx_nodes(subgraph, pos,nodelist,node_color=node_colors,node_size=1000,node_shape='o', aplha=.7)
    sigEdges = []
    nonSigEdges = []
    for e in subgraph.edges_iter():
        if float(subgraph[e[0]][e[1]]['pVal']) <= 0.05:
            sigEdges.append(e)
        else:
            nonSigEdges.append(e)
            
    # Define color map for edge 'heats'
    g2r = {'green': ((0.0, 0.0, 0.0),
                     (0.66, 0.0, 0.0),
                     (1.0, 1.0, 1.0)),
 
          'blue': ((0.0, 0.0, 0.0),
                   (1.0, 0.0, 0.0)),
 
          'red':  ((0.0, 1.0, 1.0),
                   (0.33, 0.0, 0.0),
                   (1.0, 0.0, 0.0))} 
    
    
    b2g2y2o2r = {'red':   ((0.0,  0.0, 0.0),
                          #(0.9,  1.0, 1.0),
                          (1.0,  1.0, 1.0)),
       
                'green': ((0.0,  0.0, 0.0),
                          (0.4, 1.0, 1.0),
                          (0.6, 1.0, 1.0),
                          (1.0, 0.0, 0.0)),
       
                'blue':  ((0.0,  1.0,1.0),
                          #(0.1,  0.0, 0.0),
                          (1.0,  0.0, 0.0))}
    
    plt.register_cmap(name='corrMap', data=b2g2y2o2r)
    corrMap = plt.get_cmap('corrMap')
    
            
    
    nx.draw_networkx_edges(subgraph, pos, edgelist=nonSigEdges, width=2.0, edge_cmap=corrMap,
                           edge_vmin=-1,
                           edge_vmax=1,                           
                           edge_color=[subgraph[e[0]][e[1]]['weight'] for e in nonSigEdges],
                           style='dashed', alpha=.7)
    nx.draw_networkx_edges(subgraph, pos, edgelist=sigEdges, width=2.0, edge_cmap=corrMap,
                               edge_vmin=-1,
                               edge_vmax=1,                           
                               edge_color=[subgraph[e[0]][e[1]]['weight'] for e in sigEdges],
                               style='solid', alpha=1)    
    nx.draw_networkx_edges(subgraph, pos, edgelist=badEdges, width=1.0,                         
                               edge_color='grey',
                               style='solid', alpha=.3)    
    
    # add color bar as key to heats
    plt.colorbar()
    
    #nx.draw_networkx_edge_labels(subgraph,pos,edge_labels=eLab)
    if not args.nonames:
        nx.draw_networkx_labels(subgraph, pos, font_weight='bold', font_size=8)
    plt.axis('off')
    
    # write out the file(s)
    try:
        plt.savefig(args.out)
    except ValueError:
        plt.savefig('%s.png' % (args.out))
        
    if args.graphml:
        raise NotImplemented
        #nx.write_graphml(subgraph,args.graphml)
        
    if not args.noshow:
        plt.show()

Beispiel #3

Datei anzeigen

Datei: get_flankingSeq_from_expn_vectors.py Projekt: asntech/rSeqPipeline

def main():
    """
    1: Collect Tx from one or more species that are within at least some r value of similarity to
       a provided example Tx or a submitted hypothetical expression vector.
    2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
       for use in motif discovery.
    """
    
    desc = """(1) Collect Tx from one or more species that are within 
at least some r value of similarity to a provided example Tx or a 
submitted hypothetical expression vector. (2) Use GTFs, BEDtools, and 
genome FASTAs to extract the upstream flanking sequences into a new 
FASTA for use in motif discovery."""
    
    parser = argparse.ArgumentParser(description=desc)
    FileType = argparse.FileType
    
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    
    parser.add_argument('--expn-path', type=str, required=True,
                        help="""Path to expression table file. \n(default: %(default)s)""")
    parser.add_argument('--tx-name', type=str, required=True,
                        help="""Name of the Tx you want to use as a model. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-type', type=str, default='>=', choices=['>=','<='],
                        help="""Use >= to find similar expn profiles or <= to find opposite profiles. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-thresh', type=float, default=0.7,
                        help="""Set the threshold of the Pearson r value for the filter. (default: %(default)s)""")
    parser.add_argument('--pval-filter-thresh', type=float, default=0.05,
                            help="""Set the upper threshold for the p-value of the Pearson r values to keep. (default: %(default)s)""")    
    parser.add_argument('--tx-name-header', type=str, required=True,
                        help="""The text of the header in the expn table where tx names are stored. (default: %(default)s)""")
    parser.add_argument('--cond-headers', type=str, required=True, nargs='+',
                        help="""A list of the text of the headers in the expn table where the values for each condition are stored (--cond-headers cond1 cond2 ...). (default: %(default)s)""")
    parser.add_argument('--manual-headers', type=str, required=False, nargs='?',
                        help="""If the expn table does not have headers, provide a list of ordered names for them here. (default: %(default)s)""")
    parser.add_argument('--gtf', type=str, required=True,
                        help="""The path to the gtf file that you want to use for your annotation. (default: %(default)s)""")
    parser.add_argument('--gtf-index', type=str, required=True,
                        help="""The path to the gtf index file generated from "gtf_to_genes". (default: %(default)s)""")
    parser.add_argument('--genome-fastas', type=str, required=True, nargs='+',
                        help="""A list of paths to genomic fasta files or directories where they are stored. (default: %(default)s)""")
    parser.add_argument('--flank-len', type=int, default=2000,
                        help="""The length in bp that should be harvested from the 5' end of the tx. (default: %(default)s)""")
    parser.add_argument('--out-dir', type=str, default='.',
                        help="""A path to a directory where you would like the output files to be stored. (default: %(default)s)""")
    parser.add_argument('--dump-megafasta', action='store_true',
                        help="""Save concatonated fasta file for debugging. (default: %(default)s)""")
    parser.add_argument('--dump-stats', action='store_true',
                            help="""Print a list of Tx/gene names and the r- p-values that passed the filter and exit without getting fastas. (default: %(default)s)""")    
    
    args = parser.parse_args()
    
    # tmp files will be stored here
    tmp_files = Bag()
    
    # 1: Use a correlation filter to pull out any Tx that is sufficiently similar to the model Tx
    vectDict = mangle_expn_vectors(expnPath=args.expn_path,txNameHeader=args.tx_name_header,condHeaders=args.cond_headers,manualHeaders=args.manual_headers)
    
    filterFunc = eval("lambda x: x %s %f" % (args.pearson_filter_type, args.pearson_filter_thresh))
    filterDict = pearsonExpnFilter(modelVector=vectDict[args.tx_name], targetVectors=vectDict, filterFunc=filterFunc)
    
    # remove vectors whose r's pVal is not significant (<=0.05)
    sigVectors = {}
    for key in filterDict:
        if key[1] <= args.pval_filter_thresh:
            sigVectors[key] = filterDict[key]
    matchVectors = sigVectors
    
    ## Impose a distance filter to further refine the gene set
    ## incorperating magnitudes of the absolute levels of gene expression
    
    ## set the boundries of acceptable deviation for the target gene mean expression
    ## mangitude by bootstrapping.  The metric for comparison will be the average of
    ## the differences of each point in remaining vectors against the target
    ## vector.
    
    ## 1) calc the metrics for each remaining gene's vector
    ##    PS: numpy rocks.
    ##avgDists = {}
    ##for key in sigVectors:
        ##avgDist_i = np.mean(np.subtract(vectDict[args.tx_name],
                                           ##sigVectors[key]))
        ##avgDists[key] = avgDist_i
        
    ### 2) bootstrap that bitch and give me a stdErr!
    ##medianEst,stdErrEst,lo95,hi95 = basic_bootstrap_est(avgDists.values())
    
    ### 3) recover keys that fall within +/- 1 SE
    ##matchVectors = {}
    ##for key in avgDists:
        ##avgDist = avgDists[key]
        ##if (avgDist >= -stdErrEst) and (avgDist <= stdErrEst):
            ##matchVectors[key] = sigVectors[key]
    
        
    
    # Sort txList so that the highest r values are at the top
    # and save vectors and this info out to file
    txList = sorted(matchVectors.keys(),key=lambda x: x[0], reverse=True)
    sortedTxListFile = NamedTemporaryFile(mode='w+t',prefix='txExpnVectFilteredBy_r.',suffix=".tsv",delete=False)
    for row in txList:
        if args.dump_stats:
            sys.stdout.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
        else:
            sortedTxListFile.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
    if args.dump_stats:
        sortedTxListFile.close()
        exit(0)
        
    tmp_files['sortedTxListFile'] = sortedTxListFile
    sortedTxListFile.close()
    

    
    g2gObj = gtf_to_genes.get_indexed_genes_matching_gtf_file_name(index_file_name=args.gtf_index, logger=logger, regex_str=args.gtf)[-1]
    txDict = filter_GTF_4_Tx(txList=[x[2] for x in txList],g2gObj=g2gObj)
    tmp_files['txBedFile'] = convert_2_bed(txDict=txDict)
    
    # 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
    fastaRecLengths,fastaSeqs = fastaRec_length_indexer(fastaFiles=args.genome_fastas)
    tmpFastaRecLengthFile = NamedTemporaryFile(mode='w+b',prefix='tmpFastaRecLengthFile.',suffix=".txt")
    for seqRec in fastaRecLengths:
        tmpFastaRecLengthFile.write("%s\t%s\n" % (seqRec,fastaRecLengths[seqRec]))
    tmpFastaRecLengthFile.flush()

    # TODO: concatonate fasta files
    megaFastaFile = NamedTemporaryFile(mode='w+b',prefix='tmpMegaFastaFile.',suffix=".fas")
    for fasta in fastaSeqs:
        megaFastaFile.write('>%s\n%s\n' % (fasta,fastaSeqs[fasta]))
    megaFastaFile.flush()
        
    tmp_files['flankBed'] = get_fastas(txBed=tmp_files.txBedFile.name,genomeFasta=megaFastaFile.name,lenIndex=tmpFastaRecLengthFile.name,lenFlanks=args.flank_len)
    
    
    # CLEAN UP:
    # TODO: Close all tmp_files, and move to args.outDir
    mkdirp(args.out_dir)
    for f in tmp_files:
        try:
            tmp_files[f].delete = False
        except AttributeError:
            pass
        try:
            tmp_files[f].close()
        except AttributeError:
            pass
    # ['sortedTxListFile', 'flankBed', 'txBedFile', 'flankFasta']
    sortedTxListFile = "%s/sortedTxList.tsv" % (args.out_dir)
    flankBed         = "%s/flankBed.bed" % (args.out_dir)
    txBedFile        = "%s/txBed.bed" % (args.out_dir)
    flankFasta       = "%s/flankFasta.fas" % (args.out_dir)
    
    
    shutil.move(tmp_files.sortedTxListFile.name, sortedTxListFile)
    os.chmod(sortedTxListFile,0775)
    
    tmp_files.flankBed.saveas(flankBed)
    os.chmod(flankBed,0775)
    
    shutil.move(tmp_files.txBedFile.name, txBedFile)
    os.chmod(txBedFile,0775)
    
    shutil.move(tmp_files.flankBed.seqfn, flankFasta)
    os.chmod(flankFasta,0775)
    
    if args.dump_megafasta:
        megaFasta = "%s/megaFasta.fas" % (args.out_dir)
        megaFastaFile.delete = False
        megaFastaFile.close()
        shutil.move(megaFastaFile.name, megaFasta)
        os.chmod(megaFasta,0775)