コード例 #1
def main():
    1: ...
    desc = """... ask me later! I'm on a deadline! ..."""
    parser = argparse.ArgumentParser(description=desc)
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    parser.add_argument('--hfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing homology relationships:
[--hfile "path;header1;header2"].  At LEAST one hfile is required and all MUST have all three trailing data.""")
    parser.add_argument('--xprnfile', type=str, required=True, nargs='+',
                        help="""Quoted ;-delimited list containing info about the files containing expression data:
[--xprnfile "path;nameHeader;conditionHeader1;...;conditionHeaderN"].  At LEAST one xprnfile is required and all MUST have 
exactly one <path>, exactly one <nameHeader>, and at LEAST one <conditionHeader>. It is VERY important that 
you list the same number of conditions for each expnfile and that the order reflects which condition values are to be compared.""")
    parser.add_argument('--cmap', type=str, required=True, nargs='+',
                        help="""A list of species-prefix:color combinations to set the node colors:
[--cmap <AAEL:b ...>]. the number of combinations should match the number of files given to --xprnfile.""")
    parser.add_argument('--log', action='store_true',
                        help="""Plot the points on a log:log scale. (Default: %(default)s)""")
    parser.add_argument('--show', action='store_true',
                        help="""Plot the image for interactive manipulation, otherwise just write the file. (Default: %(default)s)""")
    parser.add_argument('--pdf', action='store_true',
                        help="""Plot the image as a pdf: png otherwise. Png is preferable when data size is large. (Default: %(default)s)""")
    parser.add_argument('--out', type=str, default='',
                        help="""Base path for output. (Default: current working directory)""")
    parser.add_argument('--load-pickle', type=str, default=False,
                        help="""Load graph from a gpickle. (Default: %(default)s)""")
    args = parser.parse_args()
    # some manual arg set-up and checking
    for i in range(len(args.hfile)):
        args.hfile[i] = args.hfile[i].split(';')
        if len(args.hfile[i]) != 3:
            raise SanityCheckError('EXACTLY 3 values must follow --hfile: you gave %s' % (args.hfile[i]))
    xLen = set()
    for i in range(len(args.xprnfile)):
        args.xprnfile[i] = args.xprnfile[i].split(';')
        if not len(args.xprnfile[i]) >= 3:
            raise SanityCheckError('At LEAST 3 values must follow --xprnfile: you gave %s' % (args.xprnfile[i]))
    if not len(xLen) == 1:
        raise SanityCheckError('The same number of values must follow every --xprnfile flag.')
    if not len(args.xprnfile) == len(args.cmap):
        raise SanityCheckError('The length of values following --xprnfile and --cmap must be the same.')
    cDict = {}
    for combo in args.cmap:
            prefix,color = combo.split(':')
        cDict[prefix] = color
    # read in the expression vector data
    tmpDict = {}
    xDict = {}
    for xfile in args.xprnfile:
    # convert -RX into -PX
    for k,v in tmpDict.iteritems():
        xDict[k.replace('-R','-P')] = v
    if args.load_pickle:
        subgraphs = nx.read_gpickle('/tmp/ortho_weighted_subgraphs.gpickle')
        # lets get started: init the graph
        graph = nx.Graph()
        for f in args.hfile:
        # remove the '' node caused by unpaired relationships
        # weight the edges in each graph by the pearsonr between their expression vectors
        # if the edge length is imposible to graph (inf or nan) kill the edge
        #badEdges = []
        #edgesMissingNodes = []
        #for i,j in graph.edges_iter():
                #if math.isnan(graph[i][j]['rVal']) or math.isinf(graph[i][j]['rVal']):
            #except KeyError:
        # Get all subgraphs
        subgraphs = nx.connected_component_subgraphs(graph)  
        print "I layed a pickle!!"
    args.galaxy = False
    #args.label2 = "Pct w/ significant positive corr (r >= 0.5, p <= 0.05)"
    args.label2 = "Pct with significant correlation" 
    for prefix in cDict:
        args.label1 = "Usable paralogs per subgraph within %s" % (prefix)
        #args.label1 = "%s x" % (prefix) 
        pearsonStats,data = get_within_data(prefix,subgraphs)
    args.label1 = "Usable orthologs per subgraph between AGAP and CPIJ" 
    #args.label1 = "both x"  
    pearsonStats,data = get_between_data(prefixes=cDict.keys(),subgraphs=subgraphs)
    print "Done."
def main():
    1: Collect Tx from one or more species that are within at least some r value of similarity to
       a provided example Tx or a submitted hypothetical expression vector.
    2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
       for use in motif discovery.
    desc = """(1) Collect Tx from one or more species that are within 
at least some r value of similarity to a provided example Tx or a 
submitted hypothetical expression vector. (2) Use GTFs, BEDtools, and 
genome FASTAs to extract the upstream flanking sequences into a new 
FASTA for use in motif discovery."""
    parser = argparse.ArgumentParser(description=desc)
    FileType = argparse.FileType
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    parser.add_argument('--expn-path', type=str, required=True,
                        help="""Path to expression table file. \n(default: %(default)s)""")
    parser.add_argument('--tx-name', type=str, required=True,
                        help="""Name of the Tx you want to use as a model. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-type', type=str, default='>=', choices=['>=','<='],
                        help="""Use >= to find similar expn profiles or <= to find opposite profiles. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-thresh', type=float, default=0.7,
                        help="""Set the threshold of the Pearson r value for the filter. (default: %(default)s)""")
    parser.add_argument('--pval-filter-thresh', type=float, default=0.05,
                            help="""Set the upper threshold for the p-value of the Pearson r values to keep. (default: %(default)s)""")    
    parser.add_argument('--tx-name-header', type=str, required=True,
                        help="""The text of the header in the expn table where tx names are stored. (default: %(default)s)""")
    parser.add_argument('--cond-headers', type=str, required=True, nargs='+',
                        help="""A list of the text of the headers in the expn table where the values for each condition are stored (--cond-headers cond1 cond2 ...). (default: %(default)s)""")
    parser.add_argument('--manual-headers', type=str, required=False, nargs='?',
                        help="""If the expn table does not have headers, provide a list of ordered names for them here. (default: %(default)s)""")
    parser.add_argument('--gtf', type=str, required=True,
                        help="""The path to the gtf file that you want to use for your annotation. (default: %(default)s)""")
    parser.add_argument('--gtf-index', type=str, required=True,
                        help="""The path to the gtf index file generated from "gtf_to_genes". (default: %(default)s)""")
    parser.add_argument('--genome-fastas', type=str, required=True, nargs='+',
                        help="""A list of paths to genomic fasta files or directories where they are stored. (default: %(default)s)""")
    parser.add_argument('--flank-len', type=int, default=2000,
                        help="""The length in bp that should be harvested from the 5' end of the tx. (default: %(default)s)""")
    parser.add_argument('--out-dir', type=str, default='.',
                        help="""A path to a directory where you would like the output files to be stored. (default: %(default)s)""")
    parser.add_argument('--dump-megafasta', action='store_true',
                        help="""Save concatonated fasta file for debugging. (default: %(default)s)""")
    parser.add_argument('--dump-stats', action='store_true',
                            help="""Print a list of Tx/gene names and the r- p-values that passed the filter and exit without getting fastas. (default: %(default)s)""")    
    args = parser.parse_args()
    # tmp files will be stored here
    tmp_files = Bag()
    # 1: Use a correlation filter to pull out any Tx that is sufficiently similar to the model Tx
    vectDict = mangle_expn_vectors(expnPath=args.expn_path,txNameHeader=args.tx_name_header,condHeaders=args.cond_headers,manualHeaders=args.manual_headers)
    filterFunc = eval("lambda x: x %s %f" % (args.pearson_filter_type, args.pearson_filter_thresh))
    filterDict = pearsonExpnFilter(modelVector=vectDict[args.tx_name], targetVectors=vectDict, filterFunc=filterFunc)
    # remove vectors whose r's pVal is not significant (<=0.05)
    sigVectors = {}
    for key in filterDict:
        if key[1] <= args.pval_filter_thresh:
            sigVectors[key] = filterDict[key]
    matchVectors = sigVectors
    ## Impose a distance filter to further refine the gene set
    ## incorperating magnitudes of the absolute levels of gene expression
    ## set the boundries of acceptable deviation for the target gene mean expression
    ## mangitude by bootstrapping.  The metric for comparison will be the average of
    ## the differences of each point in remaining vectors against the target
    ## vector.
    ## 1) calc the metrics for each remaining gene's vector
    ##    PS: numpy rocks.
    ##avgDists = {}
    ##for key in sigVectors:
        ##avgDist_i = np.mean(np.subtract(vectDict[args.tx_name],
        ##avgDists[key] = avgDist_i
    ### 2) bootstrap that bitch and give me a stdErr!
    ##medianEst,stdErrEst,lo95,hi95 = basic_bootstrap_est(avgDists.values())
    ### 3) recover keys that fall within +/- 1 SE
    ##matchVectors = {}
    ##for key in avgDists:
        ##avgDist = avgDists[key]
        ##if (avgDist >= -stdErrEst) and (avgDist <= stdErrEst):
            ##matchVectors[key] = sigVectors[key]
    # Sort txList so that the highest r values are at the top
    # and save vectors and this info out to file
    txList = sorted(matchVectors.keys(),key=lambda x: x[0], reverse=True)
    sortedTxListFile = NamedTemporaryFile(mode='w+t',prefix='txExpnVectFilteredBy_r.',suffix=".tsv",delete=False)
    for row in txList:
        if args.dump_stats:
            sys.stdout.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
            sortedTxListFile.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
    if args.dump_stats:
    tmp_files['sortedTxListFile'] = sortedTxListFile

    g2gObj = gtf_to_genes.get_indexed_genes_matching_gtf_file_name(index_file_name=args.gtf_index, logger=logger, regex_str=args.gtf)[-1]
    txDict = filter_GTF_4_Tx(txList=[x[2] for x in txList],g2gObj=g2gObj)
    tmp_files['txBedFile'] = convert_2_bed(txDict=txDict)
    # 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
    fastaRecLengths,fastaSeqs = fastaRec_length_indexer(fastaFiles=args.genome_fastas)
    tmpFastaRecLengthFile = NamedTemporaryFile(mode='w+b',prefix='tmpFastaRecLengthFile.',suffix=".txt")
    for seqRec in fastaRecLengths:
        tmpFastaRecLengthFile.write("%s\t%s\n" % (seqRec,fastaRecLengths[seqRec]))

    # TODO: concatonate fasta files
    megaFastaFile = NamedTemporaryFile(mode='w+b',prefix='tmpMegaFastaFile.',suffix=".fas")
    for fasta in fastaSeqs:
        megaFastaFile.write('>%s\n%s\n' % (fasta,fastaSeqs[fasta]))
    tmp_files['flankBed'] = get_fastas(txBed=tmp_files.txBedFile.name,genomeFasta=megaFastaFile.name,lenIndex=tmpFastaRecLengthFile.name,lenFlanks=args.flank_len)
    # CLEAN UP:
    # TODO: Close all tmp_files, and move to args.outDir
    for f in tmp_files:
            tmp_files[f].delete = False
        except AttributeError:
        except AttributeError:
    # ['sortedTxListFile', 'flankBed', 'txBedFile', 'flankFasta']
    sortedTxListFile = "%s/sortedTxList.tsv" % (args.out_dir)
    flankBed         = "%s/flankBed.bed" % (args.out_dir)
    txBedFile        = "%s/txBed.bed" % (args.out_dir)
    flankFasta       = "%s/flankFasta.fas" % (args.out_dir)
    shutil.move(tmp_files.sortedTxListFile.name, sortedTxListFile)
    shutil.move(tmp_files.txBedFile.name, txBedFile)
    shutil.move(tmp_files.flankBed.seqfn, flankFasta)
    if args.dump_megafasta:
        megaFasta = "%s/megaFasta.fas" % (args.out_dir)
        megaFastaFile.delete = False
        shutil.move(megaFastaFile.name, megaFasta)