def mangle_expn_vectors(expnPath,txNameHeader,condHeaders,manualHeaders=False): """ GIVEN: 1) expnPath: expn file path. 2) txNameHeader: header name where txName lives. 3) condHeaders: list of header names of conditions to be included (order should be meaningful). 4) manualHeaders: manually set headers for the expn table file (provide header name for EVERY column in file ONLY if no headers are already present). DO: 1) Extract the txName and condition expn levels from expnPath and store in vectDict with txName as key and expn vector(list) as value. RETURN: 1) vectDict: see "DO:". """ # Sanity checks if type('') != type(txNameHeader): raise SanityCheckError("txNameHeader must be type(''); you gave: %s." % (txNameHeader)) if type([]) != type(condHeaders): raise SanityCheckError("condHeaders must be type([]); you gave: %s." % (condHeaders)) if manualHeaders: if type([]) != type(manualHeaders): raise SanityCheckError("manualHeaders must be type([]); you gave: %s." % (manualHeaders)) # lets go expnTable = tableFile2namedTuple(tablePath=expnPath,sep='\t',headers=manualHeaders) vectDict = {} for row in expnTable: vectDict[row.__getattribute__(txNameHeader)] = [float(row.__getattribute__(x)) for x in condHeaders] return vectDict
def main(): """ """ parser = argparse.ArgumentParser(description=desc) parser.add_argument('key', type=str, help="""Path to file with rename data as in Description.""") parser.add_argument('dir', type=str, help="""Path to directory with files to rename.""") # print the called command: print " ".join(sys.argv) args = parser.parse_args() keyData = tableFile2namedTuple(args.key) wrkDirFiles = os.listdir(args.dir) for f in wrkDirFiles: success = False for k in keyData: if k.Index and k.Library in f: newName = '%s.%s' % (k.Name,f) os.rename(f,newName) success = True print "Renamed %s as %s." % (f,newName) break else: pass if not success: print "%s was not in the key file." % (f)
def import_edges(graphObj,edgeTablePath,startNodeHeader,endNodeHeader): """ GIVEN: 1) graphObj = networkx.Graph() instance 2) edgeTablePath = path to labeled tsv file containing the edge relationships. 3) startNodeHeader = header label containing the first node of the edge. 4) endNodeHeader = header label containing the second node of the edge. DO: 1) Populate graphObj with the edges. RETURN: 1) None """ edgeTable = tableFile2namedTuple(edgeTablePath) graphObj.add_edges_from([(edge.__getattribute__(startNodeHeader),edge.__getattribute__(endNodeHeader)) for edge in edgeTable])
def MB_2_gff3(resultTablePath,gff3Path): """ """ gff3_lines = [] mb_table = tableFile2namedTuple(resultTablePath,sep='\t') skipped = 0 for line in mb_table: chrm,left,right = line.locus.replace('-',':').split(':') if int(left) < 1: skipped += 1 continue gff3_seqid = chrm gff3_source = 'Cufflinks' gff3_type = 'Assembled Tx boundries' gff3_start = left gff3_end = right gff3_score = line.q_value gff3_strand = '?' gff3_phase = '.' gff3_attributes = 'ID=%s;Alias=%s;Note=%s' % \ (line.tracking_id, line.nearest_ref_id, line.class_code) gff3_lines.append([gff3_seqid, gff3_source, gff3_type, gff3_start, gff3_end, gff3_score, gff3_strand, gff3_phase, gff3_attributes]) gff3Out = open(gff3Path,'w') for line in gff3_lines: gff3Out.write('%s\n' % ('\t'.join(line))) gff3Out.close() return skipped
def main(): """Inputs: -- Data tables containing at least 1 column of data tied to a column of data symbols -- Column names/positions to ID data symbols and data -- Plotting options -- Outfile path Outputs: -- Image file containing the scatterplot, Pearson stats, other useful info.""" desc = """This script creates a scatterplot of FPKM transcript read-counts and reports the Pearson Correlation.""" parser = argparse.ArgumentParser(description=desc) parser.add_argument('table1', type=str, help="""Path to first data-table.""") parser.add_argument('data1', type=str, help="""Column header name or 0-based column number where data for table1 lives.""") parser.add_argument('ids1', type=str, help="""Column header name or 0-based column number where the data symbols for table1 lives.""") parser.add_argument('--label1', type=str, default=' ', help="""Axes name for data1. (Default:%(default)s)""") parser.add_argument('table2', type=str, help="""Path to second data-table.""") parser.add_argument('data2', type=str, help="""Column header name or 0-based column number where data for table2 lives.""") parser.add_argument('ids2', type=str, help="""Column header name or 0-based column number where the data symbols for table2 lives.""") parser.add_argument('--label2', type=str, default=' ', help="""Axes name for data2. (Default:%(default)s)""") parser.add_argument('--log', action='store_true', help="""Plot the points on a log:log scale. (Default: %(default)s)""") parser.add_argument('--show', action='store_true', help="""Plot the image for interactive manipulation, otherwise just write the file. (Default: %(default)s)""") parser.add_argument('--pdf', action='store_true', help="""Plot the image as a pdf: png otherwise. Png is preferable when data size is large. (Default: %(default)s)""") parser.add_argument('--galaxy', action='store_true', help="""Use symplified namings suitable for use with Galaxy tool template. (Default: %(default)s)""") parser.add_argument('--out', type=str, default='', help="""Base path for output [superseded by --galaxy]. (Default: current working directory)""") # print the called command: print " ".join(sys.argv) args = parser.parse_args() try: args.table1 = tableFile2namedTuple(args.table1) args.table2 = tableFile2namedTuple(args.table2) except: raise # deal with names/header posibilities data = collectData(args) pearsonStats = stats.pearsonr(data[0],data[1]) plotScatter(pearsonStats,data,args) print "Good bye!"
else: for i in range(len(opts.bins)): opts.bins[i] = int(opts.bins[i]) #if opts.range != None: #opts.range = opts.range.split() #if not (len(opts.range) == 2): #print "ERROR: range has len %s and not 2." #exit(1) #else: #opts.range[0],opts.range[1] = float(opts.range[0]),float(opts.range[1]) #opts.range = (min(opts.range), max(opts.range)) # --- Parse inFile --- table = tableFile2namedTuple(tablePath=args[0],sep='\t',headers=None) for i in range(len(table)): table[i] = Bag(table[i]._asdict()) if not opts.include: opts.include = table[0].keys() else: opts.include = opts.include.split(',') for i in range(len(table)): for column in opts.include: try: table[i][column] = float(table[i][column]) except ValueError: raise ValueError('A column you asked me to include contains a data-type other than "int" or "float"!: %s' % table[i][column]) # ------- Build Figs -------
def vectorBaseESTs_2_gff3(resultTablePath,gff3Path): """ GIVEN: 1) resultTablePath: table resulting from mySQL query of EBI's "other features" database schema with following query: SELECT s.name AS "chr", d.* FROM dna_align_feature d LEFT JOIN seq_region s ON (d.seq_region_id = s.seq_region_id) WHERE s.seq_region_id < 8 AND d.analysis_id = 227 AND s.seq_region_id = d.seq_region_id 2) gffPath: path to new gff3 file DO: 1) For each row, create multiline GFF3 entries using dna_align_feature_id as 'ID' and hit_name as 'Alias', score as 'Score' column, etc. using the cigar_line to generate the feature coords. 2) Write data to gffPath. RETURN: 1) Full path of gffPath. """ # helper defs def match(current_loc,value): """ """ left = current_loc + 1 right = left + value - 1 current_loc = right return (current_loc,left,right) def insertion(current_loc,value): """ """ left = current_loc + 1 right = left + value - 1 current_loc = right return (current_loc, None, None) def deletion(current_loc,value): """ """ current_loc = current_loc return (current_loc, None, None) cigar_operations = {'I':insertion, 'M':match, 'D':deletion} strandConvertions = {'1':'+', '-1':'-'} gff3_lines = [] est_table = tableFile2namedTuple(resultTablePath,sep=',') for align_feat in est_table: cig_tupl = parseCigarString(align_feat.cigar_line,kind='EBI') #cig_tupl = parseCigarString("5M3D6M2I3M",kind='EBI') align_feat_lines = [] far_left = int(align_feat.seq_region_start) far_right = int(align_feat.seq_region_end) #far_left = 1 #far_right = 16 current_loc = far_left - 1 for op,value in cig_tupl: current_loc, left, right = cigar_operations[op.upper()](current_loc,int(value)) if left != None: #pass # Construct the gff line for the match feature and append it to gff_lines gff3_seqid = align_feat.chr gff3_source = 'Exonerate' gff3_type = 'EST_match' gff3_start = left gff3_end = right gff3_score = align_feat.score gff3_strand = strandConvertions[align_feat.seq_region_strand] gff3_phase = '.' gff3_attributes = 'ID=%s;Alias=%s' % (align_feat.dna_align_feature_id, align_feat.hit_name) align_feat_lines.append([gff3_seqid, gff3_source, gff3_type, gff3_start, gff3_end, gff3_score, gff3_strand, gff3_phase, gff3_attributes]) else: pass # sanity check after each align_feat: does the final location == far_right? if not current_loc == far_right: raise SanityCheckError() #else: #raise Exception('YAY!') gff3_lines.extend(align_feat_lines) # Add sort code here if needed # ---- sort code here ---- gff3out = open(gff3Path,'w') for line in gff3_lines: gff3out.write('%s\n' % ('\t'.join([str(x) for x in line]))) gff3out.close()
) parser.add_option( "--cigars", dest="cigars", type="str", default=False, help="""Exact Title of Column holding the cigar strings. Exp: cigar_string (default=%default)""", ) cigTypes = ["ensembl", "exonerate"] parser.add_option( "--cigar-type", dest="cigar_type", type="str", default=False, help="""Type of cigar string. REQUIRED when using '--cigars'. Options: %s (default=%default)""" % (cigTypes), ) (opts, args) = parser.parse_args() if len(args) != 1: parser.print_help() exit() features = tableFile2namedTuple(args[0], sep=opts.sep) rowsByAlgn = groupFeatureAlignments(features, opts) print """track name=%s description="%s" useScore=0""" % (opts.track_name, opts.description) for alnmnt in rowsByAlgn: printBEDline(rowsByAlgn[alnmnt], opts)
def getPossumHitTable(path,headers=possumHeaders): """Return a tuple of named tuples representing the results of a PoSSuMSearch run.""" return tuple(tableFile2namedTuple(path,sep='\t',headers=headers))