Python tableFile2namedTuple Examples, rSeq.utils.files.tableFile2namedTuple Python Examples

Example #1

0

Show file

File: expression.py Project: asntech/rSeqPipeline

def mangle_expn_vectors(expnPath,txNameHeader,condHeaders,manualHeaders=False):
    """
    GIVEN:
    1) expnPath: expn file path.
    2) txNameHeader: header name where txName lives.
    3) condHeaders: list of header names of conditions to be
       included (order should be meaningful).
    4) manualHeaders: manually set headers for the expn table file
       (provide header name for EVERY column in file ONLY if no headers are already present).
    
    DO:
    1) Extract the txName and condition expn levels from expnPath
       and store in vectDict with txName as key and expn vector(list) as value.
       
    RETURN:
    1) vectDict: see "DO:".
    """
    # Sanity checks
    if type('') != type(txNameHeader):
        raise SanityCheckError("txNameHeader must be type(''); you gave: %s." % (txNameHeader))
    if type([]) != type(condHeaders):
        raise SanityCheckError("condHeaders must be type([]); you gave: %s." % (condHeaders))
    if manualHeaders:
        if type([]) != type(manualHeaders):
            raise SanityCheckError("manualHeaders must be type([]); you gave: %s." % (manualHeaders))
    
    # lets go    
    expnTable = tableFile2namedTuple(tablePath=expnPath,sep='\t',headers=manualHeaders)
    
    vectDict = {}
    for row in expnTable:
        vectDict[row.__getattribute__(txNameHeader)] = [float(row.__getattribute__(x)) for x in condHeaders]
    
    return vectDict

Example #2

0

Show file

File: rename_hudsonAlpha_fastqs.py Project: xguse/rSeqPipeline

def main():
    """
    """
    parser = argparse.ArgumentParser(description=desc)
    
    parser.add_argument('key', type=str,
                        help="""Path to file with rename data as in Description.""")
    parser.add_argument('dir', type=str,
                        help="""Path to directory with files to rename.""")
    
    
    # print the called command:
    print " ".join(sys.argv)
    
    args = parser.parse_args()
    
    keyData = tableFile2namedTuple(args.key)
    
    wrkDirFiles = os.listdir(args.dir)
    
    for f in wrkDirFiles:
        success = False
        for k in keyData:
            if k.Index and k.Library in f:
                newName = '%s.%s' % (k.Name,f)
                os.rename(f,newName)
                success = True
                print "Renamed %s as %s." % (f,newName)
                break
            else:
                pass
        if not success:
            print "%s was not in the key file." % (f)

Example #3

0

Show file

File: networks.py Project: asntech/rSeqPipeline

def import_edges(graphObj,edgeTablePath,startNodeHeader,endNodeHeader):
    """
    GIVEN:
    1) graphObj = networkx.Graph() instance
    2) edgeTablePath = path to labeled tsv file containing the edge relationships.
    3) startNodeHeader = header label containing the first node of the edge.
    4) endNodeHeader = header label containing the second node of the edge.


    DO:
    1) Populate graphObj with the edges.
    
    RETURN:
    1) None
    """
    
    edgeTable = tableFile2namedTuple(edgeTablePath)

    graphObj.add_edges_from([(edge.__getattribute__(startNodeHeader),edge.__getattribute__(endNodeHeader)) for edge in edgeTable])

Example #4

0

Show file

File: convert.py Project: asntech/rSeqPipeline

def MB_2_gff3(resultTablePath,gff3Path):
    """
    """
    gff3_lines = []
        
    mb_table = tableFile2namedTuple(resultTablePath,sep='\t')
    skipped = 0
    for line in mb_table:
        chrm,left,right = line.locus.replace('-',':').split(':')
        if int(left) < 1:
            skipped += 1
            continue
        gff3_seqid = chrm 
        gff3_source = 'Cufflinks'
        gff3_type = 'Assembled Tx boundries'
        gff3_start = left
        gff3_end = right
        gff3_score = line.q_value
        gff3_strand = '?'
        gff3_phase = '.'
        gff3_attributes = 'ID=%s;Alias=%s;Note=%s' % \
            (line.tracking_id, line.nearest_ref_id, line.class_code)
        
        gff3_lines.append([gff3_seqid,
                           gff3_source,
                           gff3_type,
                           gff3_start,
                           gff3_end,
                           gff3_score,
                           gff3_strand,
                           gff3_phase,
                           gff3_attributes])
        
    gff3Out = open(gff3Path,'w')
    for line in gff3_lines:
        gff3Out.write('%s\n' % ('\t'.join(line)))
    gff3Out.close()
    
    return skipped

Example #5

0

Show file

File: pearson_scatter.py Project: asntech/rSeqPipeline

def main():
    """Inputs:
    -- Data tables containing at least 1 column of data tied to a column of data symbols
    -- Column names/positions to ID data symbols and data
    -- Plotting options
    -- Outfile path
    Outputs:
    -- Image file containing the scatterplot, Pearson stats, other useful info."""
    
    desc  = """This script creates a scatterplot of FPKM transcript read-counts and reports the Pearson Correlation."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    parser.add_argument('table1', type=str,
                        help="""Path to first data-table.""")
    parser.add_argument('data1', type=str,
                        help="""Column header name or 0-based column number where data for table1 lives.""")
    parser.add_argument('ids1', type=str,
                        help="""Column header name or 0-based column number where the data symbols for table1 lives.""")
    parser.add_argument('--label1', type=str, default=' ',
                        help="""Axes name for data1. (Default:%(default)s)""")
    parser.add_argument('table2', type=str,
                        help="""Path to second data-table.""")
    parser.add_argument('data2', type=str,
                        help="""Column header name or 0-based column number where data for table2 lives.""")
    parser.add_argument('ids2', type=str,
                        help="""Column header name or 0-based column number where the data symbols for table2 lives.""")
    parser.add_argument('--label2', type=str, default=' ',
                        help="""Axes name for data2. (Default:%(default)s)""")
    parser.add_argument('--log', action='store_true',
                        help="""Plot the points on a log:log scale. (Default: %(default)s)""")
    parser.add_argument('--show', action='store_true',
                        help="""Plot the image for interactive manipulation, otherwise just write the file. (Default: %(default)s)""")
    parser.add_argument('--pdf', action='store_true',
                        help="""Plot the image as a pdf: png otherwise. Png is preferable when data size is large. (Default: %(default)s)""")
    parser.add_argument('--galaxy', action='store_true',
                        help="""Use symplified namings suitable for use with Galaxy tool template. (Default: %(default)s)""")
    parser.add_argument('--out', type=str, default='',
                        help="""Base path for output [superseded by --galaxy]. (Default: current working directory)""")




    # print the called command:
    print " ".join(sys.argv)
    
    args = parser.parse_args()
    
    
    
    try:
        args.table1 = tableFile2namedTuple(args.table1)
        args.table2 = tableFile2namedTuple(args.table2)
    except:
        raise
    
    # deal with names/header posibilities
    
    data = collectData(args) 
    pearsonStats = stats.pearsonr(data[0],data[1])
    plotScatter(pearsonStats,data,args)
    print "Good bye!"

Example #6

0

Show file

File: plot_hist_of_expnPerTx.py Project: asntech/rSeqPipeline

else:
    for i in range(len(opts.bins)):
        opts.bins[i] = int(opts.bins[i])

#if opts.range != None:
    #opts.range = opts.range.split()
    #if not (len(opts.range) == 2):
        #print "ERROR: range has len %s and not 2."
        #exit(1)
    #else:
        #opts.range[0],opts.range[1] = float(opts.range[0]),float(opts.range[1])
        #opts.range = (min(opts.range), max(opts.range))

  
# --- Parse inFile ---
table = tableFile2namedTuple(tablePath=args[0],sep='\t',headers=None)
for i in range(len(table)):
    table[i] = Bag(table[i]._asdict())
if not opts.include:
    opts.include = table[0].keys()
else:
    opts.include = opts.include.split(',')

for i in range(len(table)):
    for column in opts.include: 
        try:
            table[i][column] = float(table[i][column])
        except ValueError:
            raise ValueError('A column you asked me to include contains a data-type other than "int" or "float"!: %s' % table[i][column])
            
# ------- Build Figs -------

Example #7

0

Show file

File: convert.py Project: asntech/rSeqPipeline

def vectorBaseESTs_2_gff3(resultTablePath,gff3Path):
    """
    GIVEN:
    
    1) resultTablePath: table resulting from mySQL query of EBI's "other features"
       database schema with following query:
           SELECT s.name AS "chr", d.* FROM dna_align_feature d
           LEFT JOIN seq_region s ON (d.seq_region_id = s.seq_region_id)
           WHERE s.seq_region_id < 8 AND d.analysis_id = 227 AND
           s.seq_region_id = d.seq_region_id
    
    2) gffPath: path to new gff3 file
    
    DO:
    
    1) For each row, create multiline GFF3 entries using dna_align_feature_id
       as 'ID' and hit_name as 'Alias', score as 'Score' column, etc. using the 
       cigar_line to generate the feature coords.
    2) Write data to gffPath.
    
    RETURN:
    
    1) Full path of gffPath.
    """
    # helper defs
    def match(current_loc,value):
        """
        """      
        left  = current_loc + 1
        right = left + value - 1
        
        current_loc = right 
        
        return (current_loc,left,right)
    
    def insertion(current_loc,value):
        """
        """
        left  = current_loc + 1
        right = left + value - 1
        current_loc = right 
        
        return (current_loc, None, None)
        
    def deletion(current_loc,value):
        """
        """
        current_loc = current_loc
        
        return (current_loc, None, None)
    
    cigar_operations = {'I':insertion,
                        'M':match,
                        'D':deletion}
    strandConvertions = {'1':'+',
                         '-1':'-'}
    
    gff3_lines = []
    
    est_table = tableFile2namedTuple(resultTablePath,sep=',')
    
    for align_feat in est_table:
        cig_tupl = parseCigarString(align_feat.cigar_line,kind='EBI')
        #cig_tupl = parseCigarString("5M3D6M2I3M",kind='EBI')
        align_feat_lines = []
        
        far_left  = int(align_feat.seq_region_start)
        far_right = int(align_feat.seq_region_end)
        #far_left  = 1
        #far_right = 16        
        
        current_loc = far_left - 1 
        
        for op,value in cig_tupl:
            current_loc, left, right = cigar_operations[op.upper()](current_loc,int(value))
            if left != None:
                #pass
                # Construct the gff line for the match feature and append it to gff_lines
                gff3_seqid = align_feat.chr
                gff3_source = 'Exonerate'
                gff3_type = 'EST_match'
                gff3_start = left
                gff3_end = right
                gff3_score = align_feat.score
                gff3_strand = strandConvertions[align_feat.seq_region_strand]
                gff3_phase = '.'
                gff3_attributes = 'ID=%s;Alias=%s' % (align_feat.dna_align_feature_id, align_feat.hit_name)
                
                align_feat_lines.append([gff3_seqid,
                                         gff3_source,
                                         gff3_type,
                                         gff3_start,
                                         gff3_end,
                                         gff3_score,
                                         gff3_strand,
                                         gff3_phase,
                                         gff3_attributes])
            else:
                pass
            
        # sanity check after each align_feat: does the final location == far_right?
        if not current_loc == far_right:
            raise SanityCheckError()
        #else:
            #raise Exception('YAY!')
        gff3_lines.extend(align_feat_lines)
        
    # Add sort code here if needed
    #  ---- sort code here ----
    
    gff3out = open(gff3Path,'w')
    for line in gff3_lines:
        gff3out.write('%s\n' % ('\t'.join([str(x) for x in line])))
        
    gff3out.close()

Example #8

0

Show file

File: namedTable2BEDfile.py Project: asntech/rSeqPipeline

    )
    parser.add_option(
        "--cigars",
        dest="cigars",
        type="str",
        default=False,
        help="""Exact Title of Column holding the cigar strings. Exp: cigar_string (default=%default)""",
    )
    cigTypes = ["ensembl", "exonerate"]
    parser.add_option(
        "--cigar-type",
        dest="cigar_type",
        type="str",
        default=False,
        help="""Type of cigar string.  REQUIRED when using '--cigars'.  Options: %s (default=%default)""" % (cigTypes),
    )

    (opts, args) = parser.parse_args()

    if len(args) != 1:
        parser.print_help()
        exit()

    features = tableFile2namedTuple(args[0], sep=opts.sep)
    rowsByAlgn = groupFeatureAlignments(features, opts)

    print """track name=%s description="%s" useScore=0""" % (opts.track_name, opts.description)

    for alnmnt in rowsByAlgn:
        printBEDline(rowsByAlgn[alnmnt], opts)

Example #9

0

Show file

File: rPossum.py Project: asntech/rSeqPipeline

def getPossumHitTable(path,headers=possumHeaders):
    """Return a tuple of named tuples representing the results
    of a PoSSuMSearch run."""
    return tuple(tableFile2namedTuple(path,sep='\t',headers=headers))