Example #1
0
def reduceCPseriesFiles(outputFiles,
                        reducedOutputFile,
                        indices=None,
                        tileOutputFile=None):
    """Concatenate the per-tile outputs and reduce to only include indices that are relevant."""
    # load all files in dict outputFiles
    allTiles = [fileio.loadFile(filename) for filename in outputFiles.values()]
    a = pd.concat(allTiles)
    a = a.groupby(level=0).first()

    if indices is None:
        a.to_pickle(reducedOutputFile)
    else:
        a.loc[indices].to_pickle(reducedOutputFile)

    # find tile dict if tile output file is given
    if tileOutputFile is not None:
        tiles = pd.concat([
            pd.Series(index=s.index, data=tile)
            for s, tile in itertools.izip(allTiles, outputFiles.keys())
        ])
        tiles = tiles.groupby(level=0).first()
        if indices is None:
            tiles.to_pickle(tileOutputFile)
        else:
            tiles.loc[indices].to_pickle(tileOutputFile)

    return
def main():

    # Get options and arguments from command line
    parser = argparse.ArgumentParser(description="summarize a CPseries file by variants given a CPannot file")
    parser.add_argument('--outSuffix', help="path to the output file; default is _processed.CPvariant")
    parser.add_argument('annotFilePath', help="path to the CPannot.pkl file")
    parser.add_argument('signalsFilePaths', nargs='*', help="paths to the CPseries to be combined")
    args = parser.parse_args()
    
    # Read CPannot files
    annot = pd.read_pickle(args.annotFilePath)

    # Read signals from CPseries and combine tiles
    allFiles = []
    for i, filePath in enumerate(args.signalsFilePaths):
        allFiles.append(fileio.loadFile(filePath))
    signals = pd.concat(allFiles, axis=1, join='outer')
    signals.columns = range(0, len(allFiles))

    # Define output file path
    if isinstance(args.signalsFilePaths, list):
        (signalDir, signalFilename) = os.path.split(args.signalsFilePaths[0])
    else:
        (signalDir, signalFilename) = os.path.split(args.signalsFilePaths)
    if args.outSuffix is None:
        outputFilePath = os.path.join(signalDir, signalFilename[0:9]+'_processed.CPvariant')
    else:
        outputFilePath = os.path.join(signalDir, signalFilename[0:9]+args.outSuffix)

    # Join signals with CPannot
    signals_with_variants = signals.join(annot, how='inner')

    # Group by and summarize
    signals_by_variants = signals_with_variants.groupby('variant_number').median()
    
    # Write to output file
    signals_by_variants.to_csv(outputFilePath, sep='\t')

    return 1
import os
import numpy as np
import pandas as pd
import itertools
from fittinglibs import fileio
from tectolibs import tectplots
from hjh.junction import Junction
from hjh.helix import Helix


# to start, load old helix model
model = fileio.loadFile('/home/sarah/JunctionLibrary/seq_params/linear_regression_length10.p')
seqs_per_length = {}
for length in [9, 10, 11]:
    # get all helix seqs of a particular length
    all_seqs = Junction(tuple(['W']*(length-1) + ['G'])).sequences
    
    # get rid of homoplymeric tracts
    max_homo_length = 3
    bases = ['A', 'C', 'G', 'U']
    max_num_GC_or_AU = np.ceil(length/2.)+1
    sub_index = []
    for idx, seq in all_seqs.side1.iteritems():
        too_homopolymeric = any([seq.find(base*(max_homo_length+1))>-1 for base in bases])
        too_gc_rich = len([s for s in seq if s=='C' or s=='G']) > max_num_GC_or_AU
        too_au_rich = len([s for s in seq if s=='A' or s=='U']) > max_num_GC_or_AU
        
        if not too_homopolymeric and not too_gc_rich and not too_au_rich:
            sub_index.append(idx)
    seqs_sub = all_seqs.loc[sub_index].copy()
    
Example #4
0
def getSignalFromCPFluor(CPfluorfilename):
    """Starting from CPfluor, determine integrated signal."""
    fitResults = fileio.loadFile(CPfluorfilename)
    signal = 2 * np.pi * fitResults.amplitude * fitResults.sigma * fitResults.sigma
    signal.loc[~fitResults.success.astype(bool)] = np.nan
    return signal
        sys.exit()

    bindingSeriesFile = args.cpseries
    tileFile = args.tile_file
    timeDeltaFile = args.time_dict
    annotatedClusterFile = args.annotated_clusters
    #tile_to_subset = args.tile

    time_series = args.time_series

    # load files
    if args.out_file is None:
        outFile = fileio.stripExtension(bindingSeriesFile)
    else:
        outFile = args.out_file
    bindingSeries = fileio.loadFile(bindingSeriesFile)
    timeDict = fileio.loadFile(timeDeltaFile)
    tileSeries = fileio.loadFile(tileFile)
    '''
    # look only at clusters in tile
    print 'Only looking at clusters in tile %s...'%tile_to_subset
    index = tileSeries==tile_to_subset
    bindingSeries = bindingSeries.loc[index].copy()
    tileSeries = tileSeries.loc[index].copy()
    times = timeDict[tile_to_subset] 
    '''

    print 'Using time series from tile  %s...' % time_series
    times = timeDict[time_series]
    '''
    # remove zero point if given
Example #6
0
    if args.out is None:
        print 'Error: please supply --out parameter. Something that looks like: '
        print 'paper/01_expt/results_tables/flow_3455.151204.error_scaled.results.pkl'
        sys.exit()

    if os.path.exists(args.out) and not args.force_overwrite:
        print 'Error: filename %s already exists. Use -f to overwrite.' % args.out
        sys.exit()

    key = args.key[0]
    if not os.path.exists(filename_table.loc[key, 'variant_table']):
        print 'Error: need to supply variant table filename for key %s in filename table %s.' % (
            key, args.data)
        sys.exit()

    variant_table = fileio.loadFile(filename_table.loc[key, 'variant_table'])
    if 'numTests' not in variant_table.columns.tolist():
        variant_table.rename(columns={'num_tests': 'numTests'}, inplace=True)
    affinity_data = exptplots.PerVariant(variant_table=variant_table)
    result_table = affinity_data.getResultsFromVariantTable()
    fileio.saveFile(args.out, result_table)

if args.mode == 'combine_results_table':
    """Combine two variant_tables to form a results table."""
    if args.out is None:
        print 'Error: please supply --out parameter. Something that looks like: '
        print 'paper/01_expt/results_tables/flow_3455.151204.error_scaled.results.pkl'
        sys.exit()

    if os.path.exists(args.out) and not args.force_overwrite:
        print 'Error: filename %s already exists. Use -f to overwrite.' % args.out
def main():
	################ Parse input parameters ################

	#set up command line argument parser
	parser = argparse.ArgumentParser(description='script for relating variantIDs from CPannot file to sequence')
	group = parser.add_argument_group('required arguments')
	group.add_argument('-a', '--annot_file', required=True,
						help='A .CPannot.pkl file')
	group.add_argument('-sd', '--seq_dir', required=True,
						help='A directory of .CPseq files')

	group = parser.add_argument_group('optional arguments for running script')
	group.add_argument('-l','--length', default="short",
						help='translate in "long" or "short" format: long will include every cluster in CPannot file, short will only show variantID and one sequence match. Default = short')
	group.add_argument('-sc','--seq_cols', default="3",
						help='Which sequence columns to output in CPtranslate file. May use multiple columns for long format (seqarate by commas). key: 3 = r1, 5 = r2, 7 = i7, 9 = i5')
	group.add_argument('-od','--output_dir', default=os.getcwd(),
						help='Output directory. default is current directory')
	group.add_argument('-n','--num_cores', default=1,
						help='How many cores to use for parallel processing')

	if not len(sys.argv) > 1:
		parser.print_help()
		sys.exit()

	##### parse command line arguments #####
	args = parser.parse_args()

	annot_file = args.annot_file
	seq_dir = args.seq_dir

	length = args.length
	if length != "short" and length != "long":
		print "Error: length must be either 'short' or 'long'. Exiting..."
		sys.exit()

	seq_cols = [0] + [int(n) - 1 for n in args.seq_cols.split(',')]
	seq_col_names = assign_names(seq_cols)

	output_dir = args.output_dir
	if not os.path.isdir(output_dir):
		print "Error: output directory is invalid. Exiting..."
		sys.exit()
	if output_dir[-1] != '/':
		output_dir = output_dir + '/'

	num_cores = int(args.num_cores)

	########################################


	# Read in CPannot file
	print "Reading in CPannot file..."
	start = time.time()
	annot_df = fileio.loadFile(annot_file)
	print "file loaded: {0:.2f} seconds\n".format(time.time() - start)

	# Read in CPseq files as a concatenated data frame
	print "Reading in CPseq files..."
	start = time.time()
	seq_files = cpfiletools.find_files_in_directory(seq_dir, ['.CPseq'])
	print "found CPseq files: "
	cpfiletools.printList(seq_files)
	seq_df = pd.DataFrame()
	for seq_file in seq_files:
		new_df = pd.read_csv(seq_dir+seq_file, sep='\t', index_col=0, usecols=seq_cols, header=None)
		seq_df = pd.concat([seq_df, new_df])
	seq_df.columns = seq_col_names
	print str(len(seq_files)) + " files loaded: {0:.2f} seconds\n".format(time.time() - start)
	
	# Merge the data frames 
	print "Merging data frames..."
	start = time.time()
	merged_df = annot_df.merge(seq_df, how='left', left_index=True, right_index=True)
	print "Merged: {0:.2f} seconds\n".format(time.time() - start)


	# Save long format CPtranslate if requested
	if length == "long":
		print "Saving long format CPtranslate.pkl..."
		start = time.time()
		filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".long.CPtranslate.pkl"
		print "filename = "+filename
		merged_df.to_pickle(output_dir+filename)
		print "Saved: {0:.2f} seconds\n".format(time.time() - start)

	# Create the short format CPtranslate:
	if length == "short":
		print "Generating short format CPtranslate..."
		start = time.time()
		# Make a list of unique variant_IDs

		grouped_variants = merged_df.groupby('variant_ID')

		all_variants = (Parallel(n_jobs=num_cores, verbose=10)
						(delayed(fetch_sequences)(name, group, seq_col_names) for name, group in grouped_variants))

		short_df = pd.DataFrame(all_variants)
		short_df.columns = ['variant_ID', 'count']+seq_col_names
		print "short format generated: {0:.2f} seconds\n".format(time.time() - start)
		print short_df.head()

		print "Saving short format CPtranslate.pkl..."
		start = time.time()
		filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".short.CPtranslate.pkl"
		short_df.to_pickle(output_dir+filename)
		print "Saved: {0:.2f} seconds\n".format(time.time() - start)