def reduceCPseriesFiles(outputFiles, reducedOutputFile, indices=None, tileOutputFile=None): """Concatenate the per-tile outputs and reduce to only include indices that are relevant.""" # load all files in dict outputFiles allTiles = [fileio.loadFile(filename) for filename in outputFiles.values()] a = pd.concat(allTiles) a = a.groupby(level=0).first() if indices is None: a.to_pickle(reducedOutputFile) else: a.loc[indices].to_pickle(reducedOutputFile) # find tile dict if tile output file is given if tileOutputFile is not None: tiles = pd.concat([ pd.Series(index=s.index, data=tile) for s, tile in itertools.izip(allTiles, outputFiles.keys()) ]) tiles = tiles.groupby(level=0).first() if indices is None: tiles.to_pickle(tileOutputFile) else: tiles.loc[indices].to_pickle(tileOutputFile) return
def main(): # Get options and arguments from command line parser = argparse.ArgumentParser(description="summarize a CPseries file by variants given a CPannot file") parser.add_argument('--outSuffix', help="path to the output file; default is _processed.CPvariant") parser.add_argument('annotFilePath', help="path to the CPannot.pkl file") parser.add_argument('signalsFilePaths', nargs='*', help="paths to the CPseries to be combined") args = parser.parse_args() # Read CPannot files annot = pd.read_pickle(args.annotFilePath) # Read signals from CPseries and combine tiles allFiles = [] for i, filePath in enumerate(args.signalsFilePaths): allFiles.append(fileio.loadFile(filePath)) signals = pd.concat(allFiles, axis=1, join='outer') signals.columns = range(0, len(allFiles)) # Define output file path if isinstance(args.signalsFilePaths, list): (signalDir, signalFilename) = os.path.split(args.signalsFilePaths[0]) else: (signalDir, signalFilename) = os.path.split(args.signalsFilePaths) if args.outSuffix is None: outputFilePath = os.path.join(signalDir, signalFilename[0:9]+'_processed.CPvariant') else: outputFilePath = os.path.join(signalDir, signalFilename[0:9]+args.outSuffix) # Join signals with CPannot signals_with_variants = signals.join(annot, how='inner') # Group by and summarize signals_by_variants = signals_with_variants.groupby('variant_number').median() # Write to output file signals_by_variants.to_csv(outputFilePath, sep='\t') return 1
import os import numpy as np import pandas as pd import itertools from fittinglibs import fileio from tectolibs import tectplots from hjh.junction import Junction from hjh.helix import Helix # to start, load old helix model model = fileio.loadFile('/home/sarah/JunctionLibrary/seq_params/linear_regression_length10.p') seqs_per_length = {} for length in [9, 10, 11]: # get all helix seqs of a particular length all_seqs = Junction(tuple(['W']*(length-1) + ['G'])).sequences # get rid of homoplymeric tracts max_homo_length = 3 bases = ['A', 'C', 'G', 'U'] max_num_GC_or_AU = np.ceil(length/2.)+1 sub_index = [] for idx, seq in all_seqs.side1.iteritems(): too_homopolymeric = any([seq.find(base*(max_homo_length+1))>-1 for base in bases]) too_gc_rich = len([s for s in seq if s=='C' or s=='G']) > max_num_GC_or_AU too_au_rich = len([s for s in seq if s=='A' or s=='U']) > max_num_GC_or_AU if not too_homopolymeric and not too_gc_rich and not too_au_rich: sub_index.append(idx) seqs_sub = all_seqs.loc[sub_index].copy()
def getSignalFromCPFluor(CPfluorfilename): """Starting from CPfluor, determine integrated signal.""" fitResults = fileio.loadFile(CPfluorfilename) signal = 2 * np.pi * fitResults.amplitude * fitResults.sigma * fitResults.sigma signal.loc[~fitResults.success.astype(bool)] = np.nan return signal
sys.exit() bindingSeriesFile = args.cpseries tileFile = args.tile_file timeDeltaFile = args.time_dict annotatedClusterFile = args.annotated_clusters #tile_to_subset = args.tile time_series = args.time_series # load files if args.out_file is None: outFile = fileio.stripExtension(bindingSeriesFile) else: outFile = args.out_file bindingSeries = fileio.loadFile(bindingSeriesFile) timeDict = fileio.loadFile(timeDeltaFile) tileSeries = fileio.loadFile(tileFile) ''' # look only at clusters in tile print 'Only looking at clusters in tile %s...'%tile_to_subset index = tileSeries==tile_to_subset bindingSeries = bindingSeries.loc[index].copy() tileSeries = tileSeries.loc[index].copy() times = timeDict[tile_to_subset] ''' print 'Using time series from tile %s...' % time_series times = timeDict[time_series] ''' # remove zero point if given
if args.out is None: print 'Error: please supply --out parameter. Something that looks like: ' print 'paper/01_expt/results_tables/flow_3455.151204.error_scaled.results.pkl' sys.exit() if os.path.exists(args.out) and not args.force_overwrite: print 'Error: filename %s already exists. Use -f to overwrite.' % args.out sys.exit() key = args.key[0] if not os.path.exists(filename_table.loc[key, 'variant_table']): print 'Error: need to supply variant table filename for key %s in filename table %s.' % ( key, args.data) sys.exit() variant_table = fileio.loadFile(filename_table.loc[key, 'variant_table']) if 'numTests' not in variant_table.columns.tolist(): variant_table.rename(columns={'num_tests': 'numTests'}, inplace=True) affinity_data = exptplots.PerVariant(variant_table=variant_table) result_table = affinity_data.getResultsFromVariantTable() fileio.saveFile(args.out, result_table) if args.mode == 'combine_results_table': """Combine two variant_tables to form a results table.""" if args.out is None: print 'Error: please supply --out parameter. Something that looks like: ' print 'paper/01_expt/results_tables/flow_3455.151204.error_scaled.results.pkl' sys.exit() if os.path.exists(args.out) and not args.force_overwrite: print 'Error: filename %s already exists. Use -f to overwrite.' % args.out
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser(description='script for relating variantIDs from CPannot file to sequence') group = parser.add_argument_group('required arguments') group.add_argument('-a', '--annot_file', required=True, help='A .CPannot.pkl file') group.add_argument('-sd', '--seq_dir', required=True, help='A directory of .CPseq files') group = parser.add_argument_group('optional arguments for running script') group.add_argument('-l','--length', default="short", help='translate in "long" or "short" format: long will include every cluster in CPannot file, short will only show variantID and one sequence match. Default = short') group.add_argument('-sc','--seq_cols', default="3", help='Which sequence columns to output in CPtranslate file. May use multiple columns for long format (seqarate by commas). key: 3 = r1, 5 = r2, 7 = i7, 9 = i5') group.add_argument('-od','--output_dir', default=os.getcwd(), help='Output directory. default is current directory') group.add_argument('-n','--num_cores', default=1, help='How many cores to use for parallel processing') if not len(sys.argv) > 1: parser.print_help() sys.exit() ##### parse command line arguments ##### args = parser.parse_args() annot_file = args.annot_file seq_dir = args.seq_dir length = args.length if length != "short" and length != "long": print "Error: length must be either 'short' or 'long'. Exiting..." sys.exit() seq_cols = [0] + [int(n) - 1 for n in args.seq_cols.split(',')] seq_col_names = assign_names(seq_cols) output_dir = args.output_dir if not os.path.isdir(output_dir): print "Error: output directory is invalid. Exiting..." sys.exit() if output_dir[-1] != '/': output_dir = output_dir + '/' num_cores = int(args.num_cores) ######################################## # Read in CPannot file print "Reading in CPannot file..." start = time.time() annot_df = fileio.loadFile(annot_file) print "file loaded: {0:.2f} seconds\n".format(time.time() - start) # Read in CPseq files as a concatenated data frame print "Reading in CPseq files..." start = time.time() seq_files = cpfiletools.find_files_in_directory(seq_dir, ['.CPseq']) print "found CPseq files: " cpfiletools.printList(seq_files) seq_df = pd.DataFrame() for seq_file in seq_files: new_df = pd.read_csv(seq_dir+seq_file, sep='\t', index_col=0, usecols=seq_cols, header=None) seq_df = pd.concat([seq_df, new_df]) seq_df.columns = seq_col_names print str(len(seq_files)) + " files loaded: {0:.2f} seconds\n".format(time.time() - start) # Merge the data frames print "Merging data frames..." start = time.time() merged_df = annot_df.merge(seq_df, how='left', left_index=True, right_index=True) print "Merged: {0:.2f} seconds\n".format(time.time() - start) # Save long format CPtranslate if requested if length == "long": print "Saving long format CPtranslate.pkl..." start = time.time() filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".long.CPtranslate.pkl" print "filename = "+filename merged_df.to_pickle(output_dir+filename) print "Saved: {0:.2f} seconds\n".format(time.time() - start) # Create the short format CPtranslate: if length == "short": print "Generating short format CPtranslate..." start = time.time() # Make a list of unique variant_IDs grouped_variants = merged_df.groupby('variant_ID') all_variants = (Parallel(n_jobs=num_cores, verbose=10) (delayed(fetch_sequences)(name, group, seq_col_names) for name, group in grouped_variants)) short_df = pd.DataFrame(all_variants) short_df.columns = ['variant_ID', 'count']+seq_col_names print "short format generated: {0:.2f} seconds\n".format(time.time() - start) print short_df.head() print "Saving short format CPtranslate.pkl..." start = time.time() filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".short.CPtranslate.pkl" short_df.to_pickle(output_dir+filename) print "Saved: {0:.2f} seconds\n".format(time.time() - start)