def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser( description= 'Script for assigning unique IDs to variants in CPseries files') group = parser.add_argument_group('required arguments') group.add_argument( '-sd', '--series_directory', required=True, help='directory that holds the CPseries files that need variant IDs') group.add_argument( '-sc', '--seq_column', required=True, help= 'which column in the CPseries file you want to use for assigning variants' ) group = parser.add_argument_group('optional arguments for processing data') group.add_argument( '-pi', '--previous_ID_file', default="", help= 'An ID file previously created for variants expected in the new CPseries files' ) group.add_argument( '-st', '--seq_start', default=0, help= 'start position within sequence for matching. Will use beginning of sequence if none specified.' ) group.add_argument( '-ed', '--seq_end', default=0, help= 'end position within sequence for matching. Will use end of sequence if none specified.' ) group.add_argument( '-lb', '--label', default="ID_ed", help='label attached to output files. Default is "ID_ed"') group.add_argument( '-od', '--output_directory', default="", help= 'output directory for series files with labeled variants (default will use series_directory)' ) group.add_argument( '-if', '--ID_file', default="ID_file.txt", help= 'file name for the list of IDs and corresponding sequences. Default is "ID_file.txt"' ) if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() # If no output directory given, use input directory if args.output_directory == "": args.output_directory = args.series_directory # This script will run through each of the provided CPseries files sequentially in order to # ensure that each variant gets assigned only one variant ID. CPseriesFiles = cpfiletools.find_files_in_directory( args.series_directory, ['.CPseries']) numLines = 0 for seriesFile in CPseriesFiles: numLines += int( subprocess.check_output(("wc -l {} | ".format( os.path.join(args.series_directory, seriesFile)) + " awk \'{print $1}\'"), shell=True).strip()) start = time.time() randID_set = set() print "Generating random IDs..." while len(randID_set) < numLines: randID = ''.join([ random.choice(string.ascii_uppercase + string.digits) for n in range(8) ]) # 8^36 ~ 3.25e32 possible IDs randID_set.add(randID) print "ID generation: {0:.2f} seconds".format(time.time() - start) # This dictionary will contain all the variants assigned, keyed by sequence match # The entries in variant dict will be three-element lists, the first is the ID, the second is the filter # associated with that variant (if any), and the third is the number of times that variant has been seen variantDict = {} # If a previous ID file was provided, it will pre-populate the variantDict. # Note: it is up to the user to ensure that seq_column, seq_start and seq_end match those used to # create the previous ID file! if args.previous_ID_file != "": with open(args.previous_ID_file, 'r') as f: for line in f: seq, ID, filtr, n = line.split() variantDict[seq] = [ID, filtr, int(n)] fileNum = 1 # Loop through each CPseries file to assign variants: for seriesFile in CPseriesFiles: print "Working on file: {}...{} of {}".format(seriesFile, fileNum, len(CPseriesFiles)) labeled_filename = os.path.join( args.output_directory, ".".join([ '_'.join([os.path.splitext(seriesFile)[0], args.label]), 'CPseries' ])) # Time each loop for now: start = time.time() # Read in CPseries file as pandas df series_df = pd.read_table(os.path.join(args.series_directory, seriesFile), header=None) # set sequence selection parameters: seq_col = int( args.seq_column ) - 1 # Allow for intuitive column selection (i.e. start at 1) if seq_col < 0 or seq_col > len(series_df.columns): print "Error: invalid seq column selected. Out of range. Must be within {} and {}".format( 1, len(series_df.columns)) sys.exit() # Test to ensure provided column contains sequence data: test_seq = series_df.iloc[0, seq_col] if not re.match("^[a-zA-Z]+$", test_seq): print "Error: provided column does not contain sequence data, e.g. {}".format( test_seq) sys.exit() # Test to ensure start and end sequence positions are valid: seq_length = len(series_df.iloc[0, seq_col]) strt = int(args.seq_start) if strt < 0 or strt > seq_length - 1: print "Error: invalid start position selected. Must be positive and less than seq length" sys.exit() end = int(args.seq_end) if end < strt or end > seq_length: print "Error: invalid end position selected. Must be greater than start position and <= seq length" sys.exit() # If no end range provided, use entire sequence length if end == 0: end = seq_length # Fill in list of IDs to be used as new column IDs = [] total_rows = len(series_df.index) # Iterate through entire CPseries file: for row in range(total_rows): seq = series_df.iloc[row, seq_col][strt:end] # If sub-sequence has already been seen, assign existing ID if seq in variantDict: IDs.append(variantDict[seq][0]) variantDict[seq][ 2] += 1 # Count how many times a variant has been seen else: newID = randID_set.pop() IDs.append(newID) variantDict[seq] = [newID, series_df.iloc[row, 1], 1] # Curtis' cool progress bar: cpfiletools.update_progress(row, total_rows) # Add in new ID column: (currently puts it next to the filter column) series_df.insert(loc=2, column="IDs", value=IDs) np.savetxt(labeled_filename, series_df.values, fmt='%s', delimiter='\t') print "finished file: {0:.2f} seconds".format(time.time() - start) fileNum += 1 # Now write a file containing the key for all the assigned IDs: print "Creating ID file: {}...".format(args.ID_file) variant_df = pd.DataFrame(variantDict).transpose() seqs = list(variant_df.index) variant_df.insert(loc=0, column="sequence", value=seqs) sorted_df = variant_df.sort([2, "sequence"], ascending=[ False, True ]) # Sort by number of variants, then by sequence np.savetxt(os.path.join(args.output_directory, args.ID_file), sorted_df.values, fmt='%s', delimiter='\t') print "Done"
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser(description='Script for bootstrapping fluorescence values from CPsignal files') group = parser.add_argument_group('required arguments for processing data') group.add_argument('-sd', '--CPsignal_dir', required=True, help='directory that holds the CPsignal files you want to get data from') group = parser.add_argument_group('optional arguments for processing data') group.add_argument('-bt','--bootstrap_type', default="v", help='How to subset data for bootstrapping: f = by filter, v = by variant. Default = "v"') group.add_argument('-fs','--filter_set', default="all", help='which filters you want to bootstrap. Default = "all"') group.add_argument('-vs','--variant_set', default="all", help='which variants you want to bootstrap. Default = "all"') group.add_argument('-st','--statistic', default='median', help='statistic you want to bootstrap. Default = "median", Options: "median","mean"') group.add_argument('-nb','--num_bootstraps', default=1000, help='number of times to bootstrap. Default = 1000') group.add_argument('-mr','--min_replicates', default=10, help='minimum number of replicates a variant must have for bootstrapping. Default = 10') group.add_argument('-ci','--confidence_interval', default='95', help='percent confidence interval to provide on bootstrapped statistic. Default = 95') group.add_argument('-od','--output_dir', default="CPsignal_dir", help='save output files to here. Default is provided CPsignal directory') group.add_argument('-op','--output_prefix', default="bootstrap_fluorescence", help='output file prefix. Default = "bootstrap_fluorescence"') group = parser.add_argument_group('other settings') group.add_argument('-n','--num_cores', type=int, default=1, help='maximum number of cores to use. default=1') if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments and check for problems args = parser.parse_args() numCores = int(args.num_cores) signal_files = cpfiletools.find_files_in_directory(args.CPsignal_dir, ['.CPseries']) bootstrap_type = args.bootstrap_type if bootstrap_type != 'f' and bootstrap_type != 'v': print >> sys.stderr, "Error: bootstrap type invalid (must be either 'f' or 'v'). Exiting..." sys.exit() filter_set = str.split(args.filter_set, ',') variant_set = str.split(args.variant_set, ',') statistic = args.statistic if statistic != 'median' and statistic != 'mean': print >> sys.stderr, "Error: statistic choice invalid. Exiting..." sys.exit() num_bootstraps = int(args.num_bootstraps) min_replicates = int(args.min_replicates) if int(args.confidence_interval) < 100: confidence_interval = [(100-float(args.confidence_interval))/2, 100-((100-float(args.confidence_interval))/2)] else: print >> sys.stderr, "Error: confidence interval must be between 0 and 100. Exiting..." sys.exit() if args.output_dir == "CPsignal_dir": output_directory = args.CPsignal_dir if not os.path.isdir(output_directory): print >> sys.stderr, "Error: output directory is not a valid directory. Exiting..." sys.exit() # Read in the CPseries files: print "Reading in data and subsetting if necessary..." start = time.time() series = loadAndConcatAllTiles(signal_files, args.CPsignal_dir) # Subset data of interest: # (If you don't reset the index here, pandas gives you a hard time concatenating the two data # frames in the next step) series = selectData(filter_set, variant_set, filterCol, variantIDCol, series).reset_index(drop=True) print "\nStructuring data for bootstrapping..." ### Restructure data frame such that fluorescence values are in their own columns all_fluor_series = [] indexes = range(len(series.iloc[0, fluorSeriesCol].split(','))) # Pull out the fluorescence series and put into a data frame for i in xrange(len(series)): fluorescence_series = np.array([float(j) for j in series.iloc[i, fluorSeriesCol].split(',')]) # Take the time to label unquantified clusters now, since it allows for fast removal later if all(np.isnan(fluorescence_series)): fluorescence_series = np.append(fluorescence_series, 0) else: fluorescence_series = np.append(fluorescence_series, 1) all_fluor_series.append(fluorescence_series) fluor_data_df = pd.DataFrame(all_fluor_series) # Quantified clusters get a '1' fluor_data_df.columns = indexes + ['Quantified'] # separate out the ID columns from the working series and give them names id_cols = series.iloc[:,[clusterIdCol, filterCol, variantIDCol]] id_cols.columns = ["clusterID", "filterID", "variantID"] # Create the new working series frames = [id_cols, fluor_data_df] series = pd.concat(frames, axis=1) print "Done: {0:.2f} seconds".format(time.time() - start) # Remove all clusters that have no associated values print "\nRemoving unquantified clusters..." start = time.time() count = len(series.index) series = series.loc[series["Quantified"] == 1] series.drop("Quantified", axis=1, inplace=True) count = count - len(series.index) print "Removed "+str(count)+" unquantified clusters: {0:.2f} seconds".format(time.time() - start) ### Perform Bootstrapping ### print "\nPerforming bootstrapping..." start = time.time() if bootstrap_type == 'v': allVariants = set(series.iloc[:,variantIDCol]) namesToBootstrap = list(allVariants) label = "variantID" if bootstrap_type == 'f': allFilters = set(series.iloc[:,filterCol]) namesToBootstrap = list(allFilters) label = "filterID" print "bootstrapping {} unique variants...".format(len(namesToBootstrap)) # bootstrapOneVariant(variantSeries, indexes, variantName, numBootstraps, minReplicates, statistic, confidence_interval): if numCores > 1: allBootstrappedValues = (Parallel(n_jobs=numCores, verbose = 10)(delayed(bootstrapOneVariant)(series.loc[series[label] == name,:], indexes, name, num_bootstraps, min_replicates, statistic, confidence_interval) for name in namesToBootstrap)) else: allBootstrappedValues = [bootstrapOneVariant(series.loc[series[label] == name,:], indexes, name, num_bootstraps, min_replicates, statistic, confidence_interval) for name in namesToBootstrap] allBootstrappedValues = filter(None, allBootstrappedValues) print "Done: {0:.2f} seconds".format(time.time() - start) print "{} variants passed minimum replicate cutoff of {}".format(len(allBootstrappedValues), min_replicates) ### Write to file ### with open(output_directory+args.output_prefix+".CPdata", 'w') as f: for variant in allBootstrappedValues: for line in variant: for i in line: f.write(str(i)+'\t') f.write('\n')
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser( description= 'script for generating input files for image stack quantification') group = parser.add_argument_group('required arguments for processing data') group.add_argument( '-id', '--input_directory', required=True, help='directory that holds the image files of an array experiment') group = parser.add_argument_group('optional arguments for processing data') group.add_argument( '-tl', '--tile_list', default="", help='which tiles to use when generating input files (default is all)') group.add_argument( '-od', '--output_directory', default="", help='save output files to here. default = input directory') group.add_argument('-op', '--output_prefix', default="", help='optional output file prefix') group.add_argument( '-bf', '--baseline_flag', default="", help='flag denoting image files that contain baseline measurements') group.add_argument( '-ef', '--experiment_flag', default="", help='flag denoting image files that contain experimental measurements' ) # print help if no arguments provided if len(sys.argv) <= 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() if args.output_directory == "": args.output_directory = args.input_directory # is there an os-insensitive way to add the slash at the end of this? Does it matter? args.absPath = os.path.abspath(args.input_directory) + "/" # add underscore for better formatting if args.output_prefix != "": args.output_prefix = args.output_prefix + "_" ################ Make input files ################ # Gather all image files in input directory and extract all tiles allFiles = cpfiletools.find_files_in_directory(args.input_directory, ['tif']) allTiles = set() for filename in allFiles: allTiles.add(cpfiletools.get_tile_number_from_filename(filename)) # decide which tiles you want to use for making inputFiles if args.tile_list == "": tilesToUse = set(allTiles) else: tilesToUse = set(parse_tile_input(args.tile_list)) & allTiles # Tile dictionary for storing file data later on tileDict = {} for tile in tilesToUse: tileDict[tile] = [] # Make a list of files (filtered by tile) that will be used to create input files filteredFiles = [] for filename in allFiles: if cpfiletools.get_tile_number_from_filename(filename) in tilesToUse \ and (args.baseline_flag in filename or args.experiment_flag in filename): filteredFiles.append(filename) print "will use:\t{}".format(filename) # Make separate lists for differently flagged files baselineFiles = [] expFiles = [] for filename in filteredFiles: if args.baseline_flag != "" and args.baseline_flag in filename: baselineFiles.append(filename) if args.experiment_flag != "" and args.experiment_flag in filename: expFiles.append(filename) if len(baselineFiles) < 1 and len(expFiles) < 1: print "ERROR: no tiles selected!" sys.exit() # Add all baseline files to the tile dictionary if len(baselineFiles) > 0: add_data_to_tile_dict(tileDict, args, baselineFiles, args.baseline_flag, 0) # Add all experimental files to the tile dictionary if len(expFiles) > 0: minTimeStamp = cpfiletools.parse_timestamp_from_filename(expFiles[0]) # assumes that the first experimental image timestamp (over all tiles) # is the pseudo-zero timestamp (THIS NEEDS A MORE ELEGANT SOLUTION) for filename in expFiles: if cpfiletools.parse_timestamp_from_filename( filename) < minTimeStamp: minTimeStamp = cpfiletools.parse_timestamp_from_filename( filename) add_data_to_tile_dict(tileDict, args, expFiles, args.experiment_flag, minTimeStamp) # Time table for use in hacky analysis (20160201) timeTable = {} # sort output, add sequence number to experimental file entries, and print all files for tile in sorted(tilesToUse): tileDict[tile].sort() count = 1 timeTable[tile] = [] # Fill in time table for each tile for filedata in tileDict[tile]: timeTable[tile].append(filedata.timestamp) if args.experiment_flag in filedata.ID: filedata.ID = filedata.ID + "_" + str(count) count += 1 filename = args.output_prefix + "tile" + tile + ".ipf" with open(args.output_directory + filename, 'w') as f: header = "{}".format("time") f.write(header + "\n") for filedata in tileDict[tile]: f.write("{}\n".format(filedata)) f.write("\n") print "Successfully made file: {}".format(filename) # Print out the time Table (20160201) with open("timeTable.txt", 'w') as f: tiles = sorted(timeTable.keys()) for tile in tiles: f.write(tile) for time in timeTable[tile]: f.write("\t" + str(time)) f.write("\n") print "successfully made file: timeTable.txt"
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser( description= 'Script for splitting a directory of images into multiple directories') group = parser.add_argument_group('required arguments for processing data') group.add_argument('-id', '--image_directory', required=True, help='directory that holds images to be split (.tif)') group = parser.add_argument_group('optional arguments for processing data') group.add_argument('-p', '--prefix', default="set", help='prefix for new directories. default = set') group.add_argument('-od', '--output_directory', default='image_directory', help='directory in which new directories will be made') group.add_argument( '-a', '--action', default='l', help= 'what to do with the images (m = move, l = symbolic link). Default is to link.' ) if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() if args.action != "m" and args.action != "l": print "Error: action must be either 'm' (move) or 'l' (link)!" sys.exit() # Gather the image files in the provided image directory print "Finding image files in directory {}...".format(args.image_directory) imageFiles = cpfiletools.find_files_in_directory(args.image_directory, ['tif', 'tiff']) if len(imageFiles) < 1: print "Error: no image files found in directory: " + args.image_directory sys.exit() # Make a dictionary of all the image files keyed by tile number imageDirectory = os.path.abspath(args.image_directory) imageDict = cpfiletools.make_tile_dict_multiple(imageFiles, imageDirectory) tileList = imageDict.keys() numImagesPerTile = len(imageDict[tileList[0]]) # now make new directories to hold split images: if args.output_directory == 'image_directory': outputPath = args.image_directory else: outputPath = args.output_directory if not os.path.exists(outputPath): print "Error: directory {} does not exist!".format(outputPath) newDirList = [] for n in range(numImagesPerTile): dirname = outputPath + args.prefix + "{:02}".format(n + 1) os.mkdir(dirname) newDirList.append(dirname) print "made directory: {}".format(dirname) # Now that directories are made, move images into those directories (or link) count = 0 while count < numImagesPerTile: for tile in tileList: fullFileName = imageDict[tile].pop(0) prevPath, fileName = os.path.split(fullFileName) if args.action == "m": os.rename(fullFileName, newDirList[count] + "/" + fileName) if args.action == "l": os.symlink(fullFileName, newDirList[count] + "/" + fileName) count += 1 print "Files split successfully"
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser( description= 'Script for generating phony CPfluors for unquantified images') group = parser.add_argument_group('required arguments:') group.add_argument( '-id', '--image_dir', required=True, help= 'directory that holds the all the images on which quantification was attempted (successful or not)' ) group.add_argument( '-fd', '--fluor_dir', required=True, help='directory containing CPfluor files that were generated') group.add_argument( '-sd', '--seq_dir', required=True, help='directory that contains the CPseq files for this experiment.') group = parser.add_argument_group('optional arguments for processing data') group.add_argument( '-od', '--output_dir', default="fluor_dir", help='where the output files will be saved. Default is the fluor_dir.') group.add_argument( '-fl', '--flag', default="phony", help= 'optional flag to be inserted at the front of phony CPfluor file names.' ) if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() # check that output directory is valid: if args.output_dir == "fluor_dir": output_dir = args.fluor_dir else: if os.path.isdir(args.output_dir): output_dir = args.output_dir else: print "Error: output directory " + args.output_dir + " is not a directory. Exiting..." sys.exit() # import fluor files print "Finding fluor files in directory " + args.fluor_dir + " ..." fluorFilenames = cpfiletools.find_files_in_directory( args.fluor_dir, ['.CPfluor']) if len(fluorFilenames) < 1: print "Error: No fluor files found in directory: " + args.fluor_dir sys.exit() # import image files print "Finding image files in directory " + args.image_dir + " ..." imageFilenames = cpfiletools.find_files_in_directory( args.image_dir, ['.tif', '.tiff']) if len(imageFilenames) < 1: print "Error: No image files found in directory: " + args.image_dir sys.exit() # find the relevant CPseq files: print "Finding CPseq files in directory " + args.seq_dir + " ..." seqFilenames = cpfiletools.find_files_in_directory(args.seq_dir, ['.CPseq']) if len(seqFilenames) < 1: print "Error: No CPseq files found in directory: " + args.seq_dir sys.exit() # Make a set of timestamps from the fluor files # This script assumes that no two images will have the same timestamp fluorTimestamps = set() for filename in fluorFilenames: fluorTimestamps.add(getTimestamp(filename)) # Now identify which images do not have corresponding CPfluor files: lonelyImageFiles = [] for filename in imageFilenames: timestamp = getTimestamp(filename) if timestamp not in fluorTimestamps: lonelyImageFiles.append(filename) if len(lonelyImageFiles) < 1: print "No need for phony files. Exiting..." sys.exit() # Make a CPseq dict keyed by tile number: seq_dict = cpfiletools.make_tile_dict(seqFilenames, args.seq_dir) # Now make the new phony files for filename in lonelyImageFiles: root, ext = os.path.splitext(filename) newFluorName = args.flag + filename.strip(ext) + ".CPfluor" # find the CPseq file relevant to this image: tile = cpfiletools.get_tile_number_from_filename(filename) cpseq = seq_dict[tile] with open(output_dir + '/' + newFluorName, 'w') as outfile, open(cpseq, 'r') as infile: for line in infile: cluster_ID = line.split()[0] outfile.write(cluster_ID + ':0:0.000000:0.000000:0.000000:0.000000\n') print "Generated phony file: " + newFluorName
def main(): start = time.time() ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser(description='Script for generating a \ CPannot file based on previously designed variants') group = parser.add_argument_group('required arguments') group.add_argument( '-sd', '--seq_directory', required=True, help='directory that holds the CPseq files that need variant IDs') group.add_argument( '-vt', '--variant_table', required=True, help='A tab-delimited table containing the variant information \ (first column sequence, second column variant ID)') group = parser.add_argument_group('optional arguments for processing data') group.add_argument('-od', '--output_directory', help='output directory for series files with labeled \ variants (default will use seq_directory)') group.add_argument('-n', '--num_cores', type=int, default=19, help='number of cores to use') if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() numCores = args.num_cores # If no output directory given, use current directory if not args.output_directory: args.output_directory = "./" output_directory = args.output_directory if not os.path.isdir(output_directory): print "Error: invalid output directory selection. Exiting..." sys.exit() # Construct variant dict: print "Reading in variant dict: {}".format(args.variant_table) variant_dict = get_variant_dict(args.variant_table) # Find CPseqs in seq_directory: print "Finding CPseq files in directory: {}".format(args.seq_directory) CPseqFiles = cpfiletools.find_files_in_directory(args.seq_directory, ['.CPseq']) if numCores > 1: print "Annotating clusters in parallel on {} cores...".format(numCores) annotated_cluster_lists = (Parallel(n_jobs=numCores, verbose=10)\ (delayed(annotate_clusters)(args.seq_directory + CPseq,variant_dict) for CPseq in CPseqFiles)) else: print "Annotating clusters on a single core" annotated_cluster_lists = [ annotate_clusters(args.seq_directory + CPseq, variant_dict) for CPseq in CPseqFiles ] # Combine cluster lists: print "Formatting and saving CPannot file..." all_annotations = [] map(all_annotations.extend, annotated_cluster_lists) CPannot_df = pd.DataFrame(all_annotations) CPannot_df.columns = ['cluster_ID', 'variant_ID'] # Save the CPannot file as a pickle CPannotFilename = "_".join( longestSubstring(CPseqFiles).split("_")[:-1]) + ".CPannot.pkl" print "Creating CPannot.pkl file: {}...".format(CPannotFilename) CPannot_df = CPannot_df.set_index("cluster_ID") CPannot_df.to_pickle(output_directory + CPannotFilename) print "Done. {} minutes".format(round((time.time() - start) / 60, 2))
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser(description='script for relating variantIDs from CPannot file to sequence') group = parser.add_argument_group('required arguments') group.add_argument('-a', '--annot_file', required=True, help='A .CPannot.pkl file') group.add_argument('-sd', '--seq_dir', required=True, help='A directory of .CPseq files') group = parser.add_argument_group('optional arguments for running script') group.add_argument('-l','--length', default="short", help='translate in "long" or "short" format: long will include every cluster in CPannot file, short will only show variantID and one sequence match. Default = short') group.add_argument('-sc','--seq_cols', default="3", help='Which sequence columns to output in CPtranslate file. May use multiple columns for long format (seqarate by commas). key: 3 = r1, 5 = r2, 7 = i7, 9 = i5') group.add_argument('-od','--output_dir', default=os.getcwd(), help='Output directory. default is current directory') group.add_argument('-n','--num_cores', default=1, help='How many cores to use for parallel processing') if not len(sys.argv) > 1: parser.print_help() sys.exit() ##### parse command line arguments ##### args = parser.parse_args() annot_file = args.annot_file seq_dir = args.seq_dir length = args.length if length != "short" and length != "long": print "Error: length must be either 'short' or 'long'. Exiting..." sys.exit() seq_cols = [0] + [int(n) - 1 for n in args.seq_cols.split(',')] seq_col_names = assign_names(seq_cols) output_dir = args.output_dir if not os.path.isdir(output_dir): print "Error: output directory is invalid. Exiting..." sys.exit() if output_dir[-1] != '/': output_dir = output_dir + '/' num_cores = int(args.num_cores) ######################################## # Read in CPannot file print "Reading in CPannot file..." start = time.time() annot_df = fileio.loadFile(annot_file) print "file loaded: {0:.2f} seconds\n".format(time.time() - start) # Read in CPseq files as a concatenated data frame print "Reading in CPseq files..." start = time.time() seq_files = cpfiletools.find_files_in_directory(seq_dir, ['.CPseq']) print "found CPseq files: " cpfiletools.printList(seq_files) seq_df = pd.DataFrame() for seq_file in seq_files: new_df = pd.read_csv(seq_dir+seq_file, sep='\t', index_col=0, usecols=seq_cols, header=None) seq_df = pd.concat([seq_df, new_df]) seq_df.columns = seq_col_names print str(len(seq_files)) + " files loaded: {0:.2f} seconds\n".format(time.time() - start) # Merge the data frames print "Merging data frames..." start = time.time() merged_df = annot_df.merge(seq_df, how='left', left_index=True, right_index=True) print "Merged: {0:.2f} seconds\n".format(time.time() - start) # Save long format CPtranslate if requested if length == "long": print "Saving long format CPtranslate.pkl..." start = time.time() filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".long.CPtranslate.pkl" print "filename = "+filename merged_df.to_pickle(output_dir+filename) print "Saved: {0:.2f} seconds\n".format(time.time() - start) # Create the short format CPtranslate: if length == "short": print "Generating short format CPtranslate..." start = time.time() # Make a list of unique variant_IDs grouped_variants = merged_df.groupby('variant_ID') all_variants = (Parallel(n_jobs=num_cores, verbose=10) (delayed(fetch_sequences)(name, group, seq_col_names) for name, group in grouped_variants)) short_df = pd.DataFrame(all_variants) short_df.columns = ['variant_ID', 'count']+seq_col_names print "short format generated: {0:.2f} seconds\n".format(time.time() - start) print short_df.head() print "Saving short format CPtranslate.pkl..." start = time.time() filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".short.CPtranslate.pkl" short_df.to_pickle(output_dir+filename) print "Saved: {0:.2f} seconds\n".format(time.time() - start)
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser( description='Script for assigning unique IDs to variants in CPseq files' ) group = parser.add_argument_group('required arguments') group.add_argument( '-sd', '--seq_directory', required=True, help='directory that holds the CPseq files that need variant IDs') group.add_argument( '-sc', '--seq_column', required=True, help= 'which column in the CPseq file you want to use for assigning variants' ) group = parser.add_argument_group('optional arguments for processing data') group.add_argument( '-pi', '--previous_ID_file', default="", help= 'An ID file previously created for variants expected in the new CPseq files' ) group.add_argument( '-fk', '--filters_to_use', default="", help= 'Which filters should be kept. Separate by commas: filter1,filter2,filter3,etc. If you want to use clusters without a filter, include "blank" (filter1,filter2,blank). Default is to use all filters.' ) group.add_argument( '-st', '--seq_start', default=0, help= 'start position within sequence for matching. Will use beginning of sequence if none specified.' ) group.add_argument( '-ed', '--seq_end', default=0, help= 'end position within sequence for matching. Will use end of sequence if none specified.' ) group.add_argument( '-lb', '--label', default="ID_ed", help='label attached to output files. Default is "ID_ed"') group.add_argument( '-od', '--output_directory', default="", help= 'output directory for series files with labeled variants (default will use seq_directory)' ) group.add_argument( '-if', '--ID_file', default="ID_file.txt", help= 'file name for the list of IDs and corresponding sequences. Default is "ID_file.txt"' ) if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() # If no output directory given, use input directory if args.output_directory == "": args.output_directory = args.seq_directory output_directory = args.output_directory if not os.path.isdir(output_directory): print "Error: invalid output directory selection. Exiting..." sys.exit() # Crate a set of filters to be kept: filters = set(args.filters_to_use.split(',')) if "blank" in filters: filters.remove("blank") filters.add( "no_filter" ) #I coerce the pandas dataframes to contain 'no_filter' instead of NaN's if filters.pop() == "": filters.add("all") print filters # This script will run through each of the provided CPseq files sequentially in order to # ensure that each variant gets assigned only one variant ID. CPseqFiles = cpfiletools.find_files_in_directory(args.seq_directory, ['.CPseq']) numLines = 0 for seqFile in CPseqFiles: numLines += int( subprocess.check_output( ("wc -l {} | ".format(os.path.join( args.seq_directory, seqFile)) + " awk \'{print $1}\'"), shell=True).strip()) start = time.time() randID_set = set() print "Generating random IDs..." while len(randID_set) < numLines: randID = ''.join([ random.choice(string.ascii_uppercase + string.digits) for n in range(8) ]) # 36^8 ~ 2.8e12 possible IDs randID_set.add(randID) print "ID generation: {0:.2f} seconds".format(time.time() - start) # This dictionary will contain all the variants assigned, keyed by sequence match # The entries in variant dict will be three-element lists, the first is the ID, the second is the filter # associated with that variant (if any), and the third is the number of times that variant has been seen variantDict = {} # If a previous ID file was provided, it will pre-populate the variantDict. # Note: it is up to the user to ensure that seq_column, seq_start and seq_end match those used to # create the previous ID file! if args.previous_ID_file != "": with open(args.previous_ID_file, 'r') as f: for line in f: seq, ID, filtr, n = line.split() variantDict[seq] = [ID, filtr, int(n)] fileNum = 1 CPannot_df = pd.DataFrame() # Loop through each CPseq file to assign variants: for seqFile in CPseqFiles: print "Working on file: {}...{} of {}".format(seqFile, fileNum, len(CPseqFiles)) # Time each loop for now: start = time.time() # Read in CPseq file as pandas df seq_df = pd.read_table(os.path.join(args.seq_directory, seqFile), header=None) seq_df = seq_df.fillna('no_filter') print "length pre-filter " + str(len(seq_df)) # filter df by filters to keep (if any) if "all" not in filters: seq_df = seq_df[seq_df.iloc[:, 1].isin(filters)] print "length post-filter " + str(len(seq_df)) # set sequence selection parameters: seq_col = int( args.seq_column ) - 1 # Allow for intuitive column selection (i.e. start at 1) if seq_col < 0 or seq_col > len(seq_df.columns): print "Error: invalid seq column selected. Out of range. Must be within {} and {}".format( 1, len(seq_df.columns)) sys.exit() # Test to ensure provided column contains sequence data: test_seq = seq_df.iloc[0, seq_col] if not re.match("^[a-zA-Z]+$", test_seq): print "Error: provided column does not contain sequence data, e.g. {}".format( test_seq) sys.exit() # Test to ensure start and end sequence positions are valid: seq_length = len(seq_df.iloc[0, seq_col]) strt = int(args.seq_start) if strt < 0 or strt > seq_length - 1: print "Error: invalid start position selected. Must be positive and less than seq length" sys.exit() end = int(args.seq_end) if end < strt or end > seq_length: print "Error: invalid end position selected. Must be greater than start position and <= seq length" sys.exit() # If no end range provided, use entire sequence length if end == 0: end = seq_length # Fill in list of IDs to be used as new column clusterIDs = [] IDs = [] total_rows = len(seq_df.index) # Iterate through entire CPseq file: for row in range(total_rows): seq = seq_df.iloc[row, seq_col][strt:end] # If sub-sequence has already been seen, assign existing ID if seq in variantDict: IDs.append(variantDict[seq][0]) variantDict[seq][ 2] += 1 # Count how many times a variant has been seen else: newID = randID_set.pop() IDs.append(newID) variantDict[seq] = [newID, seq_df.iloc[row, 1], 1] clusterIDs.append(seq_df.iloc[row, 0]) # Curtis' cool progress bar: cpfiletools.update_progress(row, total_rows) # Start making the CPannot file: if fileNum == 1: CPannot_df = pd.DataFrame({ "cluster_ID": clusterIDs, "variant_ID": IDs }) else: CPannot_df = pd.concat([ CPannot_df, pd.DataFrame({ "cluster_ID": clusterIDs, "variant_ID": IDs }) ]) print "finished file: {0:.2f} seconds".format(time.time() - start) fileNum += 1 # Save the CPannot file as a pickle CPannotFilename = "_".join( longestSubstring(CPseqFiles).split("_")[:-1]) + ".CPannot.pkl" print "Creating CPannot.pkl file: {}...".format(CPannotFilename) CPannot_df = CPannot_df.set_index("cluster_ID") CPannot_df.to_pickle(args.output_directory + CPannotFilename) # Now write a file containing the key for all the assigned IDs: print "Creating ID file: {}...".format(args.ID_file) variant_df = pd.DataFrame(variantDict).transpose() seqs = list(variant_df.index) variant_df.insert(loc=0, column="sequence", value=seqs) sorted_df = variant_df.sort([2, "sequence"], ascending=[ False, True ]) # Sort by number of variants, then by sequence np.savetxt(os.path.join(args.output_directory, args.ID_file), sorted_df.values, fmt='%s', delimiter='\t') print "Done"
def main(): ################ Parse input parameters ################ #set up command line argument parser parser = argparse.ArgumentParser(description='Script for generating CPseries files from CPseq and CPfluor files') group = parser.add_argument_group('required arguments for processing data') group.add_argument('-fs', '--filtered_CPseqs', required=True, help='directory that holds the filtered sequence data (CPseq)') group.add_argument('-bs', '--bsCPfluors', required=True, help='directory containing binding series CPfluor files') group = parser.add_argument_group('optional arguments for processing data') group.add_argument('-od','--output_dir', default="CPseries", help='save output files to here. default = ./CPseries') group.add_argument('-ar','--allRNA', default='', help='directory containing allRNA CPfluor files') group = parser.add_argument_group('other settings') group.add_argument('-n','--num_cores', type=int, default=20, help='maximum number of cores to use. default=20') if not len(sys.argv) > 1: parser.print_help() sys.exit() #parse command line arguments args = parser.parse_args() numCores = args.num_cores # import CPseq filtered files split by tile print "Finding CPseq files in directory {}...".format(args.filtered_CPseqs) # Gather all of the CPseq files in the 'filtered_CPseqs' file directory CPseqFilenames = cpfiletools.find_files_in_directory(args.filtered_CPseqs, ['CPseq']) if len(CPseqFilenames) < 1: print "Error: No CPseq files found in directory: " + args.filtered_CPseqs sys.exit() print "Found CPseq files: " printList(CPseqFilenames) # Create a dictionary of the CPseq files keyed by tile CPseqDict = cpfiletools.make_tile_dict(CPseqFilenames, args.filtered_CPseqs) tileList = CPseqDict.keys() # Gather all of the CPfluor files for all RNA images, if provided allRNA_Dict = {} if args.allRNA != '': print "Finding allRNA CPfluor files in directory {}...".format(args.allRNA) allRNAfilenames = cpfiletools.find_files_in_directory(args.allRNA, ['CPfluor']) print "Found allRNA files: " printList(allRNAfilenames) if len(allRNAfilenames) < 1: print "Error: no CPfluor files found in directory: " + args.allRNA allRNA_Dict = cpfiletools.make_tile_dict(allRNAfilenames, args.allRNA) else: for tile in tileList: allRNA_Dict[tile] = '' # Gather all of the CPfluor files for creating the cluster binding series print "Finding binding series CPfluor files in directory {}...".format(args.bsCPfluors) bindingSeriesList = cpfiletools.find_files_in_directory(args.bsCPfluors, ['CPfluor']) print "Found CPfluor files: " printList(bindingSeriesList) bindingSeriesDict = cpfiletools.make_tile_dict_multiple(bindingSeriesList, args.bsCPfluors) # Make sure output directory is ready: outputDirectory = args.output_dir if os.path.isdir(outputDirectory): print "Output directory {} already exists".format(outputDirectory) else: outputDirectory = os.path.join(os.getcwd(), outputDirectory) print "Making output directory: {}".format(outputDirectory) os.makedirs(outputDirectory) # Make CPseries files CPseriesDict = {} for tile, fileName in CPseqDict.items(): path, baseFile = os.path.split(fileName) CPseriesDict[tile] = os.path.join(outputDirectory, baseFile.split('.')[0]+'.CPseries') # Make CPseries files in parallel: print "Making CPseries files..." (Parallel(n_jobs=numCores, verbose = 10) (delayed(cpfiletools.generate_CPseries_files) (CPseqDict[tile], allRNA_Dict[tile], bindingSeriesDict[tile], CPseriesDict[tile], tile) for i, tile in enumerate(tileList))) print "Done"