Esempio n. 1
0
def main():
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(
        description=
        'Script for assigning unique IDs to variants in CPseries files')
    group = parser.add_argument_group('required arguments')
    group.add_argument(
        '-sd',
        '--series_directory',
        required=True,
        help='directory that holds the CPseries files that need variant IDs')
    group.add_argument(
        '-sc',
        '--seq_column',
        required=True,
        help=
        'which column in the CPseries file you want to use for assigning variants'
    )

    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument(
        '-pi',
        '--previous_ID_file',
        default="",
        help=
        'An ID file previously created for variants expected in the new CPseries files'
    )
    group.add_argument(
        '-st',
        '--seq_start',
        default=0,
        help=
        'start position within sequence for matching. Will use beginning of sequence if none specified.'
    )
    group.add_argument(
        '-ed',
        '--seq_end',
        default=0,
        help=
        'end position within sequence for matching. Will use end of sequence if none specified.'
    )
    group.add_argument(
        '-lb',
        '--label',
        default="ID_ed",
        help='label attached to output files. Default is "ID_ed"')
    group.add_argument(
        '-od',
        '--output_directory',
        default="",
        help=
        'output directory for series files with labeled variants (default will use series_directory)'
    )
    group.add_argument(
        '-if',
        '--ID_file',
        default="ID_file.txt",
        help=
        'file name for the list of IDs and corresponding sequences. Default is "ID_file.txt"'
    )

    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()

    # If no output directory given, use input directory
    if args.output_directory == "":
        args.output_directory = args.series_directory

    # This script will run through each of the provided CPseries files sequentially in order to
    # ensure that each variant gets assigned only one variant ID.

    CPseriesFiles = cpfiletools.find_files_in_directory(
        args.series_directory, ['.CPseries'])

    numLines = 0
    for seriesFile in CPseriesFiles:
        numLines += int(
            subprocess.check_output(("wc -l {} | ".format(
                os.path.join(args.series_directory, seriesFile)) +
                                     " awk \'{print $1}\'"),
                                    shell=True).strip())

    start = time.time()
    randID_set = set()
    print "Generating random IDs..."
    while len(randID_set) < numLines:
        randID = ''.join([
            random.choice(string.ascii_uppercase + string.digits)
            for n in range(8)
        ])  # 8^36 ~ 3.25e32 possible IDs
        randID_set.add(randID)
    print "ID generation: {0:.2f} seconds".format(time.time() - start)

    # This dictionary will contain all the variants assigned, keyed by sequence match
    # The entries in variant dict will be three-element lists, the first is the ID, the second is the filter
    # associated with that variant (if any), and the third is the number of times that variant has been seen
    variantDict = {}

    # If a previous ID file was provided, it will pre-populate the variantDict.
    # Note: it is up to the user to ensure that seq_column, seq_start and seq_end match those used to
    # create the previous ID file!
    if args.previous_ID_file != "":
        with open(args.previous_ID_file, 'r') as f:
            for line in f:
                seq, ID, filtr, n = line.split()
                variantDict[seq] = [ID, filtr, int(n)]

    fileNum = 1

    # Loop through each CPseries file to assign variants:
    for seriesFile in CPseriesFiles:
        print "Working on file: {}...{} of {}".format(seriesFile, fileNum,
                                                      len(CPseriesFiles))
        labeled_filename = os.path.join(
            args.output_directory, ".".join([
                '_'.join([os.path.splitext(seriesFile)[0], args.label]),
                'CPseries'
            ]))
        # Time each loop for now:
        start = time.time()
        # Read in CPseries file as pandas df
        series_df = pd.read_table(os.path.join(args.series_directory,
                                               seriesFile),
                                  header=None)

        # set sequence selection parameters:
        seq_col = int(
            args.seq_column
        ) - 1  # Allow for intuitive column selection (i.e. start at 1)
        if seq_col < 0 or seq_col > len(series_df.columns):
            print "Error: invalid seq column selected. Out of range. Must be within {} and {}".format(
                1, len(series_df.columns))
            sys.exit()

        # Test to ensure provided column contains sequence data:
        test_seq = series_df.iloc[0, seq_col]
        if not re.match("^[a-zA-Z]+$", test_seq):
            print "Error: provided column does not contain sequence data, e.g. {}".format(
                test_seq)
            sys.exit()

        # Test to ensure start and end sequence positions are valid:
        seq_length = len(series_df.iloc[0, seq_col])
        strt = int(args.seq_start)
        if strt < 0 or strt > seq_length - 1:
            print "Error: invalid start position selected. Must be positive and less than seq length"
            sys.exit()
        end = int(args.seq_end)
        if end < strt or end > seq_length:
            print "Error: invalid end position selected. Must be greater than start position and <= seq length"
            sys.exit()

        # If no end range provided, use entire sequence length
        if end == 0:
            end = seq_length

        # Fill in list of IDs to be used as new column
        IDs = []
        total_rows = len(series_df.index)

        # Iterate through entire CPseries file:
        for row in range(total_rows):
            seq = series_df.iloc[row, seq_col][strt:end]
            # If sub-sequence has already been seen, assign existing ID
            if seq in variantDict:
                IDs.append(variantDict[seq][0])
                variantDict[seq][
                    2] += 1  # Count how many times a variant has been seen
            else:
                newID = randID_set.pop()
                IDs.append(newID)
                variantDict[seq] = [newID, series_df.iloc[row, 1], 1]
            # Curtis' cool progress bar:
            cpfiletools.update_progress(row, total_rows)

        # Add in new ID column: (currently puts it next to the filter column)
        series_df.insert(loc=2, column="IDs", value=IDs)
        np.savetxt(labeled_filename,
                   series_df.values,
                   fmt='%s',
                   delimiter='\t')
        print "finished file: {0:.2f} seconds".format(time.time() - start)
        fileNum += 1

    # Now write a file containing the key for all the assigned IDs:
    print "Creating ID file: {}...".format(args.ID_file)
    variant_df = pd.DataFrame(variantDict).transpose()
    seqs = list(variant_df.index)
    variant_df.insert(loc=0, column="sequence", value=seqs)
    sorted_df = variant_df.sort([2, "sequence"], ascending=[
        False, True
    ])  # Sort by number of variants, then by sequence
    np.savetxt(os.path.join(args.output_directory, args.ID_file),
               sorted_df.values,
               fmt='%s',
               delimiter='\t')
    print "Done"
def main():
	################ Parse input parameters ################

	#set up command line argument parser
	parser = argparse.ArgumentParser(description='Script for bootstrapping fluorescence values from CPsignal files')
	group = parser.add_argument_group('required arguments for processing data')
	group.add_argument('-sd', '--CPsignal_dir', required=True,
	                    help='directory that holds the CPsignal files you want to get data from')

	group = parser.add_argument_group('optional arguments for processing data')
	group.add_argument('-bt','--bootstrap_type', default="v",
	                    help='How to subset data for bootstrapping: f = by filter, v = by variant. Default = "v"')
	group.add_argument('-fs','--filter_set', default="all",
	                    help='which filters you want to bootstrap. Default = "all"')
	group.add_argument('-vs','--variant_set', default="all",
	                    help='which variants you want to bootstrap. Default = "all"')
	group.add_argument('-st','--statistic', default='median',
	                    help='statistic you want to bootstrap. Default = "median", Options: "median","mean"')
	group.add_argument('-nb','--num_bootstraps', default=1000,
	                    help='number of times to bootstrap. Default = 1000')
	group.add_argument('-mr','--min_replicates', default=10,
	                    help='minimum number of replicates a variant must have for bootstrapping. Default = 10')
	group.add_argument('-ci','--confidence_interval', default='95',
	                    help='percent confidence interval to provide on bootstrapped statistic. Default = 95')
	group.add_argument('-od','--output_dir', default="CPsignal_dir",
	                    help='save output files to here. Default is provided CPsignal directory')
	group.add_argument('-op','--output_prefix', default="bootstrap_fluorescence",
	                    help='output file prefix. Default = "bootstrap_fluorescence"')
	group = parser.add_argument_group('other settings')
	group.add_argument('-n','--num_cores', type=int, default=1,
	                    help='maximum number of cores to use. default=1')
	
	if not len(sys.argv) > 1:
	    parser.print_help()
	    sys.exit()


	#parse command line arguments and check for problems
	args = parser.parse_args()
	numCores = int(args.num_cores)

	signal_files = cpfiletools.find_files_in_directory(args.CPsignal_dir, ['.CPseries'])
	bootstrap_type = args.bootstrap_type
	if bootstrap_type != 'f' and bootstrap_type != 'v':
		print >> sys.stderr, "Error: bootstrap type invalid (must be either 'f' or 'v'). Exiting..."
		sys.exit()
	filter_set = str.split(args.filter_set, ',')
	variant_set = str.split(args.variant_set, ',')
	statistic = args.statistic
	if statistic != 'median' and statistic != 'mean':
		print >> sys.stderr, "Error: statistic choice invalid. Exiting..."
		sys.exit()
	num_bootstraps = int(args.num_bootstraps)
	min_replicates = int(args.min_replicates)
	if int(args.confidence_interval) < 100:
		confidence_interval = [(100-float(args.confidence_interval))/2, 
								100-((100-float(args.confidence_interval))/2)]
	else:
		print >> sys.stderr, "Error: confidence interval must be between 0 and 100. Exiting..."
		sys.exit()
	
	if args.output_dir == "CPsignal_dir":
		output_directory = args.CPsignal_dir
	if not os.path.isdir(output_directory):
		print >> sys.stderr, "Error: output directory is not a valid directory. Exiting..."
		sys.exit()


	# Read in the CPseries files:
	print "Reading in data and subsetting if necessary..."
	start = time.time()
	series = loadAndConcatAllTiles(signal_files, args.CPsignal_dir)


	# Subset data of interest:
	# (If you don't reset the index here, pandas gives you a hard time concatenating the two data 
	# frames in the next step)
	series = selectData(filter_set, variant_set, filterCol, variantIDCol, series).reset_index(drop=True)
	print "\nStructuring data for bootstrapping..."

	### Restructure data frame such that fluorescence values are in their own columns
	all_fluor_series = []
	indexes = range(len(series.iloc[0, fluorSeriesCol].split(',')))

	# Pull out the fluorescence series and put into a data frame
	for i in xrange(len(series)):
		fluorescence_series = np.array([float(j) for j in series.iloc[i, fluorSeriesCol].split(',')])
		# Take the time to label unquantified clusters now, since it allows for fast removal later
		if all(np.isnan(fluorescence_series)):
			fluorescence_series = np.append(fluorescence_series, 0)
		else:
			fluorescence_series = np.append(fluorescence_series, 1)
		all_fluor_series.append(fluorescence_series)
	fluor_data_df = pd.DataFrame(all_fluor_series)
	# Quantified clusters get a '1'
	fluor_data_df.columns = indexes + ['Quantified']

	# separate out the ID columns from the working series and give them names
	id_cols = series.iloc[:,[clusterIdCol, filterCol, variantIDCol]]
	id_cols.columns = ["clusterID", "filterID", "variantID"]
	# Create the new working series
	frames = [id_cols, fluor_data_df]
	series = pd.concat(frames, axis=1)
	print "Done: {0:.2f} seconds".format(time.time() - start)


	# Remove all clusters that have no associated values
	print "\nRemoving unquantified clusters..."
	start = time.time()
	count = len(series.index)
	series = series.loc[series["Quantified"] == 1]
	series.drop("Quantified", axis=1, inplace=True)
	count = count - len(series.index)
	print "Removed "+str(count)+" unquantified clusters: {0:.2f} seconds".format(time.time() - start)



	### Perform Bootstrapping ###
	print "\nPerforming bootstrapping..."
	start = time.time()

	if bootstrap_type == 'v':
		allVariants = set(series.iloc[:,variantIDCol])
		namesToBootstrap = list(allVariants)
		label = "variantID"
	if bootstrap_type == 'f':
		allFilters = set(series.iloc[:,filterCol])
		namesToBootstrap = list(allFilters)
		label = "filterID"

	print "bootstrapping {} unique variants...".format(len(namesToBootstrap))
	# bootstrapOneVariant(variantSeries, indexes, variantName, numBootstraps, minReplicates, statistic, confidence_interval):
	if numCores > 1:
		allBootstrappedValues = (Parallel(n_jobs=numCores, verbose = 10)(delayed(bootstrapOneVariant)(series.loc[series[label] == name,:], 
							indexes, name, num_bootstraps, min_replicates, statistic, 
							confidence_interval) for name in namesToBootstrap))
	else:
		allBootstrappedValues = [bootstrapOneVariant(series.loc[series[label] == name,:], 
								indexes, name, num_bootstraps, min_replicates, statistic, 
								confidence_interval) for name in namesToBootstrap]
	allBootstrappedValues = filter(None, allBootstrappedValues)
	print "Done: {0:.2f} seconds".format(time.time() - start)
	print "{} variants passed minimum replicate cutoff of {}".format(len(allBootstrappedValues), min_replicates)
	

	### Write to file ###
	with open(output_directory+args.output_prefix+".CPdata", 'w') as f:
		for variant in allBootstrappedValues:
			for line in variant:
				for i in line:
					f.write(str(i)+'\t')
				f.write('\n')
def main():
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(
        description=
        'script for generating input files for image stack quantification')
    group = parser.add_argument_group('required arguments for processing data')
    group.add_argument(
        '-id',
        '--input_directory',
        required=True,
        help='directory that holds the image files of an array experiment')
    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument(
        '-tl',
        '--tile_list',
        default="",
        help='which tiles to use when generating input files (default is all)')
    group.add_argument(
        '-od',
        '--output_directory',
        default="",
        help='save output files to here. default = input directory')
    group.add_argument('-op',
                       '--output_prefix',
                       default="",
                       help='optional output file prefix')
    group.add_argument(
        '-bf',
        '--baseline_flag',
        default="",
        help='flag denoting image files that contain baseline measurements')
    group.add_argument(
        '-ef',
        '--experiment_flag',
        default="",
        help='flag denoting image files that contain experimental measurements'
    )

    # print help if no arguments provided
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()

    if args.output_directory == "":
        args.output_directory = args.input_directory

    # is there an os-insensitive way to add the slash at the end of this? Does it matter?
    args.absPath = os.path.abspath(args.input_directory) + "/"

    # add underscore for better formatting
    if args.output_prefix != "":
        args.output_prefix = args.output_prefix + "_"

    ################ Make input files ################

    # Gather all image files in input directory and extract all tiles
    allFiles = cpfiletools.find_files_in_directory(args.input_directory,
                                                   ['tif'])
    allTiles = set()
    for filename in allFiles:
        allTiles.add(cpfiletools.get_tile_number_from_filename(filename))

    # decide which tiles you want to use for making inputFiles
    if args.tile_list == "":
        tilesToUse = set(allTiles)
    else:
        tilesToUse = set(parse_tile_input(args.tile_list)) & allTiles

    # Tile dictionary for storing file data later on
    tileDict = {}
    for tile in tilesToUse:
        tileDict[tile] = []

    # Make a list of files (filtered by tile) that will be used to create input files
    filteredFiles = []
    for filename in allFiles:
        if cpfiletools.get_tile_number_from_filename(filename) in tilesToUse \
        and (args.baseline_flag in filename or args.experiment_flag in filename):
            filteredFiles.append(filename)
            print "will use:\t{}".format(filename)

    # Make separate lists for differently flagged files
    baselineFiles = []
    expFiles = []

    for filename in filteredFiles:
        if args.baseline_flag != "" and args.baseline_flag in filename:
            baselineFiles.append(filename)
        if args.experiment_flag != "" and args.experiment_flag in filename:
            expFiles.append(filename)

    if len(baselineFiles) < 1 and len(expFiles) < 1:
        print "ERROR: no tiles selected!"
        sys.exit()

    # Add all baseline files to the tile dictionary
    if len(baselineFiles) > 0:
        add_data_to_tile_dict(tileDict, args, baselineFiles,
                              args.baseline_flag, 0)

    # Add all experimental files to the tile dictionary
    if len(expFiles) > 0:
        minTimeStamp = cpfiletools.parse_timestamp_from_filename(expFiles[0])
        # assumes that the first experimental image timestamp (over all tiles)
        # is the pseudo-zero timestamp (THIS NEEDS A MORE ELEGANT SOLUTION)
        for filename in expFiles:
            if cpfiletools.parse_timestamp_from_filename(
                    filename) < minTimeStamp:
                minTimeStamp = cpfiletools.parse_timestamp_from_filename(
                    filename)
        add_data_to_tile_dict(tileDict, args, expFiles, args.experiment_flag,
                              minTimeStamp)

    # Time table for use in hacky analysis (20160201)
    timeTable = {}

    # sort output, add sequence number to experimental file entries, and print all files
    for tile in sorted(tilesToUse):
        tileDict[tile].sort()
        count = 1
        timeTable[tile] = []  # Fill in time table for each tile
        for filedata in tileDict[tile]:
            timeTable[tile].append(filedata.timestamp)
            if args.experiment_flag in filedata.ID:
                filedata.ID = filedata.ID + "_" + str(count)
                count += 1
        filename = args.output_prefix + "tile" + tile + ".ipf"
        with open(args.output_directory + filename, 'w') as f:
            header = "{}".format("time")
            f.write(header + "\n")
            for filedata in tileDict[tile]:
                f.write("{}\n".format(filedata))
            f.write("\n")
        print "Successfully made file: {}".format(filename)

    # Print out the time Table (20160201)
    with open("timeTable.txt", 'w') as f:
        tiles = sorted(timeTable.keys())
        for tile in tiles:
            f.write(tile)
            for time in timeTable[tile]:
                f.write("\t" + str(time))
            f.write("\n")
    print "successfully made file: timeTable.txt"
Esempio n. 4
0
def main():
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(
        description=
        'Script for splitting a directory of images into multiple directories')
    group = parser.add_argument_group('required arguments for processing data')
    group.add_argument('-id',
                       '--image_directory',
                       required=True,
                       help='directory that holds images to be split (.tif)')

    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument('-p',
                       '--prefix',
                       default="set",
                       help='prefix for new directories. default = set')
    group.add_argument('-od',
                       '--output_directory',
                       default='image_directory',
                       help='directory in which new directories will be made')
    group.add_argument(
        '-a',
        '--action',
        default='l',
        help=
        'what to do with the images (m = move, l = symbolic link). Default is to link.'
    )

    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()
    if args.action != "m" and args.action != "l":
        print "Error: action must be either 'm' (move) or 'l' (link)!"
        sys.exit()

    # Gather the image files in the provided image directory
    print "Finding image files in directory {}...".format(args.image_directory)

    imageFiles = cpfiletools.find_files_in_directory(args.image_directory,
                                                     ['tif', 'tiff'])
    if len(imageFiles) < 1:
        print "Error: no image files found in directory: " + args.image_directory
        sys.exit()

    # Make a dictionary of all the image files keyed by tile number
    imageDirectory = os.path.abspath(args.image_directory)

    imageDict = cpfiletools.make_tile_dict_multiple(imageFiles, imageDirectory)
    tileList = imageDict.keys()

    numImagesPerTile = len(imageDict[tileList[0]])

    # now make new directories to hold split images:
    if args.output_directory == 'image_directory':
        outputPath = args.image_directory
    else:
        outputPath = args.output_directory
        if not os.path.exists(outputPath):
            print "Error: directory {} does not exist!".format(outputPath)

    newDirList = []
    for n in range(numImagesPerTile):
        dirname = outputPath + args.prefix + "{:02}".format(n + 1)
        os.mkdir(dirname)
        newDirList.append(dirname)
        print "made directory: {}".format(dirname)

    # Now that directories are made, move images into those directories (or link)
    count = 0
    while count < numImagesPerTile:
        for tile in tileList:
            fullFileName = imageDict[tile].pop(0)
            prevPath, fileName = os.path.split(fullFileName)
            if args.action == "m":
                os.rename(fullFileName, newDirList[count] + "/" + fileName)
            if args.action == "l":
                os.symlink(fullFileName, newDirList[count] + "/" + fileName)
        count += 1

    print "Files split successfully"
Esempio n. 5
0
def main():
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(
        description=
        'Script for generating phony CPfluors for unquantified images')
    group = parser.add_argument_group('required arguments:')
    group.add_argument(
        '-id',
        '--image_dir',
        required=True,
        help=
        'directory that holds the all the images on which quantification was attempted (successful or not)'
    )
    group.add_argument(
        '-fd',
        '--fluor_dir',
        required=True,
        help='directory containing CPfluor files that were generated')
    group.add_argument(
        '-sd',
        '--seq_dir',
        required=True,
        help='directory that contains the CPseq files for this experiment.')

    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument(
        '-od',
        '--output_dir',
        default="fluor_dir",
        help='where the output files will be saved. Default is the fluor_dir.')
    group.add_argument(
        '-fl',
        '--flag',
        default="phony",
        help=
        'optional flag to be inserted at the front of phony CPfluor file names.'
    )

    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()

    # check that output directory is valid:
    if args.output_dir == "fluor_dir":
        output_dir = args.fluor_dir
    else:
        if os.path.isdir(args.output_dir):
            output_dir = args.output_dir
        else:
            print "Error: output directory " + args.output_dir + " is not a directory. Exiting..."
            sys.exit()

    # import fluor files
    print "Finding fluor files in directory " + args.fluor_dir + " ..."
    fluorFilenames = cpfiletools.find_files_in_directory(
        args.fluor_dir, ['.CPfluor'])
    if len(fluorFilenames) < 1:
        print "Error: No fluor files found in directory: " + args.fluor_dir
        sys.exit()

    # import image files
    print "Finding image files in directory " + args.image_dir + " ..."
    imageFilenames = cpfiletools.find_files_in_directory(
        args.image_dir, ['.tif', '.tiff'])
    if len(imageFilenames) < 1:
        print "Error: No image files found in directory: " + args.image_dir
        sys.exit()

    # find the relevant CPseq files:
    print "Finding CPseq files in directory " + args.seq_dir + " ..."
    seqFilenames = cpfiletools.find_files_in_directory(args.seq_dir,
                                                       ['.CPseq'])
    if len(seqFilenames) < 1:
        print "Error: No CPseq files found in directory: " + args.seq_dir
        sys.exit()

    # Make a set of timestamps from the fluor files
    # This script assumes that no two images will have the same timestamp

    fluorTimestamps = set()
    for filename in fluorFilenames:
        fluorTimestamps.add(getTimestamp(filename))

    # Now identify which images do not have corresponding CPfluor files:
    lonelyImageFiles = []
    for filename in imageFilenames:
        timestamp = getTimestamp(filename)
        if timestamp not in fluorTimestamps:
            lonelyImageFiles.append(filename)

    if len(lonelyImageFiles) < 1:
        print "No need for phony files. Exiting..."
        sys.exit()

    # Make a CPseq dict keyed by tile number:
    seq_dict = cpfiletools.make_tile_dict(seqFilenames, args.seq_dir)

    # Now make the new phony files
    for filename in lonelyImageFiles:
        root, ext = os.path.splitext(filename)
        newFluorName = args.flag + filename.strip(ext) + ".CPfluor"
        # find the CPseq file relevant to this image:
        tile = cpfiletools.get_tile_number_from_filename(filename)
        cpseq = seq_dict[tile]
        with open(output_dir + '/' + newFluorName,
                  'w') as outfile, open(cpseq, 'r') as infile:
            for line in infile:
                cluster_ID = line.split()[0]
                outfile.write(cluster_ID +
                              ':0:0.000000:0.000000:0.000000:0.000000\n')
        print "Generated phony file: " + newFluorName
def main():
    start = time.time()
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(description='Script for generating a \
		CPannot file based on previously designed variants')
    group = parser.add_argument_group('required arguments')
    group.add_argument(
        '-sd',
        '--seq_directory',
        required=True,
        help='directory that holds the CPseq files that need variant IDs')
    group.add_argument(
        '-vt',
        '--variant_table',
        required=True,
        help='A tab-delimited table containing the variant information \
		(first column sequence, second column variant ID)')
    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument('-od',
                       '--output_directory',
                       help='output directory for series files with labeled \
		variants (default will use seq_directory)')
    group.add_argument('-n',
                       '--num_cores',
                       type=int,
                       default=19,
                       help='number of cores to use')

    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()
    numCores = args.num_cores

    # If no output directory given, use current directory
    if not args.output_directory:
        args.output_directory = "./"
    output_directory = args.output_directory
    if not os.path.isdir(output_directory):
        print "Error: invalid output directory selection. Exiting..."
        sys.exit()

    # Construct variant dict:
    print "Reading in variant dict: {}".format(args.variant_table)
    variant_dict = get_variant_dict(args.variant_table)

    # Find CPseqs in seq_directory:
    print "Finding CPseq files in directory: {}".format(args.seq_directory)
    CPseqFiles = cpfiletools.find_files_in_directory(args.seq_directory,
                                                     ['.CPseq'])

    if numCores > 1:
        print "Annotating clusters in parallel on {} cores...".format(numCores)
        annotated_cluster_lists = (Parallel(n_jobs=numCores, verbose=10)\
         (delayed(annotate_clusters)(args.seq_directory + CPseq,variant_dict) for CPseq in CPseqFiles))
    else:
        print "Annotating clusters on a single core"
        annotated_cluster_lists = [
            annotate_clusters(args.seq_directory + CPseq, variant_dict)
            for CPseq in CPseqFiles
        ]

    # Combine cluster lists:
    print "Formatting and saving CPannot file..."
    all_annotations = []
    map(all_annotations.extend, annotated_cluster_lists)
    CPannot_df = pd.DataFrame(all_annotations)
    CPannot_df.columns = ['cluster_ID', 'variant_ID']

    # Save the CPannot file as a pickle
    CPannotFilename = "_".join(
        longestSubstring(CPseqFiles).split("_")[:-1]) + ".CPannot.pkl"
    print "Creating CPannot.pkl file: {}...".format(CPannotFilename)
    CPannot_df = CPannot_df.set_index("cluster_ID")

    CPannot_df.to_pickle(output_directory + CPannotFilename)
    print "Done. {} minutes".format(round((time.time() - start) / 60, 2))
def main():
	################ Parse input parameters ################

	#set up command line argument parser
	parser = argparse.ArgumentParser(description='script for relating variantIDs from CPannot file to sequence')
	group = parser.add_argument_group('required arguments')
	group.add_argument('-a', '--annot_file', required=True,
						help='A .CPannot.pkl file')
	group.add_argument('-sd', '--seq_dir', required=True,
						help='A directory of .CPseq files')

	group = parser.add_argument_group('optional arguments for running script')
	group.add_argument('-l','--length', default="short",
						help='translate in "long" or "short" format: long will include every cluster in CPannot file, short will only show variantID and one sequence match. Default = short')
	group.add_argument('-sc','--seq_cols', default="3",
						help='Which sequence columns to output in CPtranslate file. May use multiple columns for long format (seqarate by commas). key: 3 = r1, 5 = r2, 7 = i7, 9 = i5')
	group.add_argument('-od','--output_dir', default=os.getcwd(),
						help='Output directory. default is current directory')
	group.add_argument('-n','--num_cores', default=1,
						help='How many cores to use for parallel processing')

	if not len(sys.argv) > 1:
		parser.print_help()
		sys.exit()

	##### parse command line arguments #####
	args = parser.parse_args()

	annot_file = args.annot_file
	seq_dir = args.seq_dir

	length = args.length
	if length != "short" and length != "long":
		print "Error: length must be either 'short' or 'long'. Exiting..."
		sys.exit()

	seq_cols = [0] + [int(n) - 1 for n in args.seq_cols.split(',')]
	seq_col_names = assign_names(seq_cols)

	output_dir = args.output_dir
	if not os.path.isdir(output_dir):
		print "Error: output directory is invalid. Exiting..."
		sys.exit()
	if output_dir[-1] != '/':
		output_dir = output_dir + '/'

	num_cores = int(args.num_cores)

	########################################


	# Read in CPannot file
	print "Reading in CPannot file..."
	start = time.time()
	annot_df = fileio.loadFile(annot_file)
	print "file loaded: {0:.2f} seconds\n".format(time.time() - start)

	# Read in CPseq files as a concatenated data frame
	print "Reading in CPseq files..."
	start = time.time()
	seq_files = cpfiletools.find_files_in_directory(seq_dir, ['.CPseq'])
	print "found CPseq files: "
	cpfiletools.printList(seq_files)
	seq_df = pd.DataFrame()
	for seq_file in seq_files:
		new_df = pd.read_csv(seq_dir+seq_file, sep='\t', index_col=0, usecols=seq_cols, header=None)
		seq_df = pd.concat([seq_df, new_df])
	seq_df.columns = seq_col_names
	print str(len(seq_files)) + " files loaded: {0:.2f} seconds\n".format(time.time() - start)
	
	# Merge the data frames 
	print "Merging data frames..."
	start = time.time()
	merged_df = annot_df.merge(seq_df, how='left', left_index=True, right_index=True)
	print "Merged: {0:.2f} seconds\n".format(time.time() - start)


	# Save long format CPtranslate if requested
	if length == "long":
		print "Saving long format CPtranslate.pkl..."
		start = time.time()
		filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".long.CPtranslate.pkl"
		print "filename = "+filename
		merged_df.to_pickle(output_dir+filename)
		print "Saved: {0:.2f} seconds\n".format(time.time() - start)

	# Create the short format CPtranslate:
	if length == "short":
		print "Generating short format CPtranslate..."
		start = time.time()
		# Make a list of unique variant_IDs

		grouped_variants = merged_df.groupby('variant_ID')

		all_variants = (Parallel(n_jobs=num_cores, verbose=10)
						(delayed(fetch_sequences)(name, group, seq_col_names) for name, group in grouped_variants))

		short_df = pd.DataFrame(all_variants)
		short_df.columns = ['variant_ID', 'count']+seq_col_names
		print "short format generated: {0:.2f} seconds\n".format(time.time() - start)
		print short_df.head()

		print "Saving short format CPtranslate.pkl..."
		start = time.time()
		filename = os.path.basename(annot_file).rstrip('.CPannot.pkl')+".short.CPtranslate.pkl"
		short_df.to_pickle(output_dir+filename)
		print "Saved: {0:.2f} seconds\n".format(time.time() - start)
def main():
    ################ Parse input parameters ################

    #set up command line argument parser
    parser = argparse.ArgumentParser(
        description='Script for assigning unique IDs to variants in CPseq files'
    )
    group = parser.add_argument_group('required arguments')
    group.add_argument(
        '-sd',
        '--seq_directory',
        required=True,
        help='directory that holds the CPseq files that need variant IDs')
    group.add_argument(
        '-sc',
        '--seq_column',
        required=True,
        help=
        'which column in the CPseq file you want to use for assigning variants'
    )

    group = parser.add_argument_group('optional arguments for processing data')
    group.add_argument(
        '-pi',
        '--previous_ID_file',
        default="",
        help=
        'An ID file previously created for variants expected in the new CPseq files'
    )
    group.add_argument(
        '-fk',
        '--filters_to_use',
        default="",
        help=
        'Which filters should be kept. Separate by commas: filter1,filter2,filter3,etc. If you want to use clusters without a filter, include "blank" (filter1,filter2,blank). Default is to use all filters.'
    )
    group.add_argument(
        '-st',
        '--seq_start',
        default=0,
        help=
        'start position within sequence for matching. Will use beginning of sequence if none specified.'
    )
    group.add_argument(
        '-ed',
        '--seq_end',
        default=0,
        help=
        'end position within sequence for matching. Will use end of sequence if none specified.'
    )
    group.add_argument(
        '-lb',
        '--label',
        default="ID_ed",
        help='label attached to output files. Default is "ID_ed"')
    group.add_argument(
        '-od',
        '--output_directory',
        default="",
        help=
        'output directory for series files with labeled variants (default will use seq_directory)'
    )
    group.add_argument(
        '-if',
        '--ID_file',
        default="ID_file.txt",
        help=
        'file name for the list of IDs and corresponding sequences. Default is "ID_file.txt"'
    )

    if not len(sys.argv) > 1:
        parser.print_help()
        sys.exit()

    #parse command line arguments
    args = parser.parse_args()

    # If no output directory given, use input directory
    if args.output_directory == "":
        args.output_directory = args.seq_directory
    output_directory = args.output_directory
    if not os.path.isdir(output_directory):
        print "Error: invalid output directory selection. Exiting..."
        sys.exit()

    # Crate a set of filters to be kept:
    filters = set(args.filters_to_use.split(','))
    if "blank" in filters:
        filters.remove("blank")
        filters.add(
            "no_filter"
        )  #I coerce the pandas dataframes to contain 'no_filter' instead of NaN's
    if filters.pop() == "":
        filters.add("all")
    print filters
    # This script will run through each of the provided CPseq files sequentially in order to
    # ensure that each variant gets assigned only one variant ID.

    CPseqFiles = cpfiletools.find_files_in_directory(args.seq_directory,
                                                     ['.CPseq'])

    numLines = 0
    for seqFile in CPseqFiles:
        numLines += int(
            subprocess.check_output(
                ("wc -l {} | ".format(os.path.join(
                    args.seq_directory, seqFile)) + " awk \'{print $1}\'"),
                shell=True).strip())

    start = time.time()
    randID_set = set()
    print "Generating random IDs..."
    while len(randID_set) < numLines:
        randID = ''.join([
            random.choice(string.ascii_uppercase + string.digits)
            for n in range(8)
        ])  # 36^8 ~ 2.8e12 possible IDs
        randID_set.add(randID)
    print "ID generation: {0:.2f} seconds".format(time.time() - start)

    # This dictionary will contain all the variants assigned, keyed by sequence match
    # The entries in variant dict will be three-element lists, the first is the ID, the second is the filter
    # associated with that variant (if any), and the third is the number of times that variant has been seen
    variantDict = {}

    # If a previous ID file was provided, it will pre-populate the variantDict.
    # Note: it is up to the user to ensure that seq_column, seq_start and seq_end match those used to
    # create the previous ID file!
    if args.previous_ID_file != "":
        with open(args.previous_ID_file, 'r') as f:
            for line in f:
                seq, ID, filtr, n = line.split()
                variantDict[seq] = [ID, filtr, int(n)]

    fileNum = 1
    CPannot_df = pd.DataFrame()

    # Loop through each CPseq file to assign variants:
    for seqFile in CPseqFiles:
        print "Working on file: {}...{} of {}".format(seqFile, fileNum,
                                                      len(CPseqFiles))
        # Time each loop for now:
        start = time.time()
        # Read in CPseq file as pandas df
        seq_df = pd.read_table(os.path.join(args.seq_directory, seqFile),
                               header=None)
        seq_df = seq_df.fillna('no_filter')

        print "length pre-filter " + str(len(seq_df))
        # filter df by filters to keep (if any)
        if "all" not in filters:
            seq_df = seq_df[seq_df.iloc[:, 1].isin(filters)]
        print "length post-filter " + str(len(seq_df))

        # set sequence selection parameters:
        seq_col = int(
            args.seq_column
        ) - 1  # Allow for intuitive column selection (i.e. start at 1)
        if seq_col < 0 or seq_col > len(seq_df.columns):
            print "Error: invalid seq column selected. Out of range. Must be within {} and {}".format(
                1, len(seq_df.columns))
            sys.exit()

        # Test to ensure provided column contains sequence data:
        test_seq = seq_df.iloc[0, seq_col]
        if not re.match("^[a-zA-Z]+$", test_seq):
            print "Error: provided column does not contain sequence data, e.g. {}".format(
                test_seq)
            sys.exit()

        # Test to ensure start and end sequence positions are valid:
        seq_length = len(seq_df.iloc[0, seq_col])
        strt = int(args.seq_start)
        if strt < 0 or strt > seq_length - 1:
            print "Error: invalid start position selected. Must be positive and less than seq length"
            sys.exit()
        end = int(args.seq_end)
        if end < strt or end > seq_length:
            print "Error: invalid end position selected. Must be greater than start position and <= seq length"
            sys.exit()

        # If no end range provided, use entire sequence length
        if end == 0:
            end = seq_length

        # Fill in list of IDs to be used as new column
        clusterIDs = []
        IDs = []
        total_rows = len(seq_df.index)

        # Iterate through entire CPseq file:
        for row in range(total_rows):
            seq = seq_df.iloc[row, seq_col][strt:end]
            # If sub-sequence has already been seen, assign existing ID
            if seq in variantDict:
                IDs.append(variantDict[seq][0])
                variantDict[seq][
                    2] += 1  # Count how many times a variant has been seen
            else:
                newID = randID_set.pop()
                IDs.append(newID)
                variantDict[seq] = [newID, seq_df.iloc[row, 1], 1]
            clusterIDs.append(seq_df.iloc[row, 0])
            # Curtis' cool progress bar:
            cpfiletools.update_progress(row, total_rows)

        # Start making the CPannot file:
        if fileNum == 1:
            CPannot_df = pd.DataFrame({
                "cluster_ID": clusterIDs,
                "variant_ID": IDs
            })
        else:
            CPannot_df = pd.concat([
                CPannot_df,
                pd.DataFrame({
                    "cluster_ID": clusterIDs,
                    "variant_ID": IDs
                })
            ])

        print "finished file: {0:.2f} seconds".format(time.time() - start)
        fileNum += 1

    # Save the CPannot file as a pickle
    CPannotFilename = "_".join(
        longestSubstring(CPseqFiles).split("_")[:-1]) + ".CPannot.pkl"
    print "Creating CPannot.pkl file: {}...".format(CPannotFilename)
    CPannot_df = CPannot_df.set_index("cluster_ID")

    CPannot_df.to_pickle(args.output_directory + CPannotFilename)

    # Now write a file containing the key for all the assigned IDs:
    print "Creating ID file: {}...".format(args.ID_file)
    variant_df = pd.DataFrame(variantDict).transpose()
    seqs = list(variant_df.index)
    variant_df.insert(loc=0, column="sequence", value=seqs)
    sorted_df = variant_df.sort([2, "sequence"], ascending=[
        False, True
    ])  # Sort by number of variants, then by sequence
    np.savetxt(os.path.join(args.output_directory, args.ID_file),
               sorted_df.values,
               fmt='%s',
               delimiter='\t')
    print "Done"
Esempio n. 9
0
def main():
	################ Parse input parameters ################

	#set up command line argument parser
	parser = argparse.ArgumentParser(description='Script for generating CPseries files from CPseq and CPfluor files')
	group = parser.add_argument_group('required arguments for processing data')
	group.add_argument('-fs', '--filtered_CPseqs', required=True,
	                    help='directory that holds the filtered sequence data (CPseq)')
	group.add_argument('-bs', '--bsCPfluors', required=True,
	                    help='directory containing binding series CPfluor files')

	group = parser.add_argument_group('optional arguments for processing data')
	group.add_argument('-od','--output_dir', default="CPseries",
	                    help='save output files to here. default = ./CPseries')
	group.add_argument('-ar','--allRNA', default='',
	                    help='directory containing allRNA CPfluor files')
	group = parser.add_argument_group('other settings')
	group.add_argument('-n','--num_cores', type=int, default=20,
	                    help='maximum number of cores to use. default=20')

	if not len(sys.argv) > 1:
	    parser.print_help()
	    sys.exit()


	#parse command line arguments
	args = parser.parse_args()
	numCores = args.num_cores

	# import CPseq filtered files split by tile
	print "Finding CPseq files in directory {}...".format(args.filtered_CPseqs)

	# Gather all of the CPseq files in the 'filtered_CPseqs' file directory
	CPseqFilenames = cpfiletools.find_files_in_directory(args.filtered_CPseqs, ['CPseq'])
	if len(CPseqFilenames) < 1:
		print "Error: No CPseq files found in directory: " + args.filtered_CPseqs
		sys.exit()

	print "Found CPseq files: "
	printList(CPseqFilenames)
	# Create a dictionary of the CPseq files keyed by tile
	CPseqDict = cpfiletools.make_tile_dict(CPseqFilenames, args.filtered_CPseqs)
	tileList = CPseqDict.keys()

	# Gather all of the CPfluor files for all RNA images, if provided
	allRNA_Dict = {}
	if args.allRNA != '':
		print "Finding allRNA CPfluor files in directory {}...".format(args.allRNA)
		allRNAfilenames = cpfiletools.find_files_in_directory(args.allRNA, ['CPfluor'])
		print "Found allRNA files: "
		printList(allRNAfilenames)
		if len(allRNAfilenames) < 1:
			print "Error: no CPfluor files found in directory: " + args.allRNA
		allRNA_Dict = cpfiletools.make_tile_dict(allRNAfilenames, args.allRNA)
	else:
		for tile in tileList:
			allRNA_Dict[tile] = ''

	# Gather all of the CPfluor files for creating the cluster binding series
	print "Finding binding series CPfluor files in directory {}...".format(args.bsCPfluors)
	bindingSeriesList = cpfiletools.find_files_in_directory(args.bsCPfluors, ['CPfluor'])
	print "Found CPfluor files: "
	printList(bindingSeriesList)
	bindingSeriesDict = cpfiletools.make_tile_dict_multiple(bindingSeriesList, args.bsCPfluors)



	# Make sure output directory is ready:
	outputDirectory = args.output_dir
	if os.path.isdir(outputDirectory):
		print "Output directory {} already exists".format(outputDirectory)
	else:
		outputDirectory = os.path.join(os.getcwd(), outputDirectory)
		print "Making output directory: {}".format(outputDirectory)
		os.makedirs(outputDirectory)

	# Make CPseries files
	
	CPseriesDict = {}
	for tile, fileName in CPseqDict.items():
		path, baseFile = os.path.split(fileName)
		CPseriesDict[tile] = os.path.join(outputDirectory, baseFile.split('.')[0]+'.CPseries')
	
	# Make CPseries files in parallel:
	print "Making CPseries files..."
	(Parallel(n_jobs=numCores, verbose = 10)
		(delayed(cpfiletools.generate_CPseries_files)
			(CPseqDict[tile], 
			allRNA_Dict[tile], 
			bindingSeriesDict[tile], 
			CPseriesDict[tile], 
			tile)
		for i, tile in enumerate(tileList)))
	print "Done"