def cluster_count_matrix(config_file, lane_id, strain_fmt_string, cond_fmt_string):

    config_params = cfp.parse(config_file)

    sample_detection_limit, control_detection_limit = get_detection_limits(config_params)

    # If the file does not exist, then do not attempt to cluster it!
    try:
        genes, conditions, matrix = load_dumped_count_matrix(config_params, lane_id)
    except IOError:
        print "could not find '{}' count matrix".format(lane_id)
        return None

    thresholded_matrix = matrix

    thresholded_matrix[thresholded_matrix < sample_detection_limit] = sample_detection_limit
    logged_matrix = np.log2(thresholded_matrix)

    # Customize the strain and condition names for interpretable visualization!
    custom_genes = customize_strains(genes, config_params, strain_fmt_string)
    custom_conditions = customize_conditions(conditions, config_params, cond_fmt_string)

    dataset = [custom_genes, custom_conditions, logged_matrix]

    record, rows_tree, cols_tree = clus.cluster(dataset)

    f = get_clustered_count_matrix_filename(config_params, lane_id)
    record.save(f, rows_tree, cols_tree)
def cluster_zscore_matrix(config_file, lane_id, strain_fmt_string, cond_fmt_string):

    config_params = cfp.parse(config_file)

    # If the file does not exist, then do not attempt to cluster it!
    try:
        genes, conditions, matrix = load_dumped_zscore_matrix(config_params, lane_id)
    except IOError:
        print "could not find '{}' zscore matrix".format(lane_id)
        return None

    # Customize the strain and condition names for interpretable visualization!
    strain_table = get_barcode_table(config_params)
    sample_table = get_sample_table(config_params)
    custom_genes = customize_strains(genes, strain_table, strain_fmt_string)
    custom_conditions = customize_conditions(conditions, sample_table, cond_fmt_string)

    dataset = [custom_genes, custom_conditions, matrix]

    record, rows_tree, cols_tree = clus.cluster(dataset)

    f = get_clustered_zscore_matrix_filename(config_params, lane_id)
    record.save(f, rows_tree, cols_tree)

    # return the filename so the cdt/atr/gtr files can be copied to a directory with all
    # of the other clustergrams and eventually tarred/gzipped for distribution!
    return f
Ejemplo n.º 3
0
def get_configuration():
    """ Get the configuration from command line and config files """
    # This is the dict we will return
    configuration = {"global": {}, "logging": {}, "tables": {}}

    # Read the command line options
    cmd_line_options = command_line_parser.parse()

    # If a configuration file is specified, read that as well
    if "config" in cmd_line_options:
        conf_file_options = config_file_parser.parse(cmd_line_options["config"])
    else:
        conf_file_options = None

    # Extract global config
    configuration["global"] = __get_global_options(cmd_line_options, conf_file_options)

    # Extract logging config
    configuration["logging"] = __get_logging_options(cmd_line_options, conf_file_options)

    # Extract table configuration
    # If the --table cmd line option is set, it indicates that only table
    # options from the command line should be used
    if "table_name" in cmd_line_options:
        configuration["tables"] = __get_cmd_table_options(cmd_line_options)
    else:
        configuration["tables"] = __get_config_table_options(conf_file_options)

    # Ensure some basic rules
    __check_table_rules(configuration)

    return configuration
def main(config_file):

    # Read in the config params
    config_params = cfp.parse(config_file)
    sample_table = get_sample_table(config_params)

    # Create the folder where all the filtered condition/strain info is dumped
    create_filtered_output_folder(config_params)

    # Get parameters that specify if some steps should be run or not
    bool_dict = {'True': True, 'TRUE': True, 'False': False, 'FALSE': False}
    remove_mtag_offenders = bool_dict[config_params['remove_correlated_index_tags']]
    remove_barcode_specific_conds = bool_dict[config_params['remove_barcode_specific_conditions']]

    dataset = load_dumped_count_matrix(config_params, 'all_lanes')
    if get_verbosity(config_params) >= 2:
        print dataset[2].shape

    if get_verbosity(config_params) >= 1:
        print 'Filtering out...'
        print '\tPrespecified conditions to exclude'
    dataset, filtered_include_tab = filter_dataset_for_include_2(dataset, sample_table)
    if get_verbosity(config_params) >= 2:
        print dataset[2].shape
    write_filtered_include_table(filtered_include_tab, config_params)

    if get_verbosity(config_params) >= 1:
        print '\tPrespecified barcodes to exclude'
    dataset, filtered_barcodes = filter_dataset_for_barcodes(dataset, config_params)
    if get_verbosity(config_params) >= 2:
        print dataset[2].shape
    
    write_filtered_strain_file(filtered_barcodes, config_params)
    
    if remove_mtag_offenders:
        if get_verbosity(config_params) >= 1:
            print '\tHighly-correlated index tags'
        dataset, filtered_index_tag_condition_table = filter_dataset_for_index_tags(dataset, config_params)
        if get_verbosity(config_params) >= 2:
            print dataset[2].shape
        write_correlated_index_tags_excluded_conditions(filtered_index_tag_condition_table, config_params)

    if remove_barcode_specific_conds:
        if get_verbosity(config_params) >= 1:
            print '\tConditions with barcode-specific signatures'
        dataset, filtered_barcode_specific_condition_table = filter_dataset_for_barcode_specific_patterns(dataset, config_params)
        if get_verbosity(config_params) >= 2:
            print dataset[2].shape
        write_barcode_specific_excluded_conditions(filtered_barcode_specific_condition_table, config_params)

    if get_verbosity(config_params) >= 1:
        print '\tConditions and strains with low counts'
    dataset, filtered_degree_condition_table, filtered_degree_barcodes = filter_dataset_for_count_degree(dataset, config_params, sample_table)
    if get_verbosity(config_params) >= 2:
        print dataset[2].shape
    write_count_degree_excluded_conditions(filtered_degree_condition_table, config_params)
    write_count_degree_excluded_strains(filtered_degree_barcodes, config_params)

    # Dump the dataset out to file
    dump_filtered_count_matrix(config_params, dataset)
Ejemplo n.º 5
0
def get_configuration():
    """ Get the configuration from command line and config files """
    # This is the dict we will return
    configuration = {'global': {}, 'logging': {}, 'tables': {}}

    # Read the command line options
    cmd_line_options = command_line_parser.parse()

    # If a configuration file is specified, read that as well
    if 'config' in cmd_line_options:
        conf_file_options = config_file_parser.parse(
            cmd_line_options['config'])
    else:
        conf_file_options = None

    # Extract global config
    configuration['global'] = __get_global_options(cmd_line_options,
                                                   conf_file_options)

    # Extract logging config
    configuration['logging'] = __get_logging_options(cmd_line_options,
                                                     conf_file_options)

    # Extract table configuration
    # If the --table cmd line option is set, it indicates that only table
    # options from the command line should be used
    if 'table_name' in cmd_line_options:
        configuration['tables'] = __get_cmd_table_options(cmd_line_options)
    else:
        configuration['tables'] = __get_config_table_options(conf_file_options)

    # Ensure some basic rules
    __check_table_rules(configuration)

    return configuration
def main(config_file, lane_id):

    config_params = cfp.parse(config_file)
    species_config_params = get_species_config_params(config_params)

    # Loop over the raw fastq files, write out the "index_tag\tbarcode" file,
    # and return all encountered index tags and barcodes
    if get_verbosity(config_params) >= 1:
        print 'parsing fastq file(s)...'
    total_counts, common_primer_counts, barcodes_in_data, index_tags_in_data = fastq_to_barseq(config_params, species_config_params, lane_id)

    # Get maps of barcode to barcode_gene (keeps the strains unique/traceable), and index tag to condition
    if get_verbosity(config_params) >= 1:
        print 'creating mappings from barcodes and index tags...'
    barcode_to_gene = get_barcode_to_gene(species_config_params)
    index_tag_to_condition = get_index_tag_to_condition(config_params, lane_id)

    if get_verbosity(config_params) >= 1:
        print 'barcode to gene map: {}'.format(barcode_to_gene.items()[0:5])
        print 'index tag to condition map: {}'.format(index_tag_to_condition.items()[0:5])

    # Correct the barcodes within the specified error tolerance
    # (There is no function to correct the index tags - this could easily be
    # written in later, although we see no need for it)
    if get_verbosity(config_params) >= 1:
        print 'correcting barcodes...'
    barcode_correcting_map = correct_barcode_map(config_params, barcodes_in_data, barcode_to_gene)
    if get_verbosity(config_params) >= 1:
        print 'number of barcodes that will be counted: {}'.format(len(barcode_correcting_map))

    # Loop over the barseq file (index_tag\tbarcode\n) and assemble the matrix of read counts
    if get_verbosity(config_params) >= 1:
        print 'generating barseq matrix...'
    corrected_barcodes, index_tags, matrix = get_barseq_matrix(config_params, lane_id, barcode_to_gene, barcode_correcting_map, index_tag_to_condition)

    if get_verbosity(config_params) >= 1:
        print 'number of barcodes: {}'.format(len(corrected_barcodes))
        print 'number of index tags: {}'.format(len(index_tags))
        print 'matrix shape: {0} rows x {1} columns'.format(*matrix.shape)

    # Generate reports for index tags and barcodes
    if get_verbosity(config_params) >= 1:
        print 'generating reports...'
    generate_reports(config_params, lane_id, corrected_barcodes, index_tags, matrix, total_counts, common_primer_counts)

    # Convert the barcodes to their condition names and unique gene/barcode names
    barcode_gene_ids = np.array([barcode_to_gene[bc] for bc in corrected_barcodes])
    condition_ids = np.array([index_tag_to_condition[tag] for tag in index_tags])

    # Dump out the final count matrix to file - other scripts will read it and turn it into a readable matrix/CDT
    if get_verbosity(config_params) >= 1:
        print 'dumping count matrix...'
    dump_count_matrix(config_params, lane_id, barcode_gene_ids, condition_ids, matrix)

    # Remove the temporary barseq file
    remove_barseq_file(config_params, lane_id)
Ejemplo n.º 7
0
def main(config_file, lane_id):
    
    # Read in the config params
    config_params = cfp.parse(config_file)
    sample_table = get_sample_table(config_params)

    # Get the interactions output folder
    outfolder = get_lane_interactions_path(config_params, lane_id)
    if not os.path.isdir(outfolder):
        os.makedirs(outfolder)
    
    # Read in the count matrix from dumped file
    dataset = load_dumped_count_matrix(config_params, lane_id)

    # Filter out samples flagged as "do not include" (include? == True)
    filtered_dataset = filter_dataset_for_include(dataset, sample_table, config_params)

    # I think here is the best spot to split the dataset so that different
    # controls can be used for different samples.
    ### split dataset stuff!!!

    # Get list of control samples (control? = True)
    control_condition_ids = get_control_condition_ids(dataset, sample_table)

    # Proceed with algorithm to obtain chemical genetic interaction zscores (scaled deviations)
    if get_verbosity(config_params) >= 1:
        print "Normalizing ... "
    normalized_dataset, mean_control_profile = normalizeUsingAllControlsAndSave(config_params, outfolder, filtered_dataset, control_condition_ids, lane_id)
    if get_verbosity(config_params) >= 1:
        print "Column means: "
        print np.nanmean(normalized_dataset[2], axis = 0)
        print "Done"
        print "Calculating deviations ... "
    deviation_dataset = deviations_globalmean(config_params, outfolder, normalized_dataset, mean_control_profile, lane_id)
    if get_verbosity(config_params) >= 1:
        print "Column means: "
        print np.nanmean(deviation_dataset[2], axis = 0)
        print "Done"
        print "Scaling interactions ... "
    scaled_dev_dataset = scaleInteractions(config_params, outfolder, deviation_dataset, filtered_dataset, control_condition_ids, lane_id)
    if get_verbosity(config_params) >= 1:
        print "Column means: "
        print np.nanmean(scaled_dev_dataset[2], axis = 0)
        print "Done"
    if 'generate_scatterplots' in config_params:
	if config_params['generate_scatterplots'] == 'Y' and lane_id == 'all_lanes_filtered':
            if get_verbosity(config_params) >= 1:
                print "Generating scatterplots"
            generate_scatterplots(config_params, outfolder, mean_control_profile, filtered_dataset, normalized_dataset, deviation_dataset, scaled_dev_dataset)
def main(config_file):

    # Read in the config params
    config_params = cfp.parse(config_file)
    sample_table = get_sample_table(config_params)

    # Read in all of the z-score matrices and combine into one matrix
    dataset = combine_count_matrices(config_params)

    # Get a new folder to house the combined count matrix
    combined_count_folder = get_lane_data_path(config_params, 'all_lanes')
    if not os.path.isdir(combined_count_folder):
        os.makedirs(combined_count_folder)

    # Dump out the combined count matrix!
    combined_count_filename = get_dumped_count_matrix_filename(config_params, 'all_lanes')
    dump_dataset(dataset, combined_count_filename)
def main(config_file):

    # Read in the config params
    config_params = cfp.parse(config_file)
    sample_table = get_sample_table(config_params)

    # Read in all of the z-score matrices and combine into one matrix
    dataset = combine_zscore_matrices(config_params)

    # Get directory for index_tag_correlation analysis
    index_tag_path = get_index_tag_correlation_path(config_params)
    if not os.path.isdir(index_tag_path):
        os.makedirs(index_tag_path)

    # Export the initial combined z-score matrix
    per_lane_zscore_dataset_filename = os.path.join(index_tag_path, 'combined_per_lane_zscore_dataset.dump.gz')
    dump_dataset(dataset, per_lane_zscore_dataset_filename)

    # Get just the control dataset, and dump that out too
    control_condition_ids = get_control_condition_ids(dataset, sample_table)
    control_dataset = get_control_dataset(dataset, control_condition_ids)
    per_lane_control_zscore_dataset_filename = os.path.join(index_tag_path, 'combined_per_lane_control_zscore_dataset.dump.gz')
    dump_dataset(control_dataset, per_lane_control_zscore_dataset_filename)

    # Get the sorted index tag correlations for control conditions
    index_tags_sorted, control_index_tag_correlations_sorted = get_control_index_tag_correlations(control_dataset, sample_table, config_params)
    
    # Export the sorted index tag correlations to dump and text files
    write_index_tag_corrs(index_tags_sorted, control_index_tag_correlations_sorted, index_tag_path)   

    # Get the correlations of each profile to the barcode-specific template profiles
    template_profile_ids, template_profile_mat = generate_barcode_specific_template_profiles(dataset[0])
    condition_ids_sorted, barcode_specific_template_correlations_sorted, template_profile_ids_sorted = compute_max_correlation_barcode_specific_offenders(template_profile_ids, template_profile_mat, dataset[1], dataset[2], config_params)

    # Export the sorted correlations of profiles to the barcode-specific template profiles
    write_barcode_specific_template_corrs(condition_ids_sorted, barcode_specific_template_correlations_sorted, template_profile_ids_sorted, index_tag_path)   
    
    ## Plot a histogram of the index tag correlations
    plot_control_index_tag_correlations(control_index_tag_correlations_sorted, index_tag_path)
 
    filename = config_params['sample_table_file']

    # Read everything in as a string, to prevent vexing
    # number interpretation problems! Methods further down
    # can coerce to different types.
    tab = pd.read_table(filename, dtype = 'S')
    return tab

###########################################
#######  Here is the main script  #########
###########################################

# Get the config file, which is the only argument needed for the pipeline
config_file = args.config_file
config_params = cfp.parse(config_file)

# Read in the sample table
sample_table = get_sample_table(config_params)

# Grab all of the ids of the lanes to process
lane_ids = get_all_lane_ids(sample_table)

## Or, if you were silly and ran all lanes but the newest 4, add this in
#lane_ids = ['lane51', 'lane52', 'lane53', 'lane54']

# First, get one strain X condition count matrix per lane
# This only needs to be run once, unless the barcodes
# or index tags change for some reason.
if start <= 1:
    for lane_id in lane_ids: