def initialize_directories(dock=None): """ Initialize directories for procedure's product files. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: (dict<str>): collection of paths to directories for procedure's files """ # Define paths to directories. path_permutation = os.path.join(dock, "permutation") path_genes = os.path.join(path_permutation, "genes") # Remove previous files to avoid version or batch confusion. utility.remove_directory(path=path_permutation) # Initialize directories. utility.create_directory(path_permutation) utility.create_directory(path_genes) # Collect information. paths = dict() paths["permutation"] = path_permutation paths["genes"] = path_genes # Return information. return paths
def initialize_directories( restore=None, path_dock=None, ): """ Initialize directories for procedure's product files. arguments: restore (bool): whether to remove previous versions of data path_dock (str): path to dock directory for source and product directories and files raises: returns: (dict<str>): collection of paths to directories for procedure's files """ # Collect paths. paths = dict() # Define paths to directories. paths["dock"] = path_dock paths["organization"] = os.path.join(path_dock, "organization") paths["coombes_polygene"] = os.path.join( path_dock, "coombes_prs_gems_gain_mayo_all_2020-10-13") # Remove previous files to avoid version or batch confusion. if restore: utility.remove_directory(path=paths["organization"]) # Initialize directories. utility.create_directories(path=paths["organization"]) # Return information. return paths
def initialize_directories(dock=None): """ Initialize directories for procedure's product files. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: (dict<str>): collection of paths to directories for procedure's files """ # Collect paths. paths = dict() # Define paths to directories. paths["dock"] = dock paths["function"] = os.path.join(paths["dock"], "function") # Remove previous files to avoid version or batch confusion. utility.remove_directory(path=paths["function"]) utility.create_directory(path=paths["function"]) # Return information. return paths
def copy_example_files(): examples_dir = posixpath.join(config.CODE_DIR, 'examples') examples_destination = posixpath.join(config.RELEASE_DIR, 'driver', 'examples') utility.remove_directory(examples_destination) shutil.copytree(examples_dir, examples_destination, ignore=ignored_example_files) stdbool_h_path = posixpath.join(examples_dir, 'stdbool.h') shutil.copy2(stdbool_h_path, examples_destination)
def execute_procedure(dock=None, count=None): """ Function to execute module's main behavior. arguments: dock (str): path to root or dock directory for source and product directories and files count (int): count of shuffles to create and store raises: returns: """ # Remove previous files to avoid version or batch confusion. path_shuffle = os.path.join(dock, "shuffle") utility.remove_directory(path=path_shuffle) # Read source information from file. source = read_source(dock=dock) # Report. utility.print_terminal_partition(level=3) print( "Creating " + str(count) + " shuffles for matrices of dimension " + "zero: " + str(source["tissues_selection"]) + " by dimension one: " + str(source["persons_selection"]) + ". " "Notice that shuffles occur across dimension one (tissues for each " + "person)." ) print( "Hence, values will stay matched to their respective tissues, but " + "they will be shuffled with respect to persons." ) utility.print_terminal_partition(level=3) # Create shuffle indices. shuffles = create_shuffle_indices( count=count, dimension_zero=source["tissues_selection"], dimension_one=source["persons_selection"], ) # Compile information. information = { "shuffles": shuffles } #Write product information to file. write_product(dock=dock, information=information) pass
def write_product_sets(dock=None, information=None): """ Writes product information to file. arguments: dock (str): path to root or dock directory for source and product directories and files. information (object): information to write to file. raises: returns: """ # Specify directories and files. path_tissue = os.path.join(dock, "tissue") utility.create_directory(path_tissue) path_sets = os.path.join(path_tissue, "sets") # Remove previous files since they change from run to run. utility.remove_directory(path=path_sets) utility.create_directory(path_sets) # Iterate on sets. for set in information["sets"]: # Access information. tissue = set["tissue"] data_sample = set["sample"] data_gene = set["gene"] # Specify directories and files. path_sample = os.path.join( path_sets, (tissue + "_samples.tsv") ) path_gene = os.path.join( path_sets, (tissue + "_genes.tsv") ) # Write information to file. data_sample.to_csv( path_or_buf=path_sample, sep="\t", header=True, index=True, ) data_gene.to_csv( path_or_buf=path_gene, sep="\t", header=True, index=True, ) pass pass
def package_release(): package_name = config.get_serialization_release_name() package_path = posixpath.join(config.BUILD_DIR, package_name) utility.remove_directory(package_path) shutil.copytree(config.RELEASE_DIR, package_path) package_format = 'gztar' if config.PLATFORM_SYSTEM == 'Windows': package_format = 'zip' filename = distutils.archive_util.make_archive(package_path, package_format, root_dir=config.BUILD_DIR, base_dir=package_name) logger.info("Release artifact package filename is \'%s\'", filename)
def execute_procedure(dock=None): """ Function to execute module's main behavior. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: """ # Remove previous files to avoid version or batch confusion. path_split = os.path.join(dock, "split") utility.remove_directory(path=path_split) # Read source information from file. source = read_source(dock=dock) split_report_write_genes_signals( cohort="selection", persons=source["persons_sets"]["selection"], data_samples_tissues_persons=source["data_samples_tissues_persons"], data_gene_signal=source["data_gene_signal"], path_directory=os.path.join(dock, "split", "selection"), report=True, ) split_report_write_genes_signals( cohort="respiration", persons=source["persons_sets"]["respiration"], data_samples_tissues_persons=source["data_samples_tissues_persons"], data_gene_signal=source["data_gene_signal"], path_directory=os.path.join(dock, "split", "respiration"), report=True, ) split_report_write_genes_signals( cohort="ventilation", persons=source["persons_sets"]["ventilation"], data_samples_tissues_persons=source["data_samples_tissues_persons"], data_gene_signal=source["data_gene_signal"], path_directory=os.path.join(dock, "split", "ventilation"), report=True, ) pass
def initialize_directories(dock=None): """ Initialize directories for procedure's product files. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: (dict<str>): collection of paths to directories for procedure's files """ # Collect paths. paths = dict() # Define paths to directories. paths["dock"] = dock paths["stratification"] = os.path.join(paths["dock"], "stratification") # Remove previous files to avoid version or batch confusion. utility.remove_directory(path=paths["stratification"]) utility.create_directory(path=paths["stratification"]) # Define paths for cohorts of persons. cohorts = list() cohorts.append("selection") cohorts.append("respiration") cohorts.append("ventilation") for cohort in cohorts: paths[cohort] = dict() paths[cohort]["component"] = os.path.join(paths["stratification"], cohort, "component") paths[cohort]["regression"] = os.path.join(paths["stratification"], cohort, "regression") paths[cohort]["summary"] = os.path.join(paths["stratification"], cohort, "summary") # Initialize directories. utility.create_directories(path=paths[cohort]["component"]) utility.create_directories(path=paths[cohort]["regression"]) utility.create_directories(path=paths[cohort]["summary"]) # Return information. return paths
def execute_procedure(dock=None): """ Function to execute module's main behavior. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: """ if False: # Remove previous files to avoid version or batch confusion. path_expecto = os.path.join(dock, "expecto") utility.remove_directory(path=path_expecto) path_remote = "http://deepsea.princeton.edu/media/code/expecto/combined_snps.0.3.zip" path_local = os.path.join(path_expecto, "combined_snps.0.3.zip") utility.remove_file(path_local) wget.download(path_remote, path_local) utility.decompress_file_gzip(path_local) # Read source information from file. source = read_source(dock=dock) print(source["data_expecto"]) # Compile information. information = {"data_expecto": source["data_expecto"]} #Write product information to file. write_product(dock=dock, information=information) pass
def run(self): """ Run application :return: """ join_df = self.join_df self.persist() # Create dataframes for intersection , cogo labs and liveworks only dataframes logger.info( " Creating common dataframe emd5 present both in live works and cogo labs" ) """ Cogo labs emd5 Name 1 Sam 2 Henry Liveworks emd5 Name 2 John 3 Smith Full Outer Join c_emd5 l_emd5 c_name l_name 1 Null Sam Null 2 2 Henry John Null 3 Null Smith Intersection from cogo labs and Live works, where c_emd5 and l_emd5 is not null c_emd5 l_emd5 c_name l_name 2 2 Henry John Users only from cogo labs, where c_emd5 is not null and l_emd5 is null c_emd5 l_emd5 c_name l_name 1 Null Sam Null Users only from live works, where l_emd is not null and c_emd5 is null c_emd5 l_emd5 c_name l_name Null 3 Null Smith """ common_df = join_df.filter(~join_df.emd5.isNull() & ~join_df.cogo_emd5.isNull()) logger.info( " Creating cogo labs only dataframe where emd5 present in cogo labs and not present in live works" ) cogo_labs_only_df = join_df.filter(~join_df.cogo_emd5.isNull() & join_df.emd5.isNull()) logger.info( " Creating live works only dataframe where emd5 present in liveworks and not present in cogo labs" ) live_works_only_df = join_df.filter(join_df.cogo_emd5.isNull() & ~join_df.emd5.isNull()) # counting distinct emd5 counts intersection_count = common_df.select( common_df.cogo_emd5).distinct().count() cogo_labs_only_count = cogo_labs_only_df.select( cogo_labs_only_df.cogo_emd5).distinct().count() live_works_only_count = live_works_only_df.select( live_works_only_df.emd5).distinct().count() logger.info( "Number of Unique users present in both cogo labs and liveworks %s", intersection_count) logger.info("Number of Unique users present only in cogo labs data %s", cogo_labs_only_count) logger.info( "Number of Unique users present only in live works data %s", live_works_only_count) # Create common job data frame with users having same job title logger.info( "Creating common job data frame where common users have same job title" ) common_job_df = common_df.where(common_df.cogo_job == common_df.job) common_job_df.persist(StorageLevel.DISK_ONLY) print("Output with common emd5 users having same job title") common_job_df.show() common_job_count = common_job_df.count() # Calculate percentage common emd5 users have different job titles different_jobs_percent = ( (intersection_count - common_job_count) / intersection_count) * 100 logger.info( "Number of users with common job present in both cogo labs and liveworks %s", common_job_df.count()) logger.info("Percent have different job titles in intersection %s", different_jobs_percent) # jsonsify data from common data frame """ Create Key:Value pair Key = Job title , Value = Company Name cogolabs_emd5 cogolabs_job cogolabs_company liveworks_job liveworks_company 1 Hotel manager Bender PLC Barrister Brown PLC 1 Immigration officer Diaz Ltd cogolabs_emd5 cogo_labs_c liveworks_c 1 {"Hotel manager":"Bender PLC"} {"Barrister":"Brown PLC"} 1 {"Immigration officer":"Diaz Ltd"} """ common_json_df = common_df.withColumn("live_works_c", concat(lit("{\""), common_df.job, lit("\":\""), common_df.company, lit("\"}"))) \ .withColumn("cogo_labs_c", concat(lit("{\""), common_df.cogo_job, lit("\":\""), common_df.cogo_company, lit("\"}"))) """ Concatenate results to form Array of key value pairs group by emd5 emd5 cogolabs_json liveworks_json 1 [{"Hotel manager":"Bender PLC"},{"Immigration officer":"Diaz Ltd"}] [{"Barrister":"Brown PLC"}] """ common_agg_df = common_json_df.select(common_json_df.emd5, common_json_df.cogo_labs_c, common_json_df.live_works_c) \ .groupBy(common_json_df.emd5).agg(concat_ws(",", collect_list(common_json_df.cogo_labs_c)), concat_ws(",", collect_list(common_json_df.live_works_c))) \ .withColumnRenamed("concat_ws(,, collect_list(cogo_labs_c))", "cogo_labs_json") \ .withColumnRenamed("concat_ws(,, collect_list(live_works_c))", "live_works_json") final_df = common_agg_df.withColumn("cogo_labs_json", concat(lit("["), common_agg_df.cogo_labs_json, lit("]"))) \ .withColumn("live_works_json", concat(lit("["), common_agg_df.live_works_json, lit("]"))) print("Final output with emd5 , cogolabs json and liveworks json") final_df.persist(StorageLevel.DISK_ONLY) final_df.show() # save final output as csv if os.path.exists("data/final_output/"): remove_directory("data/final_output/") logger.info("Save final output as csv") final_df.repartition(1).write.format("csv").save( os.path.join(DATA_PATH, "final_output")) self.unpersist()
def initialize_directories(dock=None): """ Initialize directories for procedure's product files. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: (dict<str>): collection of paths to directories for procedure's files """ # Collect paths. paths = dict() # Define paths to directories. paths["dock"] = dock paths["candidacy"] = os.path.join(paths["dock"], "candidacy") # Remove previous files to avoid version or batch confusion. utility.remove_directory(path=paths["candidacy"]) utility.create_directory(path=paths["candidacy"]) # Define paths for cohorts of persons. cohorts = list() cohorts.append("selection") cohorts.append("respiration") cohorts.append("ventilation") for cohort in cohorts: paths[cohort] = dict() paths[cohort]["threshold"] = os.path.join( paths["candidacy"], cohort, "threshold" ) # Define paths for groups of genes by their distributions. paths[cohort]["distribution"] = dict() paths[cohort]["distribution"]["multimodal"] = os.path.join( paths["candidacy"], cohort, "distribution", "multimodal" ) paths[cohort]["distribution"]["unimodal"] = os.path.join( paths["candidacy"], cohort, "distribution", "unimodal" ) paths[cohort]["distribution"]["nonmultimodal"] = os.path.join( paths["candidacy"], cohort, "distribution", "nonmultimodal" ) paths[cohort]["distribution"]["any"] = os.path.join( paths["candidacy"], cohort, "distribution", "any" ) # Initialize directories. utility.create_directories(path=paths[cohort]["threshold"]) utility.create_directories( path=paths[cohort]["distribution"]["any"] ) utility.create_directories( path=paths[cohort]["distribution"]["multimodal"] ) utility.create_directories( path=paths[cohort]["distribution"]["unimodal"] ) utility.create_directories( path=paths[cohort]["distribution"]["nonmultimodal"] ) # Return information. return paths
def clean_sdk(self): """Deletes SDK that has been unpacked""" utility.remove_directory(self.path)
def execute_procedure(dock=None): """ Function to execute module's main behavior. arguments: dock (str): path to root or dock directory for source and product directories and files raises: returns: """ # Remove previous files. # Specify directories and files. path_metric = os.path.join(dock, "metric") utility.create_directory(path_metric) path_figure = os.path.join(path_metric, "figure") utility.remove_directory(path=path_figure) # Read source information from file. #source = read_source(dock=dock) utility.print_terminal_partition(level=1) print("Test of metrics of modality.") # Unimodal normal distribution. utility.print_terminal_partition(level=2) print("Simulation on 1,000,000 random values with a unimodal normal " + "distribution.") print("Expectations for unimodal normal distribution...") print("skewness = 0") print("kurtosis = 0") print("bimodality coefficient < 0.55") print("dip statistic < 0.05") utility.print_terminal_partition(level=3) # Generate random values with a normal distribution. series = generate_random_values_normal(mean=1.0, deviation=3.0, count=1000000, method="random") report_metrics(name="unimodality", series=series, dock=dock) utility.print_terminal_partition(level=3) # Bimodal normal distribution 1. utility.print_terminal_partition(level=2) print("Simulation on 1,000,000 random values with a bimodal normal " + "distribution.") print("Expectations for bimodal normal distribution...") print("skewness = ?") print("kurtosis = ?") print("bimodality coefficient > 0.55") print("dip statistic > 0.05") utility.print_terminal_partition(level=3) # Generate random values with a normal distribution. series_one = generate_random_values_normal(mean=1.0, deviation=1.0, count=500000, method="random") series_two = generate_random_values_normal(mean=5.0, deviation=2.0, count=500000, method="random") #series_one.extend(series_two) series = series_one + series_two report_metrics(name="bimodality_1", series=series, dock=dock) utility.print_terminal_partition(level=3) # Bimodal normal distribution 2. utility.print_terminal_partition(level=2) print("Simulation on 1,000,000 random values with a bimodal normal " + "distribution.") print("Expectations for bimodal normal distribution...") print("skewness = ?") print("kurtosis = ?") print("bimodality coefficient > 0.55") print("dip statistic > 0.05") utility.print_terminal_partition(level=3) # Generate random values with a normal distribution. series_one = generate_random_values_normal(mean=1.0, deviation=1.0, count=500000, method="random") series_two = generate_random_values_normal(mean=10.0, deviation=2.0, count=500000, method="random") #series_one.extend(series_two) series = series_one + series_two report_metrics(name="bimodality_2", series=series, dock=dock) utility.print_terminal_partition(level=3) # Bimodal normal distribution 3. utility.print_terminal_partition(level=2) print("Simulation on 1,000,000 random values with a bimodal normal " + "distribution.") print("Expectations for bimodal normal distribution...") print("skewness = ?") print("kurtosis = ?") print("bimodality coefficient > 0.55") print("dip statistic > 0.05") utility.print_terminal_partition(level=3) # Generate random values with a normal distribution. series_one = generate_random_values_normal(mean=1.0, deviation=1.0, count=100000, method="random") series_two = generate_random_values_normal(mean=10.0, deviation=2.0, count=900000, method="random") #series_one.extend(series_two) series = series_one + series_two report_metrics(name="bimodality_3", series=series, dock=dock) utility.print_terminal_partition(level=3) # Compile information. information = {} #Write product information to file. #write_product(dock=dock, information=information) pass