def initialize_directories(dock=None):
    """
    Initialize directories for procedure's product files.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:
        (dict<str>): collection of paths to directories for procedure's files

    """

    # Define paths to directories.
    path_permutation = os.path.join(dock, "permutation")
    path_genes = os.path.join(path_permutation, "genes")

    # Remove previous files to avoid version or batch confusion.
    utility.remove_directory(path=path_permutation)

    # Initialize directories.
    utility.create_directory(path_permutation)
    utility.create_directory(path_genes)

    # Collect information.
    paths = dict()
    paths["permutation"] = path_permutation
    paths["genes"] = path_genes

    # Return information.
    return paths
def initialize_directories(
    restore=None,
    path_dock=None,
):
    """
    Initialize directories for procedure's product files.

    arguments:
        restore (bool): whether to remove previous versions of data
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:
        (dict<str>): collection of paths to directories for procedure's files

    """

    # Collect paths.
    paths = dict()
    # Define paths to directories.
    paths["dock"] = path_dock
    paths["organization"] = os.path.join(path_dock, "organization")
    paths["coombes_polygene"] = os.path.join(
        path_dock, "coombes_prs_gems_gain_mayo_all_2020-10-13")
    # Remove previous files to avoid version or batch confusion.
    if restore:
        utility.remove_directory(path=paths["organization"])
    # Initialize directories.
    utility.create_directories(path=paths["organization"])
    # Return information.
    return paths
Beispiel #3
0
def initialize_directories(dock=None):
    """
    Initialize directories for procedure's product files.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:
        (dict<str>): collection of paths to directories for procedure's files

    """

    # Collect paths.
    paths = dict()
    # Define paths to directories.
    paths["dock"] = dock
    paths["function"] = os.path.join(paths["dock"], "function")
    # Remove previous files to avoid version or batch confusion.
    utility.remove_directory(path=paths["function"])
    utility.create_directory(path=paths["function"])
    # Return information.
    return paths
def copy_example_files():
    examples_dir = posixpath.join(config.CODE_DIR, 'examples')
    examples_destination = posixpath.join(config.RELEASE_DIR, 'driver', 'examples')

    utility.remove_directory(examples_destination)

    shutil.copytree(examples_dir, examples_destination, ignore=ignored_example_files)

    stdbool_h_path = posixpath.join(examples_dir, 'stdbool.h')
    shutil.copy2(stdbool_h_path, examples_destination)
Beispiel #5
0
def execute_procedure(dock=None, count=None):
    """
    Function to execute module's main behavior.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files
        count (int): count of shuffles to create and store

    raises:

    returns:

    """

    # Remove previous files to avoid version or batch confusion.
    path_shuffle = os.path.join(dock, "shuffle")
    utility.remove_directory(path=path_shuffle)

    # Read source information from file.
    source = read_source(dock=dock)

    # Report.
    utility.print_terminal_partition(level=3)
    print(
        "Creating " + str(count) + " shuffles for matrices of dimension " +
        "zero: " + str(source["tissues_selection"]) + " by dimension one: " +
        str(source["persons_selection"]) + ". "
        "Notice that shuffles occur across dimension one (tissues for each " +
        "person)."
    )
    print(
        "Hence, values will stay matched to their respective tissues, but " +
        "they will be shuffled with respect to persons."
    )
    utility.print_terminal_partition(level=3)

    # Create shuffle indices.
    shuffles = create_shuffle_indices(
        count=count,
        dimension_zero=source["tissues_selection"],
        dimension_one=source["persons_selection"],
    )

    # Compile information.
    information = {
        "shuffles": shuffles
    }
    #Write product information to file.
    write_product(dock=dock, information=information)

    pass
def copy_example_files():
    examples_dir = posixpath.join(config.CODE_DIR, 'examples')
    examples_destination = posixpath.join(config.RELEASE_DIR, 'driver',
                                          'examples')

    utility.remove_directory(examples_destination)

    shutil.copytree(examples_dir,
                    examples_destination,
                    ignore=ignored_example_files)

    stdbool_h_path = posixpath.join(examples_dir, 'stdbool.h')
    shutil.copy2(stdbool_h_path, examples_destination)
Beispiel #7
0
def write_product_sets(dock=None, information=None):
    """
    Writes product information to file.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files.
        information (object): information to write to file.

    raises:

    returns:

    """

    # Specify directories and files.
    path_tissue = os.path.join(dock, "tissue")
    utility.create_directory(path_tissue)
    path_sets = os.path.join(path_tissue, "sets")
    # Remove previous files since they change from run to run.
    utility.remove_directory(path=path_sets)
    utility.create_directory(path_sets)
    # Iterate on sets.
    for set in information["sets"]:
        # Access information.
        tissue = set["tissue"]
        data_sample = set["sample"]
        data_gene = set["gene"]
        # Specify directories and files.
        path_sample = os.path.join(
            path_sets, (tissue + "_samples.tsv")
        )
        path_gene = os.path.join(
            path_sets, (tissue + "_genes.tsv")
        )
        # Write information to file.
        data_sample.to_csv(
            path_or_buf=path_sample,
            sep="\t",
            header=True,
            index=True,
        )
        data_gene.to_csv(
            path_or_buf=path_gene,
            sep="\t",
            header=True,
            index=True,
        )
        pass
    pass
Beispiel #8
0
def package_release():
    package_name = config.get_serialization_release_name()
    package_path = posixpath.join(config.BUILD_DIR, package_name)

    utility.remove_directory(package_path)
    shutil.copytree(config.RELEASE_DIR, package_path)

    package_format = 'gztar'

    if config.PLATFORM_SYSTEM == 'Windows':
        package_format = 'zip'

    filename = distutils.archive_util.make_archive(package_path, package_format,
                                                   root_dir=config.BUILD_DIR, base_dir=package_name)

    logger.info("Release artifact package filename is \'%s\'", filename)
Beispiel #9
0
def execute_procedure(dock=None):
    """
    Function to execute module's main behavior.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:

    """

    # Remove previous files to avoid version or batch confusion.
    path_split = os.path.join(dock, "split")
    utility.remove_directory(path=path_split)

    # Read source information from file.
    source = read_source(dock=dock)

    split_report_write_genes_signals(
        cohort="selection",
        persons=source["persons_sets"]["selection"],
        data_samples_tissues_persons=source["data_samples_tissues_persons"],
        data_gene_signal=source["data_gene_signal"],
        path_directory=os.path.join(dock, "split", "selection"),
        report=True,
    )
    split_report_write_genes_signals(
        cohort="respiration",
        persons=source["persons_sets"]["respiration"],
        data_samples_tissues_persons=source["data_samples_tissues_persons"],
        data_gene_signal=source["data_gene_signal"],
        path_directory=os.path.join(dock, "split", "respiration"),
        report=True,
    )
    split_report_write_genes_signals(
        cohort="ventilation",
        persons=source["persons_sets"]["ventilation"],
        data_samples_tissues_persons=source["data_samples_tissues_persons"],
        data_gene_signal=source["data_gene_signal"],
        path_directory=os.path.join(dock, "split", "ventilation"),
        report=True,
    )

    pass
Beispiel #10
0
def initialize_directories(dock=None):
    """
    Initialize directories for procedure's product files.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:
        (dict<str>): collection of paths to directories for procedure's files

    """

    # Collect paths.
    paths = dict()
    # Define paths to directories.
    paths["dock"] = dock
    paths["stratification"] = os.path.join(paths["dock"], "stratification")
    # Remove previous files to avoid version or batch confusion.
    utility.remove_directory(path=paths["stratification"])
    utility.create_directory(path=paths["stratification"])

    # Define paths for cohorts of persons.
    cohorts = list()
    cohorts.append("selection")
    cohorts.append("respiration")
    cohorts.append("ventilation")
    for cohort in cohorts:
        paths[cohort] = dict()
        paths[cohort]["component"] = os.path.join(paths["stratification"],
                                                  cohort, "component")
        paths[cohort]["regression"] = os.path.join(paths["stratification"],
                                                   cohort, "regression")
        paths[cohort]["summary"] = os.path.join(paths["stratification"],
                                                cohort, "summary")
        # Initialize directories.
        utility.create_directories(path=paths[cohort]["component"])
        utility.create_directories(path=paths[cohort]["regression"])
        utility.create_directories(path=paths[cohort]["summary"])
    # Return information.
    return paths
Beispiel #11
0
def execute_procedure(dock=None):
    """
    Function to execute module's main behavior.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:

    """

    if False:
        # Remove previous files to avoid version or batch confusion.
        path_expecto = os.path.join(dock, "expecto")
        utility.remove_directory(path=path_expecto)

        path_remote = "http://deepsea.princeton.edu/media/code/expecto/combined_snps.0.3.zip"
        path_local = os.path.join(path_expecto, "combined_snps.0.3.zip")
        utility.remove_file(path_local)
        wget.download(path_remote, path_local)
        utility.decompress_file_gzip(path_local)

    # Read source information from file.
    source = read_source(dock=dock)

    print(source["data_expecto"])

    # Compile information.
    information = {"data_expecto": source["data_expecto"]}

    #Write product information to file.
    write_product(dock=dock, information=information)

    pass
Beispiel #12
0
    def run(self):
        """
        Run application
        :return:
        """
        join_df = self.join_df
        self.persist()

        # Create dataframes for intersection , cogo labs and liveworks only dataframes
        logger.info(
            " Creating common dataframe emd5 present both in live works and cogo labs"
        )
        """
        Cogo labs
        emd5    Name
        1       Sam
        2       Henry
        
        Liveworks
        emd5    Name
        2       John
        3       Smith
        
        Full Outer Join
        c_emd5    l_emd5   c_name  l_name
        1           Null     Sam     Null
        2           2       Henry   John
        Null        3        Null    Smith
        
        Intersection from cogo labs and Live works, where c_emd5 and l_emd5 is not null
        c_emd5    l_emd5   c_name  l_name
        2           2       Henry   John   
        
        Users only from cogo labs, where c_emd5 is not null and l_emd5 is null
        c_emd5    l_emd5   c_name  l_name
        1           Null     Sam     Null
        
        Users only from live works, where l_emd is not null and c_emd5 is null
        c_emd5    l_emd5   c_name  l_name
        Null        3        Null    Smith
        
        """

        common_df = join_df.filter(~join_df.emd5.isNull()
                                   & ~join_df.cogo_emd5.isNull())
        logger.info(
            " Creating cogo labs only dataframe where emd5 present in cogo labs and not present in live works"
        )
        cogo_labs_only_df = join_df.filter(~join_df.cogo_emd5.isNull()
                                           & join_df.emd5.isNull())
        logger.info(
            " Creating live works only dataframe where emd5 present in liveworks and not present in cogo labs"
        )
        live_works_only_df = join_df.filter(join_df.cogo_emd5.isNull()
                                            & ~join_df.emd5.isNull())

        # counting distinct emd5 counts
        intersection_count = common_df.select(
            common_df.cogo_emd5).distinct().count()
        cogo_labs_only_count = cogo_labs_only_df.select(
            cogo_labs_only_df.cogo_emd5).distinct().count()
        live_works_only_count = live_works_only_df.select(
            live_works_only_df.emd5).distinct().count()

        logger.info(
            "Number of Unique users present in both cogo labs and liveworks %s",
            intersection_count)
        logger.info("Number of Unique users present only in cogo labs data %s",
                    cogo_labs_only_count)
        logger.info(
            "Number of Unique users present only in live works data %s",
            live_works_only_count)

        # Create common job data frame with users having same job title
        logger.info(
            "Creating common job data frame where common users have same job title"
        )
        common_job_df = common_df.where(common_df.cogo_job == common_df.job)
        common_job_df.persist(StorageLevel.DISK_ONLY)
        print("Output with common emd5 users having same job title")
        common_job_df.show()
        common_job_count = common_job_df.count()

        # Calculate percentage common emd5 users have different job titles
        different_jobs_percent = (
            (intersection_count - common_job_count) / intersection_count) * 100
        logger.info(
            "Number of  users with common job present in both cogo labs and liveworks %s",
            common_job_df.count())
        logger.info("Percent have different job titles in intersection %s",
                    different_jobs_percent)

        # jsonsify data from common data frame
        """
        Create Key:Value pair
        Key = Job title , Value = Company Name
        
        cogolabs_emd5   cogolabs_job    cogolabs_company    liveworks_job   liveworks_company
        1              Hotel manager     Bender PLC          Barrister           Brown PLC
        1            Immigration officer Diaz Ltd                                           
        
        cogolabs_emd5   cogo_labs_c                         liveworks_c
        1               {"Hotel manager":"Bender PLC"}      {"Barrister":"Brown PLC"}
        1               {"Immigration officer":"Diaz Ltd"} 
        """

        common_json_df = common_df.withColumn("live_works_c", concat(lit("{\""), common_df.job, lit("\":\""),
                                                                     common_df.company, lit("\"}"))) \
            .withColumn("cogo_labs_c", concat(lit("{\""), common_df.cogo_job,
                                              lit("\":\""), common_df.cogo_company, lit("\"}")))
        """
        Concatenate results to form Array of key value pairs group by emd5
        emd5            cogolabs_json                                               liveworks_json
        1   [{"Hotel manager":"Bender PLC"},{"Immigration officer":"Diaz Ltd"}]     [{"Barrister":"Brown PLC"}]

        """

        common_agg_df = common_json_df.select(common_json_df.emd5, common_json_df.cogo_labs_c,
                                               common_json_df.live_works_c) \
            .groupBy(common_json_df.emd5).agg(concat_ws(",", collect_list(common_json_df.cogo_labs_c)),
                                                concat_ws(",", collect_list(common_json_df.live_works_c))) \
            .withColumnRenamed("concat_ws(,, collect_list(cogo_labs_c))", "cogo_labs_json") \
            .withColumnRenamed("concat_ws(,, collect_list(live_works_c))", "live_works_json")

        final_df = common_agg_df.withColumn("cogo_labs_json",
                                             concat(lit("["), common_agg_df.cogo_labs_json, lit("]"))) \
            .withColumn("live_works_json", concat(lit("["), common_agg_df.live_works_json, lit("]")))
        print("Final output with emd5 , cogolabs json and liveworks json")
        final_df.persist(StorageLevel.DISK_ONLY)
        final_df.show()

        # save final output as csv

        if os.path.exists("data/final_output/"):
            remove_directory("data/final_output/")
        logger.info("Save final output as csv")
        final_df.repartition(1).write.format("csv").save(
            os.path.join(DATA_PATH, "final_output"))

        self.unpersist()
Beispiel #13
0
def initialize_directories(dock=None):
    """
    Initialize directories for procedure's product files.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:
        (dict<str>): collection of paths to directories for procedure's files

    """

    # Collect paths.
    paths = dict()
    # Define paths to directories.
    paths["dock"] = dock
    paths["candidacy"] = os.path.join(paths["dock"], "candidacy")
    # Remove previous files to avoid version or batch confusion.
    utility.remove_directory(path=paths["candidacy"])
    utility.create_directory(path=paths["candidacy"])
    # Define paths for cohorts of persons.
    cohorts = list()
    cohorts.append("selection")
    cohorts.append("respiration")
    cohorts.append("ventilation")
    for cohort in cohorts:
        paths[cohort] = dict()
        paths[cohort]["threshold"] = os.path.join(
            paths["candidacy"], cohort, "threshold"
        )
        # Define paths for groups of genes by their distributions.
        paths[cohort]["distribution"] = dict()
        paths[cohort]["distribution"]["multimodal"] = os.path.join(
            paths["candidacy"], cohort, "distribution", "multimodal"
        )
        paths[cohort]["distribution"]["unimodal"] = os.path.join(
            paths["candidacy"], cohort, "distribution", "unimodal"
        )
        paths[cohort]["distribution"]["nonmultimodal"] = os.path.join(
            paths["candidacy"], cohort, "distribution", "nonmultimodal"
        )
        paths[cohort]["distribution"]["any"] = os.path.join(
            paths["candidacy"], cohort, "distribution", "any"
        )
        # Initialize directories.
        utility.create_directories(path=paths[cohort]["threshold"])
        utility.create_directories(
            path=paths[cohort]["distribution"]["any"]
        )
        utility.create_directories(
            path=paths[cohort]["distribution"]["multimodal"]
        )
        utility.create_directories(
            path=paths[cohort]["distribution"]["unimodal"]
        )
        utility.create_directories(
            path=paths[cohort]["distribution"]["nonmultimodal"]
        )
    # Return information.
    return paths
Beispiel #14
0
 def clean_sdk(self):
     """Deletes SDK that has been unpacked"""
     utility.remove_directory(self.path)
Beispiel #15
0
def execute_procedure(dock=None):
    """
    Function to execute module's main behavior.

    arguments:
        dock (str): path to root or dock directory for source and product
            directories and files

    raises:

    returns:

    """

    # Remove previous files.
    # Specify directories and files.
    path_metric = os.path.join(dock, "metric")
    utility.create_directory(path_metric)
    path_figure = os.path.join(path_metric, "figure")
    utility.remove_directory(path=path_figure)

    # Read source information from file.
    #source = read_source(dock=dock)

    utility.print_terminal_partition(level=1)
    print("Test of metrics of modality.")

    # Unimodal normal distribution.

    utility.print_terminal_partition(level=2)
    print("Simulation on 1,000,000 random values with a unimodal normal " +
          "distribution.")
    print("Expectations for unimodal normal distribution...")
    print("skewness = 0")
    print("kurtosis = 0")
    print("bimodality coefficient < 0.55")
    print("dip statistic < 0.05")
    utility.print_terminal_partition(level=3)
    # Generate random values with a normal distribution.
    series = generate_random_values_normal(mean=1.0,
                                           deviation=3.0,
                                           count=1000000,
                                           method="random")
    report_metrics(name="unimodality", series=series, dock=dock)
    utility.print_terminal_partition(level=3)

    # Bimodal normal distribution 1.
    utility.print_terminal_partition(level=2)
    print("Simulation on 1,000,000 random values with a bimodal normal " +
          "distribution.")
    print("Expectations for bimodal normal distribution...")
    print("skewness = ?")
    print("kurtosis = ?")
    print("bimodality coefficient > 0.55")
    print("dip statistic > 0.05")
    utility.print_terminal_partition(level=3)
    # Generate random values with a normal distribution.
    series_one = generate_random_values_normal(mean=1.0,
                                               deviation=1.0,
                                               count=500000,
                                               method="random")
    series_two = generate_random_values_normal(mean=5.0,
                                               deviation=2.0,
                                               count=500000,
                                               method="random")
    #series_one.extend(series_two)
    series = series_one + series_two
    report_metrics(name="bimodality_1", series=series, dock=dock)
    utility.print_terminal_partition(level=3)

    # Bimodal normal distribution 2.
    utility.print_terminal_partition(level=2)
    print("Simulation on 1,000,000 random values with a bimodal normal " +
          "distribution.")
    print("Expectations for bimodal normal distribution...")
    print("skewness = ?")
    print("kurtosis = ?")
    print("bimodality coefficient > 0.55")
    print("dip statistic > 0.05")
    utility.print_terminal_partition(level=3)
    # Generate random values with a normal distribution.
    series_one = generate_random_values_normal(mean=1.0,
                                               deviation=1.0,
                                               count=500000,
                                               method="random")
    series_two = generate_random_values_normal(mean=10.0,
                                               deviation=2.0,
                                               count=500000,
                                               method="random")
    #series_one.extend(series_two)
    series = series_one + series_two
    report_metrics(name="bimodality_2", series=series, dock=dock)
    utility.print_terminal_partition(level=3)

    # Bimodal normal distribution 3.
    utility.print_terminal_partition(level=2)
    print("Simulation on 1,000,000 random values with a bimodal normal " +
          "distribution.")
    print("Expectations for bimodal normal distribution...")
    print("skewness = ?")
    print("kurtosis = ?")
    print("bimodality coefficient > 0.55")
    print("dip statistic > 0.05")
    utility.print_terminal_partition(level=3)
    # Generate random values with a normal distribution.
    series_one = generate_random_values_normal(mean=1.0,
                                               deviation=1.0,
                                               count=100000,
                                               method="random")
    series_two = generate_random_values_normal(mean=10.0,
                                               deviation=2.0,
                                               count=900000,
                                               method="random")
    #series_one.extend(series_two)
    series = series_one + series_two
    report_metrics(name="bimodality_3", series=series, dock=dock)
    utility.print_terminal_partition(level=3)

    # Compile information.
    information = {}
    #Write product information to file.
    #write_product(dock=dock, information=information)

    pass
Beispiel #16
0
 def clean_sdk(self):
     """Deletes SDK that has been unpacked"""
     utility.remove_directory(self.path)