Python spark_context Examples, hail.spark_context Python Examples

Example #1

0

Show file

File: test_context.py Project: chrisvittal/hail

    def test_init_hail_context_twice(self):
        hl.init(idempotent=True)  # Should be no error
        hl.stop()

        hl.init(idempotent=True)
        hl.experimental.define_function(lambda x: x + 2, hl.tint32)
        # ensure functions are cleaned up without error
        hl.stop()

        hl.init(idempotent=True)  # Should be no error
        hl.init(hl.spark_context(), idempotent=True)  # Should be no error

Example #2

0

Show file

File: test_context.py Project: bcajes/hail

 def test_init_hail_context_twice(self):
     hl.init(hl.spark_context(), idempotent=True) # Should be no error

Example #3

0

Show file

File: create_fam.py Project: tpoterba/gnomad_qc

def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float,
                old_version: str, overwrite: bool) -> None:
    """
    Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file.
    In order to compare the raw ped, metadata is also generated for:
    1) A number of fake families are generated
    2) The previous iteration of the ped file (old_version)

    :param GnomADRelatedData related_data: Input data
    :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data
    :param str old_version: Version of previous iteration to load
    :param bool overwrite: Whether to overwrite previous data
    :return: Nothing
    :rtype: None
    """

    raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type),
                               delimiter="\\t")

    n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios()))
    logger.info(
        f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}"
    )
    fake_fams = create_fake_pedigree(n_fake_trios,
                                     list(related_data.meta_pd.s), raw_ped)

    fake_fams.write(fake_fam_path(related_data.data_type))

    logger.info(f"Running mendel_errors on {related_data.data_type}")

    # Run mendel errors on families made of random samples to establish expectation in non-trios:
    pedigrees = [('new', raw_ped),
                 ('old',
                  hl.Pedigree.read(fam_path(related_data.data_type,
                                            version=old_version),
                                   delimiter="\\t")),
                 ('fake',
                  hl.Pedigree.read(fake_fam_path(related_data.data_type),
                                   delimiter="\\t"))]

    ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped))
                                    for name, ped in pedigrees],
                                   related_data.sample_to_dups, True)

    # Run mendel_errors
    all_ped = pandas_to_ped(ped_pd)
    gnomad = get_gnomad_data(related_data.data_type)
    fam_samples = hl.literal({
        s
        for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id]
    })
    gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s))
    all_errors, per_fam, per_sample, _ = hl.mendel_errors(
        gnomad['GT'], all_ped)

    all_errors.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "all_errors"),
                     overwrite=overwrite)
    per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"),
                  overwrite=overwrite)
    per_sample.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "per_sample"),
                     overwrite=overwrite)

    # Merge all metadata
    ped_pd = add_pedigree_meta(ped_pd=ped_pd,
                               meta_pd=related_data.meta_pd,
                               kin_ht=related_data.kin_ht,
                               mendel_per_sample_ht=per_sample)

    # Write merged pedigrees as HT
    sql_context = SQLContext(hl.spark_context())
    hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write(
        merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)

Example #4

0

Show file

File: test_context.py Project: tuyanglin/hail

 def test_init_hail_context_twice(self):
     hl.init(idempotent=True)  # Should be no error
     hl.stop()
     hl.init(idempotent=True)  # Should be no error
     hl.init(hl.spark_context(), idempotent=True)  # Should be no error

Example #5

0

Show file

        print(
            "Error! One of --genes, --variants, or --variant_list must be given!"
        )
        exit()

    if (args.pheno_col is not None) and (args.female_col is None):
        print("Error! if giving --pheno_col, --female_col must also be given")

    args.output_stem = os.path.join(args.output_dir, args.output_name)

    ##########################
    # Import python scripts  #
    ##########################
    scripts = ["variant_annotation.py", "find_putative_causal_variants.py"]
    for script in scripts:
        hl.spark_context().addPyFile(os.path.join(args.scripts_dir, script))

    import variant_annotation as va
    from find_putative_causal_variants import count_case_control_carriers

    ########################
    # Load in matrix table #
    ########################
    fullmt = hl.read_matrix_table(args.mt)

    ####################################
    # Check if variant annotation done #
    ####################################
    try:
        fullmt.gene.describe()
    except Exception as e:

Example #6

0

Show file

def main(sqlContext, configuration, chrom, nchroms, step):
    call(["ls", "-l"])

    if (chrom == "" or step == ""):
        usage()
        sys.exit(2)

    destination = configuration["destination"] + "/" + configuration["version"]
    sourceFileName = utils.buildFileName(configuration["source_path"], chrom)
    fileName = "variants" + chrom + ".ht"
    fileNameCnv = "variants.ht"
    number_partitions = configuration["number_of_partitions"]
    current_dir = utils.buildFileName(configuration["origin_path"], chrom)

    print("sourcefilename is " + sourceFileName)

    # Pipeline steps

    if ("createIndex" in step):
        if ("createIndexCNV" in step):
            print("step to create index CNV")
            index.create_index_cnv(
                configuration["elasticsearch"]["host"],
                configuration["elasticsearch"]["port"],
                configuration["elasticsearch"]["index_cnv_name"],
                configuration["version"],
                configuration["elasticsearch"]["num_shards"],
                configuration["elasticsearch"]["num_replicas"],
                configuration["elasticsearch"]["user"],
                configuration["elasticsearch"]["pwd"])
        else:
            print("step to create index")
            index.create_index_snv(
                configuration["elasticsearch"]["host"],
                configuration["elasticsearch"]["port"],
                configuration["elasticsearch"]["index_name"],
                configuration["version"],
                configuration["elasticsearch"]["num_shards"],
                configuration["elasticsearch"]["num_replicas"],
                configuration["elasticsearch"]["user"],
                configuration["elasticsearch"]["pwd"])

    if ("loadGermline" in step):
        print("step loadGermline")
        annotations.importGermline(hl, current_dir, sourceFileName,
                                   destination + "/loaded/" + fileName,
                                   number_partitions)
        current_dir = destination + "/loaded/" + "variants" + chrom + ".ht"

    if ("loadSomatic" in step):
        print("step loadSomatics")
        print("Somatics list path: " +
              utils.buildFileName(configuration["somatic_paths"], chrom))
        # Read somatic vcf file
        sc = hl.spark_context()
        somatic_paths = sc.textFile(
            utils.buildFileName(configuration["somatic_paths"],
                                chrom)).collect()
        # Import and merge somatic files
        annotations.importSomatic(hl, current_dir, somatic_paths,
                                  destination + "/loadedSomatic/" + fileName,
                                  number_partitions)
        current_dir = destination + "/loadedSomatic/" + fileName

    if ("loadCNV" in step):
        print("step loadCNV")
        annotations.loadCNV(hl, configuration["source_path_cnv"],
                            destination + "/loadedCNV/" + fileNameCnv,
                            number_partitions)

    if ("loaddbNSFP" in step):
        print("step loaddbNSFP")
        annotations.importDbNSFPTable(
            hl, utils.buildFileName(configuration["dbNSFP_Raw"], chrom),
            utils.buildFileName(configuration["dnNSFP_path"], chrom),
            number_partitions)

    if ("loadcadd" in step):
        print("step loadCADD")
        annotations.importDBVcf(
            hl, utils.buildFileName(configuration["cadd_Raw"], chrom),
            utils.buildFileName(configuration["cadd_path"], chrom),
            number_partitions)

    if ("loadclinvar" in step):
        print("step loadclinvar")
        annotations.importDBVcf(
            hl, utils.buildFileName(configuration["clinvar_Raw"], ""),
            utils.buildFileName(configuration["clinvar_path"], ""),
            number_partitions)

    if ("loadExomesGnomad" in step):
        print("step load exomes gnomad")
        annotations.importDBVcf(
            hl, utils.buildFileName(configuration["exomesGnomad_Raw"], chrom),
            utils.buildFileName(configuration["exomesGnomad_path"], chrom),
            number_partitions)

    if ("loadExAC" in step):
        print("step load ExAC")
        annotations.importDBVcf(
            hl, utils.buildFileName(configuration["ExAC_Raw"], chrom),
            utils.buildFileName(configuration["ExAC_path"], chrom),
            number_partitions)

    if ("loadCGI" in step):
        print("step load CGI")
        annotations.importCGITable(
            hl, utils.buildFileName(configuration["CGI_Raw"], ""),
            utils.buildFileName(configuration["CGI_path"], ""),
            number_partitions)

    if ("annotateCGI" in step):
        print("step annotate CGI")
        variants = hl.read_table(current_dir)
        annotations.annotateCGI(
            hl, variants, utils.buildFileName(configuration["CGI_path"],
                                              chrom),
            destination + "/annotatedCGI/" + fileName)
        current_dir = destination + "/annotatedCGI/" + fileName

    if ("annotateVEP" in step):
        print("step annotate VEP")
        print("source file is " + current_dir)
        variants = hl.read_table(current_dir)
        annotations.annotateVEP(hl, variants,
                                destination + "/annotatedVEP/" + fileName,
                                configuration["vep"], number_partitions)

    if ("annotatedbNSFP" in step):
        print("step annotate dbNSFP")
        variants = hl.read_table(destination + "/annotatedVEP/" + fileName)
        annotations.annotateDbNSFP(
            hl, variants,
            utils.buildFileName(configuration["dnNSFP_path"], chrom),
            destination + "/annotatedVEPdbnSFP/" + fileName)

    if ("annotatecadd" in step):
        print("step annotate dbcadd")
        variants = hl.read_table(destination + "/annotatedVEPdbnSFP/" +
                                 fileName)
        annotations.annotateCADD(
            hl, variants, utils.buildFileName(configuration["cadd_path"],
                                              chrom),
            destination + "/annotatedVEPdbnSFPCadd/" + fileName)

    if ("annotateclinvar" in step):
        print("step annotate clinvar")
        variants = hl.read_table(destination + "/annotatedVEPdbnSFPCadd/" +
                                 fileName)
        annotations.annotateClinvar(
            hl, variants, utils.buildFileName(configuration["clinvar_path"],
                                              ""),
            destination + "/annotatedVEPdbnSFPCaddClinvar/" + fileName)

    if ("annotateExomesGnomad" in step):
        print("step annotate exomes gnomad")
        variants = hl.read_table(destination +
                                 "/annotatedVEPdbnSFPCaddClinvar/" + fileName)
        annotations.annotateGnomADEx(
            hl, variants,
            utils.buildFileName(configuration["exomesGnomad_path"], chrom),
            destination + "/annotatedVEPdbnSFPCaddClinvarExGnomad/" + fileName)

    if ("annotateExAC" in step):
        print("step annotate ExAC")
        variants = hl.read_table(destination +
                                 "/annotatedVEPdbnSFPCaddClinvarExGnomad/" +
                                 fileName)
        annotations.annotateExAC(
            hl, variants, utils.buildFileName(configuration["ExAC_path"],
                                              chrom), destination +
            "/annotatedVEPdbnSFPCaddClinvarExGnomadExAC/" + fileName)

    # Transforming step. It sets all fields to the corresponding ElasticSearch format
    if ("transform" in step):
        print("step transform")
        annotated = hl.read_table(
            destination + "/annotatedVEPdbnSFPCaddClinvarExGnomadExAC/" +
            fileName)
        transform.transform(annotated, destination, chrom)

    # Uploading step. It uploads all annotated variants to ElasticSearch
    if ("toElastic" in step):
        print("step to elastic")
        es_conf = {
            "es.net.http.auth.user": configuration["elasticsearch"]["user"],
            "es.net.http.auth.pass": configuration["elasticsearch"]["pwd"],
            "es.nodes": configuration["elasticsearch"]["host"],
            "es.port": configuration["elasticsearch"]["port"]
        }
        #print(es_conf)
        index_name = configuration["elasticsearch"]["index_name"]
        if ("toElasticCNV" in step):
            print("step toElasticCNV")
            variants = hl.read_table(destination + "/loadedCNV/" +
                                     fileNameCnv).to_spark()
            variants = variants.withColumn("chrom", variants["chrom"].cast(IntegerType())) \
                               .withColumn("start", variants["start"].cast(IntegerType())) \
                               .withColumn("end", variants["end"].cast(IntegerType())) \
                               .withColumn("cnt", variants["cnt"].cast(IntegerType())) \
                               .withColumn("bf", variants["bf"].cast(FloatType())) \
                               .withColumn("omim_number", variants["omim_number"].cast(IntegerType())) \
                               .withColumn("tool",lit("ExomeDepth"))
            index_name = configuration["elasticsearch"]["index_cnv_name"]
            variants.printSchema()
        else:
            # Getting annotated variants and adding the chromosome column
            variants = sqlContext.read.load(destination+"/variants/chrom="+chrom)\
                                      .withColumn("chrom",lit(chrom))
            variants.printSchema()
        variants.write.format("org.elasticsearch.spark.sql").options(
            **es_conf).save(index_name + "/" + configuration["version"],
                            mode='append')

    # Counting step to check whether the number of variants in Spark corresponds to tht number of variants that
    # have been uploaded to ElasticSearch
    if ("count" in step):
        if (nchroms == ""):
            usage()
            sys.exit(2)
        count = 0
        for chrom in range(1, int(nchroms) + 1):
            variants = sqlContext.read.load(destination + "/variants/chrom=" +
                                            str(chrom))
            count += variants.count()
        print("\nTotal number of variants: " + str(count) + "\n")

Example #7

0

Show file

    # Counting step to check whether the number of variants in Spark corresponds to tht number of variants that
    # have been uploaded to ElasticSearch
    if ("count" in step):
        if (nchroms == ""):
            usage()
            sys.exit(2)
        count = 0
        for chrom in range(1, int(nchroms) + 1):
            variants = sqlContext.read.load(destination + "/variants/chrom=" +
                                            str(chrom))
            count += variants.count()
        print("\nTotal number of variants: " + str(count) + "\n")


if __name__ == "__main__":
    # Command line options parsing
    chrom, path, nchroms, step, cores = optionParser(sys.argv[1:])
    main_conf = config.readConfig(path)
    spark_conf = SparkConf().setAppName(APP_NAME).set('spark.executor.cores',
                                                      cores)
    spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
    spark.sparkContext._jsc.hadoopConfiguration().setInt(
        "dfs.block.size", main_conf["dfs_block_size"])
    spark.sparkContext._jsc.hadoopConfiguration().setInt(
        "parquet.block.size", main_conf["dfs_block_size"])
    hl.init(spark.sparkContext)
    sqlContext = SQLContext(hl.spark_context())
    # Execute Main functionality
    main(sqlContext, main_conf, chrom, nchroms, step)

Example #8

0

Show file

File: import_vcf_vep_annotate.py Project: lea-urpa/exome_qc_library

    parser.add_argument("--chr_prefix", action='store_true', help="Chromosomes are of form 'chr1', NOT '1' etc.")
    parser.add_argument("--force_bgz", action='store_true', help="Force blog gzip import? Default true.")
    parser.add_argument("--call_fields", default="PGT", help="Name of genotype call field in VCF, default PGT.")
    parser.add_argument("--test", action='store_true', help="Filters data to just chr 22 for testing purposes.")

    args = parser.parse_args()

    ##################
    # Import scripts #
    ##################
    hl.init()

    scripts = ["helper_scripts.py"]

    for script in scripts:
        hl.spark_context().addPyFile(args.scripts_dir + script)

    import helper_scripts as h

    #####################################
    # Configure logging, define outputs #
    #####################################
    logstem = 'import_vep_annotate-'
    datestr, timestr, log_file = h.configure_logging(logstem=logstem)

    log_dir = os.path.join(args.log_dir, logstem + datestr)

    # Configure logger
    root = logging.getLogger()
    log_formatter = '%(asctime)s - %(levelname)s - %(message)s'
    logging.basicConfig(filename=log_file, format=log_formatter, level=logging.INFO)