Esempio n. 1
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    annos = annotations.get_anno_files( args )
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run gemini update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `gemini update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos( args )

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Esempio n. 2
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    annos = annotations.get_anno_files(args)
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run gemini update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `gemini update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos(args)

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Esempio n. 3
0
def load(parser, args):
    #if (args.db is None or args.vcf is None):
    if args.vcf is None:
        parser.print_help()
        exit("ERROR: load needs both a VCF file\n")
    
    start_time = time.time()
    annos = annotations.get_anno_files( args )
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run geminicassandra update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `geminicassandra update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos( args )
    
    time_2 = start_time
    time_3 = start_time
    
    if(args.node_num == 1):
        gemini_loader = GeminiLoader(args)
        gemini_loader.setup_db()
        time_2 = time.time()
        gemini_loader.single_core_stuff()
        time_3 = time.time()
        
    n_variants = 0
    
    if args.cores > 1:
        n_variants = load_multicore(args)
    else:
        n_variants = load_singlecore(args)
        
    insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants)
        
    end_time = time.time()
    total_time = str(end_time - start_time)
    db_creation_time = str(time_2 - start_time)
    single_core_time = str(time_3 - time_2)
    parallel_time = str(end_time - time_3)
    print "Finished loading in %s s" % total_time
    if args.timing_log != None:
        with open(args.timing_log, "a") as myfile:
            myfile.write(",".join([args.exp_id, total_time, db_creation_time, single_core_time, parallel_time]) + "\n")        
Esempio n. 4
0
    def __init__(self, args, buffer_size=10000, prepare_db=True):
        self.args = args
        self.seen_multi = False

        # create the gemini database
        # create a reader for the VCF file
        self.vcf_reader = self._get_vcf_reader()
        # load sample information
        expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split(
            ",")

        if self.args.anno_type == "VEP":
            self._effect_fields = self._get_vep_csq(self.vcf_reader)
            # tuples of (db_column, CSQ name)
            self._extra_effect_fields = [("vep_%s" % x.lower(), x)
                                         for x in self._effect_fields
                                         if not x.lower() in expected]

        else:
            self._effect_fields = []
            self._extra_effect_fields = []
        if not prepare_db:
            return
        self._create_db([x[0] for x in self._extra_effect_fields])

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            # load the sample info from the VCF file.
            self._prepare_samples()
            # initialize genotype counts for each sample
            self._init_sample_gt_counts()
            self.num_samples = len(self.samples)
        else:
            self.num_samples = 0

        self.clinvar_chrom_gene_lookup = load_clinvar(
            annotations.get_anno_files(self.args)['clinvar'])

        self.buffer_size = buffer_size
        self._get_anno_version()

        if not args.skip_gene_tables:
            self._get_gene_detailed()
            self._get_gene_summary()
Esempio n. 5
0
    def __init__(self, args, buffer_size=10000, prepare_db=True):
        self.args = args
        self.seen_multi = False

        # create the gemini database
        # create a reader for the VCF file
        self.vcf_reader = self._get_vcf_reader()
        # load sample information
        expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split(",")

        if self.args.anno_type == "VEP":
            self._effect_fields = self._get_vep_csq(self.vcf_reader)
            # tuples of (db_column, CSQ name)
            self._extra_effect_fields = [("vep_%s" % x.lower(), x) for x in self._effect_fields if not x.lower() in expected]

        else:
            self._effect_fields = []
            self._extra_effect_fields = []
        if not prepare_db:
            return
        self._create_db([x[0] for x  in self._extra_effect_fields])

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            # load the sample info from the VCF file.
            self._prepare_samples()
            # initialize genotype counts for each sample
            self._init_sample_gt_counts()
            self.num_samples = len(self.samples)
        else:
            self.num_samples = 0

        self.clinvar_chrom_gene_lookup = load_clinvar(annotations.get_anno_files(self.args)['clinvar'])

        self.buffer_size = buffer_size
        self._get_anno_version()

        if not args.skip_gene_tables:
            self._get_gene_detailed()
            self._get_gene_summary()
Esempio n. 6
0
def load(parser, args):
    #if (args.db is None or args.vcf is None):
    if args.vcf is None:
        parser.print_help()
        exit("ERROR: load needs both a VCF file\n")

    start_time = time.time()
    annos = annotations.get_anno_files(args)
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write(
                "\nCADD scores are not being loaded because the"
                " annotation file could not be found.\n"
                "`Run geminicassandra update --dataonly --extra cadd_score`"
                " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write(
                "CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write(
                "\nGERP per bp is not being loaded because the annotation file"
                " could not be found.\n    Run `geminicassandra update --dataonly --extra gerp_bp`"
                " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write(
                "GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos(args)

    time_2 = start_time
    time_3 = start_time

    if (args.node_num == 1):
        gemini_loader = GeminiLoader(args)
        gemini_loader.setup_db()
        time_2 = time.time()
        gemini_loader.single_core_stuff()
        time_3 = time.time()

    n_variants = 0

    if args.cores > 1:
        n_variants = load_multicore(args)
    else:
        n_variants = load_singlecore(args)

    insert_n_variants(map(strip, args.contact_points.split(',')),
                      args.keyspace, n_variants)

    end_time = time.time()
    total_time = str(end_time - start_time)
    db_creation_time = str(time_2 - start_time)
    single_core_time = str(time_3 - time_2)
    parallel_time = str(end_time - time_3)
    print "Finished loading in %s s" % total_time
    if args.timing_log != None:
        with open(args.timing_log, "a") as myfile:
            myfile.write(",".join([
                args.exp_id, total_time, db_creation_time, single_core_time,
                parallel_time
            ]) + "\n")