def load(parser, args): if (args.db is None or args.vcf is None): parser.print_help() exit("ERROR: load needs both a VCF file and a database file\n") annos = annotations.get_anno_files( args ) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write("\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run gemini update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write("\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `gemini update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos( args ) if args.scheduler: load_ipython(args) elif args.cores > 1: load_multicore(args) else: load_singlecore(args)
def load(parser, args): if (args.db is None or args.vcf is None): parser.print_help() exit("ERROR: load needs both a VCF file and a database file\n") annos = annotations.get_anno_files(args) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write("\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run gemini update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write("\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `gemini update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos(args) if args.scheduler: load_ipython(args) elif args.cores > 1: load_multicore(args) else: load_singlecore(args)
def load(parser, args): #if (args.db is None or args.vcf is None): if args.vcf is None: parser.print_help() exit("ERROR: load needs both a VCF file\n") start_time = time.time() annos = annotations.get_anno_files( args ) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write("\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run geminicassandra update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write("\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `geminicassandra update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos( args ) time_2 = start_time time_3 = start_time if(args.node_num == 1): gemini_loader = GeminiLoader(args) gemini_loader.setup_db() time_2 = time.time() gemini_loader.single_core_stuff() time_3 = time.time() n_variants = 0 if args.cores > 1: n_variants = load_multicore(args) else: n_variants = load_singlecore(args) insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants) end_time = time.time() total_time = str(end_time - start_time) db_creation_time = str(time_2 - start_time) single_core_time = str(time_3 - time_2) parallel_time = str(end_time - time_3) print "Finished loading in %s s" % total_time if args.timing_log != None: with open(args.timing_log, "a") as myfile: myfile.write(",".join([args.exp_id, total_time, db_creation_time, single_core_time, parallel_time]) + "\n")
def __init__(self, args, buffer_size=10000, prepare_db=True): self.args = args self.seen_multi = False # create the gemini database # create a reader for the VCF file self.vcf_reader = self._get_vcf_reader() # load sample information expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split( ",") if self.args.anno_type == "VEP": self._effect_fields = self._get_vep_csq(self.vcf_reader) # tuples of (db_column, CSQ name) self._extra_effect_fields = [("vep_%s" % x.lower(), x) for x in self._effect_fields if not x.lower() in expected] else: self._effect_fields = [] self._extra_effect_fields = [] if not prepare_db: return self._create_db([x[0] for x in self._extra_effect_fields]) if not self.args.no_genotypes and not self.args.no_load_genotypes: # load the sample info from the VCF file. self._prepare_samples() # initialize genotype counts for each sample self._init_sample_gt_counts() self.num_samples = len(self.samples) else: self.num_samples = 0 self.clinvar_chrom_gene_lookup = load_clinvar( annotations.get_anno_files(self.args)['clinvar']) self.buffer_size = buffer_size self._get_anno_version() if not args.skip_gene_tables: self._get_gene_detailed() self._get_gene_summary()
def __init__(self, args, buffer_size=10000, prepare_db=True): self.args = args self.seen_multi = False # create the gemini database # create a reader for the VCF file self.vcf_reader = self._get_vcf_reader() # load sample information expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split(",") if self.args.anno_type == "VEP": self._effect_fields = self._get_vep_csq(self.vcf_reader) # tuples of (db_column, CSQ name) self._extra_effect_fields = [("vep_%s" % x.lower(), x) for x in self._effect_fields if not x.lower() in expected] else: self._effect_fields = [] self._extra_effect_fields = [] if not prepare_db: return self._create_db([x[0] for x in self._extra_effect_fields]) if not self.args.no_genotypes and not self.args.no_load_genotypes: # load the sample info from the VCF file. self._prepare_samples() # initialize genotype counts for each sample self._init_sample_gt_counts() self.num_samples = len(self.samples) else: self.num_samples = 0 self.clinvar_chrom_gene_lookup = load_clinvar(annotations.get_anno_files(self.args)['clinvar']) self.buffer_size = buffer_size self._get_anno_version() if not args.skip_gene_tables: self._get_gene_detailed() self._get_gene_summary()
def load(parser, args): #if (args.db is None or args.vcf is None): if args.vcf is None: parser.print_help() exit("ERROR: load needs both a VCF file\n") start_time = time.time() annos = annotations.get_anno_files(args) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write( "\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run geminicassandra update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write( "CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write( "\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `geminicassandra update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write( "GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos(args) time_2 = start_time time_3 = start_time if (args.node_num == 1): gemini_loader = GeminiLoader(args) gemini_loader.setup_db() time_2 = time.time() gemini_loader.single_core_stuff() time_3 = time.time() n_variants = 0 if args.cores > 1: n_variants = load_multicore(args) else: n_variants = load_singlecore(args) insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants) end_time = time.time() total_time = str(end_time - start_time) db_creation_time = str(time_2 - start_time) single_core_time = str(time_3 - time_2) parallel_time = str(end_time - time_3) print "Finished loading in %s s" % total_time if args.timing_log != None: with open(args.timing_log, "a") as myfile: myfile.write(",".join([ args.exp_id, total_time, db_creation_time, single_core_time, parallel_time ]) + "\n")