def load_singlecore(args): # create a new gemini loader and populate # the gemini db and files from the VCF l = GeminiLoader(args) l.populate_from_vcf() if not args.no_genotypes and not args.no_load_genotypes: l.store_sample_gt_counts() return l
def load(parser, args): #if (args.db is None or args.vcf is None): if args.vcf is None: parser.print_help() exit("ERROR: load needs both a VCF file\n") start_time = time.time() annos = annotations.get_anno_files( args ) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write("\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run geminicassandra update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write("\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `geminicassandra update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos( args ) time_2 = start_time time_3 = start_time if(args.node_num == 1): gemini_loader = GeminiLoader(args) gemini_loader.setup_db() time_2 = time.time() gemini_loader.single_core_stuff() time_3 = time.time() n_variants = 0 if args.cores > 1: n_variants = load_multicore(args) else: n_variants = load_singlecore(args) insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants) end_time = time.time() total_time = str(end_time - start_time) db_creation_time = str(time_2 - start_time) single_core_time = str(time_3 - time_2) parallel_time = str(end_time - time_3) print "Finished loading in %s s" % total_time if args.timing_log != None: with open(args.timing_log, "a") as myfile: myfile.write(",".join([args.exp_id, total_time, db_creation_time, single_core_time, parallel_time]) + "\n")
def load_singlecore(args): # create a new gemini loader and populate # the gemini db and files from the VCF gemini_loader = GeminiLoader(args) gemini_loader.store_resources() gemini_loader.store_version() gemini_loader.store_vcf_header() gemini_loader.populate_from_vcf() if not args.skip_gene_tables and not args.test_mode: gemini_loader.update_gene_table() if not args.test_mode: gemini_loader.build_indices_and_disconnect() if not args.no_genotypes and not args.no_load_genotypes: gemini_loader.store_sample_gt_counts() gemini_annotate.add_extras(args.db, [args.db])
def load_singlecore(args): # create a new gemini loader and populate # the gemini db and files from the VCF gemini_loader = GeminiLoader(args) gemini_loader.store_resources() gemini_loader.store_version() gemini_loader.populate_from_vcf() gemini_loader.build_indices_and_disconnect() if not args.no_genotypes and not args.no_load_genotypes: gemini_loader.store_sample_gt_counts()
def finish(args, loader=None): """ all things that are performed by single core, multi, ipython should be done here """ if loader is None: loader = GeminiLoader(args, prepare_db=False) print "storing version, header, etc." loader.store_resources() loader.store_version() loader.store_vcf_header() if not args.skip_gene_tables: print "storing gene-detailed" loader._get_gene_detailed() print "storing gene-summary" loader._get_gene_summary() if not args.test_mode: print "updating gene-table" loader.update_gene_table() if not args.test_mode: print "building indices" loader.build_indices_and_disconnect() else: import database database.close_and_commit(loader.c)
def load_singlecore(args): # create a new geminicassandra loader and populate # the geminicassandra db and files from the VCF gemini_loader = GeminiLoader(args) gemini_loader.connect_to_db() if not args.no_genotypes and not args.no_load_genotypes: gemini_loader._init_sample_gt_counts() gemini_loader.populate_from_vcf() '''if not args.skip_gene_tables and not args.test_mode: gemini_loader.update_gene_table()''' if not args.no_genotypes and not args.no_load_genotypes: gemini_loader.store_sample_gt_counts() if not args.test_mode: gemini_loader.disconnect()
def load(parser, args): #if (args.db is None or args.vcf is None): if args.vcf is None: parser.print_help() exit("ERROR: load needs both a VCF file\n") start_time = time.time() annos = annotations.get_anno_files(args) # force skipping CADD and GERP if the data files have not been installed if args.skip_cadd is False: if 'cadd_score' not in annos: sys.stderr.write( "\nCADD scores are not being loaded because the" " annotation file could not be found.\n" "`Run geminicassandra update --dataonly --extra cadd_score`" " to install the annotation file.\n\n") args.skip_cadd = True else: sys.stderr.write( "CADD scores are being loaded (to skip use:--skip-cadd).\n") if args.skip_gerp_bp is False: if 'gerp_bp' not in annos: sys.stderr.write( "\nGERP per bp is not being loaded because the annotation file" " could not be found.\n Run `geminicassandra update --dataonly --extra gerp_bp`" " to install the annotation file.\n\n") args.skip_gerp_bp = True else: sys.stderr.write( "GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n") # collect of the the add'l annotation files annotations.load_annos(args) time_2 = start_time time_3 = start_time if (args.node_num == 1): gemini_loader = GeminiLoader(args) gemini_loader.setup_db() time_2 = time.time() gemini_loader.single_core_stuff() time_3 = time.time() n_variants = 0 if args.cores > 1: n_variants = load_multicore(args) else: n_variants = load_singlecore(args) insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants) end_time = time.time() total_time = str(end_time - start_time) db_creation_time = str(time_2 - start_time) single_core_time = str(time_3 - time_2) parallel_time = str(end_time - time_3) print "Finished loading in %s s" % total_time if args.timing_log != None: with open(args.timing_log, "a") as myfile: myfile.write(",".join([ args.exp_id, total_time, db_creation_time, single_core_time, parallel_time ]) + "\n")