Ejemplo n.º 1
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    annos = annotations.get_anno_files( args )
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run gemini update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `gemini update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos( args )

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 2
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos(args)

    # create a new gemini loader and populate
    # the gemini db and files from the VCF
    for try_count in range(2):
        try:
            if try_count > 0:
                args.tmp_db = os.path.join(args.tempdir, "%s.db" % uuid.uuid4())

            gemini_loader = GeminiLoader(args)
            gemini_loader.store_resources()
            gemini_loader.store_version()
            gemini_loader.store_vcf_header()
            extra_fields = gemini_loader.populate_from_vcf()
            gemini_loader.update_gene_table()
            # gemini_loader.build_indices_and_disconnect()

            if not args.no_genotypes and not args.no_load_genotypes:
                gemini_loader.store_sample_gt_counts()

            if try_count > 0:
                shutil.move(args.tmp_db, args.db)
            break
        except sqlite3.OperationalError, e:
            sys.stderr.write("sqlite3.OperationalError: %s\n" % e)
Ejemplo n.º 3
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None, "all"]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos(args)

    # create a new gemini loader and populate
    # the gemini db and files from the VCF
    for try_count in range(2):
        try:
            if try_count > 0:
                args.tmp_db = os.path.join(args.tempdir, "%s.db" % uuid.uuid4())

            gemini_loader = GeminiLoader(args)
            gemini_loader.store_resources()
            gemini_loader.store_version()
            gemini_loader.store_vcf_header()
            gemini_loader.populate_from_vcf()
            gemini_loader.update_gene_table()
            gemini_loader.build_indices_and_disconnect()

            if not args.no_genotypes and not args.no_load_genotypes:
                gemini_loader.store_sample_gt_counts()

            if try_count > 0:
                shutil.move(args.tmp_db, args.db)
            break
        except sql.exc.OperationalError, e:
            sys.stderr.write("sqlalchemy.OperationalError: %s\n" % e)
Ejemplo n.º 4
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    annos = annotations.get_anno_files(args)
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run gemini update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `gemini update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos(args)

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 5
0
def load(parser, args):
    #if (args.db is None or args.vcf is None):
    if args.vcf is None:
        parser.print_help()
        exit("ERROR: load needs both a VCF file\n")
    
    start_time = time.time()
    annos = annotations.get_anno_files( args )
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write("\nCADD scores are not being loaded because the"
            " annotation file could not be found.\n"
            "`Run geminicassandra update --dataonly --extra cadd_score`"
            " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write("CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write("\nGERP per bp is not being loaded because the annotation file"
                        " could not be found.\n    Run `geminicassandra update --dataonly --extra gerp_bp`"
                        " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos( args )
    
    time_2 = start_time
    time_3 = start_time
    
    if(args.node_num == 1):
        gemini_loader = GeminiLoader(args)
        gemini_loader.setup_db()
        time_2 = time.time()
        gemini_loader.single_core_stuff()
        time_3 = time.time()
        
    n_variants = 0
    
    if args.cores > 1:
        n_variants = load_multicore(args)
    else:
        n_variants = load_singlecore(args)
        
    insert_n_variants(map(strip, args.contact_points.split(',')), args.keyspace, n_variants)
        
    end_time = time.time()
    total_time = str(end_time - start_time)
    db_creation_time = str(time_2 - start_time)
    single_core_time = str(time_3 - time_2)
    parallel_time = str(end_time - time_3)
    print "Finished loading in %s s" % total_time
    if args.timing_log != None:
        with open(args.timing_log, "a") as myfile:
            myfile.write(",".join([args.exp_id, total_time, db_creation_time, single_core_time, parallel_time]) + "\n")        
Ejemplo n.º 6
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 7
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 8
0
def load(parser, args):
    if args.db is None or args.vcf is None:
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ["snpEff", "VEP", None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    # create a new gemini loader and populate
    # the gemini db and files from the VCF
    gemini_loader = GeminiLoader(args)
    gemini_loader.populate_from_vcf()
    gemini_loader.build_indices_and_disconnect()
    gemini_loader.store_sample_gt_counts()
Ejemplo n.º 9
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    if use_scheduler(args):
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 10
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    # create a new gemini loader and populate
    # the gemini db and files from the VCF
    gemini_loader = GeminiLoader(args)
    gemini_loader.populate_from_vcf()
    gemini_loader.build_indices_and_disconnect()
    gemini_loader.store_sample_gt_counts()
Ejemplo n.º 11
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.skip_cadd is False:
        sys.stdout.write("CADD is being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        sys.stdout.write("GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos()

    if args.scheduler:
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 12
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos()

    if use_scheduler(args):
        load_ipython(args)
    elif args.cores > 1:
        load_multicore(args)
    else:
        load_singlecore(args)
Ejemplo n.º 13
0
def load(parser, args):
    if (args.db is None or args.vcf is None):
        parser.print_help()
        exit("ERROR: load needs both a VCF file and a database file\n")
    if args.anno_type not in ['snpEff', 'VEP', None]:
        parser.print_help()
        exit("\nERROR: Unsupported selection for -t\n")

    # collect of the the add'l annotation files
    annotations.load_annos( args )

    # create a new gemini loader and populate
    # the gemini db and files from the VCF
    gemini_loader = GeminiLoader(args)
    gemini_loader.store_resources()
    gemini_loader.store_version()
    gemini_loader.store_vcf_header()
    gemini_loader.populate_from_vcf()
    gemini_loader.update_gene_table()
    # gemini_loader.build_indices_and_disconnect()

    if not args.no_genotypes and not args.no_load_genotypes:
        gemini_loader.store_sample_gt_counts()
Ejemplo n.º 14
0
def load(parser, args):
    #if (args.db is None or args.vcf is None):
    if args.vcf is None:
        parser.print_help()
        exit("ERROR: load needs both a VCF file\n")

    start_time = time.time()
    annos = annotations.get_anno_files(args)
    # force skipping CADD and GERP if the data files have not been installed
    if args.skip_cadd is False:
        if 'cadd_score' not in annos:
            sys.stderr.write(
                "\nCADD scores are not being loaded because the"
                " annotation file could not be found.\n"
                "`Run geminicassandra update --dataonly --extra cadd_score`"
                " to install the annotation file.\n\n")
            args.skip_cadd = True
        else:
            sys.stderr.write(
                "CADD scores are being loaded (to skip use:--skip-cadd).\n")
    if args.skip_gerp_bp is False:
        if 'gerp_bp' not in annos:
            sys.stderr.write(
                "\nGERP per bp is not being loaded because the annotation file"
                " could not be found.\n    Run `geminicassandra update --dataonly --extra gerp_bp`"
                " to install the annotation file.\n\n")
            args.skip_gerp_bp = True
        else:
            sys.stderr.write(
                "GERP per bp is being loaded (to skip use:--skip-gerp-bp).\n")
    # collect of the the add'l annotation files
    annotations.load_annos(args)

    time_2 = start_time
    time_3 = start_time

    if (args.node_num == 1):
        gemini_loader = GeminiLoader(args)
        gemini_loader.setup_db()
        time_2 = time.time()
        gemini_loader.single_core_stuff()
        time_3 = time.time()

    n_variants = 0

    if args.cores > 1:
        n_variants = load_multicore(args)
    else:
        n_variants = load_singlecore(args)

    insert_n_variants(map(strip, args.contact_points.split(',')),
                      args.keyspace, n_variants)

    end_time = time.time()
    total_time = str(end_time - start_time)
    db_creation_time = str(time_2 - start_time)
    single_core_time = str(time_3 - time_2)
    parallel_time = str(end_time - time_3)
    print "Finished loading in %s s" % total_time
    if args.timing_log != None:
        with open(args.timing_log, "a") as myfile:
            myfile.write(",".join([
                args.exp_id, total_time, db_creation_time, single_core_time,
                parallel_time
            ]) + "\n")