Ejemplo n.º 1
0
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    while True:
        for row in select_res.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0
            # https://github.com/pysam-developers/pysam/pull/44
            if args.anno_file.endswith(('.vcf', '.vcf.gz')):
                update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True))
            else:
                update_data = get_val_fn(annotations_in_region(row, anno, None, naming))
            #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            _update_variants(metadata, to_update, col_names, cursor)

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []
Ejemplo n.º 2
0
def windower(parser, args):
    check_dependencies("windower", [["bedtools", "--version"]])

    conn, metadata = database.get_session_metadata(args.db)
    pid = os.getpid()
    temp_file = ".".join(['.temp', str(pid)])
    make_windows(conn, args, temp_file)
Ejemplo n.º 3
0
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    while True:
        for row in select_res.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0
            # https://github.com/pysam-developers/pysam/pull/44
            if args.anno_file.endswith(('.vcf', '.vcf.gz')):
                update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True))
            else:
                update_data = get_val_fn(annotations_in_region(row, anno, None, naming))
            #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            _update_variants(metadata, to_update, col_names, cursor)

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []
Ejemplo n.º 4
0
def windower(parser, args):
    check_dependencies("windower", [["bedtools", "--version"]])

    conn, metadata = database.get_session_metadata(args.db)
    pid = os.getpid()
    temp_file = ".".join(['.temp', str(pid)])
    make_windows(conn, args, temp_file)
Ejemplo n.º 5
0
def load(db, query=None):
    import database

    t0 = time.time()
    conn, metadata = database.get_session_metadata(db)

    gt_cols = get_gt_cols(metadata)
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load %d arrays" \
            % (time.time() - t0, n)
    return carrays
Ejemplo n.º 6
0
def load(db, query=None):
    import database

    t0 = time.time()
    conn, metadata = database.get_session_metadata(db)

    gt_cols = get_gt_cols(metadata)
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load %d arrays" \
            % (time.time() - t0, n)
    return carrays
Ejemplo n.º 7
0
def pathways(parser, args):
    import database

    conn, metadata = database.get_session_metadata(args.db)

    if (not args.lof):
        get_ind_pathways(conn, metadata, args)
    else:
        get_ind_lof_pathways(conn, metadata, args)
Ejemplo n.º 8
0
def db_info(parser, args):

    conn, metadata = database.get_session_metadata(args.db)
    # column widths for the output
    out_template = "{0:20}{1:30}{2:10}"

        # header
    print out_template.format("table_name", "column_name", "type")
    for table in ['variants', 'variant_impacts', 'samples', 'gene_detailed', 'gene_summary']:
        get_table_info(metadata, table, out_template)
Ejemplo n.º 9
0
def dump(parser, args):

    conn, metadata = database.get_session_metadata(args.db)

    if args.variants:
        get_variants(conn, metadata, args)
    elif args.genotypes:
        get_genotypes(conn, metadata, args)
    elif args.samples:
        get_samples(conn, metadata, args)
    elif args.tfam:
        tfam(args)
Ejemplo n.º 10
0
def dump(parser, args):

    conn, metadata = database.get_session_metadata(args.db)

    if args.variants:
        get_variants(conn, metadata, args)
    elif args.genotypes:
        get_genotypes(conn, metadata, args)
    elif args.samples:
        get_samples(conn, metadata, args)
    elif args.tfam:
        tfam(args)
Ejemplo n.º 11
0
def db_info(parser, args):

    conn, metadata = database.get_session_metadata(args.db)
    # column widths for the output
    out_template = "{0:20}{1:30}{2:10}"

    # header
    print out_template.format("table_name", "column_name", "type")
    for table in [
            'variants', 'variant_impacts', 'samples', 'gene_detailed',
            'gene_summary'
    ]:
        get_table_info(metadata, table, out_template)
Ejemplo n.º 12
0
def get_families(db, selected_families=None):
    """
    Query the samples table to return a list of Family
    objects that each contain all of the Subjects in a Family.
    """
    conn, metadata = database.get_session_metadata(db)

    families_dict = Family.from_cursor(conn)

    # if the user has specified a set of selected families
    # to which the analysis should be restricted, then
    # first sanity check that the family ids they specified are valid.
    if selected_families is not None:
        for family in selected_families.split(','):
            if family not in families_dict:
                sys.exit("ERROR: family \"%s\" is not a valid family_id\n" % family)

    families = []
    for fam in families_dict:
        if selected_families is None or fam in selected_families:
            families.append(families_dict[fam])
    return families
Ejemplo n.º 13
0
def get_families(db, selected_families=None):
    """
    Query the samples table to return a list of Family
    objects that each contain all of the Subjects in a Family.
    """
    conn, metadata = database.get_session_metadata(db)

    families_dict = Family.from_cursor(conn)

    # if the user has specified a set of selected families
    # to which the analysis should be restricted, then
    # first sanity check that the family ids they specified are valid.
    if selected_families is not None:
        for family in selected_families.split(','):
            if family not in families_dict:
                raise ValueError("Family \"%s\" is not a valid family_id\n" % family)

    families = []
    for fam in families_dict:
        if selected_families is None or fam in selected_families:
            families.append(families_dict[fam])
    return families
Ejemplo n.º 14
0
    def __init__(self, args, buffer_size=10000, prepare_db=True):
        self.args = args
        self.seen_multi = False

        # create the gemini database
        # create a reader for the VCF file
        self.vcf_reader = self._get_vcf_reader()
        # load sample information
        expected = "consequence,codons,amino_acids,gene,symbol,feature,exon,polyphen,sift,protein_position,biotype,warning".split(",")

        if self.args.anno_type == "VEP":
            self._effect_fields = self._get_vep_csq(self.vcf_reader)
            # tuples of (db_column, CSQ name)
            self._extra_effect_fields = [("vep_%s" % x.lower(), x) for x in self._effect_fields if not x.lower() in expected]

        else:
            self._effect_fields = []
            self._extra_effect_fields = []
        if not prepare_db:
            self.c, self.metadata = database.get_session_metadata(args.db)
            return
        self._create_db([x[0] for x in self._extra_effect_fields])

        self._extra_empty = dict((x[0], None) for x in self._extra_effect_fields)

        if not self.args.no_genotypes and not self.args.no_load_genotypes:
            # load the sample info from the VCF file.
            self._prepare_samples()
            # initialize genotype counts for each sample
            self._init_sample_gt_counts()
            self.num_samples = len(self.samples)
        else:
            self.num_samples = 0

        self.clinvar_chrom_gene_lookup = load_clinvar(annotations.get_anno_files(self.args)['clinvar'])

        self.buffer_size = buffer_size
        self._get_anno_version()
Ejemplo n.º 15
0
def stats(parser, args):

    import database
    conn, metadata = database.get_session_metadata(args.db)

    if args.tstv:
        get_tstv(conn, metadata, args)
    elif args.tstv_coding:
        get_tstv_coding(conn, metadata, args)
    elif args.tstv_noncoding:
        get_tstv_noncoding(conn, metadata, args)
    elif args.snp_counts:
        get_snpcounts(conn, metadata, args)
    elif args.sfs:
        get_sfs(conn, metadata, args)
    elif args.variants_by_sample:
        get_variants_by_sample(conn, metadata, args)
    elif args.genotypes_by_sample:
        get_gtcounts_by_sample(conn, metadata, args)
    elif args.mds:
        get_mds(conn, metadata, args)
    elif args.query:
        summarize_query_by_sample(args)
Ejemplo n.º 16
0
def stats(parser, args):

    import database
    conn, metadata = database.get_session_metadata(args.db)

    if args.tstv:
        get_tstv(conn, metadata, args)
    elif args.tstv_coding:
        get_tstv_coding(conn, metadata, args)
    elif args.tstv_noncoding:
        get_tstv_noncoding(conn, metadata, args)
    elif args.snp_counts:
        get_snpcounts(conn, metadata, args)
    elif args.sfs:
        get_sfs(conn, metadata, args)
    elif args.variants_by_sample:
        get_variants_by_sample(conn, metadata, args)
    elif args.genotypes_by_sample:
        get_gtcounts_by_sample(conn, metadata, args)
    elif args.mds:
        get_mds(conn, metadata, args)
    elif args.query:
        summarize_query_by_sample(args)
Ejemplo n.º 17
0
    try:
        if res.shape[0] == 1 and len(res.shape) > 1:
            res = res[0]
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []

if __name__ == "__main__":

    db = sys.argv[1]
    #create(sys.argv[1])
    carrays = load(db)
    conn, metadata = database.get_session_metadata(db)
    if len(sys.argv) > 2:
        q = sys.argv[2]
    else:
        q = "gt_types.1094PC0012 == HET and gt_types.1719PC0016 == HET and gts.1094PC0012 == 'A/C'"

    print filter(db, carrays, q, user_dict=dict(HET=1, HOM_REF=0, HOM_ALT=3,
        UNKNOWN=2))
    print "compare to:", ("""gemini query -q "select variant_id, gts.1719PC0016 from variants" """
                          """ --gt-filter "%s" %s""" % (q, db))

Ejemplo n.º 18
0
def annotate(parser, args):
    check_dependencies("annotate", [["tabix", "-h"],
                                    ["bgzip", "-h"]])
    def _validate_args(args):
        if (args.col_operations or args.col_types or args.col_extracts):
            raise ValueError('You may only specify a column name (-c) when '
                     'using \"-a boolean\" or \"-a count\".\n')

        col_names = args.col_names.split(',')
        if len(col_names) > 1:
            raise ValueError('You may only specify a single column name (-c) '
                     'when using \"-a boolean\" or \"-a count\".\n')

        if not args.anno_file.endswith(('.vcf', '.vcf.gz')) and args.region_only and parser is not None:
            raise ValueError('You may only specify --region-only when annotation is a VCF.')

        return col_names

    def _validate_extract_args(args):
        if args.anno_file.endswith(('.vcf', '.vcf.gz')):
            if not args.col_names:
                args.col_names = args.col_extracts
            elif not args.col_extracts:
                args.col_extracts = args.col_names
        elif args.region_only and parser is not None:
            raise ValueError('You may only specify --region-only when annotation is a VCF.1')

        if not args.col_types:
            raise ValueError('need to give column types ("-t")\n')
        col_ops = args.col_operations.split(',')
        col_idxs = args.col_extracts.split(',')

        col_names = args.col_names.split(',')
        col_types = args.col_types.split(',')

        supported_types = ['text', 'float', 'integer']
        for col_type in col_types:
            if col_type not in supported_types:
                raise ValueError('Column type [%s] not supported.\n' %
                         (col_type))

        supported_ops = op_funcs.keys()

        for col_op in col_ops:
            if col_op not in supported_ops:
                raise ValueError('Column operation [%s] not supported.\n' %
                         (col_op))

        if not (len(col_ops) == len(col_names) ==
                len(col_types) == len(col_idxs)):
            raise ValueError('The number of column names, numbers, types, and '
                     'operations must match: [%s], [%s], [%s], [%s]\n' %
                     (args.col_names, args.col_extracts, args.col_types, args.col_operations))

        return col_names, col_types, col_ops, col_idxs

    if (args.db is None):
        parser.print_help()
        exit(1)
    if not os.path.exists(args.anno_file):
        sys.stderr.write("Error: cannot find annotation file.")
        exit(1)

    conn, metadata = database.get_session_metadata(args.db)

    if args.anno_type == "boolean":
        col_names = _validate_args(args)
        annotate_variants_bool(args, conn, metadata, col_names)
    elif args.anno_type == "count":
        col_names = _validate_args(args)
        annotate_variants_count(args, conn, metadata, col_names)
    elif args.anno_type == "extract":
        if args.col_extracts is None and not args.anno_file.endswith('.vcf.gz'):
            raise RuntimeError("You must specify which column to "
                               "extract from your annotation file.")
        else:
            col_names, col_types, col_ops, col_idxs = _validate_extract_args(args)
            annotate_variants_extract(args, conn, metadata, col_names, col_types, col_ops, col_idxs)
    else:
        raise RuntimeError("Unknown column type requested. Exiting.")

    conn.close()

    # index on the newly created columns
    for col_name in col_names:
        with database_transaction(args.db) as c:
            c.execute('''drop index if exists %s''' % (col_name + "idx"))
            c.execute('''create index %s on variants(%s)''' % (col_name + "idx", col_name))
Ejemplo n.º 19
0
def lofgenequery(parser, args):
    conn, metadata = database.get_session_metadata(args.db)
    samples = sample_variants(conn, metadata, args)
    sample_lof_variants(conn, metadata, args, samples)
Ejemplo n.º 20
0
def annotate(parser, args):
    check_dependencies("annotate", [["tabix", "-h"], ["bgzip", "-h"]])

    def _validate_args(args):
        if (args.col_operations or args.col_types or args.col_extracts):
            raise ValueError(
                'You must not specify a column type (-t), op (-o) or extract (-e) when '
                'using \"-a boolean\" or \"-a count\".\n')

        col_names = args.col_names.split(',')
        if len(col_names) > 1:
            raise ValueError('You may only specify a single column name (-c) '
                             'when using \"-a boolean\" or \"-a count\".\n')

        if not args.anno_file.endswith(
            ('.vcf', '.vcf.gz')) and args.region_only and parser is not None:
            raise ValueError(
                'You may only specify --region-only when annotation is a VCF.')

        return col_names

    def _validate_extract_args(args):
        if args.anno_file.endswith(('.vcf', '.vcf.gz')):
            if not args.col_names:
                args.col_names = args.col_extracts
            elif not args.col_extracts:
                args.col_extracts = args.col_names
        elif args.region_only and parser is not None:
            raise ValueError(
                'You may only specify --region-only when annotation is a VCF.1'
            )

        if not args.col_types:
            raise ValueError('need to give column types ("-t")\n')
        col_ops = args.col_operations.split(',')
        col_idxs = args.col_extracts.split(',')

        col_names = args.col_names.split(',')
        col_types = args.col_types.split(',')

        supported_types = ['text', 'float', 'integer']
        for col_type in col_types:
            if col_type not in supported_types:
                raise ValueError('Column type [%s] not supported.\n' %
                                 (col_type))

        supported_ops = op_funcs.keys()

        for col_op in col_ops:
            if col_op not in supported_ops:
                raise ValueError('Column operation [%s] not supported.\n' %
                                 (col_op))

        if not (len(col_ops) == len(col_names) == len(col_types) ==
                len(col_idxs)):
            raise ValueError(
                'The number of column names, numbers, types, and '
                'operations must match: [%s], [%s], [%s], [%s]\n' %
                (args.col_names, args.col_extracts, args.col_types,
                 args.col_operations))

        return col_names, col_types, col_ops, col_idxs

    if (args.db is None):
        parser.print_help()
        exit(1)
    if not os.path.exists(args.anno_file):
        sys.stderr.write("Error: cannot find annotation file.")
        exit(1)

    conn, metadata = database.get_session_metadata(args.db)

    if args.anno_type == "boolean":
        col_names = _validate_args(args)
        annotate_variants_bool(args, conn, metadata, col_names)
    elif args.anno_type == "count":
        col_names = _validate_args(args)
        annotate_variants_count(args, conn, metadata, col_names)
    elif args.anno_type == "extract":
        if args.col_extracts is None and not args.anno_file.endswith(
                '.vcf.gz'):
            raise RuntimeError("You must specify which column to "
                               "extract from your annotation file.")
        else:
            col_names, col_types, col_ops, col_idxs = _validate_extract_args(
                args)
            annotate_variants_extract(args, conn, metadata, col_names,
                                      col_types, col_ops, col_idxs)
    else:
        raise RuntimeError("Unknown column type requested. Exiting.")

    conn.close()

    # index on the newly created columns
    for col_name in col_names:
        with database_transaction(args.db) as c:
            c.execute('''drop index if exists %s''' % (col_name + "idx"))
            c.execute('''create index %s on variants(%s)''' %
                      (col_name + "idx", col_name))
Ejemplo n.º 21
0
def lof_sieve(parser, args):

    import database
    conn, metadata = database.get_session_metadata(args.db)
    get_ind_lof(conn, metadata, args)
Ejemplo n.º 22
0
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []


if __name__ == "__main__":

    db = sys.argv[1]
    #create(sys.argv[1])
    carrays = load(db)
    conn, metadata = database.get_session_metadata(db)
    if len(sys.argv) > 2:
        q = sys.argv[2]
    else:
        q = "gt_types.1094PC0012 == HET and gt_types.1719PC0016 == HET and gts.1094PC0012 == 'A/C'"

    print filter(db,
                 carrays,
                 q,
                 user_dict=dict(HET=1, HOM_REF=0, HOM_ALT=3, UNKNOWN=2))
    print "compare to:", (
        """gemini query -q "select variant_id, gts.1719PC0016 from variants" """
        """ --gt-filter "%s" %s""" % (q, db))
Ejemplo n.º 23
0
def filter(db, query, user_dict):
    # these should be translated to a bunch or or/and statements within gemini
    # so they are supported, but must be translated before getting here.
    if query == "False" or query is None or query is False:
        return []
    if "any(" in query or "all(" in query or \
       ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1):
        return None
    user_dict['where'] = np.where

    if query.startswith("not "):
        # "~" is not to numexpr.
        query = "~" + query[4:]
    sum_cmp = False
    if query.startswith("sum("):
        assert query[-1].isdigit()
        query, sum_cmp = query[4:].rsplit(")", 1)
        query = "(%s) %s" % (query, sum_cmp)

    query = query.replace(".", "__")
    query = " & ".join("(%s)" % token for token in query.split(" and "))
    query = " | ".join("(%s)" % token for token in query.split(" or "))

    import database
    conn, metadata = database.get_session_metadata(db)
    samples = get_samples(metadata)
    # convert gt_col[index] to gt_col__sample_name
    patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types))

    def subfn(x):
        """Turn gt_types[1] into gt_types__sample"""
        field, idx = x.groups()
        return "%s__%s" % (field, fix_sample_name(samples[int(idx)]))

    query = re.sub(patt, subfn, query)
    if os.environ.get('GEMINI_DEBUG') == 'TRUE':
        print >> sys.stderr, query[:250] + "..."
    carrays = load(db, query=query)

    if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \
       any(not any(carrays[c]) for c in carrays):
        # need this 2nd check above because of the place-holders in load()
        raise NoGTIndexException

    # loop through and create a cache of "$gt__$sample"
    for gt_col in carrays:
        if not gt_col in query: continue
        for i, sample_array in enumerate(carrays[gt_col]):
            sample = fix_sample_name(samples[i])
            if not sample in query: continue
            user_dict["%s__%s" % (gt_col, sample)] = sample_array

    # had to special-case count. it won't quite be as efficient
    if "|count|" in query:
        tokens = query[2:-2].split("|count|")
        icmp = tokens[-1]
        # a list of carrays, so not in memory.
        res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]]
        # in memory after this, but just a single axis array.
        res = np.sum(res, axis=0)
        res = ne.evaluate('res%s' % icmp)
    else:
        res = bcolz.eval(query, user_dict=user_dict)

    try:
        if res.shape[0] == 1 and len(res.shape) > 1:
            res = res[0]
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []
Ejemplo n.º 24
0
def tag_somatic_mutations(args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    gq = GeminiQuery.GeminiQuery(args.db)

    depth_string, qual_string, ssc_string, chrom_string = ("", "", "", "")
    if args.min_depth:
        depth_string = " AND depth >= %s" % args.min_depth
    if args.min_qual:
        qual_string = " AND qual >= %s" % args.min_qual
    if args.min_somatic_score:
        ssc_string = " AND (type='sv' \
                         OR somatic_score >= %s)" % args.min_somatic_score
    if args.chrom:
        chrom_string = " AND chrom = '%s'" % args.chrom

    if args.chrom is None:
        query = "SELECT variant_id, chrom, start, end, \
                        ref, alt, gene, impact, gts, gt_types, \
                        gt_ref_depths, gt_alt_depths \
                 FROM variants \
                 WHERE 1 \
                 %s \
                 %s \
                 %s \
                 %s" % (depth_string, qual_string, ssc_string, chrom_string)

    gq.run(query)
    smp2idx = gq.sample_to_idx

    somatic_counter = 0
    somatic_v_ids = []

    if args.dry_run:
        print'\t'.join(['tum_name', 'tum_gt', 'tum_alt_freq', 'tum_alt_depth', 'tum_depth', \
                        'nrm_name', 'nrm_gt', 'nrm_alt_freq', 'nrm_alt_depth', 'nrm_depth',
                        'chrom', 'start', 'end', 'ref', 'alt', 'gene'])

    for row in gq:
        # we can skip variants where all genotypes are identical
        if len(set(row['gt_types'])) == 1:
            continue

        for pair in t_n_pairs:

            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            tum_idx = smp2idx[tumor.name]
            nrm_idx = smp2idx[normal.name]

            tum_gt = row['gts'][tum_idx]
            nrm_gt = row['gts'][nrm_idx]

            tum_gt_type = row['gt_types'][tum_idx]
            nrm_gt_type = row['gt_types'][nrm_idx]

            if nrm_gt_type == tum_gt_type:
                continue

            if nrm_gt_type == UNKNOWN or tum_gt_type == UNKNOWN:
                continue

            # the genotypes pass the smell test for somatic
            # mutations if in this block.
            if (nrm_gt_type == HOM_REF and tum_gt_type != HOM_REF):

               tum_ref_depth = row['gt_ref_depths'][tum_idx]
               nrm_ref_depth = row['gt_ref_depths'][nrm_idx]

               tum_alt_depth = row['gt_alt_depths'][tum_idx]
               nrm_alt_depth = row['gt_alt_depths'][nrm_idx]

               # total observed depth
               nrm_depth = nrm_alt_depth + nrm_ref_depth
               tum_depth = tum_alt_depth + tum_ref_depth

               if (nrm_depth < args.min_norm_depth \
                  or \
                  tum_depth < args.min_tumor_depth):
                  continue

               try:
                   tum_alt_freq = float(tum_alt_depth) / \
                                  (float(tum_alt_depth) + float(tum_ref_depth))
               except ZeroDivisionError:
                   tum_alt_freq = 'NA'

               try:
                   nrm_alt_freq = float(nrm_alt_depth) / \
                                  (float(nrm_alt_depth) + float(nrm_ref_depth))
               except ZeroDivisionError:
                   nrm_alt_freq = 'NA'

               # apply evidence thresholds.
               if (args.max_norm_alt_freq and nrm_alt_freq > args.max_norm_alt_freq) \
                  or \
                  (args.max_norm_alt_count and nrm_alt_depth > args.max_norm_alt_count):
                  continue

               somatic_counter += 1
               somatic_v_ids.append((1, row['variant_id']))

               print'\t'.join(str(s) for s in [tumor.name,  tum_gt, tum_alt_freq, tum_alt_depth, tum_depth, \
                                   normal.name, nrm_gt, nrm_alt_freq, nrm_alt_depth, nrm_depth, \
                                   row['chrom'], row['start'], row['end'], row['ref'], row['alt'], row['gene']])

    if not args.dry_run:
        import database
        conn, metadata = database.get_session_metadata(args.db)

        # now set the identified mutations to True.
        update_qry = "UPDATE variants SET is_somatic = 1 "
        update_qry += " WHERE variant_id IN (%s)"
        update_qry %= ",".join(str(x[1]) for x in somatic_v_ids)
        res = conn.execute(update_qry)
        assert res.rowcount == somatic_counter
        print "Identified and set", somatic_counter, "somatic mutations"
        conn.commit()
    else:
        print "Would have identified and set", somatic_counter, "somatic mutations"
Ejemplo n.º 25
0
def create(db, cols=None):
    if cols is None:
        cols = [x[0] for x in gt_cols_types if x[0] != 'gts']
        print >> sys.stderr, (
            "indexing all columns except 'gts'; to index that column, "
            "run gemini bcolz_index %s --cols gts" % db)

    conn, metadata = database.get_session_metadata(db)
    gt_cols = [x for x in get_gt_cols(metadata) if x in cols]
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    mkdir(bcpath)

    nv = get_n_variants(conn)

    sys.stderr.write("loading %i variants for %i samples into bcolz\n" %
                     (nv, len(samples)))

    if nv == 0 or len(samples) == 0:
        return

    carrays = {}
    tmps = {}
    try:
        for gtc in gt_cols:
            carrays[gtc] = []
            tmps[gtc] = []

            dt = dict(gt_cols_types)[gtc]
            for s in samples:
                mkdir("%s/%s" % (bcpath, s))
                carrays[gtc].append(
                    bcolz.carray(np.empty(0, dtype=dt),
                                 expectedlen=nv,
                                 rootdir="%s/%s/%s" % (bcpath, s, gtc),
                                 chunklen=16384 * 8,
                                 mode="w"))
                tmps[gtc].append([])

        t0 = time.time()
        # scale step by number of samples to limit memory use.
        step = max(100, 2000000 / len(samples))
        sys.stderr.write("step-size: %i\n" % step)
        del gtc
        decomp = compression.unpack_genotype_blob

        empty = [-1] * len(samples)
        for i, row in enumerate(
                conn.execute(
                    sql.text("select %s from variants" % ", ".join(gt_cols)))):
            if i == 0:
                try:
                    decomp(row[0])
                except zlib.error:
                    decomp = compression.snappy_unpack_blob

            for j, gt_col in enumerate(gt_cols):
                vals = decomp(row[j])
                if vals is None or len(vals) == 0:  # empty gt_phred_ll
                    vals = empty
                for isamp, sample in enumerate(samples):
                    tmps[gt_col][isamp].append(vals[isamp])
                    if (i > 0 and i % step == 0) or i == nv - 1:
                        carrays[gt_col][isamp].append(tmps[gt_col][isamp])
                        tmps[gt_col][isamp] = []
                        carrays[gt_col][isamp].flush()

            if i % step == 0 and i > 0:
                print >> sys.stderr, "at %.1fM (%.0f rows / second)" % (
                    i / 1000000., i / float(time.time() - t0))

        t = float(time.time() - t0)
        print >> sys.stderr, "loaded %d variants at %.1f / second" % (len(
            carrays[gt_col][0]), nv / t)
    except:
        # on error, we remove the dirs so we can't have weird problems.
        for k, li in carrays.items():
            for i, ca in enumerate(li):
                if i < 5:
                    print >> sys.stderr, "removing:", ca.rootdir
                if i == 5:
                    print >> sys.stderr, "not reporting further removals for %s" % k
                ca.flush()
                shutil.rmtree(ca.rootdir)
        raise
Ejemplo n.º 26
0
def create(db, cols=None):
    if cols is None:
        cols = [x[0] for x in gt_cols_types if x[0] != 'gts']
        print >>sys.stderr, (
                "indexing all columns except 'gts'; to index that column, "
                "run gemini bcolz_index %s --cols gts" % db)

    conn, metadata = database.get_session_metadata(db)
    gt_cols = [x for x in get_gt_cols(metadata) if x in cols]
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    mkdir(bcpath)

    nv = get_n_variants(conn)

    sys.stderr.write("loading %i variants for %i samples into bcolz\n"
                     % (nv, len(samples)))

    if nv == 0 or len(samples) == 0:
        return

    carrays = {}
    tmps = {}
    try:
        for gtc in gt_cols:
            carrays[gtc] = []
            tmps[gtc] = []

            dt = dict(gt_cols_types)[gtc]
            for s in samples:
                mkdir("%s/%s" % (bcpath, s))
                carrays[gtc].append(bcolz.carray(np.empty(0, dtype=dt),
                                    expectedlen=nv,
                                    rootdir="%s/%s/%s" % (bcpath, s, gtc),
                                    chunklen=16384*8,
                                    mode="w"))
                tmps[gtc].append([])

        t0 = time.time()
        # scale step by number of samples to limit memory use.
        step = max(100, 2000000 / len(samples))
        sys.stderr.write("step-size: %i\n" % step)
        del gtc

        empty = [-1] * len(samples)
        for i, row in enumerate(conn.execute(sql.text("select %s from variants" % ", ".join(gt_cols)))):
            for j, gt_col in enumerate(gt_cols):
                vals = decomp(row[j])
                if vals is None or len(vals) == 0:  # empty gt_phred_ll
                    vals = empty
                for isamp, sample in enumerate(samples):
                    tmps[gt_col][isamp].append(vals[isamp])
                    if (i > 0 and i % step == 0) or i == nv - 1:
                        carrays[gt_col][isamp].append(tmps[gt_col][isamp])
                        tmps[gt_col][isamp] = []
                        carrays[gt_col][isamp].flush()

            if i % step == 0 and i > 0:
                print >>sys.stderr, "at %.1fM (%.0f rows / second)" % (i / 1000000., i / float(time.time() - t0))

        t = float(time.time() - t0)
        print >>sys.stderr, "loaded %d variants at %.1f / second" % (len(carrays[gt_col][0]), nv / t)
    except:
        # on error, we remove the dirs so we can't have weird problems.
        for k, li in carrays.items():
            for i, ca in enumerate(li):
                if i < 5:
                    print >>sys.stderr, "removing:", ca.rootdir
                if i == 5:
                    print >>sys.stderr, "not reporting further removals for %s" % k
                ca.flush()
                shutil.rmtree(ca.rootdir)
        raise
Ejemplo n.º 27
0
def filter(db, query, user_dict):
    # these should be translated to a bunch or or/and statements within gemini
    # so they are supported, but must be translated before getting here.
    if query == "False" or query is None or query is False:
        return []
    if "any(" in query or "all(" in query or \
       ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1):
        return None
    user_dict['where'] = np.where

    if query.startswith("not "):
        # "~" is not to numexpr.
        query = "~" + query[4:]
    sum_cmp = False
    if query.startswith("sum("):
        assert query[-1].isdigit()
        query, sum_cmp = query[4:].rsplit(")", 1)
        query = "(%s) %s" % (query, sum_cmp)

    query = query.replace(".", "__")
    query = " & ".join("(%s)" % token for token in query.split(" and "))
    query = " | ".join("(%s)" % token for token in query.split(" or "))

    import database
    conn, metadata = database.get_session_metadata(db)
    samples = get_samples(metadata)
    # convert gt_col[index] to gt_col__sample_name
    patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types))


    def subfn(x):
        """Turn gt_types[1] into gt_types__sample"""
        field, idx = x.groups()
        return "%s__%s" % (field, fix_sample_name(samples[int(idx)]))

    query = re.sub(patt, subfn, query)
    if os.environ.get('GEMINI_DEBUG') == 'TRUE':
        print >>sys.stderr, query[:250] + "..."
    carrays = load(db, query=query)

    if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \
       any(not any(carrays[c]) for c in carrays):
       # need this 2nd check above because of the place-holders in load()
        raise NoGTIndexException

    # loop through and create a cache of "$gt__$sample"
    for gt_col in carrays:
        if not gt_col in query: continue
        for i, sample_array in enumerate(carrays[gt_col]):
            sample = fix_sample_name(samples[i])
            if not sample in query: continue
            user_dict["%s__%s" % (gt_col, sample)] = sample_array

    # had to special-case count. it won't quite be as efficient
    if "|count|" in query:
        tokens = query[2:-2].split("|count|")
        icmp = tokens[-1]
        # a list of carrays, so not in memory.
        res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]]
        # in memory after this, but just a single axis array.
        res = np.sum(res, axis=0)
        res = ne.evaluate('res%s' % icmp)
    else:
        res = bcolz.eval(query, user_dict=user_dict)

    try:
        if res.shape[0] == 1 and len(res.shape) > 1:
            res = res[0]
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []
Ejemplo n.º 28
0
def lofgenequery(parser, args):
    conn, metadata = database.get_session_metadata(args.db)
    samples = sample_variants(conn, metadata, args)
    sample_lof_variants(conn, metadata, args, samples)
Ejemplo n.º 29
0
def lof_sieve(parser, args):

    import database
    conn, metadata = database.get_session_metadata(args.db)
    get_ind_lof(conn, metadata, args)