Python unpack_genotype_blob Examples, compression.unpack_genotype_blob Python Examples

Example #1

0

Show file

File: tool_interactions.py Project: nfarzaneh/gemini

def get_variant_genes(res, args, idx_to_sample):
    samples = defaultdict(list)
    for r in res:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts      = Z.unpack_genotype_blob(r['gts'])
        var_id = str(r['variant_id'])
        chrom = str(r['chrom'])
        start = str(r['start'])
        end = str(r['end'])
        gene     = str(r['gene'])
        impact = str(r['impact'])
        biotype = str(r['biotype'])
        in_dbsnp = str(r['in_dbsnp'])
        clinvar_sig = str(r['clinvar_sig'])
        clinvar_disease_name = str(r['clinvar_disease_name'])
        aaf_1kg_all = str(r['aaf_1kg_all'])
        aaf_esp_all = str(r['aaf_esp_all'])

        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT):
                if gene != "None":
                    (key, value) = (idx_to_sample[idx], \
                                   (gene,var_id,chrom,start,end,impact, \
                                   biotype,in_dbsnp,clinvar_sig, \
                                   clinvar_disease_name,aaf_1kg_all, \
                                   aaf_esp_all))
                    samples[idx_to_sample[idx]].append(value)
    return samples

Example #2

0

Show file

def get_variant_genes(c, args, idx_to_sample):
    samples = defaultdict(list)
    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        var_id = str(r['variant_id'])
        chrom = str(r['chrom'])
        start = str(r['start'])
        end = str(r['end'])
        gene = str(r['gene'])
        impact = str(r['impact'])
        biotype = str(r['biotype'])
        in_dbsnp = str(r['in_dbsnp'])
        clinvar_sig = str(r['clinvar_sig'])
        clinvar_disease_name = str(r['clinvar_disease_name'])
        aaf_1kg_all = str(r['aaf_1kg_all'])
        aaf_esp_all = str(r['aaf_esp_all'])

        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT):
                if gene != "None":
                    (key, value) = (idx_to_sample[idx], \
                                   (gene,var_id,chrom,start,end,impact, \
                                   biotype,in_dbsnp,clinvar_sig, \
                                   clinvar_disease_name,aaf_1kg_all, \
                                   aaf_esp_all))
                    samples[idx_to_sample[idx]].append(value)
    return samples

Example #3

0

Show file

File: tool_pathways.py Project: shameer/gemini

def _report_variant_pathways(c, args, idx_to_sample):

    (agn_paths, hgnc_paths, ensembl_paths) = get_pathways(args)

    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        gene = str(r['gene'])
        trans = str(r['transcript'])

        pathways = []
        if (gene, trans) in agn_paths:
            pathways = _get_pathways(gene,
                                     trans,
                                     agn_paths[(gene, trans)],
                                     allow_none=False)
        elif (gene, trans) in hgnc_paths:
            pathways = _get_pathways(gene,
                                     trans,
                                     hgnc_paths[(gene, trans)],
                                     allow_none=False)
        elif (gene, trans) in ensembl_paths:
            pathways = _get_pathways(gene,
                                     trans,
                                     ensembl_paths[(gene, trans)],
                                     allow_none=False)
        pathlist = ",".join(pathways)
        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT) and \
                len(pathways) > 0:
                print "\t".join([r['chrom'], str(r['start']), str(r['end']), \
                                 r['ref'], r['alt'], r['impact'], \
                                 idx_to_sample[idx], gts[idx], gene, trans, \
                                 pathlist])

Example #4

0

Show file

File: tool_pathways.py Project: nfarzaneh/gemini

def _report_variant_pathways(res, args, idx_to_sample):

    (agn_paths, hgnc_paths, ensembl_paths) = get_pathways(args)

    for r in res:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts      = Z.unpack_genotype_blob(r['gts'])
        gene     = str(r['gene'])
        trans    = str(r['transcript'])

        pathways = []
        if (gene, trans) in agn_paths:
            pathways = _get_pathways(gene, trans, agn_paths[(gene, trans)],
                            allow_none=False)
        elif (gene, trans) in hgnc_paths:
            pathways = _get_pathways(gene, trans, hgnc_paths[(gene, trans)],
                            allow_none=False)
        elif (gene, trans) in ensembl_paths:
            pathways = _get_pathways(gene, trans, ensembl_paths[(gene, trans)],
                            allow_none=False)
        pathlist = ",".join(pathways)
        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT) and \
                len(pathways) > 0:
                print "\t".join([r['chrom'], str(r['start']), str(r['end']), \
                                 r['ref'], r['alt'], r['impact'], \
                                 idx_to_sample[idx], gts[idx], gene, trans, \
                                 pathlist])

Example #5

0

Show file

def get_ind_lof(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join([
        'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change',
        'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample',
        'genotype', 'gene', 'transcript', 'trans_type'
    ])

    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            try:
                #transcript_pos for snpEff annotated VCF
                transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            except IndexError:
                #transcript_pos for VEP annotated VCF
                if aa_length != 'None' and \
                        aa_length.split("/")[0] != "-":
                    transcript_pos = aa_length.split("/")[0]
        #transcript_pct for snpEff annotated VCF
        if aa_length != 'None' and "/" not in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length)
        #transcript_pct for VEP annotated VCF
        elif aa_length != 'None' and "/" in aa_length:
            transcript_pct = float(transcript_pos) / float(
                aa_length.split("/")[1])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([
                    r['chrom'],
                    str(r['start']),
                    str(r['end']), r['ref'], r['alt'], r['impact'],
                    r['aa_change'] or 'None', transcript_pos or 'None',
                    r['aa_length'] or 'None',
                    str(transcript_pct) or 'None', idx_to_sample[idx],
                    gts[idx], gene, trans, r['biotype'] or 'None'
                ])

Example #6

0

Show file

File: tool_autosomal_dominant.py Project: egafni/gemini

def get_auto_dominant_candidates(c):
    """
    Report candidate variants that meet an autosomal dominant
    inheritance model.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, gt_types, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW'"

        c.execute(query)
        all_query_cols = [str(tuple[0]) for tuple in c.description
                          if not tuple[0].startswith("gt")]

        family_genotype_mask = family.get_auto_dominant_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()

        # yield a header
        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        yield header

        # yield the resulting auto_dom variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # skip if the variant doesn't meet a dominant model
            # for this family
            if not eval(family_genotype_mask):
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            yield result

Example #7

0

Show file

File: tool_lof_sieve.py Project: IMPIMBA/gemini

def get_ind_lof(c, args):
    
    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt',
                     'highest_impact', 'aa_change', 'var_trans_pos',
                     'trans_aa_length', 'var_trans_pct',
                     'sample', 'genotype', 'gene', 'transcript', 'trans_type'])

    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            try:
                #transcript_pos for snpEff annotated VCF
                transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            except IndexError:
                #transcript_pos for VEP annotated VCF
                if aa_length != 'None' and \
                        aa_length.split("/")[0] != "-":
                    transcript_pos = aa_length.split("/")[0] 
        #transcript_pct for snpEff annotated VCF        
        if aa_length != 'None' and "/" not in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length)
        #transcript_pct for VEP annotated VCF
        elif aa_length != 'None' and "/" in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length.split("/")[1])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([r['chrom'], str(r['start']),
                                 str(r['end']), r['ref'], r['alt'],
                                 r['impact'],
                                 r['aa_change'] or 'None',
                                 transcript_pos or 'None',
                                 r['aa_length'] or 'None',
                                 str(transcript_pct) or 'None',
                                 idx_to_sample[idx],
                                 gts[idx], gene, trans, r['biotype'] or 'None'])

Example #8

0

Show file

File: tool_interactions.py Project: nfarzaneh/gemini

def get_lof_genes(res, args, idx_to_sample):
    lof = defaultdict(list)
    for r in res:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts      = Z.unpack_genotype_blob(r['gts'])
        gene     = str(r['gene'])

        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT):
                if gene != "None":
                    (key, value) = (idx_to_sample[idx], gene)
                    lof[idx_to_sample[idx]].append(gene)
    return lof

Example #9

0

Show file

def get_lof_genes(c, args, idx_to_sample):
    lof = defaultdict(list)
    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        gene = str(r['gene'])

        for idx, gt_type in enumerate(gt_types):
            if (gt_type == HET or gt_type == HOM_ALT):
                if gene != "None":
                    (key, value) = (idx_to_sample[idx], gene)
                    lof[idx_to_sample[idx]].append(gene)
    return lof

Example #10

0

Show file

File: GeminiQuery.py Project: jgoecks/gemini

    def __getitem__(self, key):
        # we cache what we can.
        if key in ('het_samples', 'hom_alt_samples', 'unknown_samples',
                'variant_samples', 'hom_ref_samples'):
            if self.genotype_dict == {}:
                self.genotype_dict = self.query._group_samples_by_genotype(self['gt_types'])
            if key == 'het_samples':
                return self.genotype_dict[HET]
            if key == 'hom_alt_samples':
                return self.genotype_dict[HOM_ALT]
            if key == 'hom_ref_samples':
                return self.genotype_dict[HOM_REF]
            if key == 'unknown_samples':
                return self.genotype_dict[UNKNOWN]
            if key == 'variant_samples':
                return self.genotype_dict[HET] + self.genotype_dict[HOM_ALT]

        if key in self.cache:
            return self.cache[key]

        if key == 'info':
            if 'info' not in self.cache:
                self.cache['info'] = compression.unpack_ordereddict_blob(self.row['info'])
            return self.cache['info']
        if key not in self.query.gt_cols:
            return self.row[key]
        elif key in self.query.gt_cols:
            if key not in self.cache:
                self.cache[key] = compression.unpack_genotype_blob(self.row[key])
            return self.cache[key]
        raise KeyError(key)

Example #11

0

Show file

File: gemini_dump.py Project: shameer/gemini

def get_genotypes(c, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"

    c.execute(query)

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(c.description, ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in c:
        gts = Z.unpack_genotype_blob(row['gts'])
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            print args.separator.join(
                str(row[i]) for i in xrange(len(row) - 1)),
            print args.separator.join([idx_to_sample[idx], gt])

Example #12

0

Show file

File: gemini_dump.py Project: nfarzaneh/gemini

def get_genotypes(conn, metadata, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(metadata)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"
    res = conn.execute(sql.text(query))

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(metadata.tables["variants"], ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in res:
        gts = Z.unpack_genotype_blob(row['gts'])
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            a = args.separator.join(str(row[i]) for i in xrange(len(row)-1))
            b = args.separator.join([idx_to_sample[idx], gt])
            print args.separator.join((a, b))

Example #13

0

Show file

File: gemini_query.py Project: angelinasusan/gemini

def apply_query_w_genotype_select(c, query, use_header):
    """
    Execute a query that contains gt* columns in only in the SELECT.
    """
    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())
    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        # all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields

Example #14

0

Show file

File: tool_interactions.py Project: jsh58/gemini

def get_variant_genes(c, args, idx_to_sample):
    samples = defaultdict(list)
    for r in c:
        gt_types = Z.unpack_genotype_blob(r["gt_types"])
        gts = Z.unpack_genotype_blob(r["gts"])
        var_id = str(r["variant_id"])
        chrom = str(r["chrom"])
        start = str(r["start"])
        end = str(r["end"])
        gene = str(r["gene"])
        impact = str(r["impact"])
        biotype = str(r["biotype"])
        in_dbsnp = str(r["in_dbsnp"])
        clinvar_sig = str(r["clinvar_sig"])
        clinvar_disease_name = str(r["clinvar_disease_name"])
        aaf_1kg_all = str(r["aaf_1kg_all"])
        aaf_esp_all = str(r["aaf_esp_all"])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                if gene != "None":
                    (key, value) = (
                        idx_to_sample[idx],
                        (
                            gene,
                            var_id,
                            chrom,
                            start,
                            end,
                            impact,
                            biotype,
                            in_dbsnp,
                            clinvar_sig,
                            clinvar_disease_name,
                            aaf_1kg_all,
                            aaf_esp_all,
                        ),
                    )
                    samples[idx_to_sample[idx]].append(value)
    return samples

Example #15

0

Show file

File: gemini_stats.py Project: nfarzaneh/gemini

def get_mds(conn, metadata, args):
    """
    Compute the pairwise genetic distance between each sample.
    """
    idx_to_sample = {}
    res = conn.execute(sql.text("select sample_id, name from samples"))
    for row in res:
        idx_to_sample[int(row['sample_id']) - 1] = row['name']

    query = "SELECT DISTINCT v.variant_id, v.gt_types\
    FROM variants v\
    WHERE v.type = 'snp'"
    res = conn.execute(query)

    # keep a list of numeric genotype values
    # for each sample
    genotypes = collections.defaultdict(list)
    for row in res:

        gt_types = Z.unpack_genotype_blob(row['gt_types'])

        # at this point, gt_types is a numpy array
        # idx:  0 1 2 3 4 5 6 .. #samples
        # type [0 1 2 1 2 0 0 ..         ]
        for idx, gt_type in enumerate(gt_types):
            sample = idx_to_sample[idx]
            genotypes[sample].append(gt_type)

    mds = collections.defaultdict(float)
    # convert the genotype list for each sample
    # to a numpy array for performance.
    # masks stores an array of T/F indicating which genotypes are
    # known (True, [0,1,2]) and unknown (False [-1]).
    masks = {}
    for s in genotypes:
        sample = str(s)
        x = np.array(genotypes[sample])
        genotypes[sample] = x
        masks[sample] = \
            np.ma.masked_where(genotypes[sample] != UNKNOWN,
                               genotypes[sample]).mask

    # compute the euclidean distance for each s1/s2 combination
    # using numpy's vectorized sum() and square() operations.
    # we use the mask arrays to identify the indices of known genotypes
    # for each sample.  by doing a bitwise AND of the mask arrays for the
    # two samples, we have a mask array of variants where __both__ samples
    # were called.
    for sample1 in genotypes:
        for sample2 in genotypes:
            pair = (sample1, sample2)
            # which variants have known genotypes for both samples?
            both_mask = masks[str(sample1)] & masks[str(sample2)]
            genotype1 = genotypes[sample1]
            genotype2 = genotypes[sample2]

            # distance between s1 and s2:
            eucl_dist = float(np.sum(np.square((genotype1 - genotype2)[both_mask]))) \
                / \
                float(np.sum(both_mask))

            mds[pair] = eucl_dist

    # report the pairwise MDS for each sample pair.
    print "sample1\tsample2\tdistance"
    for pair in mds:
        print "\t".join([str(pair[0]), str(pair[1]), str(round(mds[pair], 4))])

Example #16

0

Show file

File: GeminiQuery.py Project: chenyu600/gemini

    def next(self):
        """
        Return the GeminiRow object for the next query result.
        """
        # we use a while loop since we may skip records based upon
        # genotype filters.  if we need to skip a record, we just
        # throw a continue and keep trying. the alternative is to just
        # recursively call self.next() if we need to skip, but this
        # can quickly exceed the stack.
        while (1):
            try:
                row = self.c.next()
                
                if self._query_needs_genotype_info():
                    gts = compression.unpack_genotype_blob(row['gts'])
                    gt_types = \
                        compression.unpack_genotype_blob(row['gt_types'])
                    gt_phases = \
                        compression.unpack_genotype_blob(row['gt_phases'])
                    gt_depths = \
                        compression.unpack_genotype_blob(row['gt_depths'])
                    gt_ref_depths = \
                        compression.unpack_genotype_blob(row['gt_ref_depths'])
                    gt_alt_depths = \
                        compression.unpack_genotype_blob(row['gt_alt_depths'])
                    gt_quals = \
                        compression.unpack_genotype_blob(row['gt_quals'])

                    # skip the record if it does not meet the user's genotype filter
                    if self.gt_filter and not eval(self.gt_filter):
                        continue

                fields = OrderedDict()

                for idx, col in enumerate(self.report_cols):
                    if col == "*":
                        continue
                    if not col.startswith("gt") and not col.startswith("GT"):
                        fields[col] = row[col]
                    else:
                        # reuse the original column anme user requested
                        # e.g. replace gts[1085] with gts.NA20814
                        if '[' in col:
                            orig_col = self.gt_idx_to_name_map[col]
                            fields[orig_col] = eval(col.strip())
                        else:
                            # asked for "gts" or "gt_types", e.g.
                            if col == "gts":
                                fields[col] = ','.join(gts)
                            elif col == "gt_types":
                                fields[col] = \
                                    ','.join(str(t) for t in gt_types)
                            elif col == "gt_phases":
                                fields[col] = \
                                    ','.join(str(p) for p in gt_phases)
                            elif col == "gt_depths":
                                fields[col] = \
                                    ','.join(str(d) for d in gt_depths)
                            elif col == "gt_quals":
                                fields[col] = \
                                    ','.join(str(d) for d in gt_quals)
                            elif col == "gt_ref_depths":
                                fields[col] = \
                                    ','.join(str(d) for d in gt_ref_depths)
                            elif col == "gt_alt_depths":
                                fields[col] = \
                                    ','.join(str(d) for d in gt_alt_depths)

                if self.show_variant_samples:
                    gt_types = compression.unpack_genotype_blob(row['gt_types'])
                    variant_samples = [x for x, y in enumerate(gt_types) if y == HET or
                                       y == HOM_ALT]
                    variant_names = [self.idx_to_sample[x] for x in variant_samples]
                    fields["variant_samples"] = ",".join(variant_names)
                    het_samples = [x for x, y in enumerate(gt_types) if y == HET]
                    het_names = [self.idx_to_sample[x] for x in het_samples]
                    fields["HET_samples"] = ",".join(het_names)
                    hom_alt_samples = [x for x, y in enumerate(gt_types) if y == HOM_ALT]
                    hom_alt_names = [self.idx_to_sample[x] for x in hom_alt_samples]
                    fields["HOM_ALT_samples"] = ",".join(hom_alt_names)

                if self._query_needs_genotype_info():
                    if not self.for_browser:
                        return GeminiRow(fields,
                                         gts, gt_types, gt_phases, 
                                         gt_depths, gt_ref_depths,
                                         gt_alt_depths, gt_quals)
                    else:
                        return fields
                else:
                    if not self.for_browser:
                        return GeminiRow(fields)
                    else:
                        return fields
            except Exception as e:
                raise StopIteration

Example #17

0

Show file

File: tool_compound_hets.py Project: kunalbhutani/gemini

def get_compound_hets(c, args):
    """
    Report candidate compound heterozygous mutations.
    """
    # build a mapping of the numpy array index to the appropriate sample name
    # e.g. 0 == 109400005
    #     37 == 147800025
    idx_to_sample = util.map_indicies_to_samples(c)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    query = "SELECT * FROM variants \
             WHERE impact_severity != 'LOW'"  # is_exonic - what about splice?
    c.execute(query)

    # step 1. collect all candidate heterozygptes for all
    # genes and samples.  the list will be refined in step 2.
    for row in c:
        gt_types = compression.unpack_genotype_blob(row['gt_types'])
        gt_phases = compression.unpack_genotype_blob(row['gt_phases'])
        gt_bases = compression.unpack_genotype_blob(row['gts'])

        site = Site(row)

        # filter putative sites that the user doesn't care about
        if site.num_hets > 1 and not args.allow_other_hets:
            continue
        if not site.is_lof and args.only_lof:
            continue

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                # (testing)
                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.gene].append(sample_site)

    # header
    print "sample\tgene\thet1\thet2"
    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.
    for sample in comp_hets:
        for gene in comp_hets[sample]:
            for site1 in comp_hets[sample][gene]:
                for site2 in comp_hets[sample][gene]:
                    if site1 == site2:
                        continue

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = []
                    alleles_site2 = []
                    if not args.ignore_phasing:
                        alleles_site1 = site1.gt.split('|')
                        alleles_site2 = site2.gt.split('|')
                    else:
                        # split on phased (|) or unphased (/) genotypes
                        alleles_site1 = re.split('\||/', site1.gt)
                        alleles_site2 = re.split('\||/', site2.gt)

                    # it is only a true compound heterozygote iff
                    # the alternates are on opposite haplotypes.
                    if not args.ignore_phasing:
                        # return the haplotype on which the alternate
                        # allele was observed for this sample at each
                        # candidate het. site.
                        # e.g., if ALT=G and alleles_site1=['A', 'G']
                        # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                        alt_hap_1 = alleles_site1.index(site1.alt)
                        alt_hap_2 = alleles_site2.index(site2.alt)

                        if alt_hap_1 != alt_hap_2:
                            print "\t".join([sample,
                                             gene,
                                             str(site1),
                                             str(site2)])
                    else:
                        # user has asked us to not care about phasing
                        print "\t".join([sample,
                                         gene,
                                         str(site1),
                                         str(site2)])

Example #18

0

Show file

def get_auto_recessive_candidates(c):
    """
    Report candidate variants that meet an autosomal recessive
    inheritance model.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, gt_types, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW'"

        c.execute(query)
        all_query_cols = [
            str(tuple[0]) for tuple in c.description
            if not tuple[0].startswith("gt")
        ]

        family_genotype_mask = family.get_auto_recessive_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()

        # skip this family if it cannot meet an autosomal_recessive model.
        if family_genotype_mask is None:
            continue

        # yield a header
        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        yield header

        # yield the resulting auto_rec variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # skip if the variant doesn't meet a recessive model
            # for this family
            if not eval(family_genotype_mask):
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            yield result

Example #19

0

Show file

File: GeminiQuery.py Project: AafreenUCSD/gemini

    def next(self):
        """
        Return the GeminiRow object for the next query result.
        """
        # we use a while loop since we may skip records based upon
        # genotype filters.  if we need to skip a record, we just
        # throw a continue and keep trying. the alternative is to just
        # recursively call self.next() if we need to skip, but this
        # can quickly exceed the stack.

        while (1):
            try:
                row = self.c.next()
            except Exception as e:
                self.conn.close()
                raise StopIteration
            gts = None
            gt_types = None
            gt_phases = None
            gt_depths = None
            gt_ref_depths = None
            gt_alt_depths = None
            gt_quals = None
            variant_names = []
            het_names = []
            hom_alt_names = []
            hom_ref_names = []
            unknown_names = []
            info = None
            
            if 'info' in self.report_cols:
                info = compression.unpack_ordereddict_blob(row['info'])
            
            if self._query_needs_genotype_info():
                gts = compression.unpack_genotype_blob(row['gts'])
                gt_types = \
                    compression.unpack_genotype_blob(row['gt_types'])
                gt_phases = \
                    compression.unpack_genotype_blob(row['gt_phases'])
                gt_depths = \
                    compression.unpack_genotype_blob(row['gt_depths'])
                gt_ref_depths = \
                    compression.unpack_genotype_blob(row['gt_ref_depths'])
                gt_alt_depths = \
                    compression.unpack_genotype_blob(row['gt_alt_depths'])
                gt_quals = \
                    compression.unpack_genotype_blob(row['gt_quals'])
                variant_samples = [x for x, y in enumerate(gt_types) if y == HET or
                                   y == HOM_ALT]
                variant_names = [self.idx_to_sample[x] for x in variant_samples]
                het_samples = [x for x, y in enumerate(gt_types) if y == HET]
                het_names = [self.idx_to_sample[x] for x in het_samples]
                hom_alt_samples = [x for x, y in enumerate(gt_types) if y == HOM_ALT]
                hom_alt_names = [self.idx_to_sample[x] for x in hom_alt_samples]
                hom_ref_samples = [x for x, y in enumerate(gt_types) if y == HOM_REF]
                hom_ref_names = [self.idx_to_sample[x] for x in hom_ref_samples]
                unknown_samples = [x for x, y in enumerate(gt_types) if y == UNKNOWN]
                unknown_names = [self.idx_to_sample[x] for x in unknown_samples]
                families = map(str, list(set([self.idx_to_sample_object[x].family_id
                            for x in variant_samples])))

                # skip the record if it does not meet the user's genotype filter
                if self.gt_filter and not eval(self.gt_filter, locals()):
                    continue

            fields = OrderedDict()

            for idx, col in enumerate(self.report_cols):
                if col == "*":
                    continue
                if not col.startswith("gt") and not col.startswith("GT") and not col == "info":
                    fields[col] = row[col]
                elif col == "info":
                    fields[col] = self._info_dict_to_string(info)
                else:
                    # reuse the original column name user requested
                    # e.g. replace gts[1085] with gts.NA20814
                    if '[' in col:
                        orig_col = self.gt_idx_to_name_map[col]
                        val = eval(col.strip())
                        if type(val) in [np.int8, np.int32, np.bool_]:
                            fields[orig_col] = int(val)
                        elif type(val) in [np.float32]:
                            fields[orig_col] = float(val)
                        else:
                            fields[orig_col] = val
                    else:
                        # asked for "gts" or "gt_types", e.g.
                        if col == "gts":
                            fields[col] = ','.join(gts)
                        elif col == "gt_types":
                            fields[col] = \
                                ','.join(str(t) for t in gt_types)
                        elif col == "gt_phases":
                            fields[col] = \
                                ','.join(str(p) for p in gt_phases)
                        elif col == "gt_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_depths)
                        elif col == "gt_quals":
                            fields[col] = \
                                ','.join(str(d) for d in gt_quals)
                        elif col == "gt_ref_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_ref_depths)
                        elif col == "gt_alt_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_alt_depths)

            if self.show_variant_samples:
                fields["variant_samples"] = \
                    self.variant_samples_delim.join(variant_names)
                fields["HET_samples"] = \
                    self.variant_samples_delim.join(het_names)
                fields["HOM_ALT_samples"] = \
                    self.variant_samples_delim.join(hom_alt_names)
            if self.show_families:
                fields["families"] = self.variant_samples_delim.join(families)

            gemini_row = GeminiRow(fields, gts, gt_types, gt_phases,
                                   gt_depths, gt_ref_depths, gt_alt_depths,
                                   gt_quals, variant_names, het_names, hom_alt_names,
                                   hom_ref_names, unknown_names, info,
                                   formatter=self.formatter)

            if not all([predicate(gemini_row) for predicate in self.predicates]):
                continue

            if not self.for_browser:
                return gemini_row
            else:
                return fields

Example #20

0

Show file

File: gemini_stats.py Project: shameer/gemini

def get_mds(c, args):
    """
    Compute the pairwise genetic distance between each sample.
    """
    idx_to_sample = {}
    c.execute("select sample_id, name from samples")
    for row in c:
        idx_to_sample[int(row['sample_id']) - 1] = row['name']

    query = "SELECT DISTINCT v.variant_id, v.gt_types\
    FROM variants v\
    WHERE v.type = 'snp'"

    c.execute(query)

    # keep a list of numeric genotype values
    # for each sample
    genotypes = collections.defaultdict(list)
    for row in c:

        gt_types = Z.unpack_genotype_blob(row['gt_types'])

        # at this point, gt_types is a numpy array
        # idx:  0 1 2 3 4 5 6 .. #samples
        # type [0 1 2 1 2 0 0 ..         ]
        for idx, gt_type in enumerate(gt_types):
            sample = idx_to_sample[idx]
            genotypes[sample].append(gt_type)

    mds = collections.defaultdict(float)
    # convert the genotype list for each sample
    # to a numpy array for performance.
    # masks stores an array of T/F indicating which genotypes are
    # known (True, [0,1,2]) and unknown (False [-1]).
    masks = {}
    for s in genotypes:
        sample = str(s)
        x = np.array(genotypes[sample])
        genotypes[sample] = x
        masks[sample] = \
            np.ma.masked_where(genotypes[sample] != UNKNOWN,
                               genotypes[sample]).mask

    # compute the euclidean distance for each s1/s2 combination
    # using numpy's vectorized sum() and square() operations.
    # we use the mask arrays to identify the indices of known genotypes
    # for each sample.  by doing a bitwise AND of the mask arrays for the
    # two samples, we have a mask array of variants where __both__ samples
    # were called.
    for sample1 in genotypes:
        for sample2 in genotypes:
            pair = (sample1, sample2)
            # which variants have known genotypes for both samples?
            both_mask = masks[str(sample1)] & masks[str(sample2)]
            genotype1 = genotypes[sample1]
            genotype2 = genotypes[sample2]

            # distance between s1 and s2:
            eucl_dist = float(np.sum(np.square((genotype1 - genotype2)[both_mask]))) \
                / \
                float(np.sum(both_mask))

            mds[pair] = eucl_dist

    # report the pairwise MDS for each sample pair.
    print "sample1\tsample2\tdistance"
    for pair in mds:
        print "\t".join([str(pair[0]), str(pair[1]), str(round(mds[pair], 4))])

Example #21

0

Show file

File: gemini_query.py Project: angelinasusan/gemini

def filter_query(c, query, gt_filter, use_header):
    """
    Execute a base SQL query while applying filters on the returned 
    rows based on filters applied to the genotype-specific columns.
    
    For example:
    --gt_filter "(gt_types.1478PC0011 == 1 or gt_types.1478PC0012 == 1)
    """

    def correct_genotype_filter(gt_filter, sample_to_idx):
        """
        This converts a "raw" genotype filter supplied by the user
        to a filter than can be eval()'ed.  Specifically, we must
        convery a _named_ genotype index to a _numerical_
        genotype index so that the appropriate value can be
        extracted for the sample from the genotype numpy arrays.
        
        For example, converts:
        --gt-filter "(gt_types.1478PC0011 == 1)"
        to
        (gt_types[11] == 1)
        """
        corrected_gt_filter = []
        tokens = re.split(r"[\s+]+", gt_filter)
        for token in tokens:
            if token.find("gt") >= 0 or token.find("GT") >= 0:
                corrected = _correct_genotype_col(token, sample_to_idx)
                corrected_gt_filter.append(corrected)
            else:
                corrected_gt_filter.append(token)
        return " ".join(corrected_gt_filter)

    # construct a mapping of sample names to list indices
    sample_to_idx = util.map_samples_to_indicies(c)

    gt_filter = correct_genotype_filter(gt_filter, sample_to_idx)
    (select_cols, all_cols_new, all_cols_orig, gt_col_map) = _split_select(query, sample_to_idx)

    query = add_gt_cols_to_query(query.lower())

    c.execute(query)

    # what are the columns that were actually selected by the user.
    all_query_cols = [str(tuple[0]) for tuple in c.description if not tuple[0].startswith("gt")]

    if "*" in select_cols:
        select_cols.remove("*")
        all_cols_orig.remove("*")
        all_cols_new.remove("*")
        select_cols += all_query_cols

    if use_header:
        h = [col for col in all_query_cols] + [col for col in oset(all_cols_orig) - oset(select_cols)]
        yield OrderedDict(itertools.izip(h, h))

    report_cols = all_query_cols + list(oset(all_cols_new) - oset(select_cols))
    for row in c:
        gts = compression.unpack_genotype_blob(row["gts"])
        gt_types = compression.unpack_genotype_blob(row["gt_types"])
        gt_phases = compression.unpack_genotype_blob(row["gt_phases"])
        gt_depths = compression.unpack_genotype_blob(row["gt_depths"])

        if not eval(gt_filter):
            continue

        fields = OrderedDict()
        for idx, col in enumerate(report_cols):
            if col == "*":
                continue
            if not col.startswith("gt") and not col.startswith("GT"):
                fields[col] = row[col]
            else:
                fields[col] = eval(col.strip())
        yield fields

Example #22

0

Show file

File: tool_de_novo_mutations.py Project: kunalbhutani/gemini

def get_de_novo_candidates(c, min_sample_depth=30):
    """
    Report candidate variants that meet appear to be de novo
    mutations in the child. We cannot distinguisj mutations that
    occured in the parental germline from those that occurred
    early in development in the child post-conception.
    """

    families = subjects.get_families(c)

    for family in families:

        query = "SELECT chrom, start, end, ref, alt, gene, \
                        impact, impact_severity, in_dbsnp, \
                        rs_ids, aaf_1kg_all, aaf_esp_all, \
                        clinvar_sig, clinvar_disease_name, \
                        clinvar_dbsource, gt_types, \
                        gt_depths, gts \
                 FROM variants \
                 WHERE impact_severity != 'LOW' \
                 AND num_het = 1"

        c.execute(query)
        all_query_cols = [str(tuple[0]) for tuple in c.description
                          if not tuple[0].startswith("gt")]

        family_genotype_mask = family.get_de_novo_filter()
        family_sample_gt_columns = family.get_subject_genotype_columns()
        family_sample_depth_columns = family.get_subject_depth_columns()
        family_sample_gt_labels = family.get_subject_genotype_labels()
        family_sample_dp_labels = family.get_subject_depth_labels()

        header = []
        header.append("family_id")
        for col in all_query_cols:
            header.append(col)
        for col in family_sample_gt_labels:
            header.append(col)
        for col in family_sample_dp_labels:
            header.append(col)
        yield header

        # report the resulting de_novo variants for this familiy
        for row in c:

            # unpack the genotype arrays so that we can interrogate
            # the genotypes present in each family member to conforming
            # to the genetic model being tested
            gt_types = compression.unpack_genotype_blob(row['gt_types'])
            gt_depths = compression.unpack_genotype_blob(row['gt_depths'])
            gts = compression.unpack_genotype_blob(row['gts'])

            # does the variant meet the a de novo model for this family?
            # if not, ignore.
            if not eval(family_genotype_mask):
                continue

            # make sure each sample's genotype had sufficient coverage.
            # otherwise, ignore
            insufficient_depth = False
            for col in family_sample_depth_columns:
                depth = int(eval(col))
                if depth < min_sample_depth:
                    insufficient_depth = True
                    break
            if insufficient_depth:
                continue

            result = []
            # first report all of the non-genotype columns
            result.append(str(family.family_id))
            for col in all_query_cols:
                if col == 'gt_types' or col == 'gts':
                    continue
                result.append(str(row[col]))

            # now report all of the genotype columns
            for col in family_sample_gt_columns:
                result.append(str(eval(col)))

            # now report all of the depth columns
            for col in family_sample_depth_columns:
                result.append(str(eval(col)))

            yield result

Example #23

0

Show file

File: GeminiQuery.py Project: yyxql/Genomics_Docker

    def next(self):
        """
        Return the GeminiRow object for the next query result.
        """
        # we use a while loop since we may skip records based upon
        # genotype filters.  if we need to skip a record, we just
        # throw a continue and keep trying. the alternative is to just
        # recursively call self.next() if we need to skip, but this
        # can quickly exceed the stack.

        while (1):
            try:
                row = self.c.next()
            except Exception as e:
                self.conn.close()
                raise StopIteration
            gts = None
            gt_types = None
            gt_phases = None
            gt_depths = None
            gt_ref_depths = None
            gt_alt_depths = None
            gt_quals = None
            variant_names = []
            het_names = []
            hom_alt_names = []
            hom_ref_names = []
            unknown_names = []
            info = None

            if 'info' in self.report_cols:
                info = compression.unpack_ordereddict_blob(row['info'])

            if self._query_needs_genotype_info():
                gts = compression.unpack_genotype_blob(row['gts'])
                gt_types = \
                    compression.unpack_genotype_blob(row['gt_types'])
                gt_phases = \
                    compression.unpack_genotype_blob(row['gt_phases'])
                gt_depths = \
                    compression.unpack_genotype_blob(row['gt_depths'])
                gt_ref_depths = \
                    compression.unpack_genotype_blob(row['gt_ref_depths'])
                gt_alt_depths = \
                    compression.unpack_genotype_blob(row['gt_alt_depths'])
                gt_quals = \
                    compression.unpack_genotype_blob(row['gt_quals'])
                variant_samples = [
                    x for x, y in enumerate(gt_types)
                    if y == HET or y == HOM_ALT
                ]
                variant_names = [
                    self.idx_to_sample[x] for x in variant_samples
                ]
                het_samples = [x for x, y in enumerate(gt_types) if y == HET]
                het_names = [self.idx_to_sample[x] for x in het_samples]
                hom_alt_samples = [
                    x for x, y in enumerate(gt_types) if y == HOM_ALT
                ]
                hom_alt_names = [
                    self.idx_to_sample[x] for x in hom_alt_samples
                ]
                hom_ref_samples = [
                    x for x, y in enumerate(gt_types) if y == HOM_REF
                ]
                hom_ref_names = [
                    self.idx_to_sample[x] for x in hom_ref_samples
                ]
                unknown_samples = [
                    x for x, y in enumerate(gt_types) if y == UNKNOWN
                ]
                unknown_names = [
                    self.idx_to_sample[x] for x in unknown_samples
                ]
                families = map(
                    str,
                    list(
                        set([
                            self.idx_to_sample_object[x].family_id
                            for x in variant_samples
                        ])))

                # skip the record if it does not meet the user's genotype filter
                if self.gt_filter and not eval(self.gt_filter, locals()):
                    continue

            fields = OrderedDict()

            for idx, col in enumerate(self.report_cols):
                if col == "*":
                    continue
                if not col.startswith("gt") and not col.startswith(
                        "GT") and not col == "info":
                    fields[col] = row[col]
                elif col == "info":
                    fields[col] = self._info_dict_to_string(info)
                else:
                    # reuse the original column name user requested
                    # e.g. replace gts[1085] with gts.NA20814
                    if '[' in col:
                        orig_col = self.gt_idx_to_name_map[col]
                        val = eval(col.strip())
                        if type(val) in [np.int8, np.int32, np.bool_]:
                            fields[orig_col] = int(val)
                        elif type(val) in [np.float32]:
                            fields[orig_col] = float(val)
                        else:
                            fields[orig_col] = val
                    else:
                        # asked for "gts" or "gt_types", e.g.
                        if col == "gts":
                            fields[col] = ','.join(gts)
                        elif col == "gt_types":
                            fields[col] = \
                                ','.join(str(t) for t in gt_types)
                        elif col == "gt_phases":
                            fields[col] = \
                                ','.join(str(p) for p in gt_phases)
                        elif col == "gt_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_depths)
                        elif col == "gt_quals":
                            fields[col] = \
                                ','.join(str(d) for d in gt_quals)
                        elif col == "gt_ref_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_ref_depths)
                        elif col == "gt_alt_depths":
                            fields[col] = \
                                ','.join(str(d) for d in gt_alt_depths)

            if self.show_variant_samples:
                fields["variant_samples"] = \
                    self.variant_samples_delim.join(variant_names)
                fields["HET_samples"] = \
                    self.variant_samples_delim.join(het_names)
                fields["HOM_ALT_samples"] = \
                    self.variant_samples_delim.join(hom_alt_names)
            if self.show_families:
                fields["families"] = self.variant_samples_delim.join(families)

            gemini_row = GeminiRow(fields,
                                   gts,
                                   gt_types,
                                   gt_phases,
                                   gt_depths,
                                   gt_ref_depths,
                                   gt_alt_depths,
                                   gt_quals,
                                   variant_names,
                                   het_names,
                                   hom_alt_names,
                                   hom_ref_names,
                                   unknown_names,
                                   info,
                                   formatter=self.formatter)

            if not all(
                [predicate(gemini_row) for predicate in self.predicates]):
                continue

            if not self.for_browser:
                return gemini_row
            else:
                return fields

Example #24

0

Show file

File: GeminiQuery.py Project: kunalbhutani/gemini

    def next(self):
        """
        Return the GeminiRow object for the next query result.
        """
        # we use a while loop since we may skip records based upon
        # genotype filters.  if we need to skip a record, we just
        # throw a continue and keep trying. the alternative is to just
        # recursively call self.next() if we need to skip, but this
        # can quickly exceed the stack.
        while (1):
            try:
                row = self.c.next()

                if self._query_needs_genotype_info():
                    gts = compression.unpack_genotype_blob(row['gts'])
                    gt_types = \
                        compression.unpack_genotype_blob(row['gt_types'])
                    gt_phases = \
                        compression.unpack_genotype_blob(row['gt_phases'])
                    gt_depths = \
                        compression.unpack_genotype_blob(row['gt_depths'])

                    # skip the record if it does not meet the user's genotype filter
                    if self.gt_filter and not eval(self.gt_filter):
                        continue

                fields = OrderedDict()
                for idx, col in enumerate(self.report_cols):
                    if col == "*":
                        continue
                    if not col.startswith("gt") and not col.startswith("GT"):
                        fields[col] = row[col]
                    else:
                        # reuse the original column anme user requested
                        # e.g. replace gts[1085] with gts.NA20814
                        if '[' in col:
                            orig_col = self.gt_idx_to_name_map[col]
                            fields[orig_col] = eval(col.strip())
                        else:
                            # asked for "gts" or "gt_types", e.g.
                            if col == "gts":
                                fields[col] = ','.join(gts)
                            elif col == "gt_types":
                                fields[col] = \
                                    ','.join(str(t) for t in gt_types)
                            elif col == "gt_phases":
                                fields[col] = \
                                    ','.join(str(p) for p in gt_phases)
                            elif col == "gt_depths":
                                fields[col] = \
                                    ','.join(str(d) for d in gt_depths)

                if self._query_needs_genotype_info():
                    if not self.for_browser:
                        return GeminiRow(fields, gts, gt_types, gt_phases,
                                         gt_depths)
                    else:
                        return fields
                else:
                    if not self.for_browser:
                        return GeminiRow(fields)
                    else:
                        return fields
            except:
                raise StopIteration