Exemple #1
0
    def _get_individuals(self):
        """Return a list with the individual objects found in db

            Returns:
                individuals (list): List of Individuals

        """
        individuals = []
        gq = GeminiQuery(self.db)
        #Dictionaru with sample to index in the gemini database
        sample_to_idx = gq.sample_to_idx

        query = "SELECT * from samples"
        gq.run(query)

        for individual in gq:
            logger.info("Found individual {0} with family id {1}".format(
                individual['name'], individual['family_id']))
            
            individuals.append(
                Individual(
                    ind_id=individual['name'],
                    case_id=individual['family_id'],
                    mother=individual['maternal_id'],
                    father=individual['paternal_id'],
                    sex=individual['sex'],
                    phenotype=individual['phenotype'],
                    index=sample_to_idx.get(individual['name']),
                    variant_source=self.db,
                    bam_path=None)
            )
        return individuals
Exemple #2
0
    def _get_transcripts(self, gemini_variant):
        """Return a Transcript object

        Go through all transcripts found for the variant
        
            Args:
                gemini_variant (GeminiQueryRow): The gemini variant

            Yields:
                transcript (puzzle.models.Transcript)

        """
        query = "SELECT * from variant_impacts WHERE variant_id = {0}".format(
            gemini_variant['variant_id']
        )
        
        gq = GeminiQuery(self.db)
        gq.run(query)

        for genimi_transcript in gq:
            transcript = Transcript(
                hgnc_symbol=genimi_transcript['gene'],
                transcript_id=genimi_transcript['transcript'],
                consequence=genimi_transcript['impact_so'],
                biotype=genimi_transcript['biotype'],
                polyphen=genimi_transcript['polyphen_pred'],
                sift=genimi_transcript['sift_pred'],
                HGVSc=genimi_transcript['codon_change'],
                HGVSp=genimi_transcript['aa_change']
                )
            yield transcript
Exemple #3
0
    def _get_transcripts(self, gemini_variant):
        """Return a Transcript object

            Gemini stores the information for the most severe transcript
            so only one transcript is connected to one variant.

            Args:
                gemini_variant (GeminiQueryRow): The gemini variant

            Returns:
                transcripts list: List of affected transcripts

        """
        query = "SELECT * from variant_impacts WHERE variant_id = {0}".format(
            gemini_variant['variant_id']
        )
        gq = GeminiQuery(self.db)
        gq.run(query)

        transcripts = []
        for transcript in gq:
            transcripts.append(Transcript(
                hgnc_symbol = transcript['gene'],
                transcript_id = transcript['transcript'],
                consequence=transcript['impact_so'],
                biotype = transcript['biotype'],
                polyphen = transcript['polyphen_pred'],
                sift = transcript['sift_pred'],
                HGVSc = transcript['codon_change'],
                HGVSp = transcript['aa_change']
                )
            )

        return transcripts
Exemple #4
0
    def _add_transcripts(self, variant_obj, gemini_variant):
        """
        Add all transcripts for a variant

        Go through all transcripts found for the variant

            Args:
                gemini_variant (GeminiQueryRow): The gemini variant

            Yields:
                transcript (puzzle.models.Transcript)

        """
        query = "SELECT * from variant_impacts WHERE variant_id = {0}".format(
            gemini_variant['variant_id']
        )

        gq = GeminiQuery(self.db)
        gq.run(query)
        
        for gemini_transcript in gq:
            transcript = Transcript(
                hgnc_symbol=gemini_transcript['gene'],
                transcript_id=gemini_transcript['transcript'],
                consequence=gemini_transcript['impact_so'],
                biotype=gemini_transcript['biotype'],
                polyphen=gemini_transcript['polyphen_pred'],
                sift=gemini_transcript['sift_pred'],
                HGVSc=gemini_transcript['codon_change'],
                HGVSp=', '.join([gemini_transcript['aa_change'] or '', gemini_transcript['aa_length'] or ''])
                )
            variant_obj.add_transcript(transcript)
def get_query_results(gemini_db, query, gt_filter="", out_format=DefaultRowFormat(None)):
    """
    Returns results of query.
    """

    gemini = GeminiQuery(gemini_db, out_format=out_format)
    gemini.run(query, gt_filter=gt_filter)
    return gemini
def run_query(args):
    predicates = get_row_predicates(args)
    add_required_columns_to_query(args)
    formatter = select_formatter(args)
    genotypes_needed = needs_genotypes(args)
    gene_needed = needs_gene(args)
    try:
        subjects = get_subjects(args)
    except KeyError:
        subjects = []
    kwargs = {}
    if args.bcolz:
        from . import gemini_bcolz
        kwargs['variant_id_getter'] = gemini_bcolz.filter

    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter, **kwargs)
    gq.run(args.query,
           args.gt_filter,
           args.show_variant_samples,
           args.sample_delim,
           predicates,
           genotypes_needed,
           gene_needed,
           args.show_families,
           subjects=subjects)

    if args.use_header and gq.header:
        print(gq.header)

    if not args.dgidb:
        for row in gq:
            print(row)
    else:
        # collect a list of all the genes that need to be queried
        # from DGIdb
        genes = defaultdict()
        for row in gq:
            genes[row['gene']] = True

        # collect info from DGIdb
        dgidb_info = query_dgidb(genes)

        # rerun the query (the cursor is now consumed)
        gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
        gq.run(args.query,
               args.gt_filter,
               args.show_variant_samples,
               args.sample_delim,
               predicates,
               genotypes_needed,
               gene_needed,
               args.show_families,
               subjects=subjects,
               **kwargs)

        # report the query results with DGIdb info added at the end.
        for row in gq:
            print(str(row) + "\t" + str(dgidb_info[row['gene']]))
Exemple #7
0
def _prep_priority_filter(gemini_db, data):
    """Prepare tabix indexed file with priority based filters and supporting information
    """
    from gemini import GeminiQuery

    out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        ref_chroms = set([x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        with file_transaction(data, out_file) as tx_out_file:
            gq = GeminiQuery(gemini_db)
            pops = [
                "aaf_esp_ea",
                "aaf_esp_aa",
                "aaf_esp_all",
                "aaf_1kg_amr",
                "aaf_1kg_eas",
                "aaf_1kg_sas",
                "aaf_1kg_afr",
                "aaf_1kg_eur",
                "aaf_1kg_all",
                "aaf_adj_exac_all",
                "aaf_adj_exac_afr",
                "aaf_adj_exac_amr",
                "aaf_adj_exac_eas",
                "aaf_adj_exac_fin",
                "aaf_adj_exac_nfe",
                "aaf_adj_exac_oth",
                "aaf_adj_exac_sas",
            ]
            attrs = (
                "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, "
                "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths"
            ).split(", ")
            gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops))
            sidx = gq.sample_to_idx[dd.get_sample_name(data)]
            header = attrs[:5] + ["filter"] + attrs[5:-2] + [x for x in pops if x.endswith("_all")] + ["freq"]
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle, dialect="excel-tab")
                cheader = header[:]
                cheader[0] = "#" + cheader[0]
                writer.writerow(cheader)
                for row in gq:
                    ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0)
                    alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0)
                    out_vals = dict(row.row)
                    try:
                        out_vals["freq"] = "%.2f" % (float(alt_depth) / float(ref_depth + alt_depth))
                    except ZeroDivisionError:
                        out_vals["freq"] = "0.00"
                    out_vals["filter"] = _calc_priority_filter(row, pops)
                    if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37(out_vals["chrom"]) in ref_chroms:
                        out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"])
                    out = [out_vals[x] for x in header]
                    writer.writerow(out)
    return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def get_actionable_mutations(parser, args):

    t_n_pairs = gemini_subjects.get_families(args.db)

    query = "SELECT variants.chrom, start, end, ref, alt, \
                    variants.gene, impact, is_somatic, \
                    gene_summary.in_cosmic_census \
             FROM variants, gene_summary \
             WHERE variants.is_somatic = 1 \
             AND (variants.type = 'snp' \
                 OR variants.type = 'indel') \
             AND (variants.impact_severity = 'HIGH' \
                 OR variants.impact_severity = 'MED') \
             AND variants.chrom = gene_summary.chrom \
             AND variants.gene = gene_summary.gene \
             AND gene_summary.in_cosmic_census = 1"


    # collect the relevant genes and query DGIDB
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)

    genes = defaultdict()
    for row in gq:
        genes[row['gene']] = True
    # collect info from DGIdb
    dgidb_info = query_dgidb(genes)


    # now rerun the query and report actionable mutations per DGIDB and COSMIC census.
    gq = GeminiQuery.GeminiQuery(args.db)
    gq.run(query)
    print('\t'.join(['tum_name', 'chrom', 'start', 'end', 'ref', 'alt', \
                    'gene', 'impact', 'is_somatic', 'in_cosmic_census',
                     'dgidb_info']))
    for row in gq:
        for pair in t_n_pairs:
            samples = pair.subjects
            if len(samples) != 2:
                continue

            tumor = pair.subjects[0]
            normal = pair.subjects[1]
            # swap if we guessed the tumor incorrectly
            if tumor.affected is False:
                tumor, normal = normal, tumor

            print('\t'.join(str(s) for s in [tumor.name, row['chrom'], \
                                            row['start'], row['end'], \
                                            row['ref'], row['alt'], \
                                            row['gene'], row['impact'], \
                                            row['is_somatic'], \
                                            row['in_cosmic_census'], \
                                            str(dgidb_info[row['gene']])]))
def generate_phenotypes(database):

    query = GeminiQuery(database)
    query_string = "SELECT name, phenotype FROM samples"
    phenotypes = {1: list(), 2: list()}

    query.run(query_string)

    for row in query:
        phenotypes[int(row["phenotype"])].append(row["name"])

    return phenotypes
def _prep_priority_filter(gemini_db, data):
    """Prepare tabix indexed file with priority based filters and supporting information
    """
    from gemini import GeminiQuery
    out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        ref_chroms = set([
            x.name
            for x in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        with file_transaction(data, out_file) as tx_out_file:
            gq = GeminiQuery(gemini_db)
            pops = [
                "aaf_esp_ea", "aaf_esp_aa", "aaf_esp_all", "aaf_1kg_amr",
                "aaf_1kg_eas", "aaf_1kg_sas", "aaf_1kg_afr", "aaf_1kg_eur",
                "aaf_1kg_all", "aaf_adj_exac_all", "aaf_adj_exac_afr",
                "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_fin",
                "aaf_adj_exac_nfe", "aaf_adj_exac_oth", "aaf_adj_exac_sas"
            ]
            attrs = (
                "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, "
                "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths"
            ).split(", ")
            gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops))
            sidx = gq.sample_to_idx[dd.get_sample_name(data)]
            header = attrs[:5] + ["filter"] + attrs[5:-2] + [
                x for x in pops if x.endswith("_all")
            ] + ["freq"]
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle, dialect="excel-tab")
                cheader = header[:]
                cheader[0] = "#" + cheader[0]
                writer.writerow(cheader)
                for row in gq:
                    ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0)
                    alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0)
                    out_vals = dict(row.row)
                    try:
                        out_vals["freq"] = "%.2f" % (
                            float(alt_depth) / float(ref_depth + alt_depth))
                    except ZeroDivisionError:
                        out_vals["freq"] = "0.00"
                    out_vals["filter"] = _calc_priority_filter(row, pops)
                    if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37(
                            out_vals["chrom"]) in ref_chroms:
                        out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"])
                    out = [out_vals[x] for x in header]
                    writer.writerow(out)
    return vcfutils.bgzip_and_index(out_file,
                                    data["config"],
                                    tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def get_query_results(gemini_db, query, gt_filter="", as_dataframe=False):
    """
    Returns results of query.
    """

    gq = GeminiQuery(gemini_db)
    gq.run(query, gt_filter=gt_filter)

    if as_dataframe:
        # Return results as dataframe.
        df = pd.DataFrame([str(row).split('\t') for row in gq], columns=gq.header.split('\t'))
        return df
    else:
        # Return results as iterator.
        return gq
Exemple #12
0
def get_query_results(gemini_db, query, gt_filter="", as_dataframe=False):
    """
    Returns results of query.
    """

    gq = GeminiQuery(gemini_db)
    gq.run(query, gt_filter=gt_filter)

    if as_dataframe:
        # Return results as dataframe.
        df = pd.DataFrame([str(row).split('\t') for row in gq],
                          columns=gq.header.split('\t'))
        return df
    else:
        # Return results as iterator.
        return gq
Exemple #13
0
def get_variant_type(variant_source):
    """Try to find out what type of variants that exists in a variant source
    
        Args:
            variant_source (str): Path to variant source
            source_mode (str): 'vcf' or 'gemini'
        
        Returns:
            variant_type (str): 'sv' or 'snv'
    """
    file_type = get_file_type(variant_source)
    variant_type = 'sv'
    if file_type == 'vcf':
        variants = VCF(variant_source)
    elif file_type == 'gemini':
        variants = GeminiQuery(variant_source)
        gemini_query = "SELECT * from variants"
        variants.run(gemini_query)
    # Check 1000 first variants, if anyone is a snv we set the variant_type
    # to 'snv'
    for i, variant in enumerate(variants):
        if file_type == 'vcf':
            if variant.is_snp:
                variant_type = 'snv'
        elif file_type == 'gemini':
            if variant['type'] == 'snp':
                variant_type = 'snv'

        if i > 1000:
            break

    return variant_type
Exemple #14
0
def main():
    """Main function which parses arguments and calls relevant functions"""
    # Parsing arguments
    arguments = parse_arguments()

    # Processing the presets config file
    presets = classes.Presets(arguments["presets_config"])

    # Passing the arguments and presets to a query constructor object
    queryformatter = classes.QueryConstructor(arguments, presets)

    # Creating the gemini database object
    gemini_db = GeminiQuery.GeminiQuery(arguments["input"])

    # Calling relevant function depending on the chosen mode
    if arguments["mode"] == "sample":
        output_table = get_sample_variants(gemini_db, arguments, queryformatter)
        with open(arguments["output"], 'w') as outputfile:
            outputfile.write('\n'.join(output_table))
    elif arguments["mode"] == "variant":
        output_table = get_variant_information(gemini_db, arguments, queryformatter)
        with open(arguments["output"], 'w') as outputfile:
            outputfile.write('\n'.join(output_table))
    elif arguments["mode"] == "table":
        output_table = get_table(gemini_db, arguments, queryformatter)
        with open(arguments["output"], 'w') as outputfile:
            outputfile.write('\n'.join(output_table))
    elif arguments["mode"] == "info":
        print_comprehension = [
            print(field) for field in get_fields(gemini_db).split('\t')
        ]
Exemple #15
0
    def _variants(self, case_id, gemini_query, filters=None):
        """Return variants found in the gemini database

            Args:
                case_id (str): The case for which we want to see information
                gemini_query (str): What variants should be chosen
                filters (dict): A dictionary with filters

            Yields:
                variant_obj (dict): A Variant formatted dictionary
        """

        gq = GeminiQuery(self.db)

        gq.run(gemini_query)

        individuals = []
        # Get the individuals for the case
        case = self.case(case_id)
        for individual in case.individuals:
            individuals.append(individual)

        index = 0
        for gemini_variant in gq:
            # Check if variant is non ref in the individuals
            variant = None
            if self.variant_type == 'sv':
                index += 1
                variant = self._format_sv_variants(
                        gemini_variant=gemini_variant,
                        index=index,
                        filters=filters
                        )
            else:
                if self._is_variant(gemini_variant, individuals):
                    index += 1
                    logger.debug("Updating index to: {0}".format(index))

                    variant = self._format_variants(
                        gemini_variant=gemini_variant,
                        index=index,
                        filters=filters
                        )

            if variant:
                
                yield variant
def csv_to_query_string(csv_handle, db):
    query_list = []
    count = 0
    for line in csv_handle:
        chrom, pos, ref, alt = line.strip().split(",")
        query_list.append("(chrom='{chrom}' and start={pos})".format(**locals()))
        count += 1
        if count % 100 == 0:
            gq = GeminiQuery(db)
            regions = " or ".join(query_list)
            query = "select * from variants where %s" % regions
            gq.run(query)
            if count == 100:
                print gq.header
            for row in gq:
                print row
            query_list = []
Exemple #17
0
def query_json():
    query = request.GET.get('query', '').strip()

    gq = GeminiQuery.GeminiQuery(database)
    gq._set_gemini_browser(True)
    gq.run(query)

    return {'gemini_results': [dict(row) for row in gq]}
Exemple #18
0
    def _variants(self, case_id, gemini_query):
        """Return variants found in the gemini database

            Args:
                case_id (str): The case for which we want to see information
                gemini_query (str): What variants should be chosen
                filters (dict): A dictionary with filters

            Yields:
                variant_obj (dict): A Variant formatted dictionary
        """
        individuals = []
        # Get the individuals for the case
        case_obj = self.case(case_id)
        for individual in case_obj.individuals:
            individuals.append(individual)

        self.db = case_obj.variant_source
        self.variant_type = case_obj.variant_type

        gq = GeminiQuery(self.db)

        gq.run(gemini_query)

        index = 0
        for gemini_variant in gq:
            variant = None

            # Check if variant is non ref in the individuals
            is_variant = self._is_variant(gemini_variant, individuals)

            if self.variant_type == 'snv' and not is_variant:
                variant = None

            else:
                index += 1
                logger.debug("Updating index to: {0}".format(index))
                variant = self._format_variant(case_id=case_id,
                                               gemini_variant=gemini_variant,
                                               individual_objs=individuals,
                                               index=index)

            if variant:

                yield variant
Exemple #19
0
def csv_to_query_string(csv_handle, db):
    query_list = []
    count = 0
    for line in csv_handle:
        chrom, pos, ref, alt = line.strip().split(",")
        query_list.append(
            "(chrom='{chrom}' and start={pos})".format(**locals()))
        count += 1
        if count % 100 == 0:
            gq = GeminiQuery(db)
            regions = " or ".join(query_list)
            query = "select * from variants where %s" % regions
            gq.run(query)
            if count == 100:
                print gq.header
            for row in gq:
                print row
            query_list = []
Exemple #20
0
def _run_gemini_query_and_filter(db, genes):
    """Use the GeminiQuery API to filter results based on severity and specific annotations
    :param db: GEMINI database.
    :type db: str.
    :returns:  tuple -- The header line for the requested columns and all rows that pass filters.
    """

    query = "SELECT chrom, start, end, ref, alt, vcf_id, rs_ids, cosmic_ids, filter, qual, qual_depth, depth, " \
            "gene, transcript, exon, codon_change, aa_change, biotype, impact, impact_so, impact_severity, " \
            "aa_length, is_lof, is_conserved, pfam_domain, in_omim, clinvar_sig, clinvar_disease_name, " \
            "clinvar_origin, clinvar_causal_allele, clinvar_dbsource, clinvar_dbsource_id, clinvar_on_diag_assay, " \
            "rmsk, in_segdup, strand_bias, rms_map_qual, in_hom_run, num_mapq_zero, num_reads_w_dels, grc, " \
            "gms_illumina, in_cse, num_alleles, allele_count, haplotype_score, is_somatic, somatic_score, " \
            "aaf_esp_ea, aaf_esp_aa, aaf_esp_aa, aaf_esp_all, aaf_1kg_amr, aaf_1kg_eas, aaf_1kg_sas, aaf_1kg_afr, " \
            "aaf_1kg_eur, aaf_1kg_all, aaf_exac_all, aaf_adj_exac_all, aaf_adj_exac_afr, aaf_adj_exac_amr, " \
            "aaf_adj_exac_eas, aaf_adj_exac_fin, aaf_adj_exac_nfe, aaf_adj_exac_oth, aaf_adj_exac_sas, " \
            "max_aaf_all, in_esp, in_1kg, in_exac FROM variants"
    # "(gts).(*), (gt_depths).(*), (gt_ref_depths).(*), (gt_alt_depths).(*), " \
    gq = GeminiQuery(db)
    gq.run(query)
    header = gq.header
    passing_rows = []
    print header

    # Filter out variants with minor allele frequencies above the threshold but
    # retain any that are above the threshold but in COSMIC or in ClinVar and not listed as benign.
    for variant_data in gq:
        if genes:
            if not gemini_interface.var_in_gene(variant_data, genes):
                continue
        # Right now removing this. Many benign and synonymous variants are in cosmic
        # if _var_is_in_cosmic(variant_data):
        #     passing_rows.append(variant_data)
        #     continue
        if gemini_interface.var_is_in_clinvar(variant_data):
            # Removed is_benign check temporarily. Some variants not annotated with up to date annotations
            passing_rows.append(variant_data)
            continue
        if gemini_interface.var_is_rare(variant_data):
            if gemini_interface.var_is_protein_effecting(variant_data):
                passing_rows.append(variant_data)

    return header, passing_rows
Exemple #21
0
    def variant(self, case_id, variant_id):
        """Return a specific variant.

            We solve this by building a gemini query and send it to _variants

            Args:
                case_id (str): Path to a gemini database
                variant_id (int): A gemini variant id

            Returns:
                variant_obj (dict): A puzzle variant

        """
        #Use the gemini id for fast lookup
        variant_id = int(variant_id)
        gemini_query = "SELECT * from variants WHERE variant_id = {0}".format(
            variant_id
        )

        individuals = []
        # Get the individuals for the case
        case_obj = self.case(case_id)
        for individual in case_obj.individuals:
            individuals.append(individual)

        self.db = case_obj.variant_source
        self.variant_type = case_obj.variant_type

        gq = GeminiQuery(self.db)
        gq.run(gemini_query)

        for gemini_variant in gq:
            variant = self._format_variant(
                case_id=case_id,
                gemini_variant=gemini_variant,
                individual_objs=individuals,
                index=gemini_variant['variant_id'],
                add_all_info = True
            )
            return variant

        return None
Exemple #22
0
def identify_low_complexity(name, in_vcf, in_bed):
    gms_thresh = 50.0
    subset_vcf = subset_by_region(name, in_vcf, in_bed)
    gemini_db = create_gemini_db(subset_vcf)
    print name
    gq = GeminiQuery(gemini_db)
    gq.run("SELECT count(*) from variants")
    total = list(gq)[0]["count(*)"]
    gq = GeminiQuery(gemini_db)
    gq.run("SELECT count(*) from variants WHERE gms_illumina < %s OR "
           "gms_solid < %s OR gms_iontorrent < %s" % (gms_thresh, gms_thresh, gms_thresh))
    low_gms = list(gq)[0]["count(*)"]
    print low_gms, total, "%.4f" % (float(low_gms) / float(total) * 100.0)
Exemple #23
0
def summarize_by_gene_and_sample(db, coding_only=True):

    "This is copied from GEMINI's own burden tool"

    query = ("select chrom, start, end, gene, impact, info from variants where"
             " impact != 'synonymous_coding' and in_1kg=0 ")

    if coding_only:
        query += " and codon_change != 'None'"

    gq = GeminiQuery(db)
    gq.run(query, show_variant_samples=True)

    burden = defaultdict(Counter)

    for row in gq:

        gene_name = row['gene']

        if not gene_name:
            gene_name = get_nearby_gene(row["chrom"], row["start"],
                                        row["end"])

        new_counts = Counter(row["HET_samples"])
        # Counter can't do scalar multiplication
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])

        del new_counts['']
        burden[gene_name] += new_counts

    dfs = list()
    for gene_name, counts in burden.items():
        df = pd.DataFrame(counts, columns=counts.keys(),
                          index=[gene_name])
        dfs.append(df)

    df = pd.concat(dfs)
    df = df.fillna(0)

    return df
def summarize_gene_region(args, gene):
    gq = GeminiQuery(args.geminidb)
    gq.run("SELECT chrom, start, end, ref, alt, type, "
           "num_hom_ref, num_het, num_hom_alt, "
           "aaf_1kg_all, "
           "gene, impact, impact_severity, aa_change, clinvar_sig, "
           "grc, gms_illumina, in_cse, rmsk "
           "FROM variants WHERE "
           "chrom == '{chrom}' AND gene == '{gene}' AND filter is NULL "
           "ORDER BY start".format(chrom=args.chrom, gene=gene))
    for row in gq:
        if row["impact_severity"] not in ["LOW"]:
            print row
            var_depths = []
            novar_depths = []
            for i, (gt, gt_type, gt_depth) in enumerate(
                    zip(row.gts, row.gt_types, row.gt_depths)):
                if gt_type > 0:
                    print "  ", gq.index2sample[i], gt, gt_depth
        elif row["type"] == "indel":
            print row["chrom"], row["start"], row["ref"], row["alt"]
def summarize_gene_region(args, gene):
    gq = GeminiQuery(args.geminidb)
    gq.run("SELECT chrom, start, end, ref, alt, type, "
           "num_hom_ref, num_het, num_hom_alt, "
           "aaf_1kg_all, "
           "gene, impact, impact_severity, aa_change, clinvar_sig, "
           "grc, gms_illumina, in_cse, rmsk "
           "FROM variants WHERE "
           "chrom == '{chrom}' AND gene == '{gene}' AND filter is NULL "
           "ORDER BY start"
           .format(chrom=args.chrom, gene=gene))
    for row in gq:
        if row["impact_severity"] not in ["LOW"]:
            print row
            var_depths = []
            novar_depths = []
            for i, (gt, gt_type, gt_depth) in enumerate(zip(row.gts, row.gt_types, row.gt_depths)):
                if gt_type > 0:
                    print "  ", gq.index2sample[i], gt, gt_depth
        elif row["type"] == "indel":
            print row["chrom"], row["start"], row["ref"], row["alt"]
Exemple #26
0
def summarize_by_gene_and_sample(db, coding_only=True):

    "This is copied from GEMINI's own burden tool"

    query = ("select chrom, start, end, gene, impact, info from variants where"
             " impact != 'synonymous_coding' and in_1kg=0 ")

    if coding_only:
        query += " and codon_change != 'None'"

    gq = GeminiQuery(db)
    gq.run(query, show_variant_samples=True)

    burden = defaultdict(Counter)

    for row in gq:

        gene_name = row['gene']

        if not gene_name:
            gene_name = get_nearby_gene(row["chrom"], row["start"], row["end"])

        new_counts = Counter(row["HET_samples"])
        # Counter can't do scalar multiplication
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])
        new_counts = new_counts + Counter(row["HOM_ALT_samples"])

        del new_counts['']
        burden[gene_name] += new_counts

    dfs = list()
    for gene_name, counts in burden.items():
        df = pd.DataFrame(counts, columns=counts.keys(), index=[gene_name])
        dfs.append(df)

    df = pd.concat(dfs)
    df = df.fillna(0)

    return df
Exemple #27
0
    def variant(self, case_id, variant_id):
        """Return a specific variant.

            We solve this by building a gemini query and send it to _variants

            Args:
                case_id (str): Path to a gemini database
                variant_id (int): A gemini variant id

            Returns:
                variant_obj (dict): A puzzle variant

        """
        variant_id = int(variant_id)
        gemini_query = "SELECT * from variants WHERE variant_id = {0}".format(
            variant_id
        )

        individuals = []
        # Get the individuals for the case
        for case in self.cases():
            if case['name'] == case_id:
                for individual in case['individuals']:
                    individuals.append(individual)

        gq = GeminiQuery(self.db)

        gq.run(gemini_query)

        for gemini_variant in gq:
            variant = self._format_variant(
                gemini_variant=gemini_variant,
                individual_objs=individuals,
                index=gemini_variant['variant_id']
            )

            return variant

        return None
Exemple #28
0
    def variant(self, case_id, variant_id):
        """Return a specific variant.

            We solve this by building a gemini query and send it to _variants

            Args:
                case_id (str): Path to a gemini database
                variant_id (int): A gemini variant id

            Returns:
                variant_obj (dict): A puzzle variant

        """
        #Use the gemini id for fast lookup
        variant_id = int(variant_id)
        gemini_query = "SELECT * from variants WHERE variant_id = {0}".format(
            variant_id)

        individuals = []
        # Get the individuals for the case
        case_obj = self.case(case_id)
        for individual in case_obj.individuals:
            individuals.append(individual)

        self.db = case_obj.variant_source
        self.variant_type = case_obj.variant_type

        gq = GeminiQuery(self.db)
        gq.run(gemini_query)

        for gemini_variant in gq:
            variant = self._format_variant(case_id=case_id,
                                           gemini_variant=gemini_variant,
                                           individual_objs=individuals,
                                           index=gemini_variant['variant_id'],
                                           add_all_info=True)
            return variant

        return None
Exemple #29
0
    def _variants(self, case_id, gemini_query):
        """Return variants found in the gemini database

            Args:
                case_id (str): The case for which we want to see information
                gemini_query (str): What variants should be chosen

            Yields:
                variant_obj (dict): A Variant formatted doctionary
        """

        gq = GeminiQuery(self.db)

        gq.run(gemini_query)

        individuals = []
        # Get the individuals for the case
        for case in self.cases():
            if case.name == case_id:
                for individual in case.individuals:
                    individuals.append(individual)

        indexes = [individual.index for individual in individuals]

        index = 0
        for gemini_variant in gq:
            # Check if variant is non ref in the individuals
            if self._is_variant(gemini_variant, indexes):
                index += 1
                logger.debug("Updating index to: {0}".format(index))

                variant = self._format_variant(
                    gemini_variant=gemini_variant,
                    individual_objs=individuals,
                    index=index
                )
                yield variant
Exemple #30
0
def family_wise_predicate(args):
    formatter = select_formatter(args)
    families = get_family_dict(args)
    gq = GeminiQuery.GeminiQuery(args.db, out_format=formatter)
    predicates = []
    for f in families.values():
        family_names = [x.name for x in f]
        subjects = get_subjects_in_family(args, f).values()
        predicates.append(
            select_subjects_predicate(subjects, args, family_names))

    def predicate(row):
        return sum([p(row) for p in predicates]) >= args.min_kindreds

    return predicate
Exemple #31
0
def gemini_query(db):
    """Execute a GEMINI Query
    :param db: A GEMINI database name
    :type db: str
    :returns:  GeminiRow Results -- True or False.
    """
    query = "SELECT chrom, start, end, ref, alt, vcf_id, rs_ids, cosmic_ids, filter, qual, qual_depth, depth, " \
            "type, sub_type, " \
            "gene, transcript, exon, codon_change, aa_change, biotype, impact, impact_so, impact_severity, " \
            "aa_length, is_lof, is_conserved, pfam_domain, in_omim, clinvar_sig, clinvar_disease_name, " \
            "is_exonic, is_coding, is_splicing, " \
            "clinvar_origin, clinvar_causal_allele, clinvar_dbsource, clinvar_dbsource_id, " \
            "clinvar_on_diag_assay, rmsk, in_segdup, strand_bias, rms_map_qual, in_hom_run, num_mapq_zero, " \
            "num_reads_w_dels, grc, gms_illumina, in_cse, num_alleles, allele_count, haplotype_score, " \
            "is_somatic, somatic_score, aaf_esp_ea, aaf_esp_aa, aaf_esp_all, aaf_1kg_amr, " \
            "aaf_1kg_eas, aaf_1kg_sas, aaf_1kg_afr, aaf_1kg_eur, aaf_1kg_all, aaf_exac_all, aaf_adj_exac_all, " \
            "aaf_adj_exac_afr, aaf_adj_exac_amr, aaf_adj_exac_eas, aaf_adj_exac_fin, aaf_adj_exac_nfe, " \
            "aaf_adj_exac_oth, aaf_adj_exac_sas, max_aaf_all, in_esp, in_1kg, in_exac, info," \
            "(gts).(*), (gt_depths).(*), (gt_ref_depths).(*), (gt_alt_depths).(*) FROM variants"

    gq = GeminiQuery(db)
    gq.run(query)

    return gq
Exemple #32
0
def stats_region(chrom):
    # Note: chrom is give as an argument

    # we then extract start and end using HTML GET
    start = request.GET.get('start', '').strip()
    end = request.GET.get('end', '').strip()

    # construct a query
    query = "SELECT start, end from variants"
    query += " WHERE chrom = '" + chrom + "'"
    query += " AND start >= " + start
    query += " AND end <= " + end

    # issue the query
    gq = GeminiQuery.GeminiQuery(database)
    gq._set_gemini_browser(True)
    gq.run(query)

    # return query results in JSON format
    return {'features': [dict(row) for row in gq]}
Exemple #33
0
def phase_genotypes(database):

	gq=GeminiQuery(database)
	families = subjects.get_families(database)
	gq.run("select chrom, start, end, ref, alt, gene, impact, gts, gt_types, gt_ref_depths, gt_alt_depths from variants")
	s2i=gq.sample_to_idx
	for row in gq:
		mendelian = ""
		phasable = ""
		inheritance = ""
		origin = ""
		phasedata = ""
		chrom = str(row['chrom'])
		start = str(row['start'])
		end = str(row['end'])
		ref = str(row['ref'])
		alt = str(row['alt'])
		gene = str(row['gene'])
		impact = str(row['impact'])
		for family in families:
			dad_idx = s2i[family.father_name]
			mom_idx = s2i[family.mother_name]
			dad_gt = str(row['gts'][dad_idx])
			mom_gt = str(row['gts'][mom_idx])
			dad_gt_type = row['gt_types'][dad_idx]
			mom_gt_type = row['gt_types'][mom_idx]
			dad_gt_ref_depths = str(row['gt_ref_depths'][dad_idx])
			mom_gt_ref_depths = str(row['gt_ref_depths'][mom_idx])
			dad_gt_alt_depths = str(row['gt_alt_depths'][dad_idx])
			mom_gt_alt_depths = str(row['gt_alt_depths'][mom_idx])
			#m5=re.search('((?:\w*|\.*))/((?:\w*|\.*))',dad_gt)
			m5=string.split(dad_gt,"/")
			#m6=re.search('((?:\w*|\.*))/((?:\w*|\.*))',mom_gt)
			m6=string.split(mom_gt,"/")
			for child in family.children:
				kid_idx = s2i[str(child.name)]
				kid_gt = str(row['gts'][kid_idx])
				kid_gt_type = row['gt_types'][kid_idx]
				kid_gt_ref_depths = str(row['gt_ref_depths'][kid_idx])
				kid_gt_alt_depths = str(row['gt_alt_depths'][kid_idx])
				if kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 1:
					mendelian = "mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "inherited from both parents"
				elif kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 3:
					mendelian = "mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "inherited from both parents"
				elif kid_gt_type == 2 or dad_gt_type == 2 or mom_gt_type == 2:
					mendelian = "missing allele - unknown"
					phasable = "missing allele - unknown"
					inheritance = "missing allele - unknown"
					origin = "missing allele - unknown"
				elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 3 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 0:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "extremely rare de novo or bad data"
				elif kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 3:
					mendelian = "mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "inherited from both parents or unlikely de novo"
				elif kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 1:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "homozygous alternate from dad, heterozygous allele from mom"
					origin = "inherited from both parents or unlikely de novo"
					ct=0
					if m5[0] == m6[ct]:
						phasedata = m5[0]+" from dad "+m6[ct+1]+" from mom"
					else:
						phasedata = m5[0]+" from dad "+m6[ct]+" from mom"
				elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 1:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "homozygous reference from dad_gt_type, heterozygous allele from mom_gt_type"
					origin = "inherited from both parents or unlikely de novo"
					ct=0
					if m5[0] == m6[ct]:
						phasedata = m5[0]+" from dad "+m6[ct+1]+" from mom"
					else:
						phasedata = m5[0]+" from dad "+m6[ct]+" from mom"
				elif kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 3:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "heterozygous allele from dad_gt_type, homozygous alternate from mom_gt_type"
					origin = "inherited from both parents or unlikely de novo"
					ct=0
					if m5[ct] == m6[0]:
						phasedata = m5[ct+1]+" from dad "+m6[0]+" from mom"
					else:
						phasedata = m5[ct]+" from dad "+m6[0]+" from mom"
				elif kid_gt_type == 1 and dad_gt_type == 1 and mom_gt_type == 0:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "heterozygous allele from dad_gt_type, homozygous reference from mom_gt_type"
					origin = "inherited from both parents or unlikely de novo"
					ct=0
					if m5[ct] == m6[0]:
						phasedata = m5[ct+1]+" from dad "+m6[0]+" from mom"
					else:
						phasedata = m5[ct]+" from dad "+m6[0]+" from mom"
				elif kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 3 or kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 0:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 0:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "homozygous alternate from dad_gt_type, homozygous reference from mom_gt_type"
					origin = "inherited from both parents or unlikely de novo"
					phasedata = m5[0]+" from dad "+m6[0]+" from mom"
				elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 3:
					mendelian = "mendelian"
					phasable = "phasable"
					inheritance = "homozygous reference from dad_gt_type, homozygous alternate from mom_gt_type"
					origin = "inherited from both parents or unlikely de novo"
					phasedata = m5[0]+" from dad "+m6[0]+" from mom"
				elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				try: 
					print chrom + "	" + start + "	" + end + "	" + family.family_id + "	" + child.name + "	" + ref + "	" + alt + "	" + gene + "	" + impact + "	" + kid_gt_ref_depths + "	" + dad_gt_ref_depths + "	" + mom_gt_ref_depths + "	" + kid_gt_alt_depths + "	" + dad_gt_alt_depths + "	" + mom_gt_alt_depths + "	" + kid_gt + "	" + dad_gt + "	" + mom_gt + "	" + str(kid_gt_type) + "	" + str(dad_gt_type) + "	" + str(mom_gt_type) + "	" + mendelian + "	" + phasable + "	" + inheritance + "	" + origin + "	" + phasedata
				except TypeError:	
					print chrom + "	" + start + "	" + end + "	" + family.family_id + "	" + child.name + "	" + ref + "	" + alt + "	" + gene + "	" + impact + "	" + kid_gt_ref_depths + "	" + dad_gt_ref_depths + "	" + mom_gt_ref_depths + "	" + kid_gt_alt_depths + "	" + dad_gt_alt_depths + "	" + mom_gt_alt_depths + "	" + kid_gt + "	" + dad_gt + "	" + mom_gt + "	" + str(kid_gt_type) + "	" + str(dad_gt_type) + "	" + str(mom_gt_type) + "	" + mendelian + "	" + phasable + "	" + inheritance + "	" + origin
def get_impact_gene(args):
    gq = GeminiQuery(args.geminidb)
    gq.run("SELECT gene FROM variants WHERE chrom == '{chrom}' AND start == {pos}"
           .format(chrom=args.chrom, pos=args.pos - 1))
    return gq.next()["gene"]
#!/usr/share/gemini/anaconda/bin/python -E
# -*- coding: utf-8 -*-
# usage: gemini_summarize.py <query> <gemini.db>

import sys
import locale
from gemini import GeminiQuery

DP_THRESHOLD = 8
GQ_THRESHOLD = 20

reload(sys)
sys.setdefaultencoding(locale.getpreferredencoding())

gq = GeminiQuery(sys.argv[2], include_gt_cols=True)
gq.run(sys.argv[1], None, True)

header_printed = False
genotype_columns = [
    'gt_depths', 'gt_ref_depths', 'gt_alt_depths', 'gts', 'gt_quals'
]
for row in gq:
    columns = row.row.keys()[:-11]
    if (not header_printed):
        # Print file header
        print '\t'.join(['SAMPLE_ID'] + [s[:-1]
                                         for s in genotype_columns] + columns)
        header_printed = True

    # Output only het & hom alt variants
    for sample in row['variant_samples']:
#!/usr/share/gemini/anaconda/bin/python -E
# -*- coding: utf-8 -*-
# usage: gemini_summarize.py <query> <gemini.db>

import sys
import locale
from gemini import GeminiQuery

DP_THRESHOLD = 8
GQ_THRESHOLD = 20

reload(sys)
sys.setdefaultencoding(locale.getpreferredencoding())

gq = GeminiQuery(sys.argv[2], include_gt_cols=True)
gq.run(sys.argv[1], None, True)

header_printed = False
genotype_columns = ["gt_depths", "gt_ref_depths", "gt_alt_depths", "gts", "gt_quals"]
for row in gq:
    columns = row.row.keys()[:-3]
    if not header_printed:
        # Print file header
        print "\t".join(["SAMPLE_ID"] + [s[:-1] for s in genotype_columns] + columns)
        header_printed = True

        # Output only het & hom alt variants
    for sample in row.variant_samples:
        # Drop low depth and low quality variants
        if (
            row["gt_depths"][gq.sample2index[sample]] < DP_THRESHOLD
Exemple #37
0
def get_individuals(variant_source, case_lines=None, case_type='ped', variant_mode='vcf'):
        """Get the individuals from a vcf file, gemini database, and/or a ped file.

            Args:
                variant_source (str): Path to a variant source
                case_lines(Iterable): Ped like lines
                case_type(str): Format of ped lines

            Returns:
                individuals (generator): generator with Individuals
        """
        individuals = []
        ind_dict ={}

        if variant_mode == 'vcf':
            head = get_header(variant_source)
            #Dictionary with ind_id:index where index show where in vcf ind info is

            for index, ind in enumerate(head.individuals):
                ind_dict[ind] = index

            if case_lines:
                # read individuals from ped file
                family_parser = FamilyParser(case_lines, family_type=case_type)
                families = family_parser.families
                logger.debug("Found families {0}".format(
                            ','.join(list(families.keys()))))
                if len(families) != 1:
                    logger.error("Only one family can be used with vcf adapter")
                    raise IOError

                case_id = list(families.keys())[0]
                logger.debug("Family used in analysis: {0}".format(case_id))

                for ind_id in family_parser.individuals:
                    ind = family_parser.individuals[ind_id]
                    logger.info("Found individual {0}".format(ind.individual_id))
                    try:
                        individual = Individual(
                            ind_id=ind_id,
                            case_id=case_id,
                            mother=ind.mother,
                            father=ind.father,
                            sex=str(ind.sex),
                            phenotype=str(ind.phenotype),
                            variant_source=variant_source,
                            ind_index=ind_dict[ind_id],
                            )
                        individuals.append(individual)
                    except KeyError as err:
                        #This is the case when individuals in ped does not exist
                        #in vcf
                        raise PedigreeError(
                            family_id=case_id,
                            individual_id=ind_id,
                            message="Individual {0} exists in ped file but not in vcf".format(ind_id)
                            )

            else:
                case_id = os.path.basename(variant_source)

                for ind in ind_dict:
                    individual = Individual(
                        ind_id=ind,
                        case_id=case_id,
                        variant_source=variant_source,
                        ind_index=ind_dict[ind]
                        )
                    individuals.append(individual)

                    logger.debug("Found individual {0} in {1}".format(
                                 ind, variant_source))
        elif variant_mode == 'gemini':
            gq = GeminiQuery(variant_source)
            #Dictionaru with sample to index in the gemini database
            ind_dict = gq.sample_to_idx
            query = "SELECT * from samples"
            gq.run(query)
            for individual in gq:
                logger.debug("Found individual {0} with family id {1}".format(
                    individual['name'], individual['family_id']))
                individuals.append(
                    Individual(
                        ind_id=individual['name'],
                        case_id=individual['family_id'],
                        mother=individual['maternal_id'],
                        father=individual['paternal_id'],
                        sex=individual['sex'],
                        phenotype=individual['phenotype'],
                        ind_index=ind_dict.get(individual['name']),
                        variant_source=variant_source,
                        bam_path=None)
                        )

        return individuals
def get_impact_gene(args):
    gq = GeminiQuery(args.geminidb)
    gq.run(
        "SELECT gene FROM variants WHERE chrom == '{chrom}' AND start == {pos}"
        .format(chrom=args.chrom, pos=args.pos - 1))
    return gq.next()["gene"]
Exemple #39
0
def find_de_novo():

	#defines input arguments
	parser = argparse.ArgumentParser(description='Finds de novos')
	parser.add_argument('-f','--input_file', default='', help='The input file; should be a SQLite .db that gemini can read')
	parser.add_argument('-p','--min_total_parent_depth', default = '0', help='The minimum total read depth for parental alleles for variants to be considered')
	parser.add_argument('-c','--min_total_child_depth', default = '0', help='The minimum total read depth for child alleles for variants to be considered')
	parser.add_argument('-m','--max_alt_parent_depth', default = '0', help='The maximum alternate read depth for parental alleles for de novos to be considered')
	parser.add_argument('-a','--max_alt_child_depth', default = '0', help='The maximum alternate read depth for child alleles for de novos to be considered')
	
	#checks minimum number of arguments
	if len(sys.argv)<1:
		parser.print_help()
		sys.exit("Where is the input file?")
    
    #parses arguments
	args = parser.parse_args()
	database=args.input_file
	mtpd=int(args.min_total_parent_depth)
	mtcd=int(args.min_total_child_depth)
	mapd=int(args.max_alt_parent_depth)
	macd=int(args.max_alt_child_depth)
	if database == '':
		sys.exit("You must supply an input file")

	gq=GeminiQuery(database)
	families = subjects.get_families(database)
	gq.run("select chrom, start, end, ref, alt, gene, impact, gts, gt_types, gt_ref_depths, gt_alt_depths from variants")
	s2i=gq.sample_to_idx
	for row in gq:
		mendelian = ""
		phasable = ""
		inheritance = ""
		origin = ""
		phasedata = ""
		chrom = str(row['chrom'])
		start = str(row['start'])
		end = str(row['end'])
		ref = str(row['ref'])
		alt = str(row['alt'])
		gene = str(row['gene'])
		impact = str(row['impact'])
		for family in families:
			dad_idx = s2i[family.father_name]
			mom_idx = s2i[family.mother_name]
			dad_gt = str(row['gts'][dad_idx])
			mom_gt = str(row['gts'][mom_idx])
			dad_gt_type = row['gt_types'][dad_idx]
			mom_gt_type = row['gt_types'][mom_idx]
			dad_gt_ref_depths = str(row['gt_ref_depths'][dad_idx])
			mom_gt_ref_depths = str(row['gt_ref_depths'][mom_idx])
			dad_gt_alt_depths = str(row['gt_alt_depths'][dad_idx])
			mom_gt_alt_depths = str(row['gt_alt_depths'][mom_idx])
			#m5=re.search('((?:\w*|\.*))/((?:\w*|\.*))',dad_gt)
			m5=string.split(dad_gt,"/")
			#m6=re.search('((?:\w*|\.*))/((?:\w*|\.*))',mom_gt)
			m6=string.split(mom_gt,"/")
			for child in family.children:
				kid_idx = s2i[str(child.name)]
				kid_gt = str(row['gts'][kid_idx])
				kid_gt_type = row['gt_types'][kid_idx]
				kid_gt_ref_depths = str(row['gt_ref_depths'][kid_idx])
				kid_gt_alt_depths = str(row['gt_alt_depths'][kid_idx])
				#code for removing variants that do not meet parameters
				if int(dad_gt_ref_depths)+int(dad_gt_alt_depths)<mtpd or int(mom_gt_ref_depths)+int(mom_gt_alt_depths)<mtpd \
				or int(kid_gt_ref_depths)+int(kid_gt_alt_depths)<mtcd \
				or int(dad_gt_alt_depths)>mapd or int(mom_gt_alt_depths)>mapd \
				or int(kid_gt_alt_depths)>macd: \
					continue				
				if kid_gt_type == 2 or dad_gt_type == 2 or mom_gt_type == 2:
					continue
				elif kid_gt_type == 1 and dad_gt_type == 0 and mom_gt_type == 0 or kid_gt_type == 1 and dad_gt_type == 3 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 3 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 0:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "extremely rare de novo or bad data"
				elif kid_gt_type == 0 and dad_gt_type == 1 and mom_gt_type == 3 or kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 1 or kid_gt_type == 3 and dad_gt_type == 1 and mom_gt_type == 0:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 3 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 0 and dad_gt_type == 0 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				elif kid_gt_type == 0 and dad_gt_type == 3 and mom_gt_type == 0 or kid_gt_type == 3 and dad_gt_type == 0 and mom_gt_type == 3:
					mendelian = "non-mendelian"
					phasable = "unphasable"
					inheritance = "unknown"
					origin = "de novo or erroneous data"
				else:
					continue

				print chrom + "	" + start + "	" + end + "	" + family.family_id + "	" + child.name + "	" + ref + "	" + alt + "	" + gene + "	" + impact + "	" + kid_gt_ref_depths + "	" + dad_gt_ref_depths + "	" + mom_gt_ref_depths + "	" + kid_gt_alt_depths + "	" + dad_gt_alt_depths + "	" + mom_gt_alt_depths + "	" + kid_gt + "	" + dad_gt + "	" + mom_gt + "	" + str(kid_gt_type) + "	" + str(dad_gt_type) + "	" + str(mom_gt_type) + "	" + mendelian + "	" + phasable + "	" + inheritance + "	" + origin
Exemple #40
0
def run_gemini_query(self, id, query, genotype_filter, json_filename, mode, results_string, user_id):

    json_results_fh = os.path.join(STATIC_FOLDER, json_filename)
    results_file = "/static/%s" % json_filename

    self.update_state(state='PROGRESS', meta={'status': 'Setup'})
    sys.stderr.write("DEBUG: Retrieving GEMINI db object\n")
    gdb = models.GDatabase.objects.get(id = ObjectId(id))

    sys.stderr.write("DEBUG: Setup Query Object\n")
    gq = GeminiQuery(gdb.file, out_format=JSONRowFormat(None))

    self.update_state(state='PROGRESS', meta={'status': 'Running Query'})
    sys.stderr.write("DEBUG: Run GEMINi Query\n")
    gq.run(query, genotype_filter)

    sys.stderr.write("DEBUG: Getting header\n")
    header = gq.header
    js_header = []
    for key in header:
        string = key.replace('.', '\\\\.')
        js_header.append(string)

    #The json result file is a unique name generated from the database name, query, and genotype_filter
    #If the json results file already exists we save some time by skipping generating the file.
    #We only re-executed the query to get the header object.
    count1 = 0
    #rows = []
    sys.stderr.write("DEBUG: Checking if file exists\n")
    self.update_state(state='PROGRESS', meta={'status': 'Checking for Existence of File'})
    if not os.path.isfile(json_results_fh):
        sys.stderr.write("DEBUG: Opening results file\n")
        self.update_state(state='PROGRESS', meta={'status': 'Writing data to JSON file'})
        with open(json_results_fh, "wb") as file:
            count = 0
            file.write("""{\n"data": [\n""")
            for row in gq:
                #rows.append(row)
                if count == 0:
                    file.write("%s" % row)
                else:
                    file.write(",\n%s" % row)
                count += 1
            file.write("""\n]\n}\n""")


        self.update_state(state='PROGRESS', meta={'status': 'File Writing Complete'})
        sys.stderr.write("DEBUG: Done writing results file\n")

        #Save Results to database
        self.update_state(state='PROGRESS', meta={'status': 'Sending Results to Database'})
        sys.stderr.write("DEBUG: Saving results to database\n")
        sys.stderr.write("DEBUG: Fetching user\n")
        user = models.User.objects.get(id=user_id)
        result_elements = results_string.split('_')

        sys.stderr.write("DEBUG: Creating result object\n")
        r = models.GResult(header = header, js_header = js_header, query = query, query_slug = result_elements[3],
                           created_on = datetime.datetime.now, created_by = user, last_accessed = datetime.datetime.now)

        sys.stderr.write("DEBUG: JSON Opening file\n")
        file = open(json_results_fh, 'rb')
        sys.stderr.write("DEBUG: Adding file\n")
        r.json.put(file, content_type = 'application/json')
        sys.stderr.write("DEBUG: Saving results\n")
        r.save()

        sys.stderr.write("DEBUG: Appending results to GEMINI database entry\n")
        gdb.results.append(r)
        sys.stderr.write("DEBUG: Saving\n")
        gdb.save()
        self.update_state(state='PROGRESS', meta={'status': 'Complete'})

    sys.stderr.write("DEBUG: Returning Results\n")
    self.update_state(state='SUCCESS', meta={'status': 'Results completed'})
    return (header, js_header, results_file, gdb.file, query, genotype_filter, results_string, json_results_fh)
def extract_shared_mutations(database, reference_file=None):

    phenotypes = generate_phenotypes(database)
    query = GeminiQuery(database)
    query_string = ("SELECT chrom, start, end, gene, ref, alt, type, sub_type,"
                    "impact, codon_change, aa_change, vcf_id, cosmic_ids"
                    " FROM variants WHERE in_1kg=0")

    query.run(query_string, show_variant_samples=True)
    rows = list()

    for row in query:

        variants = row.variant_samples

        if any(item in phenotypes[1] for item in variants) and any(
               item in phenotypes[2] for item in variants):

            valid_groups = list()

            chrom, start, end, alt = (row["chrom"], row["start"],
                                      row["end"], row["alt"])

            # In the case of intergenic regions, get the name of the
            # closest gene

            if row["gene"] is None:
                gene = get_nearby_gene(chrom, start, end)
                # print "None subsituted with", gene
            else:
                gene = row["gene"]

            for gid, group in groupby(variants, lambda x: x.split("_")[1]):

                # Rename according to Pandora guidelines
                # Starts with 0: 1 + number
                # 3 digits: 20 + number
                # 4 digits: 2 + number

                if len(gid) < 5:
                    newgid = "2" + gid if len(gid) == 4 else "20" + gid
                else:
                    newgid = gid

                group = list(group)

                if len(list(group)) == 2:

                    # Check if we have different ALT bases for the samples
                    # in the same pair. If this occurs, it is a false positive
                    # and should be discarded. To do so, we need a VCF file to
                    # query by base, otherwise we take the value as-is.

                    if reference_file is not None:

                        alt = check_multiple_alts(chrom, start, end, alt,
                                                  group, reference_file)

                        if alt is None:
                            # Biallelic site for pair - discard
                            continue

                    valid_groups.append(newgid)

            cosmic_data = "Yes" if row["cosmic_ids"] else "No"

            data = [chrom, start, end, gene, row["ref"], alt, row["type"],
                    row["sub_type"], row["impact"], row["codon_change"],
                    row["aa_change"], row["vcf_id"], cosmic_data]

            if not valid_groups:
                rows.append(data + [np.nan])
            else:
                for gid in valid_groups:
                    rows.append(data + [gid])

    colnames = ["chrom", "start", "end", "gene", "ref", "alt", "type",
                "sub_type", "impact", "codon_change", "aa_change",
                "dbsnp_id", "in_cosmic", "variants_with_pairs"]

    df = pd.DataFrame.from_records(rows, columns=colnames)
    df.set_index(["chrom", "start", "end", "gene", "ref", "alt"], inplace=True)
    # Get rid of loci without pairs
    df = df.dropna(subset=["variants_with_pairs"])

    return df
Exemple #42
0
 def test_gemini_db(self):
     """Check if self.db is a valid gemini database
             Raises sqlite3.DatabaseError if not a valid databse
         """
     gq = GeminiQuery(self.db)
     return True
Exemple #43
0
def query():
    def _get_fields():
        query = request.GET.get('query', '').strip()
        gt_filter = request.GET.get('gt_filter', '').strip()
        use_header = request.GET.get('use_header')
        igv_links = request.GET.get('igv_links')
        return query, gt_filter or None, use_header, igv_links

    # user clicked the "submit" button
    if request.GET.get('submit', '').strip():

        (query, gt_filter, use_header, igv_links) = _get_fields()

        if use_header: use_header = True
        if igv_links: igv_links = True

        gq = GeminiQuery.GeminiQuery(database)
        gq._set_gemini_browser(True)
        gq.run(query, gt_filter)

        if len(query) == 0:
            return template('query.j2', dbfile=database)

        if igv_links and ('chrom' not in query.lower() or 'start'
                          not in query.lower() or 'end' not in query.lower()):
            return template('query.j2',
                            dbfile=database,
                            rows=gq,
                            igv_links=igv_links,
                            igv_links_error=True,
                            use_header=use_header,
                            gt_filter=gt_filter,
                            query=query)
        else:
            return template('query.j2',
                            dbfile=database,
                            rows=gq,
                            igv_links=igv_links,
                            igv_links_error=False,
                            use_header=use_header,
                            gt_filter=gt_filter,
                            query=query)

    # user clicked the "save to file" button
    elif request.GET.get('save', '').strip():

        (query, gt_filter, use_header, igv_links) = _get_fields()

        gq = GeminiQuery.GeminiQuery(database)
        gq.run(query, gt_filter)

        if len(query) == 0:
            return template('query.j2', dbfile=database)

        # dump the results to a text file.  this will be
        # stored in /static and a link will be given to
        # the user.
        tmp_file = '/tmp.txt'
        tmp = open(_static_folder + tmp_file, 'w')

        for i, row in enumerate(gq):
            if i == 0 and use_header:
                tmp.write('\t'.join([str(key) for key in row.keys()]) + '\n')

            tmp.write('\t'.join([str(row[key]) for key in row.keys()]) + '\n')

        tmp.close()

        return template('query.j2',
                        dbfile=database,
                        tmp_file=tmp_file,
                        igv_links=igv_links,
                        igv_links_error=True,
                        use_header=use_header,
                        gt_filter=gt_filter,
                        query=query)
    # user did nothing.
    else:
        return template('query.j2', dbfile=database)