Exemple #1
0
    def handle(self, *args, **options):
        project_id = options['project_id']
        print("Loading data into project: " + project_id)
        project = Project.objects.get(project_id = project_id)

        cnv_filename = options['cnv_filename']
        bed_files_directory = options['bed_files_directory']
        
        if not os.path.isfile(cnv_filename):
            raise ValueError("CNV file %s doesn't exist" % options['cnv_filename'])
        
        with open(cnv_filename) as f:
            header_fields = f.readline().rstrip('\n').split('\t')
            for line in f:
                fields = line.rstrip('\n').split('\t')
                row_dict = dict(zip(header_fields, fields))

                chrom = "chr"+row_dict['chr']
                start = int(row_dict['start'])
                end = int(row_dict['end'])
                #left_overhang = int(row_dict['left_overhang_start'])
                #right_overhang = int(row_dict['right_overhang_end'])

                sample_id = row_dict['sample']
                try:
                    i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id)
                except Exception as e:
                    print("WARNING: %s: %s not found in %s" % (e, sample_id, project))
                    continue
                
                bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id)
                if not os.path.isfile(bed_file_path):
                    print("WARNING: .bed file not found: " + bed_file_path)

                    if i.cnv_bed_file != bed_file_path:
                        print("Setting cnv_bed_file path to %s" % bed_file_path)
                        i.cnv_bed_file = bed_file_path
                        i.save()
                
                project_collection = get_project_datastore(project)._get_project_collection(project_id)
                family_collection = get_mall(project).variant_store._get_family_collection(project_id, i.family.family_id)

                for collection in filter(None, [project_collection, family_collection]):
                    
                    collection.update_many(
                        {'$and': [
                            {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)} },
                            {'xpos': {'$lte': genomeloc.get_single_location(chrom, end)}}
                        ]},
                        {'$set': {'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict}})

                    #result = list(collection.find({'$and' : [
                    #       {'xpos': {'$gte':  genomeloc.get_single_location(chrom, start)}},
                    #       {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]},
                    #   {'genotypes.%s.extras.cnvs' % i.indiv_id :1 }))
                    #print(chrom, start, end, len(result), result[0] if result else None)

        print("Done")
Exemple #2
0
def iterate_coverage_bed_file(bed_file):
    for line in bed_file:
        fields = line.strip().split('\t')
        chr = 'chr' + fields[0]
        start = int(fields[1])
        end = int(fields[2])-1
        xstart = genomeloc.get_single_location(chr, start)
        xstop = genomeloc.get_single_location(chr, end)
        coverage = constants.COVERAGE_TAG_MAP[fields[3]]

        yield CoverageInterval(xstart=xstart, xstop=xstop, coverage=coverage)
Exemple #3
0
def get_variant_from_vcf_fields(vcf_fields, alt_allele_pos):
    """
    Get a basic variant from vcf_fields, for allele given by alt_allele_pos
    """

    chrom = vcf_fields[0] if 'chr' in vcf_fields[0] else 'chr' + vcf_fields[0]
    pos = int(vcf_fields[1])

    # if we can't get a genomic location, just ignore it and print a message humans will ignore too
    # obviously need a better way to approach this
    if not genomeloc.valid_pos(chrom, pos):
        print "ERROR: could not figure out coordinates for %s:%d...maybe a nonstandard chromosome?" % (chrom, pos)
        return None

    ref = vcf_fields[3]
    orig_alt_alleles = vcf_fields[4].split(',')
    alt = orig_alt_alleles[alt_allele_pos]

    xpos = genomeloc.get_single_location(chrom, pos)
    xpos, ref, alt = get_minimal_representation(xpos, ref, alt)

    variant = Variant(xpos, ref, alt)
    variant.set_extra('alt_allele_pos', alt_allele_pos)
    variant.set_extra('orig_alt_alleles', orig_alt_alleles)

    if vcf_fields[2] and vcf_fields[2] != '.':
        variant.vcf_id = vcf_fields[2]

    return variant
def iterate_coverage_bed_file(bed_file):
    for line in bed_file:
        fields = line.strip().split('\t')
        chr = 'chr' + fields[0]
        start = int(fields[1])
        end = int(fields[2])
        xstart = genomeloc.get_single_location(chr, start)
        xend = genomeloc.get_single_location(chr, end)
        coverage = constants.COVERAGE_TAG_MAP[fields[3]]

        yield {
            'xstart': xstart,
            'xend': xend,
            'xpos': {'xstart': xstart, 'xend': xend}, # for geospatial indexing
            'coverage': coverage,
        }
Exemple #5
0
 def get_all_exons(self):
     """
     Get a list of all exons (order not guaranteed) from ensembl
     Fetched from database, not REST
     """
     cursor = self.db_conn.cursor()
     cursor.execute("select exon.stable_id, seq_region.name, exon.seq_region_start, exon.seq_region_end from exon "
                    "join seq_region on exon.seq_region_id=seq_region.seq_region_id")
     exons = []
     for row in cursor:
         chr = ensembl_parsing_utils.get_chr_from_seq_region_name(row[1])
         start = row[2]
         stop = row[3]
         if chr is None:
             continue
         exon = dict(exon_id=row[0])
         exon['xstart'] = genomeloc.get_single_location(chr, start)
         exon['xstop'] = genomeloc.get_single_location(chr, stop)
         exons.append(exon)
     return exons
def load_coverage_file(path):
    """Load the given ExAC coverage file"""
    
    print("Loading file: " + path)
    with gzip.open(path) as f:
        header = next(f).replace("#chrom", "chrom").rstrip('\n').split('\t')
        for line in f:  # tqdm(f, unit=' lines'):
            fields = line.rstrip('\n').split('\t')
            fields[2:] = map(float, fields[2:])  # covert stats to float

            values = dict(zip(header, fields))
            chrom = 'chr'+values['chrom']
            values['pos'] = int(values['pos'])
            
            xpos = genomeloc.get_single_location(chrom, values['pos'])
            values['xpos'] = xpos

            #print("Inserting " + str(values)) 
            COVERAGE_DB.exac_v3_coverage.insert(values)
def get_exac_af(chrom, pos, ref, alt):
    populations = ['AMR', 'EAS', 'FIN', 'NFE', 'SAS', 'AFR']

    chrom_without_chr = chrom.replace("chr", "")
    xpos = genomeloc.get_single_location(chrom, pos)
    variant_length = len(ref) + len(alt)

    # check whether the alleles match
    matching_exac_variant = None
    matching_exac_variant_i = None
    for record in exac_vcf.fetch(chrom_without_chr, pos - variant_length, pos + variant_length):
        exac_xpos = genomeloc.get_xpos(record.CHROM, record.POS)
        for exac_alt_i, exac_alt in enumerate(record.ALT):
            exac_variant_xpos, exac_ref, exac_alt = get_minimal_representation(exac_xpos, str(record.REF), str(exac_alt))
            if exac_variant_xpos == xpos and exac_ref == ref and exac_alt == alt:
                if matching_exac_variant is not None:
                    print("ERROR: multiple exac variants match the variant: %s %s %s %s" % (chrom, pos, ref, alt))
                matching_exac_variant = record
                matching_exac_variant_i = exac_alt_i
                #print("Variant %s %s %s matches: %s %s %s %s" % (xpos, ref, alt, record, exac_variant_xpos, exac_ref, exac_alt) )

    if matching_exac_variant is None:
        #print("Variant %s %s %s %s not found in ExAC" % (chrom, pos, alt, ref))
        return None, None, None

    pop_max_af = -1
    pop_max_population = None
    for p in populations:
        if matching_exac_variant.INFO['AN_'+p] > 0:
            pop_af = matching_exac_variant.INFO['AC_'+p][matching_exac_variant_i]/float(matching_exac_variant.INFO['AN_'+p])
            if pop_af > pop_max_af:
                pop_max_af = pop_af
                pop_max_population = p


    if matching_exac_variant.INFO['AN_Adj'] != 0:
        global_af = float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i])/float(matching_exac_variant.INFO['AN_Adj'])
    else:
        assert float(matching_exac_variant.INFO['AC_Adj'][matching_exac_variant_i]) == 0
        global_af = 0

    return global_af, pop_max_af, pop_max_population
    def handle_individual(self, project, individual):
        project_id = project.project_id
        individual_id = individual.indiv_id

        print("Processing individual %s" % individual_id)
        # get variants that have been tagged or that have a note that starts with "REPORT"
        variants_in_report_and_notes = defaultdict(str)
        for vt in VariantTag.objects.filter(project_tag__project=project,
                                            project_tag__tag="REPORT",
                                            family=individual.family):

            variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = ""

        for vn in VariantNote.objects.filter(project=project, family=individual.family):
            if vn.note and vn.note.strip().startswith("REPORT"):
                variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = ""

        header = ["gene_name", "genotype", "variant", "functional_class",
                  "hgvs_c", "hgvs_p", "rsid",
                  "exac_global_af", "exac_pop_max_af", "exac_pop_max_population",
                  "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars",
                  "clinvar_url", "comments"]

        if len(variants_in_report_and_notes) != 0:
            with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out:
                #print("\t".join(header))
                out.write("\t".join(header) + "\n")

                # retrieve text of all notes that were left for any of these variants
                for vn in VariantNote.objects.filter(project=project, family=individual.family):
                    if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes:
                        other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)]
                        if len(other_notes) > 0:
                            other_notes += "||"
                        variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % (vn.date_saved, vn.user.email, vn.note.strip())

                for (xpos, ref, alt), notes in variants_in_report_and_notes.items():

                    #chrom, pos = genomeloc.get_chr_pos(xpos)

                    v = get_mall(project_id).variant_store.get_single_variant(project_id, individual.family.family_id, xpos, ref, alt)
                    if v is None:
                        raise ValueError("Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt))

                    row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes)
                    if row is None:
                        continue
                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")

                #print(variant_tag.project_tag.title, variant_tag.project_tag.tag,  variant_tag.xpos, variant_tag.ref, variant_tag.alt)


        with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out:
            header = ["gene_chrom", "gene_start", "gene_end"] + header + ["json_dump"]
            #print("\t".join(header))
            out.write("\t".join(header) + "\n")
            for gene_id, (chrom, start, end) in gene_loc.items():
                xpos_start = genomeloc.get_single_location("chr" + chrom, start)
                xpos_end = genomeloc.get_single_location("chr" + chrom, end)
                for v in get_mall(project_id).variant_store.get_variants_in_range(project_id, individual.family.family_id, xpos_start, xpos_end):

                    json_dump = str(v.genotypes)
                    try:
                        notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)]
                    except KeyError:
                        notes = ""
                    row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id)
                    if row is None:
                        continue
                    row = map(str, [chrom, start, end] + row + [json_dump])

                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")
Exemple #9
0
    def get_gene_structure(self, gene_id):
        """
        Query ensembl API for the transcript/exon structure of a gene
        This is the foundation of the elements in db.genes
        Exception if can't process gene
        """

        gene = {}

        # gene basics
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'gene'}
        gene_list_json = requests.get(url, params=params).json()
        gene_list_json = [item for item in gene_list_json if item['ID'] == gene_id]
        if len(gene_list_json) == 0:
            raise Exception("No genes with ID %s" % gene_id)
        if len(gene_list_json) > 1:
            raise Exception(">1 ensembl genes with ID %s" % gene_id)
        gene_json = gene_list_json[0]

        chr = ensembl_parsing_utils.get_chr_from_seq_region_name(gene_json['seq_region_name'])
        if chr is None:
            raise Exception("Gene %s is on a nonstandard chromosome: %s" % (gene_id, chr) )

        gene['chr'] = chr
        gene['start'] = gene_json['start']
        gene['stop'] = gene_json['end']
        gene['xstart'] = genomeloc.get_single_location(chr, gene['start'])
        gene['xstop'] = genomeloc.get_single_location(chr, gene['stop'])

        gene['gene_id'] = gene_json['ID']
        gene['symbol'] = gene_json['external_name']
        gene['description'] = gene_json['description']
        gene['biotype'] = gene_json['biotype']

        # transcripts
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'transcript'}
        transcript_json = [t for t in requests.get(url, params=params).json() if t['Parent'] == gene_id]

        gene['transcripts'] = []
        for t in transcript_json:
            transcript_id = t['ID']
            transcript = dict(
                transcript_id=transcript_id,
                biotype=t['biotype'],
                start=t['start'],
                stop=t['end']
            )
            transcript['xstart'] = genomeloc.get_single_location(chr, transcript['start'])
            transcript['xstop'] = genomeloc.get_single_location(chr, transcript['stop'])

            # exons_for_transcript
            url = self._get_rest_url() + '/feature/id/%s' % transcript_id
            params = {'content-type': 'application/json', 'feature': 'exon'}
            transcript_exon_json = requests.get(url, params=params).json()
            transcript['exons'] = [
                e['ID'] for e in sorted(transcript_exon_json, key=lambda x: x['start']) if e['Parent'] == transcript_id
            ]

            gene['transcripts'].append(transcript)

        # exons
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'exon'}
        exon_json = requests.get(url, params=params).json()

        transcript_ids = {t['transcript_id'] for t in gene['transcripts']}
        exon_ids_seen = set()
        gene['exons'] = []
        for e in exon_json:
            exon_id = e['ID']
            # skip exons that aren't actually in one of this gene's transcripts
            if e['Parent'] not in transcript_ids:
                continue
            if exon_id in exon_ids_seen:
                continue
            exon = {
                'exon_id': exon_id,
                'start': e['start'],
                'stop': e['end'],
            }
            exon['xstart'] = genomeloc.get_single_location(chr, exon['start'])
            exon['xstop'] = genomeloc.get_single_location(chr, exon['stop'])
            gene['exons'].append(exon)
            exon_ids_seen.add(e['ID'])

        # cds
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'cds'}
        cds_json = requests.get(url, params=params).json()

        cds_map = {}  # map from (start, stop) -> {start, stop, transcripts}
        for c in cds_json:
            # skip exons that aren't actually in one of this gene's transcripts
            if c['Parent'] not in transcript_ids:
                continue
            cds_t = (c['start'], c['end'])
            if cds_t not in cds_map:
                cds_map[cds_t] = {
                    'start': c['start'],
                    'stop': c['end'],
                    'xstart': genomeloc.get_single_location(chr, c['start']),
                    'xstop': genomeloc.get_single_location(chr, c['end']),
                    'transcripts': [],
                }
            cds_map[cds_t]['transcripts'].append(c['Parent'])
        gene['cds'] = sorted(cds_map.values(), key=lambda x: (x['start'], x['stop']))
        for i, cds in enumerate(gene['cds']):
            cds['cds_id'] = '%s-%i' % (gene['gene_id'], i+1)
        return gene
Exemple #10
0
    def handle_individual(self, project, individual):
        project_id = project.project_id
        individual_id = individual.indiv_id

        # get variants that have been tagged or that have a note that starts with "REPORT"
        variants_in_report_and_notes = defaultdict(str)
        for vt in VariantTag.objects.filter(project_tag__project=project,
                                            project_tag__tag="REPORT",
                                            family=individual.family):
            variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = ""

        for vn in VariantNote.objects.filter(project=project,
                                             family=individual.family):
            if vn.note and vn.note.strip().startswith("REPORT"):
                variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = ""

        if len(variants_in_report_and_notes) == 0:
            print(
                "skipping individual %s since no variants are tagged in family %s..."
                % (individual_id, individual.family.family_id))
            return

        header = [
            "gene_name", "genotype", "variant", "hgvs_c", "hgvs_p", "rsid",
            "exac_global_af", "exac_pop_max_af", "exac_pop_max_population",
            "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars",
            "clinvar_url", "comments"
        ]
        with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id),
                  "w") as out:
            #print("\t".join(header))
            out.write("\t".join(header) + "\n")

            # retrieve text of all notes that were left for any of these variants
            for vn in VariantNote.objects.filter(project=project,
                                                 family=individual.family):
                if vn.note and (vn.xpos, vn.ref,
                                vn.alt) in variants_in_report_and_notes:
                    other_notes = variants_in_report_and_notes[(vn.xpos,
                                                                vn.ref,
                                                                vn.alt)]
                    if len(other_notes) > 0:
                        other_notes += "||"
                    variants_in_report_and_notes[(
                        vn.xpos, vn.ref,
                        vn.alt)] = other_notes + "%s|%s|%s" % (
                            vn.date_saved, vn.user.email, vn.note.strip())

            for (xpos, ref,
                 alt), notes in variants_in_report_and_notes.items():

                #chrom, pos = genomeloc.get_chr_pos(xpos)

                v = get_mall(project_id).variant_store.get_single_variant(
                    project_id, individual.family.family_id, xpos, ref, alt)
                if v is None:
                    raise ValueError(
                        "Couldn't find variant in variant store for: %s, %s, %s %s %s"
                        % (project_id, individual.family.family_id, xpos, ref,
                           alt))

                row = self.get_output_row(v,
                                          v.ref,
                                          v.alt,
                                          individual.indiv_id,
                                          individual.family,
                                          all_fields=True,
                                          comments=notes)
                if row is None:
                    continue
                #print("\t".join(row))
                out.write("\t".join(row) + "\n")

                #print(variant_tag.project_tag.title, variant_tag.project_tag.tag,  variant_tag.xpos, variant_tag.ref, variant_tag.alt)

        with open("report_for_%s_%s.genes.txt" % (project_id, individual_id),
                  "w") as out:
            header = ["gene_chrom", "gene_start", "gene_end"
                      ] + header + ["json_dump"]
            #print("\t".join(header))
            out.write("\t".join(header) + "\n")
            for gene_name, (chrom, start, end) in gene_loc.items():
                xpos_start = genomeloc.get_single_location(
                    "chr" + chrom, start)
                xpos_end = genomeloc.get_single_location("chr" + chrom, end)
                for v in get_mall(
                        project_id).variant_store.get_variants_in_range(
                            project_id, individual.family.family_id,
                            xpos_start, xpos_end):
                    json_dump = str(v.genotypes)
                    for alt in v.alt.split(","):
                        try:
                            notes = variants_in_report_and_notes[(v.xpos,
                                                                  v.ref, alt)]
                        except KeyError:
                            notes = ""
                        row = self.get_output_row(v,
                                                  v.ref,
                                                  alt,
                                                  individual.indiv_id,
                                                  individual.family,
                                                  comments=notes)
                        if row is None:
                            continue
                        row = map(str, [chrom, start, end] + row + [json_dump])
                        #print("\t".join(row))
                        out.write("\t".join(row) + "\n")
def write_snp_fileset(family, output_dir_path):
    """
    Write a set of files for a family that can be passed to linkage engine
    Creates the following files:
        variants.txt
        [family_id].fam
        markers.txt
        disease_model.json
    """

    individuals = family.get_individuals()

    # fam file
    fam_file_path = os.path.join(output_dir_path, family.family_id + '.fam')
    f = open(fam_file_path, 'w')
    for indiv in individuals:
        fields = [
            family.family_id,
            indiv.indiv_id,
            indiv.paternal_id if indiv.paternal_id else '.',
            indiv.maternal_id if indiv.maternal_id else '.',
            '2' if indiv.gender == 'F' else ('1' if indiv.gender == 'F' else '0'),
            '2' if indiv.affected == 'A' else ('1' if indiv.affected == 'N' else '0'),
        ]
        f.write('\t'.join(fields)+'\n')
    f.close()

    # markers.txt
    markers_path = os.path.join(output_dir_path, 'markers.txt')
    shutil.copy(settings.COMMON_SNP_FILE, markers_path)

    # disease model
    disease_model_path = os.path.join(output_dir_path, 'disease_model.txt')
    f = open(disease_model_path, 'w')
    f.writelines([
        "DD\t.001\n",
        "Dd\t.001\n",
        "dd\t.999\n",
    ])
    f.close()

    # variants.txt
    variants_file_path = os.path.join(output_dir_path, 'variants.txt')
    f = open(variants_file_path, 'w')
    f.write('#CHR\tPOS\tREF\tALT')
    for indiv in individuals:
        f.write('\t'+indiv.indiv_id)
    f.write('\n')
    for _line in open(settings.COMMON_SNP_FILE):
        fields = _line.strip('\n').split('\t')
        xpos = genomeloc.get_single_location('chr'+fields[0], int(fields[1]))
        ref = fields[2]
        alt = fields[3]
        variant = get_mall().variant_store.get_single_variant(family.project.project_id, family.family_id, xpos, ref, alt)
        fields = [
            fields[0],
            fields[1],
            fields[2],
            fields[3],
        ]
        for indiv in individuals:
            if variant:
                genotype = variant.get_genotype(indiv.indiv_id)
                fields.append(str(genotype.num_alt) if genotype.num_alt is not None else '.')
            else:
                fields.append('0')
        f.write('\t'.join(fields)+'\n')
    f.close()
Exemple #12
0
    def handle(self, *args, **options):
        project_id = options['project_id']
        print("Loading data into project: " + project_id)
        project = Project.objects.get(project_id=project_id)

        cnv_filename = options['cnv_filename']
        bed_files_directory = options['bed_files_directory']

        if not os.path.isfile(cnv_filename):
            raise ValueError("CNV file %s doesn't exist" %
                             options['cnv_filename'])

        with open(cnv_filename) as f:
            header_fields = f.readline().rstrip('\n').split('\t')
            for line in f:
                fields = line.rstrip('\n').split('\t')
                row_dict = dict(zip(header_fields, fields))

                chrom = "chr" + row_dict['chr']
                start = int(row_dict['start'])
                end = int(row_dict['end'])
                #left_overhang = int(row_dict['left_overhang_start'])
                #right_overhang = int(row_dict['right_overhang_end'])

                sample_id = row_dict['sample']
                try:
                    i = Individual.objects.get(project=project,
                                               indiv_id__istartswith=sample_id)
                except Exception as e:
                    print("WARNING: %s: %s not found in %s" %
                          (e, sample_id, project))
                    continue

                bed_file_path = os.path.join(bed_files_directory,
                                             "%s.bed" % sample_id)
                if not os.path.isfile(bed_file_path):
                    print("WARNING: .bed file not found: " + bed_file_path)

                    if i.cnv_bed_file != bed_file_path:
                        print("Setting cnv_bed_file path to %s" %
                              bed_file_path)
                        i.cnv_bed_file = bed_file_path
                        i.save()

                project_collection = get_project_datastore(
                    project)._get_project_collection(project_id)
                family_collection = get_mall(
                    project).variant_store._get_family_collection(
                        project_id, i.family.family_id)

                for collection in filter(
                        None, [project_collection, family_collection]):

                    collection.update_many(
                        {
                            '$and': [{
                                'xpos': {
                                    '$gte':
                                    genomeloc.get_single_location(
                                        chrom, start)
                                }
                            }, {
                                'xpos': {
                                    '$lte':
                                    genomeloc.get_single_location(chrom, end)
                                }
                            }]
                        }, {
                            '$set': {
                                'genotypes.%s.extras.cnvs' % i.indiv_id:
                                row_dict
                            }
                        })

                    #result = list(collection.find({'$and' : [
                    #       {'xpos': {'$gte':  genomeloc.get_single_location(chrom, start)}},
                    #       {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]},
                    #   {'genotypes.%s.extras.cnvs' % i.indiv_id :1 }))
                    #print(chrom, start, end, len(result), result[0] if result else None)

        print("Done")
Exemple #13
0
    def handle_individual(self, project, individual):
        project_id = project.project_id
        individual_id = individual.indiv_id

        print("Processing individual %s" % individual_id)
        # get variants that have been tagged or that have a note that starts with "REPORT"
        variants_in_report_and_notes = defaultdict(str)
        for vt in VariantTag.objects.filter(project_tag__project=project,
                                            project_tag__tag="REPORT",
                                            family=individual.family):

            variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = ""

        for vn in VariantNote.objects.filter(project=project,
                                             family=individual.family):
            if vn.note and vn.note.strip().startswith("REPORT"):
                variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = ""

        header = [
            "gene_name", "genotype", "variant", "functional_class", "hgvs_c",
            "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af",
            "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat",
            "number_of_stars", "clinvar_url", "comments"
        ]

        if len(variants_in_report_and_notes) != 0:
            with open(
                    "report_for_%s_%s.flagged.txt" %
                (project_id, individual_id), "w") as out:
                #print("\t".join(header))
                out.write("\t".join(header) + "\n")

                # retrieve text of all notes that were left for any of these variants
                for vn in VariantNote.objects.filter(project=project,
                                                     family=individual.family):
                    if vn.note and (vn.xpos, vn.ref,
                                    vn.alt) in variants_in_report_and_notes:
                        other_notes = variants_in_report_and_notes[(vn.xpos,
                                                                    vn.ref,
                                                                    vn.alt)]
                        if len(other_notes) > 0:
                            other_notes += "||"
                        variants_in_report_and_notes[(
                            vn.xpos, vn.ref,
                            vn.alt)] = other_notes + "%s|%s|%s" % (
                                vn.date_saved, vn.user.email, vn.note.strip())

                for (xpos, ref,
                     alt), notes in variants_in_report_and_notes.items():

                    #chrom, pos = genomeloc.get_chr_pos(xpos)

                    v = get_mall(project).variant_store.get_single_variant(
                        project_id, individual.family.family_id, xpos, ref,
                        alt)
                    if v is None:
                        print(
                            "Rerieving variant from previous callset version (MYOSEQ_v20_previous1)"
                        )
                        v = get_mall(project).variant_store.get_single_variant(
                            'MYOSEQ_v20_previous1',
                            individual.family.family_id, xpos, ref, alt)
                    if v is None:
                        raise ValueError(
                            "Couldn't find variant in variant store for: %s, %s, %s %s %s"
                            % (project_id, individual.family.family_id, xpos,
                               ref, alt))

                    row = self.get_output_row(v,
                                              xpos,
                                              ref,
                                              alt,
                                              individual_id,
                                              individual.family,
                                              all_fields=True,
                                              comments=notes)
                    if row is None:
                        continue

                    out.write("\t".join(row) + "\n")

        with open("report_for_%s_%s.genes.txt" % (project_id, individual_id),
                  "w") as out:
            header = ["gene_chrom", "gene_start", "gene_end"
                      ] + header + ["json_dump"]

            out.write("\t".join(header) + "\n")
            for gene_id, (chrom, start, end) in gene_loc.items():
                xpos_start = genomeloc.get_single_location(
                    "chr" + chrom, start)
                xpos_end = genomeloc.get_single_location("chr" + chrom, end)
                variant_filter = VariantFilter(locations=[(xpos_start,
                                                           xpos_end)])
                for v in get_mall(project).variant_store.get_variants(
                        project_id,
                        individual.family.family_id,
                        variant_filter=variant_filter):

                    json_dump = str(v.genotypes)
                    try:
                        notes = variants_in_report_and_notes[(v.xpos, v.ref,
                                                              v.alt)]
                    except KeyError:
                        notes = ""
                    row = self.get_output_row(v,
                                              v.xpos,
                                              v.ref,
                                              v.alt,
                                              individual_id,
                                              individual.family,
                                              comments=notes,
                                              gene_id=gene_id)
                    if row is None:
                        continue
                    row = map(str,
                              ["chr" + chrom.replace("chr", ""), start, end] +
                              row + [json_dump])

                    out.write("\t".join(row) + "\n")
Exemple #14
0
    def get_gene_structure(self, gene_id):
        """
        Query ensembl API for the transcript/exon structure of a gene
        This is the foundation of the elements in db.genes
        Exception if can't process gene
        """

        gene = {}

        # gene basics
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'gene'}
        gene_list_json = requests.get(url, params=params).json()
        gene_list_json = [
            item for item in gene_list_json if item['ID'] == gene_id
        ]
        if len(gene_list_json) == 0:
            raise Exception("No genes with ID %s" % gene_id)
        if len(gene_list_json) > 1:
            raise Exception(">1 ensembl genes with ID %s" % gene_id)
        gene_json = gene_list_json[0]

        chr = ensembl_parsing_utils.get_chr_from_seq_region_name(
            gene_json['seq_region_name'])
        if chr is None:
            raise Exception("Gene %s is on a nonstandard chromosome: %s" %
                            (gene_id, chr))

        gene['chr'] = chr
        gene['start'] = gene_json['start']
        gene['stop'] = gene_json['end']
        gene['xstart'] = genomeloc.get_single_location(chr, gene['start'])
        gene['xstop'] = genomeloc.get_single_location(chr, gene['stop'])

        gene['gene_id'] = gene_json['ID']
        gene['symbol'] = gene_json['external_name']
        gene['description'] = gene_json['description']
        gene['biotype'] = gene_json['biotype']

        # transcripts
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'transcript'}
        transcript_json = [
            t for t in requests.get(url, params=params).json()
            if t['Parent'] == gene_id
        ]

        gene['transcripts'] = []
        for t in transcript_json:
            transcript_id = t['ID']
            transcript = dict(transcript_id=transcript_id,
                              biotype=t['biotype'],
                              start=t['start'],
                              stop=t['end'])
            transcript['xstart'] = genomeloc.get_single_location(
                chr, transcript['start'])
            transcript['xstop'] = genomeloc.get_single_location(
                chr, transcript['stop'])

            # exons_for_transcript
            url = self._get_rest_url() + '/feature/id/%s' % transcript_id
            params = {'content-type': 'application/json', 'feature': 'exon'}
            transcript_exon_json = requests.get(url, params=params).json()
            transcript['exons'] = [
                e['ID']
                for e in sorted(transcript_exon_json, key=lambda x: x['start'])
                if e['Parent'] == transcript_id
            ]

            gene['transcripts'].append(transcript)

        # exons
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'exon'}
        exon_json = requests.get(url, params=params).json()

        transcript_ids = {t['transcript_id'] for t in gene['transcripts']}
        exon_ids_seen = set()
        gene['exons'] = []
        for e in exon_json:
            exon_id = e['ID']
            # skip exons that aren't actually in one of this gene's transcripts
            if e['Parent'] not in transcript_ids:
                continue
            if exon_id in exon_ids_seen:
                continue
            exon = {
                'exon_id': exon_id,
                'start': e['start'],
                'stop': e['end'],
            }
            exon['xstart'] = genomeloc.get_single_location(chr, exon['start'])
            exon['xstop'] = genomeloc.get_single_location(chr, exon['stop'])
            gene['exons'].append(exon)
            exon_ids_seen.add(e['ID'])

        # cds
        url = self._get_rest_url() + '/feature/id/%s' % gene_id
        params = {'content-type': 'application/json', 'feature': 'cds'}
        cds_json = requests.get(url, params=params).json()

        cds_map = {}  # map from (start, stop) -> {start, stop, transcripts}
        for c in cds_json:
            # skip exons that aren't actually in one of this gene's transcripts
            if c['Parent'] not in transcript_ids:
                continue
            cds_t = (c['start'], c['end'])
            if cds_t not in cds_map:
                cds_map[cds_t] = {
                    'start': c['start'],
                    'stop': c['end'],
                    'xstart': genomeloc.get_single_location(chr, c['start']),
                    'xstop': genomeloc.get_single_location(chr, c['end']),
                    'transcripts': [],
                }
            cds_map[cds_t]['transcripts'].append(c['Parent'])
        gene['cds'] = sorted(cds_map.values(),
                             key=lambda x: (x['start'], x['stop']))
        for i, cds in enumerate(gene['cds']):
            cds['cds_id'] = '%s-%i' % (gene['gene_id'], i + 1)
        return gene