def poly_aaa_vs_spidex(variants_by_gene): """Analysis of poly A track changing mutations using data from SPIDEX.""" aaa_variants_list = all_poly_a_variants(variants_by_gene) raw_report = spidex_from_list(aaa_variants_list) print('Unique points', len(raw_report)) print('Plotting') variants_groups = divide_variants_by_poly_aaa(raw_report) plot_aaa_vs_spidex(variants_groups) print('ks test') spidex_aaa_ks_test(variants_groups, already_divided=True)
def get_data_from_ensembl_api(variants): import requests api_report = [] for variant in all_poly_a_variants(variants, merge_variants_with_multiple_id=False): server = 'http://rest.ensembl.org' # server = 'http://grch37.rest.ensembl.org/' GRCH 37 has no eqtls implemented ext = '/eqtl/variant_name/homo_sapiens/%s?statistic=p-value;content-type=application/json' % variant.snp_id try: r = requests.get(server + ext, headers={'content-type': 'application/json'}) if not r.ok: r.raise_for_status() sys.exit() decoded = r.json() if 'error' not in decoded: print('Got data for %s' % variant.snp_id) # print(repr(decoded)) for datum in decoded: for transcript in variant.affected_transcripts: for alt, aaa_data in transcript.poly_aaa.items(): report_chunk = (variant.snp_id, datum['tissue'], datum['value'], datum['gene'], aaa_data.increased, aaa_data.decreased, aaa_data.change, variant.chr_name, variant.chr_start, variant.ref, alt, transcript.strand, transcript.ensembl_id, transcript.cds_start, transcript.cds_end) api_report += [report_chunk] except Exception as e: print(e) return api_report
def poly_aaa_vs_expression(variants_by_gene): bdb = ExpressionDatabase(GTEX_DATABASE) def is_length_difference_big(l1, l2): """Is the first list much longer than the second?""" len1 = len(l1) len2 = len(l2) assert len1 > len2 if len2 == 0 or len1 // len2 > 10: return True gtex_report = [] gtex_report_with_tissue = [] aaa_variants_list = list(all_poly_a_variants(variants_by_gene)) print('Analysing %s poly_a related variants (out of %s total).' % (len(aaa_variants_list), len(variants_by_gene))) for variant in aaa_variants_list: for transcript in variant.affected_transcripts: if not transcript.poly_aaa: continue expression_data_by_alt = bdb.get_by_mutation(variant, transcript) transcript.expression = {} for alt, aaa_data in transcript.poly_aaa.items(): expression_data = expression_data_by_alt.get(alt, None) if not expression_data: continue else: print('Expression data for', variant.snp_id, 'found:', expression_data) expression_up = [] expression_down = [] data = transcript.poly_aaa[alt] for tissue_name, slope, gene in expression_data: gtex_report_with_tissue.append( (variant.snp_id, tissue_name, slope, gene, data.increased, data.decreased, data.change, variant.chr_name, variant.chr_start, variant.ref, alt, transcript.strand, transcript.ensembl_id, transcript.cds_start, transcript.cds_end)) slope = float(slope) if slope > 0: expression_up += [tissue_name] elif slope < 0: expression_down += [tissue_name] # is this rather up? if len(expression_up) > len(expression_down): # is this certainly up? if is_length_difference_big(expression_up, expression_down): expression_trend = 'up' else: expression_trend = 'rather_up' # is this rather down? elif len(expression_down) > len(expression_up): # is this certainly down? if is_length_difference_big(expression_down, expression_up): expression_trend = 'down' else: expression_trend = 'rather_down' # is unchanged? else: expression_trend = 'constant' expression_up_in_x_cases = len(expression_up) expression_down_in_x_cases = len(expression_down) transcript.expression[alt] = expression_trend report_chunk = (variant.snp_id, expression_up_in_x_cases, expression_down_in_x_cases, expression_trend, data.increased, data.decreased, data.change, variant.chr_name, variant.chr_start, variant.ref, alt, transcript.strand, transcript.ensembl_id, transcript.cds_start, transcript.cds_end) gtex_report += [report_chunk] """ gtex_report += [( sum('up' in v.expression.values() for v in poly_a_related_variants), sum('down' in v.expression.values() for v in poly_a_related_variants), sum( sum('up' == expr for expr in v.expression.values()) for v in poly_a_related_variants ), sum( sum('down' == expr for expr in v.expression.values()) for v in poly_a_related_variants ), sum(data.increased for v in poly_a_related_variants for data in v.poly_aaa.values()), sum(data.decreased for v in poly_a_related_variants for data in v.poly_aaa.values()) )] """ report('expression table for variants (based on data from gtex)', ['\t'.join(map(str, line)) for line in gtex_report], [ 'variant', 'expression+', 'expression-', 'trend', 'aaa+', 'aaa-', 'aaa_change', 'chrom', 'pos', 'ref', 'alt', 'strand', 'transcript', 'cds_start', 'cds_end' ]) report( 'expression table for variants with tissues (based on data from gtex)', ['\t'.join(map(str, line)) for line in gtex_report_with_tissue], [ 'variant', 'tissue', 'slope', 'gene', 'aaa+', 'aaa-', 'aaa_change', 'chrom', 'pos', 'ref', 'alt', 'strand', 'transcript', 'cds_start', 'cds_end' ]) summarize_tissue_eqtl_aaa_correlation(gtex_report_with_tissue) #report( # 'Expression table for genes (based on data from GTEx)', # ['\t'.join(map(str, line)) for line in gtex_report_by_genes], # # note: alleles is not the same as variants # [ # 'gene', 'alleles with expression+', 'alleles with expression-', # 'variants with expression+', 'variants with expression-', '#aaa+', '#aaa-' # ] #) print('Done')
def spidex_aaa_ks_test(variants_groups, already_divided=False): if not already_divided: aaa_variants_list = all_poly_a_variants(variants_groups) raw_report = spidex_from_list(aaa_variants_list) variants_groups = divide_variants_by_poly_aaa(raw_report) groups_zscores = { name: [point['dpsi_zscore'] for point in group] for name, group in variants_groups.items() } for group_1, group_2 in combinations(groups_zscores, 2): print('%s vs %s:' % (group_1, group_2)) z_scores_1 = groups_zscores[group_1] z_scores_2 = groups_zscores[group_2] ks_result = ks_2samp(z_scores_1, z_scores_2) print(ks_result) groups_new_aaa_lengths = defaultdict(list) for name, group in variants_groups.items(): for point in group: new_aaa_length = point['new_aaa_length'] groups_new_aaa_lengths[new_aaa_length].append(point['dpsi_zscore']) group = None name = None ks_results = {} for new_aaa_length in sorted(groups_new_aaa_lengths): print( 'All mutations causing poly_aaa to be <= %s vs all mutations causing poly_aaa to be > %s:' % (new_aaa_length, new_aaa_length)) z_scores_1 = [ zscore for name, group in groups_new_aaa_lengths.items() for zscore in group if name <= new_aaa_length ] z_scores_2 = [ zscore for name, group in groups_new_aaa_lengths.items() for zscore in group if name > new_aaa_length ] if not z_scores_2: print('No mutations causing poly_aaa to be > %s' % new_aaa_length) continue ks_result = ks_2samp(z_scores_1, z_scores_2) print(new_aaa_length, ks_result) ks_results[new_aaa_length] = -np.log(ks_result.pvalue) lengths = list(ks_results.keys()) plt.hist(lengths, weights=list(ks_results.values()), bins=list(ks_results.keys()), rwidth=0.9) plt.xticks(lengths) plt.xlabel('Length of poly(A) track: $x$') plt.ylabel(r'$-\log($P-Value$)$') plt.title( 'Ks-test for groups: ' 'mutations effecting in poly(A) length $\leq$ $x$ vs mutations effecting in poly(A) length > $x$' ) plt.grid(True) save_plot(plt)
def summarize_poly_aaa_variants(variants): columns = [ 'snp_id', 'gene', 'poly_aaa_increase', 'poly_aaa_decrease', 'poly_aaa_change', 'chr', 'start', 'end', 'ref', 'alt', 'transcript', 'cds_start', 'cds_end' ] Record = recordclass('RecordPolyA', columns) aaa_records = [] aaa_variants = set() up_variants = {} down_variants = {} all_variants_ids = [] variants_sources = Counter() transcripts = set() new_poly_a = 0 in_poly_a = 0 for variant in all_poly_a_variants(variants, preserve_sources=True): all_variants_ids.extend(variant.snp_id.split(',')) new = False in_a = False for transcript in variant.affected_transcripts: if not transcript.poly_aaa: continue for alt, aaa_data in transcript.poly_aaa.items(): record = Record( variant.snp_id, None, #variant.ensembl_gene_stable_id # TODO aaa_data.increased, aaa_data.decreased, aaa_data.change, variant.chr_name, variant.chr_start, variant.chr_end, variant.ref, alt, transcript.ensembl_id, transcript.cds_start, transcript.cds_end) if not aaa_data.has and aaa_data.will_have: new = True if aaa_data.has: in_a = True if aaa_data.increased: up_variants[variant] = True if aaa_data.decreased: down_variants[variant] = True transcripts.add(transcript.ensembl_id) aaa_records.append(record) aaa_variants.add(variant) if new: new_poly_a += 1 if in_a: in_poly_a += 1 for source in set(variant.source.split(',')): variants_sources[source] += 1 report('poly aaa increase and decrease by variants', aaa_records, columns) report('poly aaa sources', variants_sources.items(), ['source', 'count']) report('all ids', all_variants_ids) print('Variants creating new poly(A) tracks: %s' % new_poly_a) print('Variants in existing poly(A) tracks: %s' % in_poly_a) print('Affected transcripts: %s' % len(transcripts)) print('Down variants: %s' % len(down_variants)) print('Up variants: %s' % len(up_variants)) print('Unique variants: %s' % len(aaa_variants)) print('Variants identifiers: %s' % sum(v.snp_id.count(',') + 1 for v in aaa_variants)) print(variants_sources)
def poly_aaa_consequences(variants): mutations_in_cds_hgvs_format = defaultdict(list) indels = Counter() all = Counter() for variant in all_poly_a_variants(variants, preserve_sources=True): for transcript in variant.affected_transcripts: if not transcript.poly_aaa: continue for alt, aaa_data in transcript.poly_aaa.items(): if aaa_data.increased: category = 'increased' elif aaa_data.decreased: category = 'decreased' else: category = 'constant' hgvs = transcript.as_hgvs(variant.ref, alt) if 'del' in hgvs or 'ins' in hgvs: indels[category] += 1 all[category] += 1 mutations_in_cds_hgvs_format[category].append(hgvs) mutations_in_cds_hgvs_format['all'].append(hgvs) print('Indels enrichment:') for category in indels: print(category, indels[category] / all[category] * 100, '%') for category, muts in mutations_in_cds_hgvs_format.items(): report( 'Mutations which result in ' + category + ' in cds hgvs formats', muts) consequences = defaultdict(Counter) skipped = Counter() for category, muts in mutations_in_cds_hgvs_format.items(): filename = report( 'Mutations which result in ' + category + ' in cds hgvs formats', muts) vep_filename = vep(filename) for line in open(vep_filename): if line.startswith('#'): continue line = line.split('\t') tested_transcript = line[0].split(':')[0] vep_transcript = line[4] if line[5] != 'Transcript': skipped['Not a transcript feature'] += 1 continue if tested_transcript != vep_transcript: skipped['Different transcript'] += 1 continue variant_consequences = line[6].split(',') for consequence in variant_consequences: consequences[category][consequence] += 1 print(skipped) print('Raw consequences') print(consequences) graph = load_sequence_ontology() expanded_consequences = propagate_consequences(graph, consequences) for category, counts in expanded_consequences.items(): consequences_to_include = ['coding_sequence_variant'] consequences_to_include.extend(counts.keys()) g = graph.subgraph([ node for node, data in graph.nodes(data=True) if data['name'] in consequences_to_include ]) g = g.reverse() max_count = max(counts.values()) for node, data in g.nodes_iter(data=True): name = data['name'] count = counts[name] color = (255 - int(log((count / max_count) + 1) * 255), 255, 255) g.node[node]['style'] = 'filled' g.node[node]['shape'] = 'box' color = '#%02x%02x%02x' % color g.node[node]['fillcolor'] = color if name not in consequences[category]: g.node[node]['style'] = 'dashed,filled' g = nx.relabel_nodes( g, { node: data['name'].replace('variant', 'v.') + ': %s' % counts.get(data['name']) for node, data in g.nodes(data=True) }) a = nx_agraph.to_agraph(g) a.layout( 'dot', args= '-Nfontsize=14 -Nwidth=".2" -Nheight=".2" -Nmargin=.1 -Gfontsize=8 -Earrowsize=.5' ) a.draw('reports/poly_a_consequences_dag_' + category + '.svg') selected_consequences_groups = { 'General coding': ['synonymous_variant', 'frameshift_variant', 'inframe_variant'], 'Inframe': [ 'inframe_deletion', 'inframe_insertion', 'missense_variant', 'stop_gained', 'stop_lost' ] } for group, selected_consequences in selected_consequences_groups.items(): for category, counts in expanded_consequences.items(): data = { consequence: counts[consequence] for consequence in selected_consequences } data = OrderedDict(sorted(data.items(), key=itemgetter(1))) # Create a pie chart wedges = plt.pie( list(data.values()), labels=list(data.keys()), shadow=False, colors=plt.cm.tab20( numpy.linspace(1, 0, len(selected_consequences))), startangle=0, autopct='%1.1f%%', ) for pie_wedge in wedges[0]: pie_wedge.set_edgecolor('black') # View the plot drop above plt.axis('equal') plt.title(group + ' consequences for variants causing ' + category + ' in poly(A) length') plt.tight_layout() save_plot(plt, hide_title=True)