def plot(): print('Plotando...') # ###### PLOT ALL NUMERIC! data.dropna(inplace=True) data.drop('same_strand', 1, inplace=True) data = data.infer_objects() data = data.select_dtypes(exclude=['object']) pd.plotting.scatter_matrix(data.drop(['end', 'start'], 1)) data.plot.scatter('start', 'transcription', alpha=.2) data.plot.scatter('distance', 'transcription', alpha=.2) plot_box(data[data.distance > 0], 'distance', 'transcription', bins=50) genome_map = data.sort_values('start')[['start', 'transcription']] genome_map['start'] = pd.cut(genome_map.start, int(1e4)) genome_map = genome_map.groupby('start').sum() le = len(genome_map.transcription) plt.figure() plt.pcolor( genome_map.transcription.values.reshape(int(le**.5), int(le**.5))) plt.colorbar() plt.figure() plt.pcolor(data.corr(method='spearman')) plt.figure() data.transcription.hist() save_all_figs()
def plot(counts): hist = counts.value_counts() print(hist) soma = hist.sum() print('soma:', soma, 'fração de 0 repetições:', hist[0]/soma, 'fração com até 1 rep.', hist[0:2].sum()/soma) for log in (False, True): print(f'Construindo histograma, log={log}') plt.figure(figsize=(11, 4.8)) # ax = counts.hist(bins=200, figsize=(4, 4.8)) plt.bar(hist.index, hist, log=log) plt.xlabel('Número de repetições') plt.ylabel('Número de sequências') plt.title('Quantidade de cópias repetidas') print("Histograma salvo.") save_all_figs()
def main(): #print('Quantidades das populações') #[print(name, s) for name, s in u.get_subsets(d).items()] for corr_transcr in ('transcription', 'complement_transcription', 'gene_transcription', 'gene_complement_transcription', 'correlation', 'complement_correlation', 'motherlength', 'repetitions'): u.print_header(corr_transcr) # D = d[d[corr_transcr] != 0].dropna(subset=[corr_transcr]) # drop zeros D = d.dropna(subset=[corr_transcr]) # print(f'Comprimento total: {d.shape[0]} | Comprimento coluna: {D.shape[0]}') if 'gene' in corr_transcr: D = D.sort_values('distance') # Keep only closest copy to each G D = D.drop_duplicates(subset='neighbor_gene') subsets = get_subsets(D, corr_transcr) compare(subsets, corr_transcr) if show_flag: plt.show() else: save_all_figs()
# olap_mask = (max(mask.start - hstart, 0), # min(mask.end - hstart, 1000)) # for i in range(*olap_mask): # mask_count[i] += 1 # # plt.fill_between(olap_mask, *axesy, alpha=.1) # count += 1 # # print(irow, '/', n_heads, 'plotted:', count) # # except KeyboardInterrupt: # pass # # return mask_count # NORMALIZE complete_counts = [c / max(complete_counts) for c in complete_counts] #mask_count = [c/max(mask_count) for c in mask_count] plt.figure(figsize=(9, 4.8)) plt.plot(complete_counts, label='Read count') #plt.plot(mask_count, label='Masked count') plt.title('Perfil geral de transcrição das sequências sonda') plt.xlabel('Distância ao início da sonda (pb)') plt.ylabel('Contagem de reads normalizada') if show_flag: plt.show() save_all_figs() print('Pronto.')
def main(func): counts_sum = func(counts) counts_sum = counts_sum.iloc[1:] genes_data = counts_sum[counts_sum.index.str.startswith('Smp')] genes_data = genes_data.iloc[~genes_data.index.str.endswith('complement')] with_perere = head_data.gene_transcription.dropna().drop_duplicates() lone_genes = genes_data.drop( head_data.neighbor_gene.dropna().unique()).dropna() # print( # with_perere, # lone_genes, # genes_data # ) print(head_data.neighbor_gene.duplicated().sum()) plt.figure(figsize=(6, 10)) u.multibox_compare( (with_perere, lone_genes, genes_data), ('Com Perere-3 vizinho', 'Sem Perere-3 vizinho', 'Total'), margin=3) if __name__ == '__main__': for func in (pd.DataFrame.sum, pd.DataFrame.max): main(counts) u.save_all_figs()
def main(): head_data = pd.read_table(pardir/'genome_annotation/all_together_now.tsv') # # Very important to drop NaN's! (we use stuff like data[data.relative_position != 'olap']) # This keeps only heads with neighbor genes. head_data = head_data.dropna().reset_index(drop=True) # ################# CORRELATION ######################## print("TABELA DE CORRELAÇÕES DE SPEARMAN") print(head_data[['transcription', 'correlation', 'distance']].corr(method='spearman')) # #################### SPLITS ########################## # drop overlapped head_data = head_data[head_data.relative_position != 'olap'] # ##### SAME/DIFF STRAND same_strand = head_data[head_data.same_strand] diff_strand = head_data.drop(same_strand.index) # ##### UP/DOWN-STREAM: -----> --> # downstream = same_strand[(same_strand.strand == '+') & # (same_strand.relative_position == 'dir')] # downstream = downstream.append(same_strand[(same_strand.strand == '-') & # (same_strand.relative_position == 'esq')]) # upstream = same_strand[(same_strand.strand == '+') & # (same_strand.relative_position == 'esq')] # upstream = upstream.append(same_strand[(same_strand.strand == '-') & # (same_strand.relative_position == 'dir')]) # ##============================================================== downstream = head_data[(head_data.strand == '+') & (head_data.relative_position == 'dir')] downstream = downstream.append(head_data[(head_data.strand == '-') & (head_data.relative_position == 'esq')]) upstream = head_data.drop(downstream.index) # ################# Wilcoxon ##################### thresholds = (1e3, 1e4, 2e4, upstream.append(downstream).distance.max()) abc = ('a) ', 'b) ', 'c) ') fig, axs = plt.subplots(1, len(thresholds), figsize=(11, 4.8)) for ax in axs: ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) ax = fig.add_subplot(111, frameon=False) ax.grid(False) plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) plt.ylabel('Correlação transcricional com o gene vizinho') for i, thresh in enumerate(thresholds): # Data selection a, b = [p.loc[p.distance <= thresh].correlation for p in [downstream, upstream]] pvalue = mannwhitneyu(a, b).pvalue plabel = f'p-valor:\n{pvalue:.5f}' label = f"Distância < {thresh:.0f}" print(label, pvalue, 'Medians:', a.median(), b.median()) fig.add_subplot(1, len(thresholds), i + 1, frameon=False) plt.title(label) boxplot([a, b]) plt.annotate(plabel, (.5, .2), xycoords='axes fraction', ha='center') plt.xticks([0, 1], labels=['Downstream', 'Upstream']) plt.tight_layout() # ======================= P VS. DIS ========================== plt.figure(figsize=(11, 4.8)) plt.subplot(211) xdistances = range(100, int(upstream.distance.max()), 100) ypvalues = [mannwhitneyu(downstream.loc[downstream.distance <= thresh].correlation, upstream.loc[upstream.distance <= thresh].correlation).pvalue for thresh in xdistances] updataamounts = [len(upstream.loc[upstream.distance <= thresh]) for thresh in xdistances] downdataamounts = [len(downstream.loc[downstream.distance <= thresh]) for thresh in xdistances] plt.plot(xdistances, ypvalues) plt.semilogx() plt.ylabel('p-valor') # ---------------------------------- plt.subplot(212) plt.plot(xdistances, downdataamounts, label="Downstream") plt.plot(xdistances, updataamounts, label="Upstream") plt.legend() plt.semilogx() plt.xlabel('Distância ao vizinho (pb)') plt.ylabel('Quantidade de pontos') # =========================================== # Só na faixa selecionada # plt.figure(dpi=200) # limits = 1e3, 1e4 # print(i.distance.between(*limits)) # plt.boxplot([[i.correlation.loc[i.distance.between(*limits)]] # for i in (upstream, downstream)]) # =========================================== save_all_figs()