def main(): global no_neighbor_count with safe_open(outpath, exist_ok='exit') as outfile: # write header #outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance'])+'\n') relations = pd.DataFrame() for head_group_name, head_group in tqdm(head_groups, desc='Contigs'): if head_group_name not in gene_groups.groups: prinf( 'Não há nenhum gene no cromossomo. As heads abaixo não possuem NG.' ) prinf(head_group) no_neighbor_count += head_group.shape[0] continue gene_group = gene_groups.get_group(head_group_name) chunks = np.array_split(head_group, n_cpu) with mp.Pool() as pool: pool_results = pool.starmap(parse_chunk, ((c, gene_group, cn) for cn, c in enumerate(chunks))) for chunk_relations in pool_results: relations = relations.append(chunk_relations) # print('\nCHUN', chunk_relations, '\nREL', relations) relations.columns = ['head_id', 'gene_id', 'flag', 'distance'] relations.to_csv(outfile, sep='\t', index=False) log(f'\nConcluído. Relações salvas em {str(outpath)}.') return relations, no_neighbor_count
def parse_head_row(head_row, gene_group, outfile): parse_head_row.last_args = head_row, gene_group, outfile for _, gene_row in gene_group.iterrows(): if overlaps((gene_row.start, gene_row.end), (head_row.start, head_row.end)): flag = 'olap' chosen_gene_id = gene_row.id distance = 0 break # if none overlaps else: # b f de forward ou backfill distance = 9e99 # create Inf() class for that? for meth in ['b', 'f']: fill_back = meth == 'b' gene_pos = ['end', 'start'][fill_back] head_pos = ['end', 'start'][not fill_back] if gene_group[gene_pos].duplicated().sum(): raise pd.core.indexes.base.InvalidIndexError( 'You appear to have duplicated gene positions. ' 'Indexes won\'t work properly.') indexed_group = gene_group.sort_values(gene_pos).set_index( gene_pos) try: near_gene_index = indexed_group.index.get_loc( head_row[head_pos], meth + 'fill') gene_row = gene_group.iloc[near_gene_index] new_distance = abs(gene_row[gene_pos] - head_row[head_pos]) if new_distance < distance: chosen_gene_id = gene_row.id flag = ["gh", "hg"][fill_back] distance = new_distance except KeyError as e: prinf( f'\nNão há gene {meth.upper()} de {head_row.id}. (B=atrás, F=à frente)\n' ) outfile.write('\t'.join([head_row.id, chosen_gene_id, flag, str(distance)]) + '\n')
def main(): filtered_outfile = safe_open(filtered_outpath, exist_ok=False) discarded_outfile = safe_open(discarded_outpath, exist_ok=False) n_cpu = mp.cpu_count() #================== LER E FILTRAR ALINHAMENTOS ==================# print('Lendo resultados do Blast...', end=' ') perere3_vs_genoma = pd.read_table(perere3_inpath, header=None, names=BL_COLUMNS) sr3_vs_genoma = pd.read_table(sr3_inpath, header=None, names=BL_COLUMNS) print('Resultados lidos.') # Sort positions # for data in (perere3_vs_genoma, sr3_vs_genoma): # data.sort_values('sstart', inplace=True) # data.reset_index(drop=True, inplace=True) print('Buscando alinhamentos em que o SR3 é melhor...') discarded = pd.DataFrame() filtered_perere3_vs_genoma = perere3_vs_genoma.copy() p_groups = perere3_vs_genoma.groupby('saccver') s_groups = sr3_vs_genoma.groupby( 'saccver') ## agrupar muda index??????????? print( 'Iterando para cada scaffold no genoma e para cada perere3 no scaffold.' ) for p_group_name in tqdm(p_groups.groups, desc='Scaffolds'): s_group = s_groups.get_group(p_group_name) p_group = p_groups.get_group(p_group_name) prinf('Combinando DataFrames...', end='\r') product = cartesian_product( p_group[['sstart', 'send', 'bitscore']].reset_index(), s_group[['sstart', 'send', 'bitscore']].reset_index()) # discard when perere3 aligns better prinf('Filtrando por bitscore do SR3...', end='\r') product = product.loc[product.bitscore_x < product.bitscore_y] if product.empty: continue prinf('Subdividindo produto... ', end='\r') product_chunks = np.array_split(product, n_cpu) prinf('Procurando sobreposições...', end='\r') with mp.Pool() as pool: chunks_discarded = pool.starmap(parse_product, enumerate(product_chunks)) group_discarded = pd.concat(chunks_discarded) discarded = discarded.append(group_discarded) # print(discarded[~discarded['index'].isin(filtered_perere3_vs_genoma.index)]) # print(filtered_perere3_vs_genoma.loc[discarded['index'].unique()]) print( f"Escrevendo posições das linhas removidas de '{str(perere3_inpath)}' em '{str(discarded_outpath)}'...", end=' ') discarded.columns = pd.MultiIndex.from_product([('perere3', 'sr3'), ('index', 'sstart', 'ssend', 'bitscore')]) discarded.to_csv(discarded_outfile, sep='\t', index=False) print('Arquivo escrito.') print('Filtrando...', end=' ') filtered_perere3_vs_genoma.drop(discarded[('perere3', 'index')], inplace=True) print(f'\nFiltragem concluída. {len(discarded)} alinhamentos removidos.') print( f"Escrevendo alinhamentos filtrados do perere3 em '{str(filtered_outpath)}'...", end=' ') filtered_perere3_vs_genoma.to_csv(filtered_outfile, sep='\t', index=False) print('Arquivo escrito.') return filtered_perere3_vs_genoma, discarded
print('Concluído. Calculando correlações...') gridx, gridy = 5, 8 plt.rcParams['font.size'] = 4 plt.rcParams['figure.figsize'] = (16, 9) i = 0 for _, relation_row in relations.iterrows(): hid = relation_row.head_id gid = relation_row.gene_id if hid not in counts: prinf( f'WARNING: {hid} não presente nas contagens, só nas relações head-gene. Talvez a contagem deva ser refeita.' ) continue head_col = counts[hid] gene_col = counts[gid] n_non_zero = (head_col.astype(bool) & gene_col.astype(bool)).sum() # if number of significative points (head and gene counts != 0) is less than 4 if n_non_zero < 4: prinf('\nCorrelação descartada:\n', counts[[gid, hid]]) continue prinf( '\nCorrelação plotada:\n', pd.concat([ counts[[gid, hid]],
# genes.loc[genes.duplicated('end', 'last'), 'end'] += 1 gene_groups = genes.groupby(COLS_TO_GROUP) if __name__ == '__main__': # write header outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n') print('Iterate for each contig.') for head_group_name, head_group in tqdm(head_groups): try: gene_group = gene_groups.get_group(head_group_name) except KeyError: prinf( 'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".' ) prinf(head_group) continue # parse head_row for each head_row head_group_chunks = pd.np.array_split(head_group, mp.cpu_count()) def parse_chunk(df): global n_cpu tqdm.pandas(position=parse_chunk.bar_pos + 1) parse_chunk.bar_pos += 1 print(parse_chunk.bar_pos, '****', n_cpu) parse_chunk.bar_pos %= n_cpu return df.progress_apply(parse_head_row, axis=1,
genes['id'] = parse_gff_attributes(genes.attributes).index head_groups = heads.groupby(COLS_TO_GROUP) gene_groups = genes.groupby(COLS_TO_GROUP) if __name__ == '__main__': # write header outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n') print('Iterate for each contig and for each head in contig.') for head_group_name, head_group in tqdm(head_groups): try: gene_group = gene_groups.get_group(head_group_name) except KeyError: prinf( 'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".' ) prinf(head_group) continue for _, head_row in tqdm(list(head_group.iterrows())): for _, gene_row in gene_group.iterrows(): if overlaps((gene_row.start, gene_row.end), (head_row.start, head_row.end)): flag = 'olap' chosen_gene_id = gene_row.id distance = 0 break # if none overlaps
def main(): heads = pd.read_table(head_annotations_path, names=GFF3_COLUMNS, usecols=GFF_COLS_SUBSET) genes = pd.read_table(gene_annotations_path, names=GFF3_COLUMNS, usecols=GFF_COLS_SUBSET) heads['id'] = parse_gff_attributes(heads.attributes).index genes['id'] = parse_gff_attributes(genes.attributes).index head_groups = heads.groupby(COLS_TO_GROUP) gene_groups = genes.groupby(COLS_TO_GROUP) outfile = safe_open(outpath, exist_ok=False) # write header outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n') print('Iterate for each contig and for each head in contig.') for head_group_name, head_group in tqdm(head_groups): try: gene_group = gene_groups.get_group(head_group_name) except KeyError: prinf( 'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".' ) prinf(head_group) continue for _, head_row in tqdm(list(head_group.iterrows())): for _, gene_row in gene_group.iterrows(): if overlaps((gene_row.start, gene_row.end), (head_row.start, head_row.end)): flag = 'olap' chosen_gene_id = gene_row.id distance = 0 break # if none overlaps else: # b f de forward ou backfill distance = 9e99 # create Inf() class for that? for meth in ['b', 'f']: fill_back = meth == 'b' gene_pos = ['end', 'start'][fill_back] head_pos = ['end', 'start'][not fill_back] if gene_group[gene_pos].duplicated().sum(): raise pd.core.indexes.base.InvalidIndexError( 'You appear to have duplicated gene positions.' 'Indexes won\'t work properly.') indexed_group = gene_group.sort_values(gene_pos).set_index( gene_pos) try: near_gene_index = indexed_group.index.get_loc( head_row[head_pos], meth + 'fill') gene_row = gene_group.iloc[near_gene_index] new_distance = abs(gene_row[gene_pos] - head_row[head_pos]) if new_distance < distance: chosen_gene_id = gene_row.id flag = ["gh", "hg"][fill_back] distance = new_distance except KeyError: prinf(f'Não há gene à {flag} de {head_row.id}.') outfile.write( '\t'.join([head_row.id, chosen_gene_id, flag, str(distance)]) + '\n') print(f'\nConcluído. Relações salvas em {str(outpath)}.') outfile.close()
def main(): u.log(f'{__file__}: Generating heads of {HEAD_LEN} bp.') truncated_count = 0 #======================== LEITURA ========================# heads_annotations_file = u.safe_open(heads_annotations_path, exist_ok=False) heads_outfile = u.safe_open(heads_outpath, exist_ok=False) motherlength_outfile = u.safe_open(motherlength_path, exist_ok=False) print('Lendo alinhamentos filtrados do Perere3...', end=' ') filtered_perere3_vs_genoma = read_table(inpath) print(f"'{inpath}' lido.") print('Lendo genoma de S. mansoni...', end=' ') genomedict = to_dict(parse(str(u.genome_path), 'fasta')) print('Dicionário criado.') #======================== GET HEADS ========================# print('Searching for Perere-3 copies in S. mansoni\'s genome...') with (u.pardir / 'seqs/perere3.fa').open() as per_file: perere_len = len(''.join([l.strip() for l in per_file.readlines()][1:])) heads = [] for index, row in filtered_perere3_vs_genoma.iterrows(): # Discard copies without 3' end. if abs(row['qend'] - perere_len) < MAX_DISTANCE_FROM_END: genome_piece = genomedict[row['saccver']].seq plus_sense = row['sstart'] < row['send'] if plus_sense: head_slice = slice(row['send'], row['send'] + GTAA_WINDOW_LEN) proto_head = genome_piece[head_slice] if u.verbose: prefix = genome_piece[head_slice.start - PREFIX_LEN:head_slice.start] else: head_slice = slice(row['send'] - GTAA_WINDOW_LEN - 1, row['send'] - 1) proto_head = genome_piece[head_slice].reverse_complement() if u.verbose: prefix = genome_piece[head_slice.stop:head_slice.stop + PREFIX_LEN].reverse_complement() ###### Algumas dão xabu (???) if head_slice.start < 0 or head_slice.stop < 0: u.prinf(f'Head descartada com posições:', head_slice) continue skip_gtaa = find_gtaa_break(proto_head) head = proto_head[skip_gtaa:skip_gtaa + HEAD_LEN] #======================== ANOTAR HEAD NO GFF ========================# if plus_sense: start_pos = row['send'] + 1 + skip_gtaa end_pos = start_pos + HEAD_LEN else: end_pos = row['send'] - 1 - skip_gtaa start_pos = end_pos - HEAD_LEN heads_annotations_file.write('\t'.join([ row['saccver'], 'WormBase_imported', 'gene', str(start_pos), str(end_pos), '.', ['-', '+'][plus_sense], '.', f'gene_id=head{index};motherlength={row["length"]};length={HEAD_LEN}' ]) + '\n') motherlength_outfile.write(f"head{index}\t{row['length']}\n") #======================== ESCREVER HEAD EM FASTA ========================# heads_outfile.write(f'>head{index}\n' + str(head) + '\n') #========================================================================# if u.verbose: print( ['-', '+'][plus_sense], prefix, proto_head[:skip_gtaa] + ' | ' + head[:30 - skip_gtaa] + '...', f" {len(head)}bp\t{row['pident']:.2f}%\t{row['evalue']:.2e}\t{row['bitscore']:5}\t{row['saccver']}\t{head_slice.start}-{head_slice.stop}" ) else: truncated_count += 1 u.log( f'\n{filtered_perere3_vs_genoma.shape[0] - truncated_count} heads written:', heads_outpath, heads_annotations_path, sep='\n\t') u.log(f'{filtered_perere3_vs_genoma.shape[0]} alignments considered.') u.log(truncated_count, 'heads discarded as truncated.') heads_annotations_file.close() heads_outfile.close() motherlength_outfile.close() #======================== PLOTAR HISTOGRAMAS ========================# if u.plot_flag: heads_df = DataFrame.from_dict(dict(enumerate(zip(*heads)))) for j in range(8): plt.figure(figsize=(16, 9)) for i in range(12): plt.subplot(3, 4, i + 1) heads_df[i + j * 12].value_counts().plot(kind='bar') plt.show()