Example #1
0
def main():
    global no_neighbor_count
    with safe_open(outpath, exist_ok='exit') as outfile:
        # write header
        #outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance'])+'\n')

        relations = pd.DataFrame()

        for head_group_name, head_group in tqdm(head_groups, desc='Contigs'):

            if head_group_name not in gene_groups.groups:
                prinf(
                    'Não há nenhum gene no cromossomo. As heads abaixo não possuem NG.'
                )
                prinf(head_group)
                no_neighbor_count += head_group.shape[0]
                continue

            gene_group = gene_groups.get_group(head_group_name)
            chunks = np.array_split(head_group, n_cpu)

            with mp.Pool() as pool:
                pool_results = pool.starmap(parse_chunk,
                                            ((c, gene_group, cn)
                                             for cn, c in enumerate(chunks)))

                for chunk_relations in pool_results:
                    relations = relations.append(chunk_relations)
                    # print('\nCHUN', chunk_relations, '\nREL', relations)

        relations.columns = ['head_id', 'gene_id', 'flag', 'distance']
        relations.to_csv(outfile, sep='\t', index=False)
        log(f'\nConcluído. Relações salvas em {str(outpath)}.')

    return relations, no_neighbor_count
Example #2
0
def parse_head_row(head_row, gene_group, outfile):
    parse_head_row.last_args = head_row, gene_group, outfile

    for _, gene_row in gene_group.iterrows():
        if overlaps((gene_row.start, gene_row.end),
                    (head_row.start, head_row.end)):
            flag = 'olap'
            chosen_gene_id = gene_row.id
            distance = 0
            break

    # if none overlaps
    else:
        # b f de forward ou backfill
        distance = 9e99  # create Inf() class for that?
        for meth in ['b', 'f']:
            fill_back = meth == 'b'
            gene_pos = ['end', 'start'][fill_back]
            head_pos = ['end', 'start'][not fill_back]

            if gene_group[gene_pos].duplicated().sum():
                raise pd.core.indexes.base.InvalidIndexError(
                    'You appear to have duplicated gene positions. '
                    'Indexes won\'t work properly.')

            indexed_group = gene_group.sort_values(gene_pos).set_index(
                gene_pos)

            try:
                near_gene_index = indexed_group.index.get_loc(
                    head_row[head_pos], meth + 'fill')
                gene_row = gene_group.iloc[near_gene_index]
                new_distance = abs(gene_row[gene_pos] - head_row[head_pos])

                if new_distance < distance:
                    chosen_gene_id = gene_row.id
                    flag = ["gh", "hg"][fill_back]
                    distance = new_distance

            except KeyError as e:
                prinf(
                    f'\nNão há gene {meth.upper()} de {head_row.id}. (B=atrás, F=à frente)\n'
                )

    outfile.write('\t'.join([head_row.id, chosen_gene_id, flag,
                             str(distance)]) + '\n')
Example #3
0
def main():
    filtered_outfile = safe_open(filtered_outpath, exist_ok=False)
    discarded_outfile = safe_open(discarded_outpath, exist_ok=False)
    n_cpu = mp.cpu_count()

    #================== LER E FILTRAR ALINHAMENTOS ==================#

    print('Lendo resultados do Blast...', end=' ')
    perere3_vs_genoma = pd.read_table(perere3_inpath,
                                      header=None,
                                      names=BL_COLUMNS)
    sr3_vs_genoma = pd.read_table(sr3_inpath, header=None, names=BL_COLUMNS)
    print('Resultados lidos.')

    # Sort positions
    # for data in (perere3_vs_genoma, sr3_vs_genoma):
    #     data.sort_values('sstart', inplace=True)
    #     data.reset_index(drop=True, inplace=True)

    print('Buscando alinhamentos em que o SR3 é melhor...')
    discarded = pd.DataFrame()
    filtered_perere3_vs_genoma = perere3_vs_genoma.copy()

    p_groups = perere3_vs_genoma.groupby('saccver')
    s_groups = sr3_vs_genoma.groupby(
        'saccver')  ## agrupar muda index???????????

    print(
        'Iterando para cada scaffold no genoma e para cada perere3 no scaffold.'
    )
    for p_group_name in tqdm(p_groups.groups, desc='Scaffolds'):
        s_group = s_groups.get_group(p_group_name)
        p_group = p_groups.get_group(p_group_name)

        prinf('Combinando DataFrames...', end='\r')
        product = cartesian_product(
            p_group[['sstart', 'send', 'bitscore']].reset_index(),
            s_group[['sstart', 'send', 'bitscore']].reset_index())

        # discard when perere3 aligns better
        prinf('Filtrando por bitscore do SR3...', end='\r')
        product = product.loc[product.bitscore_x < product.bitscore_y]

        if product.empty:
            continue

        prinf('Subdividindo produto...         ', end='\r')
        product_chunks = np.array_split(product, n_cpu)

        prinf('Procurando sobreposições...', end='\r')
        with mp.Pool() as pool:
            chunks_discarded = pool.starmap(parse_product,
                                            enumerate(product_chunks))

        group_discarded = pd.concat(chunks_discarded)
        discarded = discarded.append(group_discarded)
        # print(discarded[~discarded['index'].isin(filtered_perere3_vs_genoma.index)])
        # print(filtered_perere3_vs_genoma.loc[discarded['index'].unique()])

    print(
        f"Escrevendo posições das linhas removidas de '{str(perere3_inpath)}' em '{str(discarded_outpath)}'...",
        end=' ')
    discarded.columns = pd.MultiIndex.from_product([('perere3', 'sr3'),
                                                    ('index', 'sstart',
                                                     'ssend', 'bitscore')])
    discarded.to_csv(discarded_outfile, sep='\t', index=False)
    print('Arquivo escrito.')

    print('Filtrando...', end=' ')
    filtered_perere3_vs_genoma.drop(discarded[('perere3', 'index')],
                                    inplace=True)
    print(f'\nFiltragem concluída. {len(discarded)} alinhamentos removidos.')

    print(
        f"Escrevendo alinhamentos filtrados do perere3 em '{str(filtered_outpath)}'...",
        end=' ')
    filtered_perere3_vs_genoma.to_csv(filtered_outfile, sep='\t', index=False)
    print('Arquivo escrito.')

    return filtered_perere3_vs_genoma, discarded
Example #4
0
print('Concluído. Calculando correlações...')

gridx, gridy = 5, 8
plt.rcParams['font.size'] = 4
plt.rcParams['figure.figsize'] = (16, 9)
i = 0

for _, relation_row in relations.iterrows():

    hid = relation_row.head_id
    gid = relation_row.gene_id

    if hid not in counts:
        prinf(
            f'WARNING: {hid} não presente nas contagens, só nas relações head-gene. Talvez a contagem deva ser refeita.'
        )
        continue
    head_col = counts[hid]
    gene_col = counts[gid]

    n_non_zero = (head_col.astype(bool) & gene_col.astype(bool)).sum()
    # if number of significative points (head and gene counts != 0) is less than 4
    if n_non_zero < 4:
        prinf('\nCorrelação descartada:\n', counts[[gid, hid]])
        continue

    prinf(
        '\nCorrelação plotada:\n',
        pd.concat([
            counts[[gid, hid]],
Example #5
0
# genes.loc[genes.duplicated('end', 'last'), 'end'] += 1

gene_groups = genes.groupby(COLS_TO_GROUP)

if __name__ == '__main__':
    # write header
    outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n')
    print('Iterate for each contig.')

    for head_group_name, head_group in tqdm(head_groups):
        try:
            gene_group = gene_groups.get_group(head_group_name)

        except KeyError:
            prinf(
                'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".'
            )
            prinf(head_group)
            continue

        # parse head_row for each head_row
        head_group_chunks = pd.np.array_split(head_group, mp.cpu_count())

        def parse_chunk(df):
            global n_cpu
            tqdm.pandas(position=parse_chunk.bar_pos + 1)
            parse_chunk.bar_pos += 1
            print(parse_chunk.bar_pos, '****', n_cpu)
            parse_chunk.bar_pos %= n_cpu
            return df.progress_apply(parse_head_row,
                                     axis=1,
Example #6
0
genes['id'] = parse_gff_attributes(genes.attributes).index
head_groups = heads.groupby(COLS_TO_GROUP)
gene_groups = genes.groupby(COLS_TO_GROUP)

if __name__ == '__main__':
    # write header
    outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n')
    print('Iterate for each contig and for each head in contig.')

    for head_group_name, head_group in tqdm(head_groups):
        try:
            gene_group = gene_groups.get_group(head_group_name)

        except KeyError:
            prinf(
                'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".'
            )
            prinf(head_group)
            continue

        for _, head_row in tqdm(list(head_group.iterrows())):

            for _, gene_row in gene_group.iterrows():
                if overlaps((gene_row.start, gene_row.end),
                            (head_row.start, head_row.end)):
                    flag = 'olap'
                    chosen_gene_id = gene_row.id
                    distance = 0
                    break

            # if none overlaps
Example #7
0
def main():
    heads = pd.read_table(head_annotations_path,
                          names=GFF3_COLUMNS,
                          usecols=GFF_COLS_SUBSET)
    genes = pd.read_table(gene_annotations_path,
                          names=GFF3_COLUMNS,
                          usecols=GFF_COLS_SUBSET)
    heads['id'] = parse_gff_attributes(heads.attributes).index
    genes['id'] = parse_gff_attributes(genes.attributes).index
    head_groups = heads.groupby(COLS_TO_GROUP)
    gene_groups = genes.groupby(COLS_TO_GROUP)

    outfile = safe_open(outpath, exist_ok=False)
    # write header
    outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance']) + '\n')
    print('Iterate for each contig and for each head in contig.')

    for head_group_name, head_group in tqdm(head_groups):
        try:
            gene_group = gene_groups.get_group(head_group_name)

        except KeyError:
            prinf(
                'Não há nenhum gene no cromossomo. As heads abaixo são "desgenadas".'
            )
            prinf(head_group)
            continue

        for _, head_row in tqdm(list(head_group.iterrows())):

            for _, gene_row in gene_group.iterrows():
                if overlaps((gene_row.start, gene_row.end),
                            (head_row.start, head_row.end)):
                    flag = 'olap'
                    chosen_gene_id = gene_row.id
                    distance = 0
                    break

            # if none overlaps
            else:
                # b f de forward ou backfill
                distance = 9e99  # create Inf() class for that?
                for meth in ['b', 'f']:
                    fill_back = meth == 'b'
                    gene_pos = ['end', 'start'][fill_back]
                    head_pos = ['end', 'start'][not fill_back]

                    if gene_group[gene_pos].duplicated().sum():
                        raise pd.core.indexes.base.InvalidIndexError(
                            'You appear to have duplicated gene positions.'
                            'Indexes won\'t work properly.')

                    indexed_group = gene_group.sort_values(gene_pos).set_index(
                        gene_pos)

                    try:
                        near_gene_index = indexed_group.index.get_loc(
                            head_row[head_pos], meth + 'fill')
                        gene_row = gene_group.iloc[near_gene_index]
                        new_distance = abs(gene_row[gene_pos] -
                                           head_row[head_pos])

                        if new_distance < distance:
                            chosen_gene_id = gene_row.id
                            flag = ["gh", "hg"][fill_back]
                            distance = new_distance

                    except KeyError:
                        prinf(f'Não há gene à {flag} de {head_row.id}.')

            outfile.write(
                '\t'.join([head_row.id, chosen_gene_id, flag,
                           str(distance)]) + '\n')

    print(f'\nConcluído. Relações salvas em {str(outpath)}.')

    outfile.close()
Example #8
0
def main():
    u.log(f'{__file__}: Generating heads of {HEAD_LEN} bp.')
    truncated_count = 0
    #======================== LEITURA ========================#
    heads_annotations_file = u.safe_open(heads_annotations_path,
                                         exist_ok=False)
    heads_outfile = u.safe_open(heads_outpath, exist_ok=False)
    motherlength_outfile = u.safe_open(motherlength_path, exist_ok=False)

    print('Lendo alinhamentos filtrados do Perere3...', end=' ')
    filtered_perere3_vs_genoma = read_table(inpath)
    print(f"'{inpath}' lido.")

    print('Lendo genoma de S. mansoni...', end=' ')
    genomedict = to_dict(parse(str(u.genome_path), 'fasta'))
    print('Dicionário criado.')

    #======================== GET HEADS ========================#
    print('Searching for Perere-3 copies in S. mansoni\'s genome...')

    with (u.pardir / 'seqs/perere3.fa').open() as per_file:
        perere_len = len(''.join([l.strip()
                                  for l in per_file.readlines()][1:]))

    heads = []

    for index, row in filtered_perere3_vs_genoma.iterrows():

        # Discard copies without 3' end.
        if abs(row['qend'] - perere_len) < MAX_DISTANCE_FROM_END:
            genome_piece = genomedict[row['saccver']].seq
            plus_sense = row['sstart'] < row['send']

            if plus_sense:
                head_slice = slice(row['send'], row['send'] + GTAA_WINDOW_LEN)
                proto_head = genome_piece[head_slice]

                if u.verbose:
                    prefix = genome_piece[head_slice.start -
                                          PREFIX_LEN:head_slice.start]

            else:
                head_slice = slice(row['send'] - GTAA_WINDOW_LEN - 1,
                                   row['send'] - 1)
                proto_head = genome_piece[head_slice].reverse_complement()

                if u.verbose:
                    prefix = genome_piece[head_slice.stop:head_slice.stop +
                                          PREFIX_LEN].reverse_complement()

            ###### Algumas dão xabu (???)
            if head_slice.start < 0 or head_slice.stop < 0:
                u.prinf(f'Head descartada com posições:', head_slice)
                continue

            skip_gtaa = find_gtaa_break(proto_head)
            head = proto_head[skip_gtaa:skip_gtaa + HEAD_LEN]

            #======================== ANOTAR HEAD NO GFF ========================#

            if plus_sense:
                start_pos = row['send'] + 1 + skip_gtaa
                end_pos = start_pos + HEAD_LEN

            else:
                end_pos = row['send'] - 1 - skip_gtaa
                start_pos = end_pos - HEAD_LEN

            heads_annotations_file.write('\t'.join([
                row['saccver'], 'WormBase_imported', 'gene',
                str(start_pos),
                str(end_pos), '.', ['-', '+'][plus_sense], '.',
                f'gene_id=head{index};motherlength={row["length"]};length={HEAD_LEN}'
            ]) + '\n')

            motherlength_outfile.write(f"head{index}\t{row['length']}\n")

            #======================== ESCREVER HEAD EM FASTA ========================#
            heads_outfile.write(f'>head{index}\n' + str(head) + '\n')
            #========================================================================#

            if u.verbose:
                print(
                    ['-', '+'][plus_sense], prefix, proto_head[:skip_gtaa] +
                    ' | ' + head[:30 - skip_gtaa] + '...',
                    f" {len(head)}bp\t{row['pident']:.2f}%\t{row['evalue']:.2e}\t{row['bitscore']:5}\t{row['saccver']}\t{head_slice.start}-{head_slice.stop}"
                )

        else:
            truncated_count += 1

    u.log(
        f'\n{filtered_perere3_vs_genoma.shape[0] - truncated_count} heads written:',
        heads_outpath,
        heads_annotations_path,
        sep='\n\t')
    u.log(f'{filtered_perere3_vs_genoma.shape[0]} alignments considered.')
    u.log(truncated_count, 'heads discarded as truncated.')

    heads_annotations_file.close()
    heads_outfile.close()
    motherlength_outfile.close()

    #======================== PLOTAR HISTOGRAMAS ========================#

    if u.plot_flag:
        heads_df = DataFrame.from_dict(dict(enumerate(zip(*heads))))

        for j in range(8):
            plt.figure(figsize=(16, 9))
            for i in range(12):
                plt.subplot(3, 4, i + 1)
                heads_df[i + j * 12].value_counts().plot(kind='bar')

            plt.show()