Beispiel #1
0
 def save_bin(self):
     if self.args.gtf_bin:
         sys.stderr.write(
             "Error: Can't save since the binary provided at input")
         sys.exit()
     gene_models = pygenes.GeneModels()
     gene_models.load_ensembl_gtf(args.gtf)
     gene_models.save_binary(self.args.outfile + '.gene_models.binary')
Beispiel #2
0
    def write_output(self):
        gene_models = pygenes.GeneModels()

        if (self.args.gtf_bin):
            gene_models.load_binary(self.args.gtf_bin)
        else:
            gene_models.load_ensembl_gtf(self.args.gtf)

        with open(args.infile, 'r') as titan_output:
            while True:
                line = titan_output.readline()
                if line[0] == '#':
                    continue
                header = line.rstrip()
                self.outfile.write("%s\tPygenes(gene_id,gene_name;)\n" %
                                   header)
                break

            ##do something to.. data lines
            for row in titan_output:
                row = row.rstrip()
                col = row.split('\t')

                if self.args.demix:
                    chrom = col[2]
                else:
                    chrom = col[1]
                try:
                    if self.args.demix:
                        start = int(col[3])
                        end = int(col[4])
                    else:
                        start = int(col[2])
                        end = int(col[3])
                except:
                    self.outfile.write(
                        "%s\t%s\n" %
                        (row, "[ERROR - position not of type int()]"))
                    continue

                if (args.is_contained):
                    gene_ids = gene_models.find_contained_genes(
                        chrom, start, end)
                else:
                    gene_ids = gene_models.find_overlapping_genes(
                        chrom, start, end)

                pygenes_addition = ""
                for gene_id in gene_ids:
                    gene_name = gene_models.get_gene(gene_id).name

                    pygenes_addition += "%s,%s;" % (gene_id, gene_name)
                self.outfile.write("%s\t%s\n" % (row, pygenes_addition))
            self.outfile.close()
Beispiel #3
0
    def __init__(self,
                 infile,
                 outfile,
                 gtf_bin=None,
                 gtf=None,
                 is_contained=False):
        self.infile = infile
        self.outfile = outfile
        self.is_contained = is_contained

        if not gtf and not gtf_bin:
            raise InputArgsException('Requires either gtf or gtf_bin files')

        self.gene_models = pygenes.GeneModels()
        if gtf_bin:
            self.gene_models.load_binary(gtf_bin)
        else:
            self.gene_models.load_ensembl_gtf(gtf)
Beispiel #4
0
def tabulate_results(breakpoints_filename, likelihoods_filename, library_ids,
                     genome_fasta, gtf_filename, dgv_filename,
                     breakpoint_table, breakpoint_library_table):

    lib_names = pd.DataFrame(library_ids.items(),
                             columns=['library', 'library_id'])

    breakpoints = pd.read_csv(breakpoints_filename,
                              sep='\t',
                              names=destruct.predict_breaks.breakpoint_fields,
                              converters=converters)
    breakpoints = breakpoints.drop(['breakpoint_id'], axis=1)
    breakpoints = breakpoints.rename(columns={'count': 'num_split'})
    breakpoints.loc[breakpoints['inserted'] == '.', 'inserted'] = ''

    likelihoods = pd.read_csv(likelihoods_filename,
                              sep='\t',
                              names=destruct.predict_breaks.likelihoods_fields,
                              converters=converters)
    likelihoods = likelihoods.drop(['breakpoint_id'], axis=1)

    breakpoint_reads = (likelihoods.groupby(['cluster_id', 'library_id'
                                             ]).size().reset_index())
    breakpoint_reads.columns = ['cluster_id', 'library_id', 'num_reads']

    breakpoint_unique_reads = (likelihoods.drop_duplicates([
        'cluster_id', 'library_id', 'template_length_1', 'template_length_2'
    ]).groupby(['cluster_id', 'library_id']).size().reset_index())
    breakpoint_unique_reads.columns = [
        'cluster_id', 'library_id', 'num_unique_reads'
    ]

    breakpoint_library = (
        breakpoint_reads.merge(breakpoint_unique_reads).merge(lib_names).drop(
            ['library_id'], axis=1))

    agg_f = {
        'log_likelihood': np.average,
        'log_cdf': np.average,
        'template_length_1': max,
        'template_length_2': max,
    }

    breakpoint_stats = (
        likelihoods.groupby('cluster_id').agg(agg_f).reset_index())

    breakpoint_stats['template_length_min'] = breakpoint_stats[[
        'template_length_1', 'template_length_2'
    ]].min(axis=1)

    breakpoint_counts = (
        likelihoods.groupby('cluster_id').size().reset_index())
    breakpoint_counts.columns = ['cluster_id', 'num_reads']

    breakpoint_unique_counts = (likelihoods.drop_duplicates(
        ['cluster_id', 'library_id', 'template_length_1',
         'template_length_2']).groupby('cluster_id').size().reset_index())
    breakpoint_unique_counts.columns = ['cluster_id', 'num_unique_reads']

    breakpoints = breakpoints.merge(breakpoint_stats,
                                    on='cluster_id',
                                    how='inner')
    breakpoints = breakpoints.merge(breakpoint_counts,
                                    on='cluster_id',
                                    how='inner')
    breakpoints = breakpoints.merge(breakpoint_unique_counts,
                                    on='cluster_id',
                                    how='inner')

    # Calculate breakpoint type
    def breakpoint_type(row):
        if row['chromosome_1'] != row['chromosome_2']:
            return 'translocation'
        if row['strand_1'] == row['strand_2']:
            return 'inversion'
        positions = sorted([(row['position_{0}'.format(side)],
                             row['strand_{0}'.format(side)])
                            for side in (1, 2)])
        if positions[0][1] == '+':
            return 'deletion'
        else:
            return 'duplication'

    breakpoints['type'] = breakpoints.apply(breakpoint_type, axis=1)

    # Calculate number inserted at the breakpoint
    def calculate_num_inserted(row):
        if row['inserted'] == '.':
            return 0
        else:
            return len(row['inserted'])

    breakpoints['num_inserted'] = breakpoints.apply(calculate_num_inserted,
                                                    axis=1)

    # Annotate sequence
    reference_sequences = dict()
    for id, seq in destruct.utils.seq.read_sequences(open(genome_fasta, 'rt')):
        reference_sequences[id] = seq

    breakpoints['sequence'] = breakpoints.apply(
        lambda row: create_sequence(row, reference_sequences), axis=1)

    # Annotate gene information
    gene_models = pygenes.GeneModels()
    gene_models.load_ensembl_gtf(gtf_filename)

    breakpoints = breakpoints.apply(
        lambda row: annotate_genes(row, gene_models), axis=1)

    # Annotate database of genomic variants
    dgv = DGVDatabase(dgv_filename)

    breakpoints['dgv_ids'] = breakpoints.apply(lambda row: query_dgv(row, dgv),
                                               axis=1)

    breakpoints = breakpoints.rename(columns={'cluster_id': 'prediction_id'})

    breakpoints.to_csv(breakpoint_table,
                       sep='\t',
                       na_rep='NA',
                       header=True,
                       index=False)

    breakpoint_library = breakpoint_library.rename(
        columns={'cluster_id': 'prediction_id'})

    breakpoint_library.to_csv(breakpoint_library_table,
                              sep='\t',
                              na_rep='NA',
                              header=True,
                              index=False)