def save_bin(self): if self.args.gtf_bin: sys.stderr.write( "Error: Can't save since the binary provided at input") sys.exit() gene_models = pygenes.GeneModels() gene_models.load_ensembl_gtf(args.gtf) gene_models.save_binary(self.args.outfile + '.gene_models.binary')
def write_output(self): gene_models = pygenes.GeneModels() if (self.args.gtf_bin): gene_models.load_binary(self.args.gtf_bin) else: gene_models.load_ensembl_gtf(self.args.gtf) with open(args.infile, 'r') as titan_output: while True: line = titan_output.readline() if line[0] == '#': continue header = line.rstrip() self.outfile.write("%s\tPygenes(gene_id,gene_name;)\n" % header) break ##do something to.. data lines for row in titan_output: row = row.rstrip() col = row.split('\t') if self.args.demix: chrom = col[2] else: chrom = col[1] try: if self.args.demix: start = int(col[3]) end = int(col[4]) else: start = int(col[2]) end = int(col[3]) except: self.outfile.write( "%s\t%s\n" % (row, "[ERROR - position not of type int()]")) continue if (args.is_contained): gene_ids = gene_models.find_contained_genes( chrom, start, end) else: gene_ids = gene_models.find_overlapping_genes( chrom, start, end) pygenes_addition = "" for gene_id in gene_ids: gene_name = gene_models.get_gene(gene_id).name pygenes_addition += "%s,%s;" % (gene_id, gene_name) self.outfile.write("%s\t%s\n" % (row, pygenes_addition)) self.outfile.close()
def __init__(self, infile, outfile, gtf_bin=None, gtf=None, is_contained=False): self.infile = infile self.outfile = outfile self.is_contained = is_contained if not gtf and not gtf_bin: raise InputArgsException('Requires either gtf or gtf_bin files') self.gene_models = pygenes.GeneModels() if gtf_bin: self.gene_models.load_binary(gtf_bin) else: self.gene_models.load_ensembl_gtf(gtf)
def tabulate_results(breakpoints_filename, likelihoods_filename, library_ids, genome_fasta, gtf_filename, dgv_filename, breakpoint_table, breakpoint_library_table): lib_names = pd.DataFrame(library_ids.items(), columns=['library', 'library_id']) breakpoints = pd.read_csv(breakpoints_filename, sep='\t', names=destruct.predict_breaks.breakpoint_fields, converters=converters) breakpoints = breakpoints.drop(['breakpoint_id'], axis=1) breakpoints = breakpoints.rename(columns={'count': 'num_split'}) breakpoints.loc[breakpoints['inserted'] == '.', 'inserted'] = '' likelihoods = pd.read_csv(likelihoods_filename, sep='\t', names=destruct.predict_breaks.likelihoods_fields, converters=converters) likelihoods = likelihoods.drop(['breakpoint_id'], axis=1) breakpoint_reads = (likelihoods.groupby(['cluster_id', 'library_id' ]).size().reset_index()) breakpoint_reads.columns = ['cluster_id', 'library_id', 'num_reads'] breakpoint_unique_reads = (likelihoods.drop_duplicates([ 'cluster_id', 'library_id', 'template_length_1', 'template_length_2' ]).groupby(['cluster_id', 'library_id']).size().reset_index()) breakpoint_unique_reads.columns = [ 'cluster_id', 'library_id', 'num_unique_reads' ] breakpoint_library = ( breakpoint_reads.merge(breakpoint_unique_reads).merge(lib_names).drop( ['library_id'], axis=1)) agg_f = { 'log_likelihood': np.average, 'log_cdf': np.average, 'template_length_1': max, 'template_length_2': max, } breakpoint_stats = ( likelihoods.groupby('cluster_id').agg(agg_f).reset_index()) breakpoint_stats['template_length_min'] = breakpoint_stats[[ 'template_length_1', 'template_length_2' ]].min(axis=1) breakpoint_counts = ( likelihoods.groupby('cluster_id').size().reset_index()) breakpoint_counts.columns = ['cluster_id', 'num_reads'] breakpoint_unique_counts = (likelihoods.drop_duplicates( ['cluster_id', 'library_id', 'template_length_1', 'template_length_2']).groupby('cluster_id').size().reset_index()) breakpoint_unique_counts.columns = ['cluster_id', 'num_unique_reads'] breakpoints = breakpoints.merge(breakpoint_stats, on='cluster_id', how='inner') breakpoints = breakpoints.merge(breakpoint_counts, on='cluster_id', how='inner') breakpoints = breakpoints.merge(breakpoint_unique_counts, on='cluster_id', how='inner') # Calculate breakpoint type def breakpoint_type(row): if row['chromosome_1'] != row['chromosome_2']: return 'translocation' if row['strand_1'] == row['strand_2']: return 'inversion' positions = sorted([(row['position_{0}'.format(side)], row['strand_{0}'.format(side)]) for side in (1, 2)]) if positions[0][1] == '+': return 'deletion' else: return 'duplication' breakpoints['type'] = breakpoints.apply(breakpoint_type, axis=1) # Calculate number inserted at the breakpoint def calculate_num_inserted(row): if row['inserted'] == '.': return 0 else: return len(row['inserted']) breakpoints['num_inserted'] = breakpoints.apply(calculate_num_inserted, axis=1) # Annotate sequence reference_sequences = dict() for id, seq in destruct.utils.seq.read_sequences(open(genome_fasta, 'rt')): reference_sequences[id] = seq breakpoints['sequence'] = breakpoints.apply( lambda row: create_sequence(row, reference_sequences), axis=1) # Annotate gene information gene_models = pygenes.GeneModels() gene_models.load_ensembl_gtf(gtf_filename) breakpoints = breakpoints.apply( lambda row: annotate_genes(row, gene_models), axis=1) # Annotate database of genomic variants dgv = DGVDatabase(dgv_filename) breakpoints['dgv_ids'] = breakpoints.apply(lambda row: query_dgv(row, dgv), axis=1) breakpoints = breakpoints.rename(columns={'cluster_id': 'prediction_id'}) breakpoints.to_csv(breakpoint_table, sep='\t', na_rep='NA', header=True, index=False) breakpoint_library = breakpoint_library.rename( columns={'cluster_id': 'prediction_id'}) breakpoint_library.to_csv(breakpoint_library_table, sep='\t', na_rep='NA', header=True, index=False)