def merge_frames(insertion_frames, sample_names): # Check sample names for duplicates. duplicate_samples = [ s for s, count in Counter(sample_names).items() if count > 1 ] if len(duplicate_samples) > 1: raise ValueError('Duplicate sample names given ({})'.format( ', '.join(duplicate_samples))) # Merge frames. frames = [] for (frame, sample_name) in zip(insertion_frames, sample_names): # Check if frame is valid. Insertion.check_frame(frame) # Augment frame with sample name. frame = frame.copy() frame['sample'] = sample_name frame['id'] = add_prefix(frame['id'], prefix=sample_name + '.') frames.append(frame) merged = pd.concat(frames, axis=0) merged = Insertion.format_frame(merged) return merged
def cis_insertions(): return [ # 1000 bp upstream of Trp53bp2. Insertion(id='INS1', chromosome='1', position=182408172, strand=1, support=2, metadata=frozendict({'cis_id': 'CIS1'})), # Different chromosome. Insertion(id='INS2', chromosome='4', position=77843175, strand=1, support=2, metadata=frozendict({'cis_id': 'CIS2'})) ] # yapf: disable
def insertions(): # Trp53bp2 location: 1: 182,409,172-182,462,432. # Myh9 location: 15: 77,760,587-77,842,175. return [ # 1000 bp upstream of Trp53bp2. Insertion(id='INS1', chromosome='1', position=182408172, strand=1, support=2, metadata=frozendict()), # 2000 bp downstream of Myh9. Insertion(id='INS2', chromosome='15', position=77758587, strand=1, support=2, metadata=frozendict()), # Different chromosome. Insertion(id='INS3', chromosome='4', position=77843175, strand=1, support=2, metadata=frozendict()) ] # yapf: disable
def main(): """Main function for pyim-annotate.""" args = parse_args() insertions = Insertion.from_csv(args.insertions, sep='\t') annotator = args.caller.from_args(args) annotated = list(annotator.annotate(insertions)) annotated_frame = Insertion.to_frame(annotated) annotated_frame = annotated_frame.reindex(index=order_by_index( annotated_frame.index, index_natsorted(annotated_frame.id))) annotated_frame.to_csv(str(args.output), sep='\t', index=False)
def main(): parser = setup_parser() args = parser.parse_args() # Read frame. ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True) # Create output directory if it doesn't exist. args.output_dir.mkdir(exist_ok=True, parents=True) if args.samples is not None: # Subset for samples and convert to categorical. ins_frame = ins_frame.ix[ins_frame['sample'].isin(args.samples)] ins_frame['sample'] = pd.Categorical(ins_frame['sample'], categories=args.samples) # Split and write individual outputs. for sample, sample_frame in ins_frame.groupby('sample'): if args.remove_prefix: sample_frame['id'] = remove_prefix(sample_frame['id'], prefix=sample + '.') if len(sample_frame) == 0: print('WARNING: no insertions found for sample {}'.format(sample)) sample_path = args.output_dir / '{}.txt'.format(sample) sample_frame.to_csv(str(sample_path), sep='\t', index=False)
def main(): """Main function for pyim-cis.""" args = parse_args() # Read insertions. ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True) # Drop any columns if needed. if args.drop is not None: ins_frame = ins_frame.drop(args.drop, axis=1) ins_frame = ins_frame.drop_duplicates() # Convert to BED frame. start = (ins_frame['position'] - (args.width // 2)).astype(int) end = (ins_frame['position'] + (args.width // 2)).astype(int) strand = ins_frame['strand'].map({1: '+', -1: '-', np.nan: '.'}) color = strand.map({'+': '0,0,255', '-': '255,0,0', '.': '60,60,60'}) bed_frame = pd.DataFrame( OrderedDict([ ('chrom', ins_frame['chromosome']), ('chromStart', start), ('chromEnd', end), ('name', ins_frame['id']), ('score', ins_frame['support']), ('strand', strand), ('thickStart', start), ('thickEnd', end), ('itemRgb', color) ]) ) # yapf: disable # Write output. bed_frame.to_csv(str(args.output), sep='\t', index=False, header=False)
def _insertions_to_cimpl(self, insertions): # Convert insertions to frame representation. ins_frame = Insertion.to_frame(insertions) # Extract and rename required columns. column_map = { 'id': 'id', 'chromosome': 'chr', 'position': 'location', 'sample': 'sampleID' } cimpl_ins = (ins_frame[list( column_map.keys())].rename(columns=column_map)) # Add chr prefix. cimpl_ins['chr'] = add_prefix(cimpl_ins['chr'], prefix='chr') # Add depth if present. if 'support' in ins_frame: cimpl_ins['contig_depth'] = ins_frame['support'] elif 'depth_unique' in ins_frame: cimpl_ins['contig_depth'] = ins_frame['depth_unique'] return cimpl_ins
def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs): metadata = toolz.merge({ 'depth': len(ends), 'depth_unique': len(set(ends)) }, kwargs) return Insertion(id=id_, chromosome=ref, position=pos, strand=strand, support=metadata['depth_unique'], metadata=frozendict(metadata))
def run(self, read_path, output_dir, read2_path=None): if read2_path is None: raise ValueError('This pipeline requires paired-end data') logger = logging.getLogger() output_dir.mkdir(exist_ok=True, parents=True) # Extract genomic sequences. if logger is not None: logger.info('Extracting genomic sequences') logger.info(' %-18s: %s', 'Transposon', shorten_path(self._transposon_path)) logger.info(' %-18s: %s', 'Minimum length', self._min_length) # Trim reads and align to reference. genomic_path, genomic2_path = self._extract_genomic( read_path, read2_path, output_dir, logger=logger) alignment_path = self._align( genomic_path, genomic2_path, output_dir, logger=logger) # Extract insertions from bam file. bam_file = pysam.AlignmentFile(str(alignment_path)) try: insertions = extract_insertions( iter(bam_file), func=_process_mates, paired=True, merge_dist=self._merge_distance, min_mapq=self._min_mapq, min_support=self._min_support, logger=logger) finally: bam_file.close() # Write insertions to output file. insertion_path = output_dir / 'insertions.txt' ins_frame = Insertion.to_frame(insertions) ins_frame.to_csv(str(insertion_path), sep='\t', index=False)