Exemple #1
0
def merge_frames(insertion_frames, sample_names):
    # Check sample names for duplicates.
    duplicate_samples = [
        s for s, count in Counter(sample_names).items() if count > 1
    ]

    if len(duplicate_samples) > 1:
        raise ValueError('Duplicate sample names given ({})'.format(
            ', '.join(duplicate_samples)))

    # Merge frames.
    frames = []
    for (frame, sample_name) in zip(insertion_frames, sample_names):
        # Check if frame is valid.
        Insertion.check_frame(frame)

        # Augment frame with sample name.
        frame = frame.copy()
        frame['sample'] = sample_name
        frame['id'] = add_prefix(frame['id'], prefix=sample_name + '.')

        frames.append(frame)

    merged = pd.concat(frames, axis=0)
    merged = Insertion.format_frame(merged)

    return merged
Exemple #2
0
def cis_insertions():
    return [
        # 1000 bp upstream of Trp53bp2.
        Insertion(id='INS1', chromosome='1', position=182408172, strand=1,
                support=2, metadata=frozendict({'cis_id': 'CIS1'})),
        # Different chromosome.
        Insertion(id='INS2', chromosome='4', position=77843175, strand=1,
                  support=2, metadata=frozendict({'cis_id': 'CIS2'}))
    ] # yapf: disable
Exemple #3
0
def insertions():
    # Trp53bp2 location: 1: 182,409,172-182,462,432.
    # Myh9 location: 15: 77,760,587-77,842,175.

    return [
        # 1000 bp upstream of Trp53bp2.
        Insertion(id='INS1', chromosome='1', position=182408172,
                  strand=1, support=2, metadata=frozendict()),
        # 2000 bp downstream of Myh9.
        Insertion(id='INS2', chromosome='15', position=77758587,
                  strand=1, support=2, metadata=frozendict()),
        # Different chromosome.
        Insertion(id='INS3', chromosome='4', position=77843175,
                  strand=1, support=2, metadata=frozendict())
    ] # yapf: disable
Exemple #4
0
def main():
    """Main function for pyim-annotate."""
    args = parse_args()

    insertions = Insertion.from_csv(args.insertions, sep='\t')

    annotator = args.caller.from_args(args)
    annotated = list(annotator.annotate(insertions))

    annotated_frame = Insertion.to_frame(annotated)

    annotated_frame = annotated_frame.reindex(index=order_by_index(
        annotated_frame.index, index_natsorted(annotated_frame.id)))

    annotated_frame.to_csv(str(args.output), sep='\t', index=False)
Exemple #5
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    # Read frame.
    ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True)

    # Create output directory if it doesn't exist.
    args.output_dir.mkdir(exist_ok=True, parents=True)

    if args.samples is not None:
        # Subset for samples and convert to categorical.
        ins_frame = ins_frame.ix[ins_frame['sample'].isin(args.samples)]
        ins_frame['sample'] = pd.Categorical(ins_frame['sample'],
                                             categories=args.samples)

    # Split and write individual outputs.
    for sample, sample_frame in ins_frame.groupby('sample'):
        if args.remove_prefix:
            sample_frame['id'] = remove_prefix(sample_frame['id'],
                                               prefix=sample + '.')

        if len(sample_frame) == 0:
            print('WARNING: no insertions found for sample {}'.format(sample))

        sample_path = args.output_dir / '{}.txt'.format(sample)
        sample_frame.to_csv(str(sample_path), sep='\t', index=False)
Exemple #6
0
def main():
    """Main function for pyim-cis."""

    args = parse_args()

    # Read insertions.
    ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True)

    # Drop any columns if needed.
    if args.drop is not None:
        ins_frame = ins_frame.drop(args.drop, axis=1)
        ins_frame = ins_frame.drop_duplicates()

    # Convert to BED frame.
    start = (ins_frame['position'] - (args.width // 2)).astype(int)
    end = (ins_frame['position'] + (args.width // 2)).astype(int)
    strand = ins_frame['strand'].map({1: '+', -1: '-', np.nan: '.'})
    color = strand.map({'+': '0,0,255', '-': '255,0,0', '.': '60,60,60'})

    bed_frame = pd.DataFrame(
        OrderedDict([
            ('chrom', ins_frame['chromosome']),
            ('chromStart', start),
            ('chromEnd', end),
            ('name', ins_frame['id']),
            ('score', ins_frame['support']),
            ('strand', strand),
            ('thickStart', start),
            ('thickEnd', end),
            ('itemRgb', color)
        ])
    )  # yapf: disable

    # Write output.
    bed_frame.to_csv(str(args.output), sep='\t', index=False, header=False)
Exemple #7
0
    def _insertions_to_cimpl(self, insertions):
        # Convert insertions to frame representation.
        ins_frame = Insertion.to_frame(insertions)

        # Extract and rename required columns.
        column_map = {
            'id': 'id',
            'chromosome': 'chr',
            'position': 'location',
            'sample': 'sampleID'
        }

        cimpl_ins = (ins_frame[list(
            column_map.keys())].rename(columns=column_map))

        # Add chr prefix.
        cimpl_ins['chr'] = add_prefix(cimpl_ins['chr'], prefix='chr')

        # Add depth if present.
        if 'support' in ins_frame:
            cimpl_ins['contig_depth'] = ins_frame['support']

        elif 'depth_unique' in ins_frame:
            cimpl_ins['contig_depth'] = ins_frame['depth_unique']

        return cimpl_ins
Exemple #8
0
def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs):
    metadata = toolz.merge({
        'depth': len(ends),
        'depth_unique': len(set(ends))
    }, kwargs)
    return Insertion(id=id_,
                     chromosome=ref,
                     position=pos,
                     strand=strand,
                     support=metadata['depth_unique'],
                     metadata=frozendict(metadata))
Exemple #9
0
    def run(self, read_path, output_dir, read2_path=None):
        if read2_path is None:
            raise ValueError('This pipeline requires paired-end data')

        logger = logging.getLogger()

        output_dir.mkdir(exist_ok=True, parents=True)

        # Extract genomic sequences.
        if logger is not None:
            logger.info('Extracting genomic sequences')
            logger.info('  %-18s: %s', 'Transposon',
                        shorten_path(self._transposon_path))
            logger.info('  %-18s: %s', 'Minimum length', self._min_length)

        # Trim reads and align to reference.
        genomic_path, genomic2_path = self._extract_genomic(
            read_path, read2_path, output_dir, logger=logger)
        alignment_path = self._align(
            genomic_path, genomic2_path, output_dir, logger=logger)

        # Extract insertions from bam file.
        bam_file = pysam.AlignmentFile(str(alignment_path))

        try:
            insertions = extract_insertions(
                iter(bam_file),
                func=_process_mates,
                paired=True,
                merge_dist=self._merge_distance,
                min_mapq=self._min_mapq,
                min_support=self._min_support,
                logger=logger)
        finally:
            bam_file.close()

        # Write insertions to output file.
        insertion_path = output_dir / 'insertions.txt'

        ins_frame = Insertion.to_frame(insertions)
        ins_frame.to_csv(str(insertion_path), sep='\t', index=False)