def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME['fake'].seq[0:50].upper(): raise AssertionError('fake genome file does not have the expected contents') global BAM_CACHE BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam'))
def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome( get_data('mock_reference_genome.fa')) if ('CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME[REF_CHR].seq[0:50].upper()): raise AssertionError( 'fake genome file does not have the expected contents')
def setUpModule(): warnings.simplefilter('ignore') global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(REFERENCE_GENOME_FILE) if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME[ 'fake'].seq[0:50].upper(): raise AssertionError( 'fake genome file does not have the expected contents')
def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(REFERENCE_GENOME_FILE) if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME[ 'fake'].seq[0:50].upper(): raise AssertionError( 'fake genome file does not have the expected contents') global BAM_CACHE BAM_CACHE = BamCache(BAM_INPUT)
def setUpModule(): global annotations, reference_genome, template_metadata, genome_bam_fh, trans_bam_fh, masking print('setup start') annotations = load_reference_genes(FULL_REFERENCE_ANNOTATIONS_FILE_JSON) reference_genome = load_reference_genome(REFERENCE_GENOME_FILE) template_metadata = load_templates(TEMPLATE_METADATA_FILE) genome_bam_fh = pysam.AlignmentFile(FULL_BAM_INPUT) trans_bam_fh = pysam.AlignmentFile(TRANSCRIPTOME_BAM_INPUT) print('setup loading is complete')
def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1): if isinstance(reference_genome, str): log('loading:', reference_genome, time_stamp=True) self.reference_genome = load_reference_genome(reference_genome) else: self.reference_genome = reference_genome self._load_bams(input_bams) self.bpp_cache = dict() self.max_event_size = max_event_size self.buffer = buffer
def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1): if isinstance(reference_genome, str): logger.info(f'loading: {reference_genome}') self.reference_genome = load_reference_genome(reference_genome) else: self.reference_genome = reference_genome self._load_bams(input_bams) self.bpp_cache = dict() self.max_event_size = max_event_size self.buffer = buffer
def main(): args = parse_arguments() repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq]))) log('loading:', args.input) reference_genome = load_reference_genome(args.input) comments = [ os.path.basename(__file__), 'input: {}'.format(args.input), 'min_length: {}'.format(args.min_length), 'repeat_seq: {}'.format(', '.join(args.repeat_seq)), ] log('writing:', args.output) with open(args.output, 'w') as fh: for comment in comments: fh.write('## {}\n'.format(comment)) fh.write('chr\tstart\tend\tname\n') visited = set() for chrom, seq in sorted(reference_genome.items()): if chrom.startswith('chr'): chrom = chrom[3:] seq = str(seq.seq).lower() if seq in visited: continue else: visited.add(seq) spans = [] for repseq in repeat_sequences: log( 'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format( repseq, args.min_length, chrom, len(seq) ) ) index = 0 while index < len(seq): next_n = seq.find(repseq, index) if next_n < 0: break index = next_n while ( index + len(repseq) <= len(seq) and seq[index : index + len(repseq)] == repseq ): index += len(repseq) span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq)) if len(span) >= args.min_length and len(span) >= 2 * len(repseq): spans.append(span) log('found', len(spans), 'spans', time_stamp=False) for span in spans: fh.write( '{}\t{}\t{}\t{}\n'.format( span.reference_object, span.start, span.end, span.name ) )
def set_example_genes(): result = {} genes = load_annotations(os.path.join(DATA_DIR, 'example_genes.json')) seqs = load_reference_genome(os.path.join(DATA_DIR, 'example_genes.fa')) for chr_genes in genes.values(): for gene in chr_genes: if gene.name in seqs: gene.seq = str(seqs[gene.name].seq) result[gene.name] = gene if gene.aliases: for alias in gene.aliases: result[alias] = gene print(result.keys()) return result
def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != REFERENCE_GENOME['fake'].seq[0:50].upper(): raise AssertionError('fake genome file does not have the expected contents') global BAM_CACHE BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) global FULL_BAM_CACHE FULL_BAM_CACHE = BamCache(get_data('mock_reads_for_events.sorted.bam')) global READS READS = {} for read in BAM_CACHE.fetch('reference3', 1, 8000): if read.qname not in READS: READS[read.qname] = [None, None] if read.is_supplementary: continue if read.is_read1: READS[read.qname][0] = read else: READS[read.qname][1] = read