def overlap_repeats(self): """Overlaps event coordinates with repeatmasker, simple repeats""" event_groups_by_chr = self.group_by_chr() for chrom, events in event_groups_by_chr.iteritems(): proper_chrom = tools.proper_chrom(chrom, chrom_proper=self.chrom_proper) for snv_type, snv_groups in events.iteritems(): print 'processing repeat', snv_type for snvs in snv_groups: overlaps = repeat.find_overlaps({'chrom':proper_chrom, 'start':int(snvs[0].ref_start), 'end':int(snvs[0].ref_end)}, self.repeat_overlaps) if overlaps: attrs = {} for repeat_type, types in overlaps.iteritems(): if repeat_type == 'simple_repeats': attr = 'within_simple_repeats' elif repeat_type == 'segdup': attr = 'within_segdup' elif repeat_type == 'rmsk': attr = 'repeatmasker' if types: # only report one with shortest name types_sorted = types.keys() types_sorted.sort(lambda x,y: len(x)-len(y)) attrs[attr] = types_sorted[0] if attrs: for snv in snvs: tools.set_attrs(snv, attrs) # clears cache for repeat_olap in self.repeat_overlaps.values(): repeat_olap.finish()
def parse(cls, record): """Parses tabulated output into object""" cols = record.rstrip('\n').split('\t') data = {} headers = cls.headers[:] headers.extend(cls.headers_support) for i in range(len(headers)): if cols[i+1] != 'na' and (headers[i] == 'spanning_reads' or headers[i] == 'coverage'): if ',' in cols[i+1]: data[headers[i]] = [int(n) for n in cols[i+1].split(',')] else: data[headers[i]] = int(cols[i+1]) else: data[headers[i]] = cols[i+1] e = Event(cols[1]) set_attrs(e, data) return e
def parse_results(self, snv_file, select_types=None, chrom=None): """Parses results from single file into SNV objects""" names = SNVCaller.output_headers # conversion between header name and object attribute field_name_conversion = { 'type': 'snv_type', 'chr': 'ref', 'chr_start': 'ref_start', 'chr_end': 'ref_end', 'ctg': 'var', 'ctg_len': 'var_len', 'ctg_start': 'var_start', 'ctg_end': 'var_end', 'len': 'snv_len', 'ref': 'ref_seq', 'alt': 'var_seq', 'event_reads': 'nreads_event', 'contig_reads': 'nreads_contig', 'genome_reads': 'nreads_genome', 'gene': 'gene', 'from_end': 'from_end', 'ctg_strand': 'query_strand', } for line in open(snv_file, 'r'): cols = line.rstrip('\n').split('\t') if cols[0] == 'id': continue attributes = {} for i in range(1, len(cols)): name = names[i] value = cols[i] if field_name_conversion.has_key(name): name = field_name_conversion[name] if name in ('expansion', 'from_end'): value = int(value) elif name == 'confirm_contig_region': value = value.split('-') value[0] = int(value[0]) value[1] = int(value[1]) elif name == 'at_least_1_read_opposite': if value == 'true': value = True else: value = False attributes[name] = value if select_types and not attributes['snv_type'] in select_types: continue if chrom and attributes['ref'] != chrom: continue snv = SNV(method='psl') tools.set_attrs(snv, attributes) self.snvs.append(snv)