def read_scanner_data(handle): """ Helper function to parse ScannerData object from file handle. Args: handle (file): File handle Returns: ScannerData """ name = read_string(handle) pmt_green = read_int(handle) pmt_red = read_int(handle) scanner_version = read_string(handle) imaging_user = read_string(handle) return ScannerData(name, pmt_green, pmt_red, scanner_version, imaging_user)
def get_base_calls(self): """ Returns: list(string): The genotype basecalls The characters are A, C, G, T, or - for a no-call/null. The calls are relative to the top strand. """ try: ploidy_type = self.get_ploidy_type() except: ploidy_type = 1 if ploidy_type != 1: genotypes = self.get_genotypes() with open(self.filename, "rb") as gtc_handle: gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_BASE_CALLS]) num_entries = read_int(gtc_handle) result = [] for idx in range(num_entries): if ploidy_type == 1: result.append(gtc_handle.read(2).decode()) else: byte_string = gtc_handle.read(2).decode() ab_genotype = code2genotype[genotypes[idx]] if ab_genotype == "NC" or ab_genotype == "NULL": result.append("-") else: top_genotype = "".join( [byte_string[0] if allele == "A" else byte_string[1] for allele in ab_genotype]) result.append(top_genotype) return result
def __get_generic_array(self, toc_entry, parse_function, item_size, offset, count): """ Internal helper function to access a data array in a generic fashion. Args: toc_entry (int): Identifier for entry in table of contents parse_function (function): A function used to parse the value from a file handle item_size (int): Size (in bytes) of individual entry offset (int): Offset (in elements counts) to start reading count (int): Number of entries to read (None is read all remaining entries) Returns: list(type): An array parsed from the file (type dependent on parse_function) """ with open(self.filename, "rb") as gtc_handle: gtc_handle.seek(self.toc_table[toc_entry]) num_entries = read_int(gtc_handle) - offset if count is not None: num_entries = min(num_entries, count) if offset > 0: gtc_handle.seek( self.toc_table[toc_entry] + 4 + offset * item_size) result = [] for idx in range(num_entries): result.append(parse_function(gtc_handle)) return result
def __init__(self, filename, ignore_version=False, check_write_complete=True): """ Constructor Args: filename (string): GTC filename ignore_version (bool): boolean to ignore automated checks on file version, not recommended (default: False) Returns: GenotypeCalls """ self.filename = filename with open(self.filename, "rb") as gtc_handle: identifier = gtc_handle.read(3).decode() if identifier != "gtc": raise Exception("GTC format error: bad format identifier") self.version = read_byte(gtc_handle) if self.version not in GenotypeCalls.supported_version and not ignore_version: raise Exception("Unsupported GTC File version (" + str(self.version) + ")") number_toc_entries = read_int(gtc_handle) # # Parse the table of contents and map the toc entry # to the lookup # self.toc_table = {} for toc_idx in range(number_toc_entries): (id, offset) = struct.unpack("<hI", gtc_handle.read(6)) self.toc_table[id] = offset if check_write_complete and not self.is_write_complete(): raise Exception("GTC file is incomplete")
def get_num_intensity_only(self): """ Returns: int: The number of intensity only SNPs """ with open(self.filename, "rb") as gtc_handle: gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_GC50] + 12) return read_int(gtc_handle)
def get_num_no_calls(self): """ Returns: int: The number of no calls """ with open(self.filename, "rb") as gtc_handle: gtc_handle.seek(self.toc_table[GenotypeCalls.__ID_GC50] + 8) return read_int(gtc_handle)
def __parse_locus_version_6(self, handle): """ Helper function to parse version 6 locus entry Args: handle (file): File handle at start of locus entry record Returns: None Raises: Exception: Manifest format error """ self.ilmn_id = read_string(handle) self.source_strand = SourceStrand.from_string( self.ilmn_id.split("_")[-2]) self.name = read_string(handle) for idx in range(3): read_string(handle) handle.read(4) for idx in range(2): read_string(handle) self.snp = read_string(handle) self.chrom = read_string(handle) for idx in range(2): read_string(handle) self.map_info = int(read_string(handle)) for idx in range(2): read_string(handle) self.address_a = read_int(handle) self.address_b = read_int(handle) for idx in range(7): read_string(handle) handle.read(3) self.assay_type = read_byte(handle) if self.assay_type not in [0, 1, 2]: raise Exception( "Format error in reading assay type from locus entry") if self.address_b == 0: if self.assay_type != 0: raise Exception( "Manifest format error: Assay type is inconsistent with address B" ) else: if self.assay_type == 0: raise Exception( "Manifest format error: Assay type is inconsistent with address B" )
def __parse_file(self, handle): """ Helper function to initialize this object from a file handle Args: handle (file handle): File handle at start of locus entry record Returns: None """ version = read_int(handle) if version == 6: self.__parse_locus_version_6(handle) elif version == 7: self.__parse_locus_version_7(handle) elif version == 8: self.__parse_locus_version_8(handle) else: raise Exception( "Manifest format error: unknown version for locus entry (" + str(version) + ")")
def __get_generic_array_numpy(self, toc_entry, numpy_type, offset=0, count=None): """ Internal helper function to access a data array in a generic fashion. Args: toc_entry (int): Identifier for entry in table of contents numpy_type (numpy.dtype): Data type to read into array offset (int): Offset (in element counts) to start reading count (int): Number of entries to read (None will read remaining entries) Returns: list(type): An array parsed from the file (type dependent on parse_function) """ numpy_type = dtype(numpy_type) with open(self.filename, "rb") as gtc_handle: gtc_handle.seek(self.toc_table[toc_entry]) num_entries = read_int(gtc_handle) - offset if count is not None: num_entries = min(num_entries, count) if offset > 0: gtc_handle.seek( self.toc_table[toc_entry] + 4 + offset * numpy_type.itemsize) return frombuffer(gtc_handle.read(num_entries * numpy_type.itemsize), dtype=numpy_type)
def read_cluster_file(handle): """ Read a cluster file Args: file: EGT cluster file handle Returns: ClusterFile Raises: Exception: Incompatible cluster file format """ version = read_int(handle) if version != 3: raise Exception("Cluster file version " + str(version) + " not supported") gencall_version = read_string(handle) cluster_version = read_string(handle) call_version = read_string(handle) normalization_version = read_string(handle) date_created = read_string(handle) is_wgt = read_byte(handle) == 1 if not is_wgt: raise Exception("Only WGT cluster file version supported") manifest_name = read_string(handle) result = ClusterFile(gencall_version, cluster_version, call_version, normalization_version, date_created, manifest_name) data_block_version = read_int(handle) if data_block_version not in [8, 9]: raise Exception("Data block version in cluster file " + str(data_block_version) + " not supported") # opa _ = read_string(handle) num_records = read_int(handle) cluster_records = ClusterFile.read_array( handle, num_records, lambda handle: ClusterRecord.read_record( handle, data_block_version)) cluster_scores = ClusterFile.read_array(handle, num_records, ClusterScore.read_record) # genotypes _ = ClusterFile.read_array(handle, num_records, read_string) loci_names = ClusterFile.read_array(handle, num_records, read_string) addresses = ClusterFile.read_array(handle, num_records, read_int) # cluster counts cluster_counts = [] for idx in range(num_records): # 3 corresponds to number genotypes (AA, AB, BB) cluster_counts.append(ClusterFile.read_array(handle, 3, read_int)) for (cluster_record, count_record) in zip(cluster_records, cluster_counts): assert cluster_record.aa_cluster_stats.N == count_record[0] assert cluster_record.ab_cluster_stats.N == count_record[1] assert cluster_record.bb_cluster_stats.N == count_record[2] for (locus_name, address, cluster_record, cluster_score) in zip(loci_names, addresses, cluster_records, cluster_scores): cluster_record.address = address cluster_record.cluster_score = cluster_score result.add_record(locus_name, cluster_record) return result
def __parse_file(self, manifest_file): """ Helper function to initialize this object from a file. Args: manifest_file (string): Location of BPM (bead pool manifest) file Returns: None Raises: Exception: Unsupported or unknown BPM version Exception: Manifest format error """ with open(manifest_file, "rb") as manifest_handle: header = manifest_handle.read(3).decode() if len(header) != 3 or header != "BPM": raise Exception("Invalid BPM format") version = read_byte(manifest_handle) if version != 1: raise Exception("Unknown BPM version (" + str(ord(version)) + ")") version = read_int(manifest_handle) version_flag = 0x1000 if version & version_flag == version_flag: version = version ^ version_flag if version > 5 or version < 3: raise Exception("Unsupported BPM version (" + str(version) + ")") self.manifest_name = read_string(manifest_handle) if version > 1: self.control_config = read_string(manifest_handle) self.num_loci = read_int(manifest_handle) manifest_handle.seek(4 * self.num_loci, 1) name_lookup = {} for idx in range(self.num_loci): self.names.append(read_string(manifest_handle)) name_lookup[self.names[-1]] = idx for idx in range(self.num_loci): normalization_id = read_byte(manifest_handle) if normalization_id >= 100: raise Exception( "Manifest format error: read invalid normalization ID") self.normalization_ids.append(normalization_id) self.assay_types = [0] * self.num_loci self.addresses = [0] * self.num_loci self.snps = [""] * self.num_loci self.chroms = [""] * self.num_loci self.map_infos = [0] * self.num_loci self.ref_strands = [RefStrand.Unknown] * self.num_loci self.source_strands = [SourceStrand.Unknown] * self.num_loci for idx in range(self.num_loci): locus_entry = LocusEntry(manifest_handle) self.assay_types[name_lookup[ locus_entry.name]] = locus_entry.assay_type self.addresses[name_lookup[ locus_entry.name]] = locus_entry.address_a self.snps[name_lookup[locus_entry.name]] = locus_entry.snp self.chroms[name_lookup[locus_entry.name]] = locus_entry.chrom self.map_infos[name_lookup[ locus_entry.name]] = locus_entry.map_info self.ref_strands[name_lookup[ locus_entry.name]] = locus_entry.ref_strand self.source_strands[name_lookup[ locus_entry.name]] = locus_entry.source_strand if len(self.normalization_ids) != len(self.assay_types): raise Exception( "Manifest format error: read invalid number of assay entries" ) all_norm_ids = set() for locus_idx in range(self.num_loci): self.normalization_ids[locus_idx] += 100 * \ self.assay_types[locus_idx] all_norm_ids.add(self.normalization_ids[locus_idx]) sorted_norm_ids = sorted(all_norm_ids) lookup_dictionary = {} for idx in range(len(sorted_norm_ids)): lookup_dictionary[sorted_norm_ids[idx]] = idx self.normalization_lookups = [ lookup_dictionary[normalization_id] for normalization_id in self.normalization_ids ]