def __update_file_information( self, fp: IO[bytes] = None ): if not 'rb' in fp.mode: self.log( "err", "invalid file mode, need 'rb', got '%s'" % fp.mode ) raise ValueError( "Invalid file mode" ) # store current position old_pos = fp.tell() fp.seek( 0x10 ) self.__file_information = [] for n in range( self.__number_of_file_entries ): data_offset = upk( "<Q", fp.read( 8 ) )[ 0 ] # data offset rel. to pfs0 body data_size = upk( "<Q", fp.read( 8 ) )[ 0 ] # data size string_offset = upk( "<I", fp.read( 4 ) )[ 0 ] # string table offset of filename self.__file_information.append(( self.__read_filename( fp, string_offset ), # get filename as string data_size, data_offset + self.__body_offset # convert to absolute offset )) fp.read( 4 ) # skip seperator # restore fp position fp.seek( old_pos )
def unpack(fmt,f): """A utiltiy function that reads the appropriate number of bytes for the format string passed and, if any of the elements are string, strips them of whitespace Inputs: fmt - format string for struct.unpack f - file handle Returns: tuple of read variables.""" ans = upk(fmt, f.read(csize(fmt))) return tuple(an.strip() if isinstance(an, str) else an for an in ans)
def unpack(fmt, f): """A utiltiy function that reads the appropriate number of bytes for the format string passed and, if any of the elements are string, strips them of whitespace Inputs: fmt - format string for struct.unpack f - file handle Returns: tuple of read variables.""" ans = upk(fmt, f.read(csize(fmt))) return tuple(an.strip() if isinstance(an, str) else an for an in ans)
def read_motif_total_num (motif_fhd,species): """Only read the header of binary file, return the total number of motif scan hits regardless of cutoff. """ if species == "hg18": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chr20":[0,0],"chr21":[0,0], "chr22":[0,0],"chrX":[0,0],"chrY":[0,0] } elif species == "mm8": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chrX":[0,0],"chrY":[0,0] } else: raise Exception("Only hg18/mm8 supported!") chromosomes = chromosomes_fp.keys() motif_fhd.seek(0) # unpack the start pos for chromosome in chromosomes: chromosomes_fp[chromosome][0] = upk("<i",motif_fhd.read(4))[0] motif_fhd.seek(124,1) motif_fhd.seek(0,2) # calculate number of hits total_motif_hits = 0 for i in range(len(chromosomes)-1): mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8 chromosomes_fp[chromosomes[i]][1] = mh total_motif_hits += mh # last one mh = (motif_fhd.tell()-chromosomes_fp[chromosomes[-1]][0])/8 chromosomes_fp[chromosomes[-1]][1]=mh total_motif_hits += mh return total_motif_hits
def read_u64(f, off): return upk('<Q', read_at(f, off, 8))[0]
def read_u48(f, off): s = upk('<HI', read_at(f, off, 6)) return 0x10000 * s[1] + s[0]
def read_u32(f, off): return upk('<I', read_at(f, off, 4))[0]
def read_u16(f, off): return upk('<H', read_at(f, off, 2))[0]
def read_u8(f, off): return upk('<B', read_at(f, off, 1))[0]
def read_motif2 (motif_fhd,species,cutoff=0): """Read motif scan result, and return a WigTrackI object containing the motif locations. * If the motif scan data file is not big, use this function to load the whole file into memory. It may be faster than read_motif(). motif_fhd : a file handler for binary motif scan result species : must be "mm8" for mouse or "hg18" for human cutoff : cutoff for the motif scan score """ motif_range_list = FWTrackI(fw=0) if species == "hg18": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chr20":[0,0],"chr21":[0,0], "chr22":[0,0],"chrX":[0,0],"chrY":[0,0] } chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6", "chr7","chr8","chr9","chr10","chr11","chr12", "chr13","chr14","chr15","chr16","chr17","chr18", "chr19","chr20","chr21","chr22","chrX","chrY"] elif species == "mm8": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chrX":[0,0],"chrY":[0,0] } chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6", "chr7","chr8","chr9","chr10","chr11","chr12", "chr13","chr14","chr15","chr16","chr17","chr18", "chr19","chrX","chrY"] else: raise Exception("Only hg18/mm8 supported!") motif_fhd.seek(0) data = motif_fhd.read() # unpack the start pos p = 0 for chromosome in chromosomes: chromosomes_fp[chromosome][0] = upk("<i",data[p:p+4])[0] p += 128 # calculate number of hits total_motif_hits = 0 for i in range(len(chromosomes)-1): mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8 chromosomes_fp[chromosomes[i]][1] = mh total_motif_hits += mh # last one mh = (len(data)-chromosomes_fp[chromosomes[-1]][0])/8 chromosomes_fp[chromosomes[-1]][1]=mh total_motif_hits += mh # read and write read_motif_hits = 0 portion = 0 p = 0 n=0 for chromosome in chromosomes: p = chromosomes_fp[chromosome][0] for i in range(chromosomes_fp[chromosome][1]): read_motif_hits += 1 portion = float(read_motif_hits)/total_motif_hits if LOG: sys.stdout.write("\r %.1f%% %s" % (portion*100,"#"*int(portion*50))) sys.stdout.flush() loc = upk("<i",data[p:p+4])[0] score = upk("<f",data[p+4:p+8])[0] p += 8 if score < 0: strand = 1 score = score*-1 else: strand = 0 #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand)) if score > cutoff: #print score,cutoff n+=1 motif_range_list.add_loc(chromosome,loc-1,strand) #print loc-1 if LOG : sys.stdout.write("\n") data = None motif_range_list.merge_overlap() return motif_range_list
def read_u48(f, off): return upk('<IH', read_at(f, off, 6))[0]
def read_motif (motif_fhd,species,cutoff=0): """Read motif scan result, and return a TabIO.FWTrackI object containing the motif locations. motif_fhd : a file handler for binary motif scan result species : must be "mm8" for mouse or "hg18" for human cutoff : cutoff for the motif scan score """ motif_range_list = FWTrackI(fw=0) if species == "hg18": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chr20":[0,0],"chr21":[0,0], "chr22":[0,0],"chrX":[0,0],"chrY":[0,0] } elif species == "mm8": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1":[0,0],"chr2":[0,0],"chr3":[0,0], "chr4":[0,0],"chr5":[0,0],"chr6":[0,0], "chr7":[0,0],"chr8":[0,0],"chr9":[0,0], "chr10":[0,0],"chr11":[0,0],"chr12":[0,0], "chr13":[0,0],"chr14":[0,0],"chr15":[0,0], "chr16":[0,0],"chr17":[0,0],"chr18":[0,0], "chr19":[0,0],"chrX":[0,0],"chrY":[0,0] } else: raise Exception("Only hg18/mm8 supported!") chromosomes = chromosomes_fp.keys() motif_fhd.seek(0) # unpack the start pos for chromosome in chromosomes: chromosomes_fp[chromosome][0] = upk("<i",motif_fhd.read(4))[0] motif_fhd.seek(124,1) motif_fhd.seek(0,2) # calculate number of hits total_motif_hits = 0 for i in range(len(chromosomes)-1): mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8 chromosomes_fp[chromosomes[i]][1] = mh total_motif_hits += mh # last one mh = (motif_fhd.tell()-chromosomes_fp[chromosomes[-1]][0])/8 chromosomes_fp[chromosomes[-1]][1]=mh total_motif_hits += mh # read and write read_motif_hits = 0 portion = 0 for chromosome in chromosomes: motif_fhd.seek(chromosomes_fp[chromosome][0],0) for i in range(chromosomes_fp[chromosome][1]): read_motif_hits += 1 portion = float(read_motif_hits)/total_motif_hits if LOG: sys.stdout.write("\r%.1f%% %s" % (portion*100,"#"*int(portion*50))) sys.stdout.flush() loc = upk("<i",motif_fhd.read(4))[0] score = upk("<f",motif_fhd.read(4))[0] motif_fhd.read(4) if score < 0: strand = -1 score = score*-1 else: strand = 1 #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand)) if score > cutoff: #print score,cutoff motif_range_list.add_range(chromosome,RangeI(start=loc-1,end=loc,strand=strand)) #print loc-1 #sys.stdout.write("\n") motif_range_list.merge_overlap() return motif_range_list
def __init__(self, path: str, logger: Callable[[str, str], any] = None): """PFS0File constructor Opens a pfs0 container file and reads its header Params: path: str = path to file logger: Callable[[str,str],any] = logger function Exceptions: FileNotFoundError -> file at 'path' was not found logger: Any function that takes in two strings (log_level, message) where log_level is one of: "info" "warn" "err" """ self.opened = False # set the logger function self.log = logger if logger != None else LOGGER # check if the supplied file-path exists if not os.path.isfile( path ): self.log( "err", "File '%s' not found! " % path ) raise FileNotFoundError( "The file '%s' could not be opened!" % path ) try: # read file header # 4 bytes file magic # 4 bytes uint LE no. of file entries # 4 bytes uint LE size of string table in bytes # 4 bytes seperator \x00\x00\x00\x00 # 0x18 * no. of file entries: # 8 bytes unsigned long long LE data offset (rel. to body) # 8 bytes unsigned long long LE data size # 4 bytes uint string table offset # var. bytes of NULL-terminated strings self.log( "info", "Opening file '%s' for reading..." % path ) fp = open( path, 'rb' ) fp.seek( 0 ) # file magic magic = fp.read( 4 ) if magic != FILE_MAGIC: err_msg = "Invalid file magic, expected 'PFS0', got: '%s'" % magic.decode() self.log( "err", err_msg ) raise ValueError( err_msg ) # sizes and offsets self.__number_of_file_entries = upk( "<I", fp.read( 4 ) )[ 0 ] # number of files/file entries in container self.__string_table_size = upk( "<I", fp.read( 4 ) )[ 0 ] # size of string table(filenames) in bytes fp.read( 4 ) # skip seperator (4 zero-bytes) # 0x10 = current position, 0x18 = size of 1 (one) file entry self.__string_table_offset = 0x10 + 0x18 * self.__number_of_file_entries # get offset of file body self.__body_offset = self.__string_table_offset + self.__string_table_size # file information self.__update_file_information( fp ) fp.seek( 0 ) self.fp = fp except: err = sys.exc_info()[ 0 ] self.log( "err", "Could not read header:\n\n%s" % err.message or str(err) ) return None self.opened = True
def read_u48(fp, off): s = upk('<HI', read_at(fp, off, 6)) return s[1] << 16 | s[0]
def read_motif(motif_fhd, species, cutoff=0): """Read motif scan result, and return a TabIO.FWTrackI object containing the motif locations. motif_fhd : a file handler for binary motif scan result species : must be "mm8" for mouse or "hg18" for human cutoff : cutoff for the motif scan score """ motif_range_list = FWTrackI(fw=0) if species == "hg18": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1": [0, 0], "chr2": [0, 0], "chr3": [0, 0], "chr4": [0, 0], "chr5": [0, 0], "chr6": [0, 0], "chr7": [0, 0], "chr8": [0, 0], "chr9": [0, 0], "chr10": [0, 0], "chr11": [0, 0], "chr12": [0, 0], "chr13": [0, 0], "chr14": [0, 0], "chr15": [0, 0], "chr16": [0, 0], "chr17": [0, 0], "chr18": [0, 0], "chr19": [0, 0], "chr20": [0, 0], "chr21": [0, 0], "chr22": [0, 0], "chrX": [0, 0], "chrY": [0, 0] } elif species == "mm8": chromosomes_fp = { # store start and number of file-pos for every chromosome in bin file "chr1": [0, 0], "chr2": [0, 0], "chr3": [0, 0], "chr4": [0, 0], "chr5": [0, 0], "chr6": [0, 0], "chr7": [0, 0], "chr8": [0, 0], "chr9": [0, 0], "chr10": [0, 0], "chr11": [0, 0], "chr12": [0, 0], "chr13": [0, 0], "chr14": [0, 0], "chr15": [0, 0], "chr16": [0, 0], "chr17": [0, 0], "chr18": [0, 0], "chr19": [0, 0], "chrX": [0, 0], "chrY": [0, 0] } else: raise Exception("Only hg18/mm8 supported!") chromosomes = chromosomes_fp.keys() motif_fhd.seek(0) # unpack the start pos for chromosome in chromosomes: chromosomes_fp[chromosome][0] = upk("<i", motif_fhd.read(4))[0] motif_fhd.seek(124, 1) motif_fhd.seek(0, 2) # calculate number of hits total_motif_hits = 0 for i in range(len(chromosomes) - 1): mh = (chromosomes_fp[chromosomes[i + 1]][0] - chromosomes_fp[chromosomes[i]][0]) / 8 chromosomes_fp[chromosomes[i]][1] = mh total_motif_hits += mh # last one mh = (motif_fhd.tell() - chromosomes_fp[chromosomes[-1]][0]) / 8 chromosomes_fp[chromosomes[-1]][1] = mh total_motif_hits += mh # read and write read_motif_hits = 0 portion = 0 for chromosome in chromosomes: motif_fhd.seek(chromosomes_fp[chromosome][0], 0) for i in range(chromosomes_fp[chromosome][1]): read_motif_hits += 1 portion = float(read_motif_hits) / total_motif_hits if LOG: sys.stdout.write("\r%.1f%% %s" % (portion * 100, "#" * int(portion * 50))) sys.stdout.flush() loc = upk("<i", motif_fhd.read(4))[0] score = upk("<f", motif_fhd.read(4))[0] motif_fhd.read(4) if score < 0: strand = -1 score = score * -1 else: strand = 1 #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand)) if score > cutoff: #print score,cutoff motif_range_list.add_range( chromosome, RangeI(start=loc - 1, end=loc, strand=strand)) #print loc-1 #sys.stdout.write("\n") motif_range_list.merge_overlap() return motif_range_list