Esempio n. 1
0
    def __update_file_information( self, fp: IO[bytes] = None ):                    
        if not 'rb' in fp.mode:
            self.log( "err", "invalid file mode, need 'rb', got '%s'" % fp.mode )
            raise ValueError( "Invalid file mode" )

        # store current position
        old_pos = fp.tell()
        fp.seek( 0x10 )

        self.__file_information = []

        for n in range( self.__number_of_file_entries ):
            data_offset   = upk( "<Q", fp.read( 8 ) )[ 0 ] # data offset rel. to pfs0 body
            data_size     = upk( "<Q", fp.read( 8 ) )[ 0 ] # data size
            string_offset = upk( "<I", fp.read( 4 ) )[ 0 ] # string table offset of filename

            self.__file_information.append((
                self.__read_filename( fp, string_offset ), # get filename as string
                data_size,
                data_offset + self.__body_offset           # convert to absolute offset
            ))

            fp.read( 4 ) # skip seperator
        
        # restore fp position
        fp.seek( old_pos )
Esempio n. 2
0
def unpack(fmt,f):
  """A utiltiy function that reads the appropriate number of bytes for the
  format string passed and, if any of the elements are string, strips them of
  whitespace
  Inputs:
    fmt - format string for struct.unpack
    f - file handle
  Returns:
    tuple of read variables."""
  ans = upk(fmt, f.read(csize(fmt)))
  return tuple(an.strip() if isinstance(an, str) else an for an in ans)
Esempio n. 3
0
def unpack(fmt, f):
    """A utiltiy function that reads the appropriate number of bytes for the
  format string passed and, if any of the elements are string, strips them of
  whitespace
  Inputs:
    fmt - format string for struct.unpack
    f - file handle
  Returns:
    tuple of read variables."""
    ans = upk(fmt, f.read(csize(fmt)))
    return tuple(an.strip() if isinstance(an, str) else an for an in ans)
Esempio n. 4
0
def read_motif_total_num (motif_fhd,species):
    """Only read the header of binary file, return the total number of
    motif scan hits regardless of cutoff.

    """
    if species == "hg18":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chr20":[0,0],"chr21":[0,0],
            "chr22":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
    elif species == "mm8":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
    else:
        raise Exception("Only hg18/mm8 supported!")
        
    chromosomes = chromosomes_fp.keys()
    motif_fhd.seek(0)
    # unpack the start pos
    for chromosome in chromosomes:
        chromosomes_fp[chromosome][0] = upk("<i",motif_fhd.read(4))[0]
        motif_fhd.seek(124,1)
    motif_fhd.seek(0,2)
    
    # calculate number of hits
    total_motif_hits = 0
    for i in range(len(chromosomes)-1):
        mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8
        chromosomes_fp[chromosomes[i]][1] = mh
        total_motif_hits += mh
    # last one
    mh = (motif_fhd.tell()-chromosomes_fp[chromosomes[-1]][0])/8
    chromosomes_fp[chromosomes[-1]][1]=mh
    total_motif_hits += mh
 
    return total_motif_hits
Esempio n. 5
0
def read_u64(f, off):
    return upk('<Q', read_at(f, off, 8))[0]
Esempio n. 6
0
def read_u48(f, off):
    s = upk('<HI', read_at(f, off, 6))
    return 0x10000 * s[1] + s[0]
Esempio n. 7
0
def read_u32(f, off):
    return upk('<I', read_at(f, off, 4))[0]
Esempio n. 8
0
def read_u16(f, off):
    return upk('<H', read_at(f, off, 2))[0]
Esempio n. 9
0
def read_u8(f, off):
    return upk('<B', read_at(f, off, 1))[0]
Esempio n. 10
0
def read_motif2 (motif_fhd,species,cutoff=0):
    """Read motif scan result, and return a WigTrackI object
    containing the motif locations.

    * If the motif scan data file is not big, use this function to
      load the whole file into memory. It may be faster than
      read_motif().

    motif_fhd : a file handler for binary motif scan result
    species   : must be "mm8" for mouse or "hg18" for human
    cutoff    : cutoff for the motif scan score
    """
    motif_range_list = FWTrackI(fw=0)
    if species == "hg18":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chr20":[0,0],"chr21":[0,0],
            "chr22":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
        chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6",
                       "chr7","chr8","chr9","chr10","chr11","chr12",
                       "chr13","chr14","chr15","chr16","chr17","chr18",
                       "chr19","chr20","chr21","chr22","chrX","chrY"]
    elif species == "mm8":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
        chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6",
                       "chr7","chr8","chr9","chr10","chr11","chr12",
                       "chr13","chr14","chr15","chr16","chr17","chr18",
                       "chr19","chrX","chrY"]
    else:
        raise Exception("Only hg18/mm8 supported!")
        
    motif_fhd.seek(0)
    data = motif_fhd.read()
    # unpack the start pos
    p = 0
    for chromosome in chromosomes:
        chromosomes_fp[chromosome][0] = upk("<i",data[p:p+4])[0]
        p += 128

    # calculate number of hits
    total_motif_hits = 0
    for i in range(len(chromosomes)-1):
        mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8
        chromosomes_fp[chromosomes[i]][1] = mh
        total_motif_hits += mh
    # last one
    mh = (len(data)-chromosomes_fp[chromosomes[-1]][0])/8
    chromosomes_fp[chromosomes[-1]][1]=mh
    total_motif_hits += mh

    # read and write
    read_motif_hits = 0
    portion = 0
    p = 0

    n=0
    for chromosome in chromosomes:
        p = chromosomes_fp[chromosome][0]
        for i in range(chromosomes_fp[chromosome][1]):
            read_motif_hits += 1
            portion = float(read_motif_hits)/total_motif_hits
            if LOG:
                sys.stdout.write("\r  %.1f%% %s" % (portion*100,"#"*int(portion*50)))
                sys.stdout.flush()
            loc = upk("<i",data[p:p+4])[0]
            score = upk("<f",data[p+4:p+8])[0]
            p += 8
            if score < 0:
               strand = 1
               score = score*-1
            else:
               strand = 0
            #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand))
            if score > cutoff:
		#print score,cutoff
                n+=1
                motif_range_list.add_loc(chromosome,loc-1,strand)
            #print loc-1
    if LOG : sys.stdout.write("\n")
    data = None
    motif_range_list.merge_overlap()
    return motif_range_list
Esempio n. 11
0
def read_u48(f, off):
    return upk('<IH', read_at(f, off, 6))[0]
Esempio n. 12
0
def read_motif (motif_fhd,species,cutoff=0):
    """Read motif scan result, and return a TabIO.FWTrackI object
    containing the motif locations.

    motif_fhd : a file handler for binary motif scan result
    species   : must be "mm8" for mouse or "hg18" for human
    cutoff    : cutoff for the motif scan score
    """
    motif_range_list = FWTrackI(fw=0)
    if species == "hg18":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chr20":[0,0],"chr21":[0,0],
            "chr22":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
    elif species == "mm8":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
    else:
        raise Exception("Only hg18/mm8 supported!")
        
    chromosomes = chromosomes_fp.keys()
    motif_fhd.seek(0)
    # unpack the start pos
    for chromosome in chromosomes:
        chromosomes_fp[chromosome][0] = upk("<i",motif_fhd.read(4))[0]
        motif_fhd.seek(124,1)
    motif_fhd.seek(0,2)
    
    # calculate number of hits
    total_motif_hits = 0
    for i in range(len(chromosomes)-1):
        mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8
        chromosomes_fp[chromosomes[i]][1] = mh
        total_motif_hits += mh
    # last one
    mh = (motif_fhd.tell()-chromosomes_fp[chromosomes[-1]][0])/8
    chromosomes_fp[chromosomes[-1]][1]=mh
    total_motif_hits += mh

    # read and write
    read_motif_hits = 0
    portion = 0
    for chromosome in chromosomes:
        motif_fhd.seek(chromosomes_fp[chromosome][0],0)
        for i in range(chromosomes_fp[chromosome][1]):
            read_motif_hits += 1
            portion = float(read_motif_hits)/total_motif_hits
            if LOG:
                sys.stdout.write("\r%.1f%% %s" % (portion*100,"#"*int(portion*50)))
                sys.stdout.flush()
            loc = upk("<i",motif_fhd.read(4))[0]
            score = upk("<f",motif_fhd.read(4))[0]
            motif_fhd.read(4)
            if score < 0:
               strand = -1
               score = score*-1
            else:
               strand = 1
            #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand))
            if score > cutoff:
		#print score,cutoff
                motif_range_list.add_range(chromosome,RangeI(start=loc-1,end=loc,strand=strand))
            #print loc-1
    #sys.stdout.write("\n")
    motif_range_list.merge_overlap()
    return motif_range_list
Esempio n. 13
0
    def __init__(self, path: str, logger: Callable[[str, str], any] = None):
        """PFS0File constructor

        Opens a pfs0 container file and reads its header

        Params:
            path: str = path to file
            logger: Callable[[str,str],any] = logger function

        Exceptions:
            FileNotFoundError -> file at 'path' was not found

        logger:
            Any function that takes in two strings (log_level, message)
            where log_level is one of:
                "info"
                "warn"
                "err"
        """
        self.opened = False

        # set the logger function
        self.log = logger if logger != None else LOGGER

        # check if the supplied file-path exists
        if not os.path.isfile( path ):
            self.log( "err", "File '%s' not found! " % path )
            raise FileNotFoundError( "The file '%s' could not be opened!" % path )

        try:
            # read file header
            # 4 bytes file magic
            # 4 bytes uint LE no. of file entries
            # 4 bytes uint LE size of string table in bytes
            # 4 bytes seperator \x00\x00\x00\x00
            # 0x18 * no. of file entries:
            #   8 bytes unsigned long long LE data offset (rel. to body)
            #   8 bytes unsigned long long LE data size
            #   4 bytes uint string table offset
            # var. bytes of NULL-terminated strings

            self.log( "info", "Opening file '%s' for reading..." % path )

            fp = open( path, 'rb' )
            
            fp.seek( 0 )

            # file magic

            magic = fp.read( 4 )

            if magic != FILE_MAGIC:
                err_msg = "Invalid file magic, expected 'PFS0', got: '%s'" % magic.decode()
                self.log( "err", err_msg )
                raise ValueError( err_msg )
        
            
            # sizes and offsets

            self.__number_of_file_entries = upk( "<I", fp.read( 4 ) )[ 0 ] # number of files/file entries in container
            self.__string_table_size      = upk( "<I", fp.read( 4 ) )[ 0 ] # size of string table(filenames) in bytes

            fp.read( 4 ) # skip seperator (4 zero-bytes)

            # 0x10 = current position, 0x18 = size of 1 (one) file entry
            self.__string_table_offset    = 0x10 + 0x18 * self.__number_of_file_entries

            # get offset of file body
            self.__body_offset = self.__string_table_offset + self.__string_table_size

            # file information
            self.__update_file_information( fp )

            fp.seek( 0 )
            self.fp = fp
        except:
            err = sys.exc_info()[ 0 ]
            self.log( "err", "Could not read header:\n\n%s" % err.message or str(err) )
            return None

        self.opened = True 
Esempio n. 14
0
def read_u48(fp, off):
    s = upk('<HI', read_at(fp, off, 6))
    return s[1] << 16 | s[0]
Esempio n. 15
0
def read_motif(motif_fhd, species, cutoff=0):
    """Read motif scan result, and return a TabIO.FWTrackI object
    containing the motif locations.

    motif_fhd : a file handler for binary motif scan result
    species   : must be "mm8" for mouse or "hg18" for human
    cutoff    : cutoff for the motif scan score
    """
    motif_range_list = FWTrackI(fw=0)
    if species == "hg18":
        chromosomes_fp = {  # store start and number of file-pos for every chromosome in bin file
            "chr1": [0, 0],
            "chr2": [0, 0],
            "chr3": [0, 0],
            "chr4": [0, 0],
            "chr5": [0, 0],
            "chr6": [0, 0],
            "chr7": [0, 0],
            "chr8": [0, 0],
            "chr9": [0, 0],
            "chr10": [0, 0],
            "chr11": [0, 0],
            "chr12": [0, 0],
            "chr13": [0, 0],
            "chr14": [0, 0],
            "chr15": [0, 0],
            "chr16": [0, 0],
            "chr17": [0, 0],
            "chr18": [0, 0],
            "chr19": [0, 0],
            "chr20": [0, 0],
            "chr21": [0, 0],
            "chr22": [0, 0],
            "chrX": [0, 0],
            "chrY": [0, 0]
        }
    elif species == "mm8":
        chromosomes_fp = {  # store start and number of file-pos for every chromosome in bin file
            "chr1": [0, 0],
            "chr2": [0, 0],
            "chr3": [0, 0],
            "chr4": [0, 0],
            "chr5": [0, 0],
            "chr6": [0, 0],
            "chr7": [0, 0],
            "chr8": [0, 0],
            "chr9": [0, 0],
            "chr10": [0, 0],
            "chr11": [0, 0],
            "chr12": [0, 0],
            "chr13": [0, 0],
            "chr14": [0, 0],
            "chr15": [0, 0],
            "chr16": [0, 0],
            "chr17": [0, 0],
            "chr18": [0, 0],
            "chr19": [0, 0],
            "chrX": [0, 0],
            "chrY": [0, 0]
        }
    else:
        raise Exception("Only hg18/mm8 supported!")

    chromosomes = chromosomes_fp.keys()
    motif_fhd.seek(0)
    # unpack the start pos
    for chromosome in chromosomes:
        chromosomes_fp[chromosome][0] = upk("<i", motif_fhd.read(4))[0]
        motif_fhd.seek(124, 1)
    motif_fhd.seek(0, 2)

    # calculate number of hits
    total_motif_hits = 0
    for i in range(len(chromosomes) - 1):
        mh = (chromosomes_fp[chromosomes[i + 1]][0] -
              chromosomes_fp[chromosomes[i]][0]) / 8
        chromosomes_fp[chromosomes[i]][1] = mh
        total_motif_hits += mh
    # last one
    mh = (motif_fhd.tell() - chromosomes_fp[chromosomes[-1]][0]) / 8
    chromosomes_fp[chromosomes[-1]][1] = mh
    total_motif_hits += mh

    # read and write
    read_motif_hits = 0
    portion = 0
    for chromosome in chromosomes:
        motif_fhd.seek(chromosomes_fp[chromosome][0], 0)
        for i in range(chromosomes_fp[chromosome][1]):
            read_motif_hits += 1
            portion = float(read_motif_hits) / total_motif_hits
            if LOG:
                sys.stdout.write("\r%.1f%% %s" %
                                 (portion * 100, "#" * int(portion * 50)))
                sys.stdout.flush()
            loc = upk("<i", motif_fhd.read(4))[0]
            score = upk("<f", motif_fhd.read(4))[0]
            motif_fhd.read(4)
            if score < 0:
                strand = -1
                score = score * -1
            else:
                strand = 1
            #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand))
            if score > cutoff:
                #print score,cutoff
                motif_range_list.add_range(
                    chromosome, RangeI(start=loc - 1, end=loc, strand=strand))
            #print loc-1
    #sys.stdout.write("\n")
    motif_range_list.merge_overlap()
    return motif_range_list