Beispiel #1
0
    def read_file(filename):
        tmp_dir = tempfile.mkdtemp()
        tmp_file_path = os.path.join(tmp_dir, 'tempfile')

        try:
            with lzma.open(filename, 'rb') as xz_file:
                with open(tmp_file_path, 'wb') as tmp_file:
                    tmp_file.write(xz_file.read())

            costs = [[]]
            #scales = []
            with open(tmp_file_path, 'rb') as f:
                width, height, n_scales = struct.unpack('iii', f.read(12))
                scales = struct.unpack('%df' % n_scales, f.read(4*n_scales))

                #costs = [[]]
                i = 0
                count = 0
                a = f.read(width)
                while a != '':
                    costs[i].append(list(struct.unpack('%dB' % width, a)))
                    a = f.read(width)

                    count += 1
                    if count == height:
                        i += 1
                        count = 0
                        if i < 2:
                            costs.append([])
        finally:
            h_fs.rm(tmp_dir, ignore_errors=True)

        return {'costs': costs, 'scales': scales}
Beispiel #2
0
def _get_xzfile(filenames):
    tar_data = _get_tarfile(filenames)
    lzma_fobj = StringIO.StringIO()
    xz_file = lzma.open(lzma_fobj, 'w')
    xz_file.write(tar_data.read())
    lzma_fobj.seek(0)
    return lzma_fobj
    def test_xz_archive(self):
        tfobj = _get_xzfile(self.filenames)
        xzfobj = lzma.open(tfobj)

        with layers.LayerArchive(xzfobj) as tar:
            members = tar.getmembers()
            for tarinfo in members:
                self.assertIn(tarinfo.name, self.filenames)
def cca_items(args):
    '''This generator takes an s3_paths_fname file, fetches the data,
    constructs a CCA record, and yields it.

    '''
    for path in lzma.open(args.s3_paths_fname):
        if args.date_hour is not None:
            if not path.startswith(args.date_hour):
                continue                
        s3_path = args.s3_path_prefix + path.strip()
        url = args.s3_http_host + s3_path
        logger.info( url )
        retries = 0
        max_retries = 10
        while retries < max_retries:
            retries += 1
            sys.stderr.flush()
            try:
                resp = requests.get(url)
                errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa')
                logger.info( '\n'.join(errors) )
                for si in Chunk(file_obj=StringIO(data)):

                    item = {
                        'key': si.stream_id,
                        'url': si.abs_url,
                        'timestamp': si.stream_time.epoch_ticks,
                        'request': None,  ## not part of this data set
                        'response': {
                            'headers': [
                                ['Content-Type', 'text/html'],
                            ],
                            'body': si.body.clean_html,
                            ## alternatively, could use si.body.raw and
                            ## si.body.media_type for the Content-Type
                            ## header, but that would cause the Serif NER
                            ## to be useless to teams...
                        },
                        'imported': None,
                    }
                    yield item

                    #print cbor.dumps(rec)

                    ## do something with the data
                    logger.info(
                        '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % (
                        len(si.body.clean_html), len(si.body.clean_visible), 
                        len(si.body.sentences['serif']),
                        len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))),
                        ))
                break # break out of retry loop
            except Exception, exc:
                logger.critical( traceback.format_exc(exc) )
                logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) )
                time.sleep(1)
Beispiel #5
0
def read_datafile(file_name):
   with lzma.open(file_name, "r") as f:
      for line in f:
         if line.startswith('ts,'):
            names=line.rstrip().split(',')
            break
      data = np.genfromtxt(f, delimiter=',', comments='#', skiprows=2, names=names,
                           usecols=(range(0,7)),
                           converters={0: mdates.strpdate2num('%Y-%m-%d %H:%M:%S')})
   return data
Beispiel #6
0
    def load(self):
        """Loads the neuron weights from the associated file and returns it.

        :returns: Loaded weights vector.
        """
        with open(self.file, "wb") as target:
            with lzma.open(self.compressedFile, "r") as uncompressor:
                target.write(uncompressor.read())
        weights = numpy.load(self.file)
        remove(self.file)
        return weights
 def configure(self, mode, id_type):
     mode_type = '%s_%s' % (mode, id_type)
     data = set(self.config.get(mode_type, []))
     path = self.config.get(mode_type + '_path')
     if path:
         if path.endswith('.xz'):
             fh = lzma.open(path)
         else:
             fh = open(path)
         map(data.add, fh.read().splitlines())
     setattr(self, mode_type, data)
     logger.info('finished loading %d %s to %s', len(data), id_type, mode)
Beispiel #8
0
def zopen(path):
    if path == '-':
        return sys.stdin
    lpath = path.lower()
    if lpath.endswith('.gz'):
        return gzip.open(path, 'rb')
    elif lpath.endswith('.bz2'):
        return bz2.BZ2File(path, 'rb')
    elif lpath.endswith('.xz'):
        assert lzma, "path ends with .xz but lzma library not available"
        return lzma.open(path, 'rb')
    else:
        return open(path, 'r')
Beispiel #9
0
    def save(self, weights):
        """Saves the given weights to this neuron associated file.

        Weights vector is first saved as serialized numpy array, then LZMA
        compression algorithm is applied in order to minimize neuron file weight.

        :param weights: Weights vector to save.
        """
        numpy.save(self.file, weights)
        with open(self.file, "rb") as source:
            with lzma.open(self.compressedFile, "w") as compressor:
                compressor.write(source.read())
        remove(self.file)
Beispiel #10
0
def zopenw(path):
    if path == '-':
        return sys.stdout
    lpath = path.lower()
    # TODO: if prefix is s3: or http:, open some stream to such an interface
    if lpath.endswith('.gz'):
        return gzip.open(path, 'wb')
    elif lpath.endswith('.bz2'):
        return bz2.BZ2File(path, 'wb')
    elif lpath.endswith('.xz'):
        assert lzma, "path ends with .xz but lzma library not available"
        return lzma.open(path, 'wb')
    else:
        return open(path, 'w')
 def GetPackagesContainingDescription(self, text):
     """Get the list of every packages that are installable on the system."""
     active_sources = self.GetActiveSources()  #[source for source in self.all_sources if not source.ignore]   
     containing_longnames = {}
     for source in active_sources:
         file = lzma.open(source.hdlist)
         for line in file:
             if line[:9] == '@summary@':
                 fields = line.strip().split('@')
                 description = fields[2]
             elif line[:6] == '@info@':
                 fields = line.strip().split('@')
                 longname = fields[2]
                 if description.lower().find(text) != -1:
                     containing_longnames[longname] = True
     return containing_longnames
Beispiel #12
0
    def mkarc(self, arc_engine_id, arc_name, file_infos):
        if arc_engine_id == 'arc_xz':
            try:
                zfile = lzma.open(arc_name, 'wb')
                file_info = file_infos[0]

                rname = file_info.get_filename()
                data = open(rname, 'rb').read()

                zfile.write(data)
                zfile.close()

                return True
            except:
                pass

        return False
Beispiel #13
0
def num_lines(filepath):
    """Returns the number of lines in a specified file"""
    if filepath.endswith('.gz'):
        fp = gzip.open(filepath, 'rb')
    elif filepath.endswith('.xz'):
        import backports.lzma as lzma
        fp = lzma.open(filepath, 'rb')
    else:
        fp = open(filepath)

    # count number of lines
    for i, line in enumerate(fp, 1):
        pass

    fp.close()
    return i

    return loggers
Beispiel #14
0
def parse(hdlist, add_raw=False):
    """Create a generator of packages parsed from synthesis hdlist
    file."""

    pkg = {}

    try:
        for line in gzip.open(hdlist, 'rb'):
            handleline(pkg, line, add_raw)
            if 'name' in pkg:
                yield pkg
                pkg = {}
    except IOError:
        for line in lzma.open(hdlist, 'rb'):
            handleline(pkg, line, add_raw)
            if 'name' in pkg:
                yield pkg
                pkg = {}
Beispiel #15
0
    def _compress_image_stream(self, stream):
        outfile = os.path.join(self.workflow.source.workdir, EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE)
        if self.method == "gzip":
            outfile = outfile.format("gz")
            fp = gzip.open(outfile, "wb", compresslevel=6)
        elif self.method == "lzma":
            outfile = outfile.format("xz")
            fp = lzma.open(outfile, "wb")
        else:
            raise RuntimeError("Unsupported compression format {0}".format(self.method))

        _chunk_size = 1024 ** 2  # 1 MB chunk size for reading/writing
        self.log.info("compressing image %s to %s using %s method", self.workflow.image, outfile, self.method)
        data = stream.read(_chunk_size)
        while data != b"":
            fp.write(data)
            data = stream.read(_chunk_size)

        return outfile
    def _compress_image_stream(self, stream):
        outfile = os.path.join(self.workflow.source.workdir,
                               EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE)
        if self.method == 'gzip':
            outfile = outfile.format('gz')
            fp = gzip.open(outfile, 'wb', compresslevel=6)
        elif self.method == 'lzma':
            outfile = outfile.format('xz')
            fp = lzma.open(outfile, 'wb')
        else:
            raise RuntimeError('Unsupported compression format {0}'.format(self.method))

        _chunk_size = 1024**2  # 1 MB chunk size for reading/writing
        self.log.info('compressing image %s to %s using %s method',
                      self.workflow.image, outfile, self.method)
        data = stream.read(_chunk_size)
        while data != b'':
            fp.write(data)
            data = stream.read(_chunk_size)

        return outfile
Beispiel #17
0
    def test_1010(self, tmp_bdb_root, test_docs):
        """Minter yields identifiers matching N2T through a template extensions.

        This checks identifiers in an area where where the minter template must be
        extended before it can be stepped to the next state.
        """
        with lzma.open(PERL_MINTED_PATH) as f:
            for i in range(6218):
                f.readline()
            for i, python_sping in enumerate(
                    nog.minter.mint_by_bdb_path(
                        test_docs.joinpath(
                            '77913_r7_last_before_template_extend.bdb'),
                        10,
                        dry_run=True,
                    )):
                perl_sping = f.readline().strip()
                assert (
                    perl_sping == python_sping
                ), "Mismatch after minting {} identifiers. python={} != perl={}".format(
                    i, python_sping, perl_sping)
Beispiel #18
0
def compress_str(filepath, strbuffer, chunk_size=65536):
    """Takes a StringIO buffer and writes the output to a gzip- or xz-compressed
    file."""
    # go to beginning of string buffer
    strbuffer.seek(0)

    # output path
    if filepath.endswith('.gz'):
        # gzip compression
        fp = gzip.open(filepath, 'wb')
    elif filepath.endswith('.xz'):
        # xz compression
        fp = lzma.open(filepath, 'wb')

    # to avoid overflow errors, we will read from the stream in chunks
    contents = strbuffer.read(chunk_size)

    while contents != '':
        fp.write(contents)
        contents = strbuffer.read(chunk_size)

    fp.close()
Beispiel #19
0
def extract_7zip(fname):
    import backports.lzma as lzma
    import tarfile
    lz = lzma.open(str(fname))
    print('Extracting "%s"...' % fname)
    print('  decompressing...')
    tar = tarfile.open(fileobj=lz)

    def progress_generator(tar):
        prog = 0
        so_far = 0
        total = len(tar.getmembers())
        last = 0.0
        for ti in tar:
            so_far += 1
            percent = int((float(so_far) / float(total)) * 100.0)
            if last is None or percent - last >= (100.0 / 5.0):
                last = percent
                print('  %3d%% extracted' % percent)
            yield ti
    tar.extractall(members=progress_generator(tar))
    return None
Beispiel #20
0
    def _compress_image_stream(self, stream):
        outfile = os.path.join(self.workflow.source.workdir,
                               EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE)
        if self.method == 'gzip':
            outfile = outfile.format('gz')
            fp = gzip.open(outfile, 'wb', compresslevel=6)
        elif self.method == 'lzma':
            outfile = outfile.format('xz')
            fp = lzma.open(outfile, 'wb')
        else:
            raise RuntimeError('Unsupported compression format {0}'.format(
                self.method))

        _chunk_size = 1024**2  # 1 MB chunk size for reading/writing
        self.log.info('compressing image %s to %s using %s method',
                      self.workflow.image, outfile, self.method)
        data = stream.read(_chunk_size)
        while data != b'':
            fp.write(data)
            data = stream.read(_chunk_size)

        return outfile
def downloadLatestRDS():
	cwd = os.getcwd()
	rdsurl = 'https://nsrllookup.com/hashes/Sep2019.txz'
	try:
		print('[+] Downloading latest RDS list from https://nsrllookup.com/hashes/Sep2019.txz. Its size is around 2.5GB\n')
		r = requests.get(rdsurl)
	except:
		sys.exit('[ERROR] Do you have a direct, working Internet connection?')
	
	try:
		with open(cwd + '/latesthashes.txz', 'wb') as f:
			f.write(r.content)
	except:
		sys.exit('[ERROR] can\'t write data to disk.')
	f.close()

	# unzip and set name to latesthashes.txt (thanks to http://tiny.cc/ssxdfz):
	i = 'latesthashes.txz'
	with lzma.open(i) as compressed:
		o = rdshashes
		with open(o, 'wb') as destination:
			shutil.copyfileobj(compressed, destination)

	return True
Beispiel #22
0
    def __init__(self,
                 path=None,
                 data=None,
                 file_obj=None,
                 mode='rb',
                 message=StreamItem_v0_3_0,
                 read_wrapper=None,
                 write_wrapper=None,
                 inline_md5=True):
        '''Load a chunk from an existing file handle or buffer of data.
        If no data is passed in, then chunk starts as empty and
        chunk.add(message) can be called to append to it.

        mode is only used if you specify a path to an existing file to
        open.

        :param path: path to a file in the local file system.  If path
        ends in .xz then mode must be 'rb' and the entire file is
        loaded into memory and decompressed before the Chunk is ready
        for reading.

        :param mode: read/write mode for opening the file; if
        mode='wb', then a file will be created.

        :file_obj: already opened file, mode must agree with mode
        parameter.

        :param data: bytes of data from which to read messages

        :param message: defaults to StreamItem_v0_3_0; you can specify
        your own Thrift-generated class here.

        :param read_wrapper: a function that takes a deserialized
        message as input and returns a new object to yield from
        __iter__

        :param write_wrapper: a function used in Chunk.add(obj) that
        takes the added object as input and returns another object
        that is a thrift class that can be serialized.
        '''

        self.read_wrapper = read_wrapper
        self.write_wrapper = write_wrapper

        allowed_modes = ['wb', 'ab', 'rb']
        assert mode in allowed_modes, 'mode=%r not in %r' % (mode,
                                                             allowed_modes)
        self.mode = mode

        ## class for constructing messages when reading
        self.message = message

        ## initialize internal state before figuring out what data we
        ## are acting on
        self._count = 0
        self._md5_hexdigest = None

        ## might not have any output parts
        self._o_chunk_fh = None

        ## might not have any input parts
        self._i_chunk_fh = None

        ## open an existing file from path, or create it
        if path is not None:
            assert data is None and file_obj is None, \
                'Must specify only path or data or file_obj'
            if os.path.exists(path):
                ## if the file is there, then use mode
                if mode not in ['rb', 'ab']:
                    exc = IOError('mode=%r would overwrite existing %s' %
                                  (mode, path))
                    exc.errno = errno.EEXIST
                    raise exc
                if path.endswith('.xz'):
                    if xz is None:
                        if mode != 'rb':
                            raise Exception(
                                'backports.lzma is not installed and mode=%r but only "rb" is allowed without backports.lzma'
                                % mode)
                        ## launch xz child
                        xz_child = subprocess.Popen(['xzcat', path],
                                                    stdout=subprocess.PIPE,
                                                    stderr=subprocess.PIPE)
                        file_obj = xz_child.stdout
                        ## what to do with stderr
                    else:
                        file_obj = xz.open(path, mode)

                elif path.endswith('.gz'):
                    assert mode == 'rb', 'mode=%r for .gz' % mode
                    file_obj = gz.open(path)
                elif path.endswith('.xz.gpg'):
                    assert mode == 'rb', 'mode=%r for .xz' % mode
                    ## launch xz child
                    xz_child = subprocess.Popen(
                        ['gpg -d %s | xz --decompress' % path],
                        stdout=subprocess.PIPE,
                        shell=True)
                    #stderr=subprocess.PIPE)
                    file_obj = xz_child.stdout
                    ## what to do with stderr?
                else:
                    file_obj = open(path, mode)
            else:
                ## otherwise make one for writing
                if mode not in ['wb', 'ab']:
                    exc = IOError('%s does not exist but mode=%r' %
                                  (path, mode))
                    exc.errno = errno.ENOENT
                    raise exc
                dirname = os.path.dirname(path)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                if path.endswith('.gz'):
                    file_obj = gz.open(path, mode)
                elif path.endswith('.xz'):
                    if xz is None:
                        raise Exception(
                            'file extension is .xz but backports.lzma is not installed'
                        )
                    file_obj = xz.open(path, mode)
                else:
                    file_obj = open(path, mode)

        ## if created without any arguments, then prepare to add
        ## messages to an in-memory file object
        if data is None and file_obj is None:
            ## make the default behavior when instantiated as Chunk()
            ## to write to an in-memory buffer
            file_obj = StringIO()
            self.mode = 'wb'
            mode = self.mode

        elif file_obj is None:  ## --> must have 'data'
            ## wrap the data in a file obj for reading
            if mode == 'rb':
                file_obj = StringIO(data)
                file_obj.seek(0)
            elif mode == 'ab':
                file_obj = StringIO()
                file_obj.write(data)
                ## and let it just keep writing to it
            else:
                raise Exception('mode=%r but specified "data"' % mode)

        elif file_obj is not None and hasattr(file_obj, 'mode'):
            if isinstance(file_obj.mode, int):
                ## some tools, like python gzip library, use int modes
                file_obj_mode = {1: 'r', 2: 'w'}[file_obj.mode]
            else:
                file_obj_mode = file_obj.mode

            assert file_obj_mode[0] == mode[0], 'file_obj.mode=%r != %r=mode'\
                % (file_obj_mode, mode)
            ## use the file object for writing out the data as it
            ## happens, i.e. in streaming mode.

        if mode in ['ab', 'wb']:
            if inline_md5:
                self._o_chunk_fh = md5_file(file_obj)
            else:
                self._o_chunk_fh = file_obj

        else:
            assert mode == 'rb', mode
            if inline_md5:
                self._i_chunk_fh = md5_file(file_obj)
            else:
                self._i_chunk_fh = file_obj
Beispiel #23
0
 def __init__(self):
     self.empty = True
     self.buffer = io.BytesIO()
     self.dump = lzma.open(self.buffer, mode="wt", preset=9)
     self.dump.write(u"[")
Beispiel #24
0
def find_sequence(input_file, feature_name, sequence_filter, feature_regex,
                  build_dir, sample_id, read_num, minimum_trimmed_length,
                  max_dist_from_edge, log_handle):
    """
    Loads a collection of RNA-Seq reads and filters the reads so as to only
    return those containing a specified sequence of interest.

    Arguments
    ---------
    input_file: str
        Filepath to a FASTQ file containing reads to scan.
    feature_name: str
        Type of feature being searched for; used in naming filing and
        directories and in choosing logs to write to. [sl|polya]
    sequence_filter: str
        A short sequence string used for initial filtering. All reads will be
        checked to see if it contains this string, and those that do will be
        further checked using a regular expression to find the location of the
        match.
    feature_regex: str
        A regular expression string indicating the exact sequence to be
        searched for. This will be either a set of spliced leader prefixes or
        suffixes, or a string of A's or T's, possibly anchored at one end of
        the read.
    build_dir: str
        Base directory to save output to.
    sample_id: str
        ID of the sample being scanned.
    read_num: str
        Which of the mated reads should be scanned. [1|2]
    minimum_trimmed_length: int
        Minimum length of read allowed after matching feature is trimmed.
    max_dist_from_edge: int
        Maximum distance SL/Poly(A) feature can be from the edge of read.
    log_handle: logging.Handle
        Handler to use for logging.

    Output files
    ------------
    There are three possible sets of output files for this function depending
    on whether the input read comes from a mated-pair of reads or a single
    read, and whether (for the case of mated-pair reads) it is the left read
    or right read:

    1. Sequence found in R1
        *_1_1_xxx_untrimmed.fastq
        *_1_1_xxx_trimmed.fastq
        *_1_2.fastq
    2. Sequence found in R2
        *_2_2_xxx_untrimmed.fastq
        *_2_2_xxx_trimmed.fastq
        *_2_1.fastq
    """
    #--------------------------------------
    # FASTQ row indices
    #--------------------------------------
    ID_IDX = 0
    SEQUENCE_IDX = 1
    QUALITY_IDX = 2

    log_handle.info("# Processing %s" % os.path.basename(input_file))

    # determine whether regular expression in anchored
    anchored = (feature_regex.startswith("^") or feature_regex.endswith("$"))

    # list to keep track of potential matches
    matches = []

    # output filepaths
    output_base = '%s/%s/fastq/%s_%s_%s' % (
        build_dir, sample_id, sample_id, read_num, read_num[-1]
    )

    # determine compression type to use
    file_ext = os.path.splitext(input_file)[-1]

    # for uncompressed fastq files, we don't need the final extension
    if file_ext not in ['.gz', '.xz']:
        file_ext = ''

    output_untrimmed = "%s_%s_untrimmed.fastq%s" % (output_base, feature_name,
                                                     file_ext)
    output_trimmed = "%s_%s_trimmed.fastq%s" % (output_base, feature_name,
                                                file_ext)

    # Also keep track of match lengths which will be used for more rigorous
    # filtering when comparing to the genome sequence near where the read is
    # mapped.
    match_lengths_dir = os.path.join(build_dir, sample_id, 'results')
    output_lengths = "%s/match_lengths_%s.csv.gz" % (match_lengths_dir, read_num)
    match_lengths_fp = gzip.open(output_lengths, 'wb')

    # mated reads
    read_num_other = "1" if read_num == "2" else "2"
    input_file_mated = input_file.replace("." + read_num[-1],
                                          "." + read_num_other[-1])
    output_mated_reads = "%s_%s.fastq%s" % (output_base[:-2], read_num_other,
                                             file_ext)

    # compile regex
    read_regex = re.compile(feature_regex)

    # total number of reads
    num_reads = num_lines(input_file) / 4

    # Start sample log
    log_handle.info("# Scanning %d reads for %s" % (num_reads, feature_name))
    log_handle.info("# Using Regex pattern:\n %s" % feature_regex)

    # open output string buffer (will write to compressed file later)
    reads_trimmed = cStringIO.StringIO()
    reads_untrimmed = cStringIO.StringIO()
    mated_reads_buffer = cStringIO.StringIO()

    # Keep track of matched read IDs
    read_ids = []

    # Keep track of ways in which reads are filtered out
    num_filtered_no_seq_match = 0
    num_filtered_too_small = 0
    num_filtered_far_from_edge = 0

    # Find all reads containing the sequence of interest
    if file_ext == '.gz':
        fastq = gzip.open(input_file, 'rb')
        fastq_mated = gzip.open(input_file_mated, 'rb')
    elif file_ext == '.xz':
        import backports.lzma as lzma
        fastq = lzma.open(input_file, 'rb')
        fastq_mated = lzma.open(input_file_mated, 'rb')
    else:
        fastq = open(input_file, 'r')
        fastq_mated = open(input_file_mated, 'r')

    # iterate over mated reads at same time
    mated_reads = readfq(fastq_mated)

    for i, read in enumerate(readfq(fastq)):
        # get mated read
        mated_read = mated_reads.next()

        # ignore any reads that don't contain at least the smallest part of
        # the sequence of interest
        # this just speeds up the search so we don't have to use regex on all
        # reads
        if sequence_filter not in read[SEQUENCE_IDX]:
            num_filtered_no_seq_match += 1
            continue

        # check for match

        # When looking for internal matches, there may be multiple hits. Choose
        # the one that is closest to the edge of the read where the feature is
        # expected to be found.
        try:
            # for polya, reverse sequence to to find match closest to right
            # side
            if (feature_name == 'polya' and (not anchored)):
                match = re.search(read_regex, read[SEQUENCE_IDX][::-1])
                match_start = len(read[SEQUENCE_IDX]) - match.end() 
                match_end = len(read[SEQUENCE_IDX]) - match.start() 
            else:
                match = re.search(read_regex, read[SEQUENCE_IDX])

                # for anchored reads, its possible that a read passes the
                # quick filter check but the regex does not match
                if match is None:
                    num_filtered_no_seq_match += 1
                    continue

                match_start = match.start()
                match_end = match.end()
        except:
            import pdb; pdb.set_trace();

        # match length
        match_length = match.end() - match.start()

        # For SL sequence, trim everything up to end of match
        if feature_name == 'sl':
            trimmed_read = [read[ID_IDX],
                            read[SEQUENCE_IDX][match_end:],
                            "+",
                            read[QUALITY_IDX][match_end:]]
        else:
            # otherwise trim from the start of the match to the end of the read
            trimmed_read = [read[ID_IDX],
                            read[SEQUENCE_IDX][:match_start],
                            "+",
                            read[QUALITY_IDX][:match_start]]

        # skip reads that are less than the required amount after trimming
        if len(trimmed_read[SEQUENCE_IDX]) < minimum_trimmed_length:
            num_filtered_too_small += 1
            continue

        # length of portion trimmed off
        trimmed_part_length = (len(read[SEQUENCE_IDX]) - 
                               len(trimmed_read[SEQUENCE_IDX]))

        # for internal matches, skip reads where match is not close enough to
        # the edge of the read
        if (trimmed_part_length - match_length) > max_dist_from_edge:
            num_filtered_far_from_edge += 1
            continue

        # write length
        match_lengths_fp.write(",".join([read[ID_IDX], str(match_length)]) + "\n")

        # take reverse complement if requested
        # this will return the read back to the expected orientation (SL
        # upstream/Poly(A) downstream)
        #if reverse:
        #    trimmed_read[SEQUENCE_IDX] = str(
        #        Seq.Seq(trimmed_read[SEQUENCE_IDX]).reverse_complement())
        #    trimmed_read[QUALITY_IDX] = trimmed_read[QUALITY_IDX][::-1]

        # Otherwise add trimmed read to output
        reads_trimmed.write("\n".join(trimmed_read) + "\n")

        # Also save complete (untrimmed) reads containing the matched sequence.
        # By mapping these reads to the genome we can eliminate false hits;
        # i.e. reads that contain a portion of the sequence of intereste but
        # are not actual trans-splicing / poly-adenylation reads.
        untrimmed_read = [read[ID_IDX],
                          read[SEQUENCE_IDX],
                          "+",
                          read[QUALITY_IDX]]
        reads_untrimmed.write("\n".join(untrimmed_read) + "\n")

        # paired-end reads
        untrimmed_mated_read = [mated_read[ID_IDX],
                                mated_read[SEQUENCE_IDX],
                                "+",
                                mated_read[QUALITY_IDX]]
        mated_reads_buffer.write("\n".join(untrimmed_mated_read) + "\n")

        # save id
        read_ids.append(read[ID_IDX])

    # log numbers
    log_handle.info("# Found %d reads with possible %s fragment" %
             (len(read_ids), feature_name))
    log_handle.info(
        "# Excluded %d reads with no feature matches."
        % num_filtered_no_seq_match)
    log_handle.info(
        "# Excluded %d reads which were too short after trimming."
        % num_filtered_too_small)
    log_handle.info(
        "# Excluded %d reads with matched feature too far from read edge."
        % num_filtered_far_from_edge)

    # Create output directory
    output_dir = os.path.dirname(output_base)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, mode=0o755)

    # write trimmed and untrimmed reads to fastq.gz / fastq.xz
    compress_str(output_trimmed, reads_trimmed)
    compress_str(output_untrimmed, reads_untrimmed)
    compress_str(output_mated_reads, mated_reads_buffer)

    # clean up
    fastq.close()
    fastq_mated.close()
    reads_trimmed.close()
    reads_untrimmed.close()
    mated_reads_buffer.close()
    match_lengths_fp.close()

    log_handle.info("# Finished processing %s" % os.path.basename(input_file))
Beispiel #25
0
  parser.add_argument('-k', '--k_trials', type=int, default=100)
  parser.add_argument('-n', '--n_words', type=int, default=5000)
  parser.add_argument('files', nargs='+')
  
  args = parser.parse_args()
  type_counter = Counter()
  for filename in args.files:
    with codecs.open(filename, encoding='utf-8', errors='ignore') as f:
      for line in f:
        line = line.strip()
        if line:
          if not re.match('#|[0-9]+[-.][0-9]+', line):
            type_counter[line.split('\t')[1]] += 1
  
  types = type_counter.keys()
  total = sum(type_counter.values())
  probs = [type_counter[type_] / total for type_ in types]
  
  trials = []
  n_words = min(args.n_words, len(types)) or len(types)
  for _ in xrange(args.k_trials):
    chosen_types = np.random.choice(types, size=n_words, replace=False, p=probs)
    with codecs.open('uncompressed.txt', 'w', encoding='utf-8', errors='ignore') as f:
      f.write('\n'.join(chosen_types))
    with lzma.open('compressed.txt.xz', 'wb') as f:
      f.write('\n'.join(chosen_types).encode('utf-8', 'ignore'))
    trials.append(os.path.getsize('compressed.txt.xz')/os.path.getsize('uncompressed.txt'))
  os.remove('uncompressed.txt')
  os.remove('compressed.txt.xz')
  print(np.mean(trials))
Beispiel #26
0
    def __init__(self, path=None, data=None, file_obj=None, mode='rb',
                 message=StreamItem_v0_3_0,
                 read_wrapper=None, write_wrapper=None,
        ):
        '''Load a chunk from an existing file handle or buffer of data.
        If no data is passed in, then chunk starts as empty and
        chunk.add(message) can be called to append to it.

        mode is only used if you specify a path to an existing file to
        open.

        :param path: path to a file in the local file system.  If path
        ends in .xz then mode must be 'rb' and the entire file is
        loaded into memory and decompressed before the Chunk is ready
        for reading.

        :param mode: read/write mode for opening the file; if
        mode='wb', then a file will be created.

        :file_obj: already opened file, mode must agree with mode
        parameter.

        :param data: bytes of data from which to read messages

        :param message: defaults to StreamItem_v0_3_0; you can specify
        your own Thrift-generated class here.

        :param read_wrapper: a function that takes a deserialized
        message as input and returns a new object to yield from
        __iter__

        :param write_wrapper: a function used in Chunk.add(obj) that
        takes the added object as input and returns another object
        that is a thrift class that can be serialized.
        '''
        if not fastbinary_import_failure:
            logger.debug('using TBinaryProtocolAccelerated (fastbinary)')

        else:
            logger.warn('import fastbinary failed; falling back to 15x slower TBinaryProtocol: %r'\
                            % fastbinary_import_failure)

        self.read_wrapper = read_wrapper
        self.write_wrapper = write_wrapper

        allowed_modes = ['wb', 'ab', 'rb']
        assert mode in allowed_modes, 'mode=%r not in %r' % (mode, allowed_modes)
        self.mode = mode

        ## class for constructing messages when reading
        self.message = message

        ## initialize internal state before figuring out what data we
        ## are acting on
        self._count = 0
        self._md5_hexdigest = None

        ## might not have any output parts
        self._o_chunk_fh = None
        self._o_transport = None
        self._o_protocol = None

        ## might not have any input parts
        self._i_chunk_fh = None
        self._i_transport = None
        self._i_protocol = None

        ## open an existing file from path, or create it
        if path is not None:
            assert data is None and file_obj is None, \
                'Must specify only path or data or file_obj'
            if os.path.exists(path):
                ## if the file is there, then use mode 
                if mode not in ['rb', 'ab']:
                    exc = IOError('mode=%r would overwrite existing %s' % (mode, path))
                    exc.errno = errno.EEXIST
                    raise exc
                if path.endswith('.xz'):
                    if lzma is None:
                        if mode != 'rb':
                            raise Exception('backports.lzma is not installed and mode=%r but only "rb" is allowed without backports.lzma' % mode)
                        ## launch xz child
                        xz_child = subprocess.Popen(
                            ['xzcat', path],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
                        file_obj = xz_child.stdout
                        ## what to do with stderr
                    else:
                        file_obj = lzma.open(path, mode)
                        
                elif path.endswith('.gz'):
                    assert mode == 'rb', 'mode=%r for .gz' % mode
                    file_obj  = gzip.open(path)
                elif path.endswith('.xz.gpg'):
                    assert mode == 'rb', 'mode=%r for .xz' % mode
                    ## launch xz child
                    xz_child = subprocess.Popen(
                        ['gpg -d %s | xz --decompress' % path],
                        stdout=subprocess.PIPE, shell=True)
                        #stderr=subprocess.PIPE)
                    file_obj = xz_child.stdout
                    ## what to do with stderr?
                else:
                    file_obj = open(path, mode)
            else:
                ## otherwise make one for writing
                if mode not in ['wb', 'ab']:
                    exc = IOError('%s does not exist but mode=%r' % (path, mode))
                    exc.errno = errno.ENOENT
                    raise exc
                dirname = os.path.dirname(path)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                if path.endswith('.gz'):
                    file_obj = gzip.open(path, mode)
                elif path.endswith('.xz'):
                    if lzma is None:
                        raise Exception('file extension is .xz but backports.lzma is not installed')
                    file_obj = lzma.open(path, mode)
                else:
                    file_obj = open(path, mode)

        ## if created without any arguments, then prepare to add
        ## messages to an in-memory file object
        if data is None and file_obj is None:
            ## make the default behavior when instantiated as Chunk()
            ## to write to an in-memory buffer
            file_obj = StringIO()
            self.mode = 'wb'
            mode = self.mode

        elif file_obj is None: ## --> must have 'data'
            ## wrap the data in a file obj for reading
            if mode == 'rb':
                file_obj = StringIO(data)
                file_obj.seek(0)
            elif mode == 'ab':
                file_obj = StringIO()
                file_obj.write(data)
                ## and let it just keep writing to it
            else:
                raise Exception('mode=%r but specified "data"' % mode)

        elif file_obj is not None and hasattr(file_obj, 'mode'):
            if isinstance(file_obj.mode, int):
                ## some tools, like python gzip library, use int modes
                file_obj_mode = {1: 'r', 2: 'w'}[file_obj.mode]
            else:
                file_obj_mode = file_obj.mode

            assert file_obj_mode[0] == mode[0], 'file_obj.mode=%r != %r=mode'\
                % (file_obj_mode, mode)
            ## use the file object for writing out the data as it
            ## happens, i.e. in streaming mode.

        if mode in ['ab', 'wb']:
            self._o_chunk_fh = md5_file( file_obj )
            self._o_transport = TTransport.TBufferedTransport(self._o_chunk_fh)
            self._o_protocol = protocol(self._o_transport)

        else:
            assert mode == 'rb', mode
            self._i_chunk_fh = md5_file( file_obj )
Beispiel #27
0
# pip install requests backports.lzma streamcorpus
## installation on CentOS/RHEL is similar using yum instead of apt-get
from backports import lzma
import requests
from streamcorpus import Chunk, decrypt_and_uncompress, compress_and_encrypt

logging.basicConfig()
logger = logging.getLogger()

s3_http_host = 'https://aws-publicdatasets.s3.amazonaws.com/'
s3_path_prefix = 'trec/dd/local-politics-streamcorpus-v0_3_0/'
s3_paths_fname = 'local-politics-streamcorpus-v0_3_0-s3-paths.txt.xz'
if not os.path.exists(s3_paths_fname):
    sys.exit('please download %strec/dd/%s' % (s3_http_host, s3_paths_fname))

for path in lzma.open(s3_paths_fname):
    s3_path = s3_path_prefix + path.strip()
    url = s3_http_host + s3_path
    logger.info(url)
    retries = 0
    max_retries = 10
    while retries < max_retries:
        retries += 1
        sys.stderr.flush()
        try:
            resp = requests.get(url)
            errors, data = decrypt_and_uncompress(resp.content,
                                                  gpg_private='trec-kba-rsa')
            logger.info('\n'.join(errors))
            for si in Chunk(file_obj=StringIO(data)):
def compressSingle (fileName, compressionMethod):

    testFile = open(fileName,'r')

    #standard filter values
    dict_size = 1610612736
    lc = 1
    lp = 0
    pb = 2
    mode = lzma.MODE_NORMAL
    nice_len = 273
    mf = lzma.MF_BT2
    depth = 0


    #Filters for various LZMA compressions
    BJC = lzma.FILTER_ARM
    BJC_LZMA2_filter = [
            {"id": BJC},
            {"id": lzma.FILTER_LZMA2,  "lc": lc, "lp":lp, "pb": pb , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
    ]
    LZMA2_HPB_filter = [
            {"id": lzma.FILTER_LZMA2,  "lc": lc, "lp":lp, "pb": 4 , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
    std_LZMA2_filter = [
            {"id": lzma.FILTER_LZMA2,  "lc": lc, "lp":lp, "pb":pb , "mode":mode,  "mf":mf, "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
    std_LZMA1_filter = [
            {"id": lzma.FILTER_LZMA1,  "lc": lc, "lp":lp, "pb":pb , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
    LZMA2_DELTA1_filter = [
            {"id": lzma.FILTER_DELTA, "dist": 1},
            {"id": lzma.FILTER_LZMA2,  "lc": lc, "lp":lp, "pb":pb , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
    LZMA2_HLP_filter = [
            {"id": lzma.FILTER_LZMA2,  "lc": 0, "lp":4, "pb":pb , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
    LZMA2_HLC_filter = [
            {"id": lzma.FILTER_LZMA2,  "lc": 4, "lp":0, "pb":pb , "mode":mode,  "mf":mf,\
             "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]

    if compressionMethod == 0:
        #zip, no compression
        compressionMethod = '.zip'
        compressedName = (fileName + compressionMethod)
        with zipfile.ZipFile(compressedName,'w',zipfile.ZIP_STORED) as newFile:
            newFile.write(fileName)
            newFile.close()
    #zip deflate
    if compressionMethod == 1:
        compressionMethod = '.zip'
        compressedName = (fileName + compressionMethod)
        with zipfile.ZipFile( compressedName,'w',zipfile.ZIP_DEFLATED) as newFile:
            newFile.write(fileName)
            newFile.close()

    #bzip
    if compressionMethod == 2:
        compressionMethod = '.bz'
        compressedName = (fileName + compressionMethod)
        with bz2.BZ2File( compressedName,'w') as newFile:
            newFile.write(testFile.read())
            newFile.close()





     #lzma modes

    #std lzma2
    if compressionMethod == 3:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open( compressedName,'wb', filters=std_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()



    #std lzma 1
    if compressionMethod == 4:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=std_LZMA1_filter, format=lzma.FORMAT_ALONE)
        newFile.write(testFile.read())
        newFile.close()


        #delta LZMA2
    if compressionMethod == 5:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=LZMA2_DELTA1_filter)
        newFile.write(testFile.read())
        newFile.close()


    #HLP lzma
    if compressionMethod == 6:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=LZMA2_HLP_filter)
        newFile.write(testFile.read())
        newFile.close()


    #HLC lzma
    if compressionMethod == 7:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=LZMA2_HLC_filter)
        newFile.write(testFile.read())
        newFile.close()



     #HPB lzma
    if compressionMethod == 8:
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=LZMA2_HPB_filter)
        newFile.write(testFile.read())
        newFile.close()




    #ARM lzma
    if compressionMethod == 9:
        BJC = lzma.FILTER_ARM
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)

        newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()


    #FILTER_ARMTHUMB lzma
    if compressionMethod == 10:
        BJC = lzma.FILTER_ARMTHUMB
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()



    #IA64 lzma
    if compressionMethod == 11:
        BJC = lzma.FILTER_IA64
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(  compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()


    #POWERPC lzma
    if compressionMethod == 12:
        BJC = lzma.FILTER_POWERPC
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(  compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()

    #SPARC lzma
    if compressionMethod == 13:
        BJC = lzma.FILTER_SPARC
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)
        newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()


    #X86 lzma
    if compressionMethod == 14:
        BJC = lzma.FILTER_X86
        BJC_LZMA2_filter[0].update({"id":BJC})
        compressionMethod = '.lzma'
        compressedName = (fileName + compressionMethod)


        newFile = lzma.open(  compressedName,'wb',filters=BJC_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()



        #GZIP
    if compressionMethod == 15:
        compressionMethod = '.gz'
        compressedName = (fileName + compressionMethod)
        newFile = gzip.open(compressedName,'w')
        newFile.write(testFile.read())
        newFile.close()

        #zlib
    if compressionMethod == 16:
        compressionMethod = '.gzip'
        compressedName = (fileName + compressionMethod)
        newFile = open(   compressedName,'w')
        compressedContent = zlib.compress(testFile.read(),9)
        newFile.write(compressedContent)
        newFile.close()
Beispiel #29
0
  import wget
  import tarfile

  # if using python2
  if (sys.version_info < (3,0)):
    from backports import lzma
    from contextlib import closing

  for directory in ['rtems-source-builder', 'rtems', 'rtems-examples']:
    vprint('Downloading '+options.release_url + '/sources/' + directory+'-' + options.tag + '.tar.xz...')
    wget.download(options.release_url + '/sources/' + directory+'-' + options.tag + '.tar.xz',options.dir)
    vprint('Done...')
    vprint('Unpacking ' + options.dir + '/' + directory + '-' + options.tag + '.tar.xz...')

    # if using python3
    if (sys.version_info > (3,0)):
      with tarfile.open(options.dir + '/' + directory + '-' + options.tag + '.tar.xz') as f:
        f.extractall('.')

    else:
      with closing(lzma.open(options.dir + '/' + directory + '-' + options.tag + '.tar.xz')) as xz:
        with tarfile.open(fileobj=xz) as f:
          f.extractall('.')

    vprint('Done...')
    os.remove(options.dir + '/' + directory + '-' + options.tag + '.tar.xz')
    os.rename(options.dir + '/' + directory + '-' + options.tag, options.dir + '/' + directory)

vprint('\n\nScript finished successfully...')
sys.exit(0)
Beispiel #30
0
 def _from_file(clazz, filename, header_only=False, strict=False):
     """
     :param filename: name of the file to read from
     :type filename: string
     :param header_only: read header only
     :rtype: Graph
     :return: imported hypergraph
     """
     num_edges = None
     num_verts = None
     is_dimacs = False
     stream = None
     graph = clazz()
     try:
         mtype = mimetypes.guess_type(filename)[1]
         if mtype is None:
             stream = open(filename, 'r')
         elif mtype == 'bzip2':
             stream = BZ2File(filename, 'r')
         elif mtype == 'gz' or mtype == 'gzip':
             stream = gzip.open(filename, 'r')
         elif mtype == 'xz' and xz:
             stream = xz.open(filename, 'r')
         else:
             raise IOError('Unknown input type "%s" for file "%s"' %
                           (mtype, filename))
         nr = 0
         header_seen = False
         for line in stream:
             nr += 1
             line = line.split()
             if line == [] or line[0] in ('x', 'n'):
                 continue
             elif line[0] == 'p':
                 if header_seen:
                     logging.critical('L(%s). Duplicate header. Exiting.' %
                                      nr)
                     exit(3)
                 if len(line) > 4:
                     logging.critical(
                         'L(%s). Too many arguments. Exiting.' % nr)
                     exit(3)
                 is_dimacs = line[1] == 'edge'
                 is_formula = line[1] == 'cnf'
                 num_verts = int(line[2])
                 num_edges = int(line[3])
                 if header_only:
                     return num_verts, num_edges
                 if num_verts == 0:
                     logging.warning("Empty graph.")
                     return graph
                 header_seen = True
             elif line[0] != 'c' and (
                     is_dimacs or (line[0] != 'a' and line[0] != 'e')
             ):  #now also ignores forAll and Exists :P
                 if not header_seen:
                     logging.critical(
                         'L(%s). Lines before header. Exiting.' % nr)
                     exit(3)
                 try:
                     if is_dimacs:
                         graph.add_edge(int(line[1]), int(line[2]))
                     elif is_formula:
                         atoms = map(lambda x: abs(int(x)), line[0:-1])
                         #print("formula{0}".format(atoms))
                         for i in atoms:
                             for j in atoms:
                                 if i < j:
                                     graph.add_edge(
                                         i, j
                                     )  #abs -> then it also works for qbf
                         num_edges += (len(atoms) *
                                       (len(atoms) - 1)) / 2 - 1
                     else:
                         graph.add_edge(int(line[0]), int(line[1]))
                     assert (0 not in graph.nodes())
                 except ValueError, e:
                     logging.critical('L(%s). Invalid integer. Exiting.' %
                                      nr)
                     logging.critical('Error was: %s' % e)
                     exit(3)
                 except IndexError, e:
                     logging.critical('L(%s). Incomplete edge. Exiting' %
                                      nr)
                     logging.critical('Error was: %s' % e)
                     exit(3)
def read_ascii_file(spectrum_file,
                    start_row=0,
                    end_row=-1,
                    blackbody=False,
                    stellar_radius=1.0,
                    distance=1.,
                    include_dilution_factor=True):
    """
    Reads in a BT-NextGen spectrum and outputs the wavelengths in microns and 
    flux in W / m**2 / um
    start_row and end_row allow for reading only a subset of the file to save time
    (and to skip headers). But note that this way you cant read in the last row.
    blackbody : Set to true to return the blackbody spectrum instead
    stellar_radius : Stellar radius in solar radii
    distance : distance in pc
    """

    if spectrum_file.endswith('.bz2'):
        with bz2.BZ2File(spectrum_file, 'r') as myf:
            x = myf.read()
    elif spectrum_file.endswith('.gz'):
        with gzip.open(spectrum_file, 'r') as myf:
            x = myf.read()
    elif spectrum_file.endswith('.xz'):

        from backports import lzma  # This can be tricky to install...
        x = lzma.open(spectrum_file).read()

    else:
        raise IOError("Unrecognized file type: " + str(spectrum_file))

    start_row = np.int(start_row)
    end_row = np.int(end_row)

    data = x.split('\n')
    output = []
    all_output = []

    for ix, row in enumerate(data[start_row:end_row]):
        # remove duplicate white space, not sure how to do this easily
        temp = row.strip()
        for rep in range(50):
            temp = temp.replace('  ', ' ')

        # And change Ds to Es for the exponential
        temp = temp.replace('D', 'e')

        # Split it based on white space and append the results to the output array
        if temp != '':
            split = temp.split(' ')
            all_output.append(split)
            output.append(np.float64(split[0:3]))
    output = np.array(output)  # make it a numpy array so it is 2D
    if output.ndim != 2:
        raise Exception("Couldnt read the spectra file!" + spectrum_file)

    wavs = output[:, 0] * 1e-4  # turn to microns
    flux = output[:, 1]

    if blackbody:
        flux = output[:, 2]

    DF = -8.  # For "all most recent models"
    #    DF=-28.9007901434 # For NextGen T > 5000K
    #    DF=-26.9007901434 # For NextGen T < 5000K

    # convert flux to ergs/sec/cm**2/A
    flux = 10**(
        flux + DF
    )  # this is the conversion eqn from the website (https://phoenix.ens-lyon.fr/Grids/FORMAT)

    # convert to W/m**2/um
    # 1 erg/s = 1e-7 W
    # 1 cm**-2 = 1e4 m**-2
    # 1 A**-1 = 1e4 um**-1
    flux_si = flux * 1e-7 * 1e4 * 1e4  #
    #    flux2=flux*units.erg/units.s/(units.cm**2)/units.angstrom
    #    flux_si=flux2.to(units.watt/(units.m**2)/units.micron)
    #    print 'median log flux:',np.median(np.log10(flux_si))

    # Scale by the distance and stellar radii
    # * (radius/distance)^2 in same units.
    rsol_pc = 2.25461e-8
    if include_dilution_factor:
        dilution_factor = (stellar_radius * rsol_pc / distance)**2
        flux_si *= dilution_factor

    return [wavs, flux_si]
def lzwrite(data, path):
    with lzma.open(path, "wb") as outfile:
        outfile.write(data)
def lzd(src, tgt):
    with lzma.open(src, "rb") as srcfile:
        with open(tgt, "wb") as tgtfile:
            tgtfile.write(srcfile.read())
Beispiel #34
0
        lp = 0
        pb = 2
        mode = lzma.MODE_NORMAL
        nice_len = 273
        mf = lzma.MF_BT2
        depth = 0

        std_LZMA2_filter = [
            {"id": lzma.FILTER_LZMA2,  "lc": lc, "lp":lp, "pb":pb , "mode":mode,  "mf":mf, "nice_len": nice_len, "dict_size": dict_size, "depth":depth}
        ]
        start = time.time()
        compressionMethod = 'LZMA_STD'
        compressedName = (path + compressionMethod)
        testFile.close()
        testFile = open(path,'r')
        newFile = lzma.open( compressedName,'wb', filters=std_LZMA2_filter)
        newFile.write(testFile.read())
        newFile.close()
        print (compressionMethod+": "+str(float(os.path.getsize(compressedName))/1024) +" Kb " + \
               str(float(os.path.getsize(compressedName))/float(os.path.getsize(path))))
        print(str((time.time()-start)*1000) + " miliseconds")
        size = float(os.path.getsize(compressedName))
        tm = time.time()-start
        storageList.append(compressionMethod)
        storageList.append(size)
        storageList.append(size/uncompSize)
        storageList.append(tm)
        print()
        os.remove(compressedName)

Beispiel #35
0
 def __init__(self):
     self.empty = True
     self.buffer = io.BytesIO()
     self.dump = lzma.open(self.buffer, mode="wt", preset=9)
     self.dump.write(u"[")
Beispiel #36
0
    def create_assignments(self):
        # a segmentation problem can only have assignments if details about how
        # the assignments should be created are provided
        h_utils.require(self.has_details,
            InvalidStateError(_('This segmentation problem is lacking '
                                'details.')))

        # clear any previous assignments for this segmentation problem
        self.clear_assignments()

        details = self.details
        try:
            tmp_dir = tempfile.mkdtemp()

            # generate the tiles for the assignments
            tiles_info = h_dip.generate_tiles(
                img_path=self.image.path,
                tiles_dim=details.tiles_dimension,
                overlap_rel=details.tiles_overlap,
                border_rel=details.tiles_border,
                dst_path=tmp_dir,
                workable_checker=h_dip.simple_content_detection()
            )[0]

            assignments = []
            for info in tiles_info:
                a = self.assignments.model(
                    seg_prob=self,
                    tile_bbox_x0=info['cropped_bbox'][0],
                    tile_bbox_y0=info['cropped_bbox'][1],
                    tile_bbox_x1=info['cropped_bbox'][2],
                    tile_bbox_y1=info['cropped_bbox'][3],
                    workable=info['workable']
                )
                with open(info['path'], 'rb') as f:
                    a.tile.save(os.path.basename(info['path']), File(f),
                        save=False)
                assignments.append(a)

            if details.algorithm == u'LIVEVESSEL':
                files_temp_path = []
                live_script_rel_path = '../externals/matlab/livevessel_pre_process/'
                livevessel_preprocess_script = h_fs.get_absolute_path(live_script_rel_path, settings.PROJECT_ROOT)
                processes = []
                for a in assignments:
                    tile_number = os.path.basename(os.path.splitext(a.tile.path)[0]).split('_')[1]
                    basedir = 'liv_preprocess_' + tile_number
                    temp_path = os.path.join(tmp_dir, basedir)
                    files_temp_path.append(temp_path)

                    code = "addpath(genpath('"+ livevessel_preprocess_script + "'));"
                    code = code + "offline('" + a.tile.path + "', '" + temp_path + "');"
                    code = code + "exit;"

                    processes.append(subprocess.Popen(
                        ["/usr/local/bin/matlab", "-nosplash", "-nodesktop", "-nojvm", "-r", code],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE))


                exit_code = [process.communicate() for process in processes]
                #print "next process"
                #out, err = process.communicate()

                index = 0
                for a in assignments:
                    compressed_file_path = files_temp_path[index] + '.xz'
                    with open(files_temp_path[index], 'rb') as f:
                        with lzma.open(compressed_file_path, 'wb') as xz_file:
                            xz_file.write(f.read())

                    with open(compressed_file_path, 'rb') as f:
                        basename = os.path.basename(compressed_file_path)
                        a.preprocess_file.save(basename, File(f), save=False)
                    index += 1



            if details.pre_seg:
                # generate pre segs. for the tiles
                pre_seg_tiles_info, border = h_dip.generate_tiles(
                    img_path=details.pre_seg.path,
                    tiles_dim=details.tiles_dimension,
                    overlap_rel=details.tiles_overlap,
                    border_rel=details.tiles_border,
                    dst_path=tmp_dir,
                    tiles_prefix='pre_seg_'
                )
                assert len(tiles_info) == len(pre_seg_tiles_info)
                for a, info in zip(assignments, pre_seg_tiles_info):
                    img = Image.open(info['path'])
                    img_w, img_h = img.size
                    img.crop((border, border, img_w - border,
                              img_h - border)).save(info['path'])
                    with open(info['path'], 'rb') as f:
                        a.pre_seg.save(os.path.basename(info['path']), File(f),
                            save=False)

            # effectively create the assignments
            try:
                self.assignments.bulk_create(assignments)
            except DatabaseError:
                debug_logger.exception('bulk creation of %d assignments '
                                       'failed... falling back to individual '
                                       'creation' % len(assignments))
                map(lambda a: a.save(), assignments)

            # in case of success, return the number of assignments created
            return len(assignments)
        except:
            # in case any thing goes wrong, undo and log the problem to the
            # administrators
            self.clear_assignments()
            internal_errors_logger.exception('assignments for segmentation '
                                             'problem %d could not be '
                                             'created' % self.pk)
        finally:
            h_fs.rm(tmp_dir, ignore_errors=True)
Beispiel #37
0
    def from_sat_file(clazz, filename, mode):
        header = {}
        clauses = []
        stream = None

        nr = 0

        def log_error(msg):
            logging.error(msg)
            exit(2)

        try:
            mtype = mimetypes.guess_type(filename)[1]
            if mtype is None:
                stream = open(filename, 'r')
            elif mtype == 'bzip2':
                stream = BZ2File(filename, 'r')
            elif mtype == 'gz' or mtype == 'gzip':
                stream = gzip.open(filename, 'r')
            elif mtype == 'xz' and xz:
                stream = xz.open(filename, 'r')
            else:
                raise IOError('Unknown input type "%s" for file "%s"' %
                              (mtype, filename))
            for line in stream:
                if isinstance(line, bytes):
                    line = line.decode("utf-8")
                if len(line.rstrip()) == 0:
                    continue

                line = line.split()
                nr += 1

                if line[0] == "p":
                    logging.info("Reading header")

                    if len(header.keys()) != 0:
                        log_error("Multiple header in line {}".format(nr))
                    if len(line) != 4:
                        log_error(
                            "Wrong header. expected 4 tokens (p cnf num_variables num_edges). Got {} instead"
                            .format(len(line)))

                    if line[1] != "cnf":
                        log_error(
                            "Expected cnf identifier. Got {} instead".format(
                                line[1]))

                    try:
                        header["num_variables"] = int(line[2])
                        header["num_clauses"] = int(line[3])
                    except ValueError as e:
                        logging.error(e)
                        log_error(
                            "Invalid format for number of variables or clauses. Expected integer"
                        )

                elif line[0] == "c" or line[0] == "%" or line[0] == "w":
                    logging.info("#" * 20 + "Reading comment")
                    logging.info(" ".join(line))
                else:
                    if len(header.keys()) == 0:
                        log_error(
                            "Reading edge before header in line {}".format(nr))

                    try:
                        if int(line[-1]) == 0:
                            line = line[:-1]
                        line = [int(x) for x in line]
                    except ValueError as e:
                        logging.error(e)
                        log_error(
                            "Invalid format for clause. Variables should be in integer"
                        )

                    for v in line:
                        if abs(v) < 1 or abs(v) > header["num_variables"]:
                            log_error(
                                "Vertex {} out of bounds. Expected abs in range [1, {}]"
                                .format(v, header["num_variables"]))

                    if len(line) > 0:
                        clauses.append(line)
        finally:
            if stream:
                stream.close()

        t_sat = clazz.convert_to_threesat(header["num_variables"], clauses,
                                          mode)
        return t_sat