def read_file(filename): tmp_dir = tempfile.mkdtemp() tmp_file_path = os.path.join(tmp_dir, 'tempfile') try: with lzma.open(filename, 'rb') as xz_file: with open(tmp_file_path, 'wb') as tmp_file: tmp_file.write(xz_file.read()) costs = [[]] #scales = [] with open(tmp_file_path, 'rb') as f: width, height, n_scales = struct.unpack('iii', f.read(12)) scales = struct.unpack('%df' % n_scales, f.read(4*n_scales)) #costs = [[]] i = 0 count = 0 a = f.read(width) while a != '': costs[i].append(list(struct.unpack('%dB' % width, a))) a = f.read(width) count += 1 if count == height: i += 1 count = 0 if i < 2: costs.append([]) finally: h_fs.rm(tmp_dir, ignore_errors=True) return {'costs': costs, 'scales': scales}
def _get_xzfile(filenames): tar_data = _get_tarfile(filenames) lzma_fobj = StringIO.StringIO() xz_file = lzma.open(lzma_fobj, 'w') xz_file.write(tar_data.read()) lzma_fobj.seek(0) return lzma_fobj
def test_xz_archive(self): tfobj = _get_xzfile(self.filenames) xzfobj = lzma.open(tfobj) with layers.LayerArchive(xzfobj) as tar: members = tar.getmembers() for tarinfo in members: self.assertIn(tarinfo.name, self.filenames)
def cca_items(args): '''This generator takes an s3_paths_fname file, fetches the data, constructs a CCA record, and yields it. ''' for path in lzma.open(args.s3_paths_fname): if args.date_hour is not None: if not path.startswith(args.date_hour): continue s3_path = args.s3_path_prefix + path.strip() url = args.s3_http_host + s3_path logger.info( url ) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info( '\n'.join(errors) ) for si in Chunk(file_obj=StringIO(data)): item = { 'key': si.stream_id, 'url': si.abs_url, 'timestamp': si.stream_time.epoch_ticks, 'request': None, ## not part of this data set 'response': { 'headers': [ ['Content-Type', 'text/html'], ], 'body': si.body.clean_html, ## alternatively, could use si.body.raw and ## si.body.media_type for the Content-Type ## header, but that would cause the Serif NER ## to be useless to teams... }, 'imported': None, } yield item #print cbor.dumps(rec) ## do something with the data logger.info( '%d bytes of html, or %d bytes of tag-stripped clean_visible, and %d sentences with %d tokens' % ( len(si.body.clean_html), len(si.body.clean_visible), len(si.body.sentences['serif']), len(list(chain(*map(attrgetter('tokens'), si.body.sentences['serif'])))), )) break # break out of retry loop except Exception, exc: logger.critical( traceback.format_exc(exc) ) logger.critical( 'retrying %d of %d times to fetch and access: %s' % (retries, max_retries, url) ) time.sleep(1)
def read_datafile(file_name): with lzma.open(file_name, "r") as f: for line in f: if line.startswith('ts,'): names=line.rstrip().split(',') break data = np.genfromtxt(f, delimiter=',', comments='#', skiprows=2, names=names, usecols=(range(0,7)), converters={0: mdates.strpdate2num('%Y-%m-%d %H:%M:%S')}) return data
def load(self): """Loads the neuron weights from the associated file and returns it. :returns: Loaded weights vector. """ with open(self.file, "wb") as target: with lzma.open(self.compressedFile, "r") as uncompressor: target.write(uncompressor.read()) weights = numpy.load(self.file) remove(self.file) return weights
def configure(self, mode, id_type): mode_type = '%s_%s' % (mode, id_type) data = set(self.config.get(mode_type, [])) path = self.config.get(mode_type + '_path') if path: if path.endswith('.xz'): fh = lzma.open(path) else: fh = open(path) map(data.add, fh.read().splitlines()) setattr(self, mode_type, data) logger.info('finished loading %d %s to %s', len(data), id_type, mode)
def zopen(path): if path == '-': return sys.stdin lpath = path.lower() if lpath.endswith('.gz'): return gzip.open(path, 'rb') elif lpath.endswith('.bz2'): return bz2.BZ2File(path, 'rb') elif lpath.endswith('.xz'): assert lzma, "path ends with .xz but lzma library not available" return lzma.open(path, 'rb') else: return open(path, 'r')
def save(self, weights): """Saves the given weights to this neuron associated file. Weights vector is first saved as serialized numpy array, then LZMA compression algorithm is applied in order to minimize neuron file weight. :param weights: Weights vector to save. """ numpy.save(self.file, weights) with open(self.file, "rb") as source: with lzma.open(self.compressedFile, "w") as compressor: compressor.write(source.read()) remove(self.file)
def zopenw(path): if path == '-': return sys.stdout lpath = path.lower() # TODO: if prefix is s3: or http:, open some stream to such an interface if lpath.endswith('.gz'): return gzip.open(path, 'wb') elif lpath.endswith('.bz2'): return bz2.BZ2File(path, 'wb') elif lpath.endswith('.xz'): assert lzma, "path ends with .xz but lzma library not available" return lzma.open(path, 'wb') else: return open(path, 'w')
def GetPackagesContainingDescription(self, text): """Get the list of every packages that are installable on the system.""" active_sources = self.GetActiveSources() #[source for source in self.all_sources if not source.ignore] containing_longnames = {} for source in active_sources: file = lzma.open(source.hdlist) for line in file: if line[:9] == '@summary@': fields = line.strip().split('@') description = fields[2] elif line[:6] == '@info@': fields = line.strip().split('@') longname = fields[2] if description.lower().find(text) != -1: containing_longnames[longname] = True return containing_longnames
def mkarc(self, arc_engine_id, arc_name, file_infos): if arc_engine_id == 'arc_xz': try: zfile = lzma.open(arc_name, 'wb') file_info = file_infos[0] rname = file_info.get_filename() data = open(rname, 'rb').read() zfile.write(data) zfile.close() return True except: pass return False
def num_lines(filepath): """Returns the number of lines in a specified file""" if filepath.endswith('.gz'): fp = gzip.open(filepath, 'rb') elif filepath.endswith('.xz'): import backports.lzma as lzma fp = lzma.open(filepath, 'rb') else: fp = open(filepath) # count number of lines for i, line in enumerate(fp, 1): pass fp.close() return i return loggers
def parse(hdlist, add_raw=False): """Create a generator of packages parsed from synthesis hdlist file.""" pkg = {} try: for line in gzip.open(hdlist, 'rb'): handleline(pkg, line, add_raw) if 'name' in pkg: yield pkg pkg = {} except IOError: for line in lzma.open(hdlist, 'rb'): handleline(pkg, line, add_raw) if 'name' in pkg: yield pkg pkg = {}
def _compress_image_stream(self, stream): outfile = os.path.join(self.workflow.source.workdir, EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE) if self.method == "gzip": outfile = outfile.format("gz") fp = gzip.open(outfile, "wb", compresslevel=6) elif self.method == "lzma": outfile = outfile.format("xz") fp = lzma.open(outfile, "wb") else: raise RuntimeError("Unsupported compression format {0}".format(self.method)) _chunk_size = 1024 ** 2 # 1 MB chunk size for reading/writing self.log.info("compressing image %s to %s using %s method", self.workflow.image, outfile, self.method) data = stream.read(_chunk_size) while data != b"": fp.write(data) data = stream.read(_chunk_size) return outfile
def _compress_image_stream(self, stream): outfile = os.path.join(self.workflow.source.workdir, EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE) if self.method == 'gzip': outfile = outfile.format('gz') fp = gzip.open(outfile, 'wb', compresslevel=6) elif self.method == 'lzma': outfile = outfile.format('xz') fp = lzma.open(outfile, 'wb') else: raise RuntimeError('Unsupported compression format {0}'.format(self.method)) _chunk_size = 1024**2 # 1 MB chunk size for reading/writing self.log.info('compressing image %s to %s using %s method', self.workflow.image, outfile, self.method) data = stream.read(_chunk_size) while data != b'': fp.write(data) data = stream.read(_chunk_size) return outfile
def test_1010(self, tmp_bdb_root, test_docs): """Minter yields identifiers matching N2T through a template extensions. This checks identifiers in an area where where the minter template must be extended before it can be stepped to the next state. """ with lzma.open(PERL_MINTED_PATH) as f: for i in range(6218): f.readline() for i, python_sping in enumerate( nog.minter.mint_by_bdb_path( test_docs.joinpath( '77913_r7_last_before_template_extend.bdb'), 10, dry_run=True, )): perl_sping = f.readline().strip() assert ( perl_sping == python_sping ), "Mismatch after minting {} identifiers. python={} != perl={}".format( i, python_sping, perl_sping)
def compress_str(filepath, strbuffer, chunk_size=65536): """Takes a StringIO buffer and writes the output to a gzip- or xz-compressed file.""" # go to beginning of string buffer strbuffer.seek(0) # output path if filepath.endswith('.gz'): # gzip compression fp = gzip.open(filepath, 'wb') elif filepath.endswith('.xz'): # xz compression fp = lzma.open(filepath, 'wb') # to avoid overflow errors, we will read from the stream in chunks contents = strbuffer.read(chunk_size) while contents != '': fp.write(contents) contents = strbuffer.read(chunk_size) fp.close()
def extract_7zip(fname): import backports.lzma as lzma import tarfile lz = lzma.open(str(fname)) print('Extracting "%s"...' % fname) print(' decompressing...') tar = tarfile.open(fileobj=lz) def progress_generator(tar): prog = 0 so_far = 0 total = len(tar.getmembers()) last = 0.0 for ti in tar: so_far += 1 percent = int((float(so_far) / float(total)) * 100.0) if last is None or percent - last >= (100.0 / 5.0): last = percent print(' %3d%% extracted' % percent) yield ti tar.extractall(members=progress_generator(tar)) return None
def _compress_image_stream(self, stream): outfile = os.path.join(self.workflow.source.workdir, EXPORTED_COMPRESSED_IMAGE_NAME_TEMPLATE) if self.method == 'gzip': outfile = outfile.format('gz') fp = gzip.open(outfile, 'wb', compresslevel=6) elif self.method == 'lzma': outfile = outfile.format('xz') fp = lzma.open(outfile, 'wb') else: raise RuntimeError('Unsupported compression format {0}'.format( self.method)) _chunk_size = 1024**2 # 1 MB chunk size for reading/writing self.log.info('compressing image %s to %s using %s method', self.workflow.image, outfile, self.method) data = stream.read(_chunk_size) while data != b'': fp.write(data) data = stream.read(_chunk_size) return outfile
def downloadLatestRDS(): cwd = os.getcwd() rdsurl = 'https://nsrllookup.com/hashes/Sep2019.txz' try: print('[+] Downloading latest RDS list from https://nsrllookup.com/hashes/Sep2019.txz. Its size is around 2.5GB\n') r = requests.get(rdsurl) except: sys.exit('[ERROR] Do you have a direct, working Internet connection?') try: with open(cwd + '/latesthashes.txz', 'wb') as f: f.write(r.content) except: sys.exit('[ERROR] can\'t write data to disk.') f.close() # unzip and set name to latesthashes.txt (thanks to http://tiny.cc/ssxdfz): i = 'latesthashes.txz' with lzma.open(i) as compressed: o = rdshashes with open(o, 'wb') as destination: shutil.copyfileobj(compressed, destination) return True
def __init__(self, path=None, data=None, file_obj=None, mode='rb', message=StreamItem_v0_3_0, read_wrapper=None, write_wrapper=None, inline_md5=True): '''Load a chunk from an existing file handle or buffer of data. If no data is passed in, then chunk starts as empty and chunk.add(message) can be called to append to it. mode is only used if you specify a path to an existing file to open. :param path: path to a file in the local file system. If path ends in .xz then mode must be 'rb' and the entire file is loaded into memory and decompressed before the Chunk is ready for reading. :param mode: read/write mode for opening the file; if mode='wb', then a file will be created. :file_obj: already opened file, mode must agree with mode parameter. :param data: bytes of data from which to read messages :param message: defaults to StreamItem_v0_3_0; you can specify your own Thrift-generated class here. :param read_wrapper: a function that takes a deserialized message as input and returns a new object to yield from __iter__ :param write_wrapper: a function used in Chunk.add(obj) that takes the added object as input and returns another object that is a thrift class that can be serialized. ''' self.read_wrapper = read_wrapper self.write_wrapper = write_wrapper allowed_modes = ['wb', 'ab', 'rb'] assert mode in allowed_modes, 'mode=%r not in %r' % (mode, allowed_modes) self.mode = mode ## class for constructing messages when reading self.message = message ## initialize internal state before figuring out what data we ## are acting on self._count = 0 self._md5_hexdigest = None ## might not have any output parts self._o_chunk_fh = None ## might not have any input parts self._i_chunk_fh = None ## open an existing file from path, or create it if path is not None: assert data is None and file_obj is None, \ 'Must specify only path or data or file_obj' if os.path.exists(path): ## if the file is there, then use mode if mode not in ['rb', 'ab']: exc = IOError('mode=%r would overwrite existing %s' % (mode, path)) exc.errno = errno.EEXIST raise exc if path.endswith('.xz'): if xz is None: if mode != 'rb': raise Exception( 'backports.lzma is not installed and mode=%r but only "rb" is allowed without backports.lzma' % mode) ## launch xz child xz_child = subprocess.Popen(['xzcat', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) file_obj = xz_child.stdout ## what to do with stderr else: file_obj = xz.open(path, mode) elif path.endswith('.gz'): assert mode == 'rb', 'mode=%r for .gz' % mode file_obj = gz.open(path) elif path.endswith('.xz.gpg'): assert mode == 'rb', 'mode=%r for .xz' % mode ## launch xz child xz_child = subprocess.Popen( ['gpg -d %s | xz --decompress' % path], stdout=subprocess.PIPE, shell=True) #stderr=subprocess.PIPE) file_obj = xz_child.stdout ## what to do with stderr? else: file_obj = open(path, mode) else: ## otherwise make one for writing if mode not in ['wb', 'ab']: exc = IOError('%s does not exist but mode=%r' % (path, mode)) exc.errno = errno.ENOENT raise exc dirname = os.path.dirname(path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if path.endswith('.gz'): file_obj = gz.open(path, mode) elif path.endswith('.xz'): if xz is None: raise Exception( 'file extension is .xz but backports.lzma is not installed' ) file_obj = xz.open(path, mode) else: file_obj = open(path, mode) ## if created without any arguments, then prepare to add ## messages to an in-memory file object if data is None and file_obj is None: ## make the default behavior when instantiated as Chunk() ## to write to an in-memory buffer file_obj = StringIO() self.mode = 'wb' mode = self.mode elif file_obj is None: ## --> must have 'data' ## wrap the data in a file obj for reading if mode == 'rb': file_obj = StringIO(data) file_obj.seek(0) elif mode == 'ab': file_obj = StringIO() file_obj.write(data) ## and let it just keep writing to it else: raise Exception('mode=%r but specified "data"' % mode) elif file_obj is not None and hasattr(file_obj, 'mode'): if isinstance(file_obj.mode, int): ## some tools, like python gzip library, use int modes file_obj_mode = {1: 'r', 2: 'w'}[file_obj.mode] else: file_obj_mode = file_obj.mode assert file_obj_mode[0] == mode[0], 'file_obj.mode=%r != %r=mode'\ % (file_obj_mode, mode) ## use the file object for writing out the data as it ## happens, i.e. in streaming mode. if mode in ['ab', 'wb']: if inline_md5: self._o_chunk_fh = md5_file(file_obj) else: self._o_chunk_fh = file_obj else: assert mode == 'rb', mode if inline_md5: self._i_chunk_fh = md5_file(file_obj) else: self._i_chunk_fh = file_obj
def __init__(self): self.empty = True self.buffer = io.BytesIO() self.dump = lzma.open(self.buffer, mode="wt", preset=9) self.dump.write(u"[")
def find_sequence(input_file, feature_name, sequence_filter, feature_regex, build_dir, sample_id, read_num, minimum_trimmed_length, max_dist_from_edge, log_handle): """ Loads a collection of RNA-Seq reads and filters the reads so as to only return those containing a specified sequence of interest. Arguments --------- input_file: str Filepath to a FASTQ file containing reads to scan. feature_name: str Type of feature being searched for; used in naming filing and directories and in choosing logs to write to. [sl|polya] sequence_filter: str A short sequence string used for initial filtering. All reads will be checked to see if it contains this string, and those that do will be further checked using a regular expression to find the location of the match. feature_regex: str A regular expression string indicating the exact sequence to be searched for. This will be either a set of spliced leader prefixes or suffixes, or a string of A's or T's, possibly anchored at one end of the read. build_dir: str Base directory to save output to. sample_id: str ID of the sample being scanned. read_num: str Which of the mated reads should be scanned. [1|2] minimum_trimmed_length: int Minimum length of read allowed after matching feature is trimmed. max_dist_from_edge: int Maximum distance SL/Poly(A) feature can be from the edge of read. log_handle: logging.Handle Handler to use for logging. Output files ------------ There are three possible sets of output files for this function depending on whether the input read comes from a mated-pair of reads or a single read, and whether (for the case of mated-pair reads) it is the left read or right read: 1. Sequence found in R1 *_1_1_xxx_untrimmed.fastq *_1_1_xxx_trimmed.fastq *_1_2.fastq 2. Sequence found in R2 *_2_2_xxx_untrimmed.fastq *_2_2_xxx_trimmed.fastq *_2_1.fastq """ #-------------------------------------- # FASTQ row indices #-------------------------------------- ID_IDX = 0 SEQUENCE_IDX = 1 QUALITY_IDX = 2 log_handle.info("# Processing %s" % os.path.basename(input_file)) # determine whether regular expression in anchored anchored = (feature_regex.startswith("^") or feature_regex.endswith("$")) # list to keep track of potential matches matches = [] # output filepaths output_base = '%s/%s/fastq/%s_%s_%s' % ( build_dir, sample_id, sample_id, read_num, read_num[-1] ) # determine compression type to use file_ext = os.path.splitext(input_file)[-1] # for uncompressed fastq files, we don't need the final extension if file_ext not in ['.gz', '.xz']: file_ext = '' output_untrimmed = "%s_%s_untrimmed.fastq%s" % (output_base, feature_name, file_ext) output_trimmed = "%s_%s_trimmed.fastq%s" % (output_base, feature_name, file_ext) # Also keep track of match lengths which will be used for more rigorous # filtering when comparing to the genome sequence near where the read is # mapped. match_lengths_dir = os.path.join(build_dir, sample_id, 'results') output_lengths = "%s/match_lengths_%s.csv.gz" % (match_lengths_dir, read_num) match_lengths_fp = gzip.open(output_lengths, 'wb') # mated reads read_num_other = "1" if read_num == "2" else "2" input_file_mated = input_file.replace("." + read_num[-1], "." + read_num_other[-1]) output_mated_reads = "%s_%s.fastq%s" % (output_base[:-2], read_num_other, file_ext) # compile regex read_regex = re.compile(feature_regex) # total number of reads num_reads = num_lines(input_file) / 4 # Start sample log log_handle.info("# Scanning %d reads for %s" % (num_reads, feature_name)) log_handle.info("# Using Regex pattern:\n %s" % feature_regex) # open output string buffer (will write to compressed file later) reads_trimmed = cStringIO.StringIO() reads_untrimmed = cStringIO.StringIO() mated_reads_buffer = cStringIO.StringIO() # Keep track of matched read IDs read_ids = [] # Keep track of ways in which reads are filtered out num_filtered_no_seq_match = 0 num_filtered_too_small = 0 num_filtered_far_from_edge = 0 # Find all reads containing the sequence of interest if file_ext == '.gz': fastq = gzip.open(input_file, 'rb') fastq_mated = gzip.open(input_file_mated, 'rb') elif file_ext == '.xz': import backports.lzma as lzma fastq = lzma.open(input_file, 'rb') fastq_mated = lzma.open(input_file_mated, 'rb') else: fastq = open(input_file, 'r') fastq_mated = open(input_file_mated, 'r') # iterate over mated reads at same time mated_reads = readfq(fastq_mated) for i, read in enumerate(readfq(fastq)): # get mated read mated_read = mated_reads.next() # ignore any reads that don't contain at least the smallest part of # the sequence of interest # this just speeds up the search so we don't have to use regex on all # reads if sequence_filter not in read[SEQUENCE_IDX]: num_filtered_no_seq_match += 1 continue # check for match # When looking for internal matches, there may be multiple hits. Choose # the one that is closest to the edge of the read where the feature is # expected to be found. try: # for polya, reverse sequence to to find match closest to right # side if (feature_name == 'polya' and (not anchored)): match = re.search(read_regex, read[SEQUENCE_IDX][::-1]) match_start = len(read[SEQUENCE_IDX]) - match.end() match_end = len(read[SEQUENCE_IDX]) - match.start() else: match = re.search(read_regex, read[SEQUENCE_IDX]) # for anchored reads, its possible that a read passes the # quick filter check but the regex does not match if match is None: num_filtered_no_seq_match += 1 continue match_start = match.start() match_end = match.end() except: import pdb; pdb.set_trace(); # match length match_length = match.end() - match.start() # For SL sequence, trim everything up to end of match if feature_name == 'sl': trimmed_read = [read[ID_IDX], read[SEQUENCE_IDX][match_end:], "+", read[QUALITY_IDX][match_end:]] else: # otherwise trim from the start of the match to the end of the read trimmed_read = [read[ID_IDX], read[SEQUENCE_IDX][:match_start], "+", read[QUALITY_IDX][:match_start]] # skip reads that are less than the required amount after trimming if len(trimmed_read[SEQUENCE_IDX]) < minimum_trimmed_length: num_filtered_too_small += 1 continue # length of portion trimmed off trimmed_part_length = (len(read[SEQUENCE_IDX]) - len(trimmed_read[SEQUENCE_IDX])) # for internal matches, skip reads where match is not close enough to # the edge of the read if (trimmed_part_length - match_length) > max_dist_from_edge: num_filtered_far_from_edge += 1 continue # write length match_lengths_fp.write(",".join([read[ID_IDX], str(match_length)]) + "\n") # take reverse complement if requested # this will return the read back to the expected orientation (SL # upstream/Poly(A) downstream) #if reverse: # trimmed_read[SEQUENCE_IDX] = str( # Seq.Seq(trimmed_read[SEQUENCE_IDX]).reverse_complement()) # trimmed_read[QUALITY_IDX] = trimmed_read[QUALITY_IDX][::-1] # Otherwise add trimmed read to output reads_trimmed.write("\n".join(trimmed_read) + "\n") # Also save complete (untrimmed) reads containing the matched sequence. # By mapping these reads to the genome we can eliminate false hits; # i.e. reads that contain a portion of the sequence of intereste but # are not actual trans-splicing / poly-adenylation reads. untrimmed_read = [read[ID_IDX], read[SEQUENCE_IDX], "+", read[QUALITY_IDX]] reads_untrimmed.write("\n".join(untrimmed_read) + "\n") # paired-end reads untrimmed_mated_read = [mated_read[ID_IDX], mated_read[SEQUENCE_IDX], "+", mated_read[QUALITY_IDX]] mated_reads_buffer.write("\n".join(untrimmed_mated_read) + "\n") # save id read_ids.append(read[ID_IDX]) # log numbers log_handle.info("# Found %d reads with possible %s fragment" % (len(read_ids), feature_name)) log_handle.info( "# Excluded %d reads with no feature matches." % num_filtered_no_seq_match) log_handle.info( "# Excluded %d reads which were too short after trimming." % num_filtered_too_small) log_handle.info( "# Excluded %d reads with matched feature too far from read edge." % num_filtered_far_from_edge) # Create output directory output_dir = os.path.dirname(output_base) if not os.path.exists(output_dir): os.makedirs(output_dir, mode=0o755) # write trimmed and untrimmed reads to fastq.gz / fastq.xz compress_str(output_trimmed, reads_trimmed) compress_str(output_untrimmed, reads_untrimmed) compress_str(output_mated_reads, mated_reads_buffer) # clean up fastq.close() fastq_mated.close() reads_trimmed.close() reads_untrimmed.close() mated_reads_buffer.close() match_lengths_fp.close() log_handle.info("# Finished processing %s" % os.path.basename(input_file))
parser.add_argument('-k', '--k_trials', type=int, default=100) parser.add_argument('-n', '--n_words', type=int, default=5000) parser.add_argument('files', nargs='+') args = parser.parse_args() type_counter = Counter() for filename in args.files: with codecs.open(filename, encoding='utf-8', errors='ignore') as f: for line in f: line = line.strip() if line: if not re.match('#|[0-9]+[-.][0-9]+', line): type_counter[line.split('\t')[1]] += 1 types = type_counter.keys() total = sum(type_counter.values()) probs = [type_counter[type_] / total for type_ in types] trials = [] n_words = min(args.n_words, len(types)) or len(types) for _ in xrange(args.k_trials): chosen_types = np.random.choice(types, size=n_words, replace=False, p=probs) with codecs.open('uncompressed.txt', 'w', encoding='utf-8', errors='ignore') as f: f.write('\n'.join(chosen_types)) with lzma.open('compressed.txt.xz', 'wb') as f: f.write('\n'.join(chosen_types).encode('utf-8', 'ignore')) trials.append(os.path.getsize('compressed.txt.xz')/os.path.getsize('uncompressed.txt')) os.remove('uncompressed.txt') os.remove('compressed.txt.xz') print(np.mean(trials))
def __init__(self, path=None, data=None, file_obj=None, mode='rb', message=StreamItem_v0_3_0, read_wrapper=None, write_wrapper=None, ): '''Load a chunk from an existing file handle or buffer of data. If no data is passed in, then chunk starts as empty and chunk.add(message) can be called to append to it. mode is only used if you specify a path to an existing file to open. :param path: path to a file in the local file system. If path ends in .xz then mode must be 'rb' and the entire file is loaded into memory and decompressed before the Chunk is ready for reading. :param mode: read/write mode for opening the file; if mode='wb', then a file will be created. :file_obj: already opened file, mode must agree with mode parameter. :param data: bytes of data from which to read messages :param message: defaults to StreamItem_v0_3_0; you can specify your own Thrift-generated class here. :param read_wrapper: a function that takes a deserialized message as input and returns a new object to yield from __iter__ :param write_wrapper: a function used in Chunk.add(obj) that takes the added object as input and returns another object that is a thrift class that can be serialized. ''' if not fastbinary_import_failure: logger.debug('using TBinaryProtocolAccelerated (fastbinary)') else: logger.warn('import fastbinary failed; falling back to 15x slower TBinaryProtocol: %r'\ % fastbinary_import_failure) self.read_wrapper = read_wrapper self.write_wrapper = write_wrapper allowed_modes = ['wb', 'ab', 'rb'] assert mode in allowed_modes, 'mode=%r not in %r' % (mode, allowed_modes) self.mode = mode ## class for constructing messages when reading self.message = message ## initialize internal state before figuring out what data we ## are acting on self._count = 0 self._md5_hexdigest = None ## might not have any output parts self._o_chunk_fh = None self._o_transport = None self._o_protocol = None ## might not have any input parts self._i_chunk_fh = None self._i_transport = None self._i_protocol = None ## open an existing file from path, or create it if path is not None: assert data is None and file_obj is None, \ 'Must specify only path or data or file_obj' if os.path.exists(path): ## if the file is there, then use mode if mode not in ['rb', 'ab']: exc = IOError('mode=%r would overwrite existing %s' % (mode, path)) exc.errno = errno.EEXIST raise exc if path.endswith('.xz'): if lzma is None: if mode != 'rb': raise Exception('backports.lzma is not installed and mode=%r but only "rb" is allowed without backports.lzma' % mode) ## launch xz child xz_child = subprocess.Popen( ['xzcat', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) file_obj = xz_child.stdout ## what to do with stderr else: file_obj = lzma.open(path, mode) elif path.endswith('.gz'): assert mode == 'rb', 'mode=%r for .gz' % mode file_obj = gzip.open(path) elif path.endswith('.xz.gpg'): assert mode == 'rb', 'mode=%r for .xz' % mode ## launch xz child xz_child = subprocess.Popen( ['gpg -d %s | xz --decompress' % path], stdout=subprocess.PIPE, shell=True) #stderr=subprocess.PIPE) file_obj = xz_child.stdout ## what to do with stderr? else: file_obj = open(path, mode) else: ## otherwise make one for writing if mode not in ['wb', 'ab']: exc = IOError('%s does not exist but mode=%r' % (path, mode)) exc.errno = errno.ENOENT raise exc dirname = os.path.dirname(path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if path.endswith('.gz'): file_obj = gzip.open(path, mode) elif path.endswith('.xz'): if lzma is None: raise Exception('file extension is .xz but backports.lzma is not installed') file_obj = lzma.open(path, mode) else: file_obj = open(path, mode) ## if created without any arguments, then prepare to add ## messages to an in-memory file object if data is None and file_obj is None: ## make the default behavior when instantiated as Chunk() ## to write to an in-memory buffer file_obj = StringIO() self.mode = 'wb' mode = self.mode elif file_obj is None: ## --> must have 'data' ## wrap the data in a file obj for reading if mode == 'rb': file_obj = StringIO(data) file_obj.seek(0) elif mode == 'ab': file_obj = StringIO() file_obj.write(data) ## and let it just keep writing to it else: raise Exception('mode=%r but specified "data"' % mode) elif file_obj is not None and hasattr(file_obj, 'mode'): if isinstance(file_obj.mode, int): ## some tools, like python gzip library, use int modes file_obj_mode = {1: 'r', 2: 'w'}[file_obj.mode] else: file_obj_mode = file_obj.mode assert file_obj_mode[0] == mode[0], 'file_obj.mode=%r != %r=mode'\ % (file_obj_mode, mode) ## use the file object for writing out the data as it ## happens, i.e. in streaming mode. if mode in ['ab', 'wb']: self._o_chunk_fh = md5_file( file_obj ) self._o_transport = TTransport.TBufferedTransport(self._o_chunk_fh) self._o_protocol = protocol(self._o_transport) else: assert mode == 'rb', mode self._i_chunk_fh = md5_file( file_obj )
# pip install requests backports.lzma streamcorpus ## installation on CentOS/RHEL is similar using yum instead of apt-get from backports import lzma import requests from streamcorpus import Chunk, decrypt_and_uncompress, compress_and_encrypt logging.basicConfig() logger = logging.getLogger() s3_http_host = 'https://aws-publicdatasets.s3.amazonaws.com/' s3_path_prefix = 'trec/dd/local-politics-streamcorpus-v0_3_0/' s3_paths_fname = 'local-politics-streamcorpus-v0_3_0-s3-paths.txt.xz' if not os.path.exists(s3_paths_fname): sys.exit('please download %strec/dd/%s' % (s3_http_host, s3_paths_fname)) for path in lzma.open(s3_paths_fname): s3_path = s3_path_prefix + path.strip() url = s3_http_host + s3_path logger.info(url) retries = 0 max_retries = 10 while retries < max_retries: retries += 1 sys.stderr.flush() try: resp = requests.get(url) errors, data = decrypt_and_uncompress(resp.content, gpg_private='trec-kba-rsa') logger.info('\n'.join(errors)) for si in Chunk(file_obj=StringIO(data)):
def compressSingle (fileName, compressionMethod): testFile = open(fileName,'r') #standard filter values dict_size = 1610612736 lc = 1 lp = 0 pb = 2 mode = lzma.MODE_NORMAL nice_len = 273 mf = lzma.MF_BT2 depth = 0 #Filters for various LZMA compressions BJC = lzma.FILTER_ARM BJC_LZMA2_filter = [ {"id": BJC}, {"id": lzma.FILTER_LZMA2, "lc": lc, "lp":lp, "pb": pb , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] LZMA2_HPB_filter = [ {"id": lzma.FILTER_LZMA2, "lc": lc, "lp":lp, "pb": 4 , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] std_LZMA2_filter = [ {"id": lzma.FILTER_LZMA2, "lc": lc, "lp":lp, "pb":pb , "mode":mode, "mf":mf, "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] std_LZMA1_filter = [ {"id": lzma.FILTER_LZMA1, "lc": lc, "lp":lp, "pb":pb , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] LZMA2_DELTA1_filter = [ {"id": lzma.FILTER_DELTA, "dist": 1}, {"id": lzma.FILTER_LZMA2, "lc": lc, "lp":lp, "pb":pb , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] LZMA2_HLP_filter = [ {"id": lzma.FILTER_LZMA2, "lc": 0, "lp":4, "pb":pb , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] LZMA2_HLC_filter = [ {"id": lzma.FILTER_LZMA2, "lc": 4, "lp":0, "pb":pb , "mode":mode, "mf":mf,\ "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] if compressionMethod == 0: #zip, no compression compressionMethod = '.zip' compressedName = (fileName + compressionMethod) with zipfile.ZipFile(compressedName,'w',zipfile.ZIP_STORED) as newFile: newFile.write(fileName) newFile.close() #zip deflate if compressionMethod == 1: compressionMethod = '.zip' compressedName = (fileName + compressionMethod) with zipfile.ZipFile( compressedName,'w',zipfile.ZIP_DEFLATED) as newFile: newFile.write(fileName) newFile.close() #bzip if compressionMethod == 2: compressionMethod = '.bz' compressedName = (fileName + compressionMethod) with bz2.BZ2File( compressedName,'w') as newFile: newFile.write(testFile.read()) newFile.close() #lzma modes #std lzma2 if compressionMethod == 3: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open( compressedName,'wb', filters=std_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #std lzma 1 if compressionMethod == 4: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=std_LZMA1_filter, format=lzma.FORMAT_ALONE) newFile.write(testFile.read()) newFile.close() #delta LZMA2 if compressionMethod == 5: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=LZMA2_DELTA1_filter) newFile.write(testFile.read()) newFile.close() #HLP lzma if compressionMethod == 6: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=LZMA2_HLP_filter) newFile.write(testFile.read()) newFile.close() #HLC lzma if compressionMethod == 7: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=LZMA2_HLC_filter) newFile.write(testFile.read()) newFile.close() #HPB lzma if compressionMethod == 8: compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=LZMA2_HPB_filter) newFile.write(testFile.read()) newFile.close() #ARM lzma if compressionMethod == 9: BJC = lzma.FILTER_ARM BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #FILTER_ARMTHUMB lzma if compressionMethod == 10: BJC = lzma.FILTER_ARMTHUMB BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #IA64 lzma if compressionMethod == 11: BJC = lzma.FILTER_IA64 BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open( compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #POWERPC lzma if compressionMethod == 12: BJC = lzma.FILTER_POWERPC BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open( compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #SPARC lzma if compressionMethod == 13: BJC = lzma.FILTER_SPARC BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open(compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #X86 lzma if compressionMethod == 14: BJC = lzma.FILTER_X86 BJC_LZMA2_filter[0].update({"id":BJC}) compressionMethod = '.lzma' compressedName = (fileName + compressionMethod) newFile = lzma.open( compressedName,'wb',filters=BJC_LZMA2_filter) newFile.write(testFile.read()) newFile.close() #GZIP if compressionMethod == 15: compressionMethod = '.gz' compressedName = (fileName + compressionMethod) newFile = gzip.open(compressedName,'w') newFile.write(testFile.read()) newFile.close() #zlib if compressionMethod == 16: compressionMethod = '.gzip' compressedName = (fileName + compressionMethod) newFile = open( compressedName,'w') compressedContent = zlib.compress(testFile.read(),9) newFile.write(compressedContent) newFile.close()
import wget import tarfile # if using python2 if (sys.version_info < (3,0)): from backports import lzma from contextlib import closing for directory in ['rtems-source-builder', 'rtems', 'rtems-examples']: vprint('Downloading '+options.release_url + '/sources/' + directory+'-' + options.tag + '.tar.xz...') wget.download(options.release_url + '/sources/' + directory+'-' + options.tag + '.tar.xz',options.dir) vprint('Done...') vprint('Unpacking ' + options.dir + '/' + directory + '-' + options.tag + '.tar.xz...') # if using python3 if (sys.version_info > (3,0)): with tarfile.open(options.dir + '/' + directory + '-' + options.tag + '.tar.xz') as f: f.extractall('.') else: with closing(lzma.open(options.dir + '/' + directory + '-' + options.tag + '.tar.xz')) as xz: with tarfile.open(fileobj=xz) as f: f.extractall('.') vprint('Done...') os.remove(options.dir + '/' + directory + '-' + options.tag + '.tar.xz') os.rename(options.dir + '/' + directory + '-' + options.tag, options.dir + '/' + directory) vprint('\n\nScript finished successfully...') sys.exit(0)
def _from_file(clazz, filename, header_only=False, strict=False): """ :param filename: name of the file to read from :type filename: string :param header_only: read header only :rtype: Graph :return: imported hypergraph """ num_edges = None num_verts = None is_dimacs = False stream = None graph = clazz() try: mtype = mimetypes.guess_type(filename)[1] if mtype is None: stream = open(filename, 'r') elif mtype == 'bzip2': stream = BZ2File(filename, 'r') elif mtype == 'gz' or mtype == 'gzip': stream = gzip.open(filename, 'r') elif mtype == 'xz' and xz: stream = xz.open(filename, 'r') else: raise IOError('Unknown input type "%s" for file "%s"' % (mtype, filename)) nr = 0 header_seen = False for line in stream: nr += 1 line = line.split() if line == [] or line[0] in ('x', 'n'): continue elif line[0] == 'p': if header_seen: logging.critical('L(%s). Duplicate header. Exiting.' % nr) exit(3) if len(line) > 4: logging.critical( 'L(%s). Too many arguments. Exiting.' % nr) exit(3) is_dimacs = line[1] == 'edge' is_formula = line[1] == 'cnf' num_verts = int(line[2]) num_edges = int(line[3]) if header_only: return num_verts, num_edges if num_verts == 0: logging.warning("Empty graph.") return graph header_seen = True elif line[0] != 'c' and ( is_dimacs or (line[0] != 'a' and line[0] != 'e') ): #now also ignores forAll and Exists :P if not header_seen: logging.critical( 'L(%s). Lines before header. Exiting.' % nr) exit(3) try: if is_dimacs: graph.add_edge(int(line[1]), int(line[2])) elif is_formula: atoms = map(lambda x: abs(int(x)), line[0:-1]) #print("formula{0}".format(atoms)) for i in atoms: for j in atoms: if i < j: graph.add_edge( i, j ) #abs -> then it also works for qbf num_edges += (len(atoms) * (len(atoms) - 1)) / 2 - 1 else: graph.add_edge(int(line[0]), int(line[1])) assert (0 not in graph.nodes()) except ValueError, e: logging.critical('L(%s). Invalid integer. Exiting.' % nr) logging.critical('Error was: %s' % e) exit(3) except IndexError, e: logging.critical('L(%s). Incomplete edge. Exiting' % nr) logging.critical('Error was: %s' % e) exit(3)
def read_ascii_file(spectrum_file, start_row=0, end_row=-1, blackbody=False, stellar_radius=1.0, distance=1., include_dilution_factor=True): """ Reads in a BT-NextGen spectrum and outputs the wavelengths in microns and flux in W / m**2 / um start_row and end_row allow for reading only a subset of the file to save time (and to skip headers). But note that this way you cant read in the last row. blackbody : Set to true to return the blackbody spectrum instead stellar_radius : Stellar radius in solar radii distance : distance in pc """ if spectrum_file.endswith('.bz2'): with bz2.BZ2File(spectrum_file, 'r') as myf: x = myf.read() elif spectrum_file.endswith('.gz'): with gzip.open(spectrum_file, 'r') as myf: x = myf.read() elif spectrum_file.endswith('.xz'): from backports import lzma # This can be tricky to install... x = lzma.open(spectrum_file).read() else: raise IOError("Unrecognized file type: " + str(spectrum_file)) start_row = np.int(start_row) end_row = np.int(end_row) data = x.split('\n') output = [] all_output = [] for ix, row in enumerate(data[start_row:end_row]): # remove duplicate white space, not sure how to do this easily temp = row.strip() for rep in range(50): temp = temp.replace(' ', ' ') # And change Ds to Es for the exponential temp = temp.replace('D', 'e') # Split it based on white space and append the results to the output array if temp != '': split = temp.split(' ') all_output.append(split) output.append(np.float64(split[0:3])) output = np.array(output) # make it a numpy array so it is 2D if output.ndim != 2: raise Exception("Couldnt read the spectra file!" + spectrum_file) wavs = output[:, 0] * 1e-4 # turn to microns flux = output[:, 1] if blackbody: flux = output[:, 2] DF = -8. # For "all most recent models" # DF=-28.9007901434 # For NextGen T > 5000K # DF=-26.9007901434 # For NextGen T < 5000K # convert flux to ergs/sec/cm**2/A flux = 10**( flux + DF ) # this is the conversion eqn from the website (https://phoenix.ens-lyon.fr/Grids/FORMAT) # convert to W/m**2/um # 1 erg/s = 1e-7 W # 1 cm**-2 = 1e4 m**-2 # 1 A**-1 = 1e4 um**-1 flux_si = flux * 1e-7 * 1e4 * 1e4 # # flux2=flux*units.erg/units.s/(units.cm**2)/units.angstrom # flux_si=flux2.to(units.watt/(units.m**2)/units.micron) # print 'median log flux:',np.median(np.log10(flux_si)) # Scale by the distance and stellar radii # * (radius/distance)^2 in same units. rsol_pc = 2.25461e-8 if include_dilution_factor: dilution_factor = (stellar_radius * rsol_pc / distance)**2 flux_si *= dilution_factor return [wavs, flux_si]
def lzwrite(data, path): with lzma.open(path, "wb") as outfile: outfile.write(data)
def lzd(src, tgt): with lzma.open(src, "rb") as srcfile: with open(tgt, "wb") as tgtfile: tgtfile.write(srcfile.read())
lp = 0 pb = 2 mode = lzma.MODE_NORMAL nice_len = 273 mf = lzma.MF_BT2 depth = 0 std_LZMA2_filter = [ {"id": lzma.FILTER_LZMA2, "lc": lc, "lp":lp, "pb":pb , "mode":mode, "mf":mf, "nice_len": nice_len, "dict_size": dict_size, "depth":depth} ] start = time.time() compressionMethod = 'LZMA_STD' compressedName = (path + compressionMethod) testFile.close() testFile = open(path,'r') newFile = lzma.open( compressedName,'wb', filters=std_LZMA2_filter) newFile.write(testFile.read()) newFile.close() print (compressionMethod+": "+str(float(os.path.getsize(compressedName))/1024) +" Kb " + \ str(float(os.path.getsize(compressedName))/float(os.path.getsize(path)))) print(str((time.time()-start)*1000) + " miliseconds") size = float(os.path.getsize(compressedName)) tm = time.time()-start storageList.append(compressionMethod) storageList.append(size) storageList.append(size/uncompSize) storageList.append(tm) print() os.remove(compressedName)
def create_assignments(self): # a segmentation problem can only have assignments if details about how # the assignments should be created are provided h_utils.require(self.has_details, InvalidStateError(_('This segmentation problem is lacking ' 'details.'))) # clear any previous assignments for this segmentation problem self.clear_assignments() details = self.details try: tmp_dir = tempfile.mkdtemp() # generate the tiles for the assignments tiles_info = h_dip.generate_tiles( img_path=self.image.path, tiles_dim=details.tiles_dimension, overlap_rel=details.tiles_overlap, border_rel=details.tiles_border, dst_path=tmp_dir, workable_checker=h_dip.simple_content_detection() )[0] assignments = [] for info in tiles_info: a = self.assignments.model( seg_prob=self, tile_bbox_x0=info['cropped_bbox'][0], tile_bbox_y0=info['cropped_bbox'][1], tile_bbox_x1=info['cropped_bbox'][2], tile_bbox_y1=info['cropped_bbox'][3], workable=info['workable'] ) with open(info['path'], 'rb') as f: a.tile.save(os.path.basename(info['path']), File(f), save=False) assignments.append(a) if details.algorithm == u'LIVEVESSEL': files_temp_path = [] live_script_rel_path = '../externals/matlab/livevessel_pre_process/' livevessel_preprocess_script = h_fs.get_absolute_path(live_script_rel_path, settings.PROJECT_ROOT) processes = [] for a in assignments: tile_number = os.path.basename(os.path.splitext(a.tile.path)[0]).split('_')[1] basedir = 'liv_preprocess_' + tile_number temp_path = os.path.join(tmp_dir, basedir) files_temp_path.append(temp_path) code = "addpath(genpath('"+ livevessel_preprocess_script + "'));" code = code + "offline('" + a.tile.path + "', '" + temp_path + "');" code = code + "exit;" processes.append(subprocess.Popen( ["/usr/local/bin/matlab", "-nosplash", "-nodesktop", "-nojvm", "-r", code], stdout=subprocess.PIPE, stderr=subprocess.PIPE)) exit_code = [process.communicate() for process in processes] #print "next process" #out, err = process.communicate() index = 0 for a in assignments: compressed_file_path = files_temp_path[index] + '.xz' with open(files_temp_path[index], 'rb') as f: with lzma.open(compressed_file_path, 'wb') as xz_file: xz_file.write(f.read()) with open(compressed_file_path, 'rb') as f: basename = os.path.basename(compressed_file_path) a.preprocess_file.save(basename, File(f), save=False) index += 1 if details.pre_seg: # generate pre segs. for the tiles pre_seg_tiles_info, border = h_dip.generate_tiles( img_path=details.pre_seg.path, tiles_dim=details.tiles_dimension, overlap_rel=details.tiles_overlap, border_rel=details.tiles_border, dst_path=tmp_dir, tiles_prefix='pre_seg_' ) assert len(tiles_info) == len(pre_seg_tiles_info) for a, info in zip(assignments, pre_seg_tiles_info): img = Image.open(info['path']) img_w, img_h = img.size img.crop((border, border, img_w - border, img_h - border)).save(info['path']) with open(info['path'], 'rb') as f: a.pre_seg.save(os.path.basename(info['path']), File(f), save=False) # effectively create the assignments try: self.assignments.bulk_create(assignments) except DatabaseError: debug_logger.exception('bulk creation of %d assignments ' 'failed... falling back to individual ' 'creation' % len(assignments)) map(lambda a: a.save(), assignments) # in case of success, return the number of assignments created return len(assignments) except: # in case any thing goes wrong, undo and log the problem to the # administrators self.clear_assignments() internal_errors_logger.exception('assignments for segmentation ' 'problem %d could not be ' 'created' % self.pk) finally: h_fs.rm(tmp_dir, ignore_errors=True)
def from_sat_file(clazz, filename, mode): header = {} clauses = [] stream = None nr = 0 def log_error(msg): logging.error(msg) exit(2) try: mtype = mimetypes.guess_type(filename)[1] if mtype is None: stream = open(filename, 'r') elif mtype == 'bzip2': stream = BZ2File(filename, 'r') elif mtype == 'gz' or mtype == 'gzip': stream = gzip.open(filename, 'r') elif mtype == 'xz' and xz: stream = xz.open(filename, 'r') else: raise IOError('Unknown input type "%s" for file "%s"' % (mtype, filename)) for line in stream: if isinstance(line, bytes): line = line.decode("utf-8") if len(line.rstrip()) == 0: continue line = line.split() nr += 1 if line[0] == "p": logging.info("Reading header") if len(header.keys()) != 0: log_error("Multiple header in line {}".format(nr)) if len(line) != 4: log_error( "Wrong header. expected 4 tokens (p cnf num_variables num_edges). Got {} instead" .format(len(line))) if line[1] != "cnf": log_error( "Expected cnf identifier. Got {} instead".format( line[1])) try: header["num_variables"] = int(line[2]) header["num_clauses"] = int(line[3]) except ValueError as e: logging.error(e) log_error( "Invalid format for number of variables or clauses. Expected integer" ) elif line[0] == "c" or line[0] == "%" or line[0] == "w": logging.info("#" * 20 + "Reading comment") logging.info(" ".join(line)) else: if len(header.keys()) == 0: log_error( "Reading edge before header in line {}".format(nr)) try: if int(line[-1]) == 0: line = line[:-1] line = [int(x) for x in line] except ValueError as e: logging.error(e) log_error( "Invalid format for clause. Variables should be in integer" ) for v in line: if abs(v) < 1 or abs(v) > header["num_variables"]: log_error( "Vertex {} out of bounds. Expected abs in range [1, {}]" .format(v, header["num_variables"])) if len(line) > 0: clauses.append(line) finally: if stream: stream.close() t_sat = clazz.convert_to_threesat(header["num_variables"], clauses, mode) return t_sat