def open(tarname, nojson=False): """Opens a tar file containing bzip2-compressed chunks of lines containing JSON objects. Use as an iterator, like this: for obj in tarchunk.open("blah.tar"): print o['text'] for s in tarchunk.open("blah.tar", nojson=True): # s is a string """ global good, bad tar = tarfile.open(tarname, mode='r|*') for tarinfo in tar: name = tarinfo.name try: obj = tar.extractfile(tarinfo) if obj is None: continue if nojson: yield from bz2.open(obj) else: for line in bz2.open(obj): yield json.loads(line.decode('utf8')) good += 1 except Exception as e: print("Choked on {0}: {1}".format(name, e)) bad += 1
def bz2_open(file, mode='r'): """Abstract the numerous ways BZ2 files are handled in Python. @param file: The file path to open. @type file: str @keyword mode: The mode to open the file with. Only the values of 'r' and 'w' for reading and writing respectively are supported. @type mode: str @return: The bzip2 file object. @rtype: file object """ # Check the mode. if mode not in ['r', 'w']: raise RelaxError("The mode '%s' must be one or 'r' or 'w'." % mode) # Check if the bz2 module exists. if not bz2_module: if mode == 'r': raise RelaxError("Cannot open the file %s, try uncompressing first. %s." % (file, bz2_module_message)) else: raise RelaxError("Cannot create bzip2 file %s, the bz2 Python module cannot be found." % file) # Open the file for reading. if mode == 'r': # Python 3.3 text mode. if sys.version_info[0] == 3 and sys.version_info[1] >= 3: file_obj = bz2.open(file, 't') # Python 3.0, 3.1 and 3.2 text mode. elif sys.version_info[0] == 3 and sys.version_info[1] < 3: file_obj = io.TextIOWrapper(Bzip2Fixed(file, 'r')) # Python 2 text mode. else: file_obj = bz2.BZ2File(file, 'r') # Open the file for writing. elif mode == 'w': # Python 3.3 text mode. if sys.version_info[0] == 3 and sys.version_info[1] >= 3: file_obj = bz2.open(file, 'wt') # Python 3.0, 3.1 and 3.2 text mode. elif sys.version_info[0] == 3 and sys.version_info[1] < 3: file_obj = io.TextIOWrapper(Bzip2Fixed(file, 'w')) # Python 2 text mode. else: file_obj = bz2.BZ2File(file, 'w') # Return the file object. return file_obj
def open_file(filename, mode, encoding=None): import sys, io binary = mode.endswith("b") mode = mode.rstrip("b") + "b" if mode.startswith("r"): if filename == "-": fileobj = sys.stdin.buffer else: fileobj = open(filename, mode) buf = fileobj.peek(100) if buf.startswith(b"\x1f\x8b\x08"): import gzip fileobj = gzip.open(fileobj, mode) elif buf[0:3] == b"BZh" and buf[4:10] == b"1AY&SY": import bz2 fileobj = bz2.open(fileobj, mode) elif buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): import lzma fileobj = lzma.open(fileobj, mode) else: if filename == "-": fileobj = sys.stdout.buffer elif filename.endswith(".gz"): import gzip fileobj = gzip.open(filename, mode) elif filename.endswith(".bz2"): import bz2 fileobj = bz2.open(filename, mode) elif filename.endswith(".xz"): import lzma fileobj = lzma.open(filename, mode) else: fileobj = open(filename, mode) if binary: return fileobj else: return io.TextIOWrapper(fileobj, encoding=encoding, errors="surrogateescape", line_buffering=True)
def _build_vocabulary(self, vocabulary_size): """ Count words in the pages file and write a list of the most frequent words to the vocabulary file. """ counter = collections.Counter() with bz2.open(self._pages_path, 'rt') as pages: for page in pages: words = page.strip().split() counter.update(words) common = ['<unk>'] + counter.most_common(vocabulary_size - 1) common = [x[0] for x in common] with bz2.open(self._vocabulary_path, 'wt') as vocabulary: for word in common: vocabulary.write(word + '\n')
def _read_pages(self, url): """ Extract plain words from a Wikipedia dump and store them to the pages file. Each page will be a line with words separated by spaces. """ wikipedia_path = download(url, self._cache_dir) with bz2.open(wikipedia_path) as wikipedia, \ bz2.open(self._pages_path, 'wt') as pages: for _, element in etree.iterparse(wikipedia, tag='{*}page'): if element.find('./{*}redirect') is not None: continue page = element.findtext('./{*}revision/{*}text') words = self._tokenize(page) pages.write(' '.join(words) + '\n') element.clear()
def test_output_files(logger, args, result): if not args.publish_results: return # Only when there is an output directory does it need publishing. dstdir = _mkdir_test_output(logger, args, result) if not dstdir: return # copy plain text files good = re.compile(r"(\.txt|\.diff|^RESULT)$") log = re.compile(r"(\.log)$") for name in os.listdir(result.output_directory): # copy simple files src = os.path.join(result.output_directory, name) dst = os.path.join(dstdir, name) if os.path.isfile(dst) and os.path.samefile(src, dst): continue if os.path.isfile(dst) \ and os.path.getmtime(src) < os.path.getmtime(dst): continue if good.search(name): logger.info("copying '%s' to '%s'", src, dst) shutil.copyfile(src, dst) continue # copy compressed files dst = dst + ".bz2" if os.path.isfile(dst) \ and os.path.getmtime(src) < os.path.getmtime(dst): continue if log.search(name): logger.info("compressing '%s' to '%s'", src, dst) with open(src, "rb") as f: data = f.read() with bz2.open(dst, "wb") as f: f.write(data) continue
def asHandle(fileNameOrHandle, mode='r'): """ Decorator for file opening that makes it easy to open compressed files. Based on L{Bio.File.as_handle}. @param fileNameOrHandle: Either a C{str} or a file handle. @return: A generator that can be turned into a context manager via L{contextlib.contextmanager}. """ if isinstance(fileNameOrHandle, six.string_types): if fileNameOrHandle.endswith('.gz'): if six.PY3: yield gzip.open(fileNameOrHandle, mode='rt', encoding='UTF-8') else: yield gzip.GzipFile(fileNameOrHandle) elif fileNameOrHandle.endswith('.bz2'): if six.PY3: yield bz2.open(fileNameOrHandle, mode='rt', encoding='UTF-8') else: yield bz2.BZ2File(fileNameOrHandle) else: with open(fileNameOrHandle) as fp: yield fp else: yield fileNameOrHandle
def main(): usage = 'usage: %prog [options] <trim_length> <fastq_file>' parser = OptionParser(usage) (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide trim length and FASTQ file') else: trim_length = int(args[0]) fastq_file = args[1] if fastq_file[-3:] == '.gz': fastq_in = gzip.open(fastq_file, 'rt') elif fastq_file[-4:] == '.bz2': fastq_in = bz2.open(fastq_file, 'rt') else: fastq_in = open(fastq_file) header = fastq_in.readline().rstrip() while header: seq = fastq_in.readline().rstrip() mid = fastq_in.readline().rstrip() qual = fastq_in.readline().rstrip() # trim seq = seq[:trim_length] qual = qual[:trim_length] print('%s\n%s\n%s\n%s' % (header,seq,mid,qual)) header = fastq_in.readline().rstrip() fastq_in.close()
def unbzip(fname): tmp_fname = tempfile.mkstemp()[1] tmpf = open(tmp_fname, "w") bz = bz2.open(fname) tmpf.write(bz.read()) tmpf.close() return tmp_fname
def commit(self): if not self.to_write: raise DBException('Need to be in write mode to commit') try: os.makedirs(self.node_dir) except FileExistsError: pass # This is ok w = bz2.open(self.node_file + '.tmp', 'wt', encoding='utf-8') start_pos = self.key.get_last_key() if self.db.is_sparse: poses = [] vals = [] for i, val in enumerate(self._vals): if val is not None: vals.append(val) poses.append(i) w.write('\t'.join([str(x + start_pos) for x in poses])) w.write('\n') for v in vals: w.write('%s\n' % repr(v)) else: for v in self._vals: w.write('%s\n' % repr(v)) w.close() os.rename(self.node_file + '.tmp', self.node_file)
def xml_writer(self, filename): if self.xowa: try: while True: line = (yield) #f.write(line.encode('utf-8')) print(line, end='') except GeneratorExit: pass logging.info('XML-Stream: %s done.', filename) elif self.compress: with bz2.open(filename+'.bz2', 'w') as f: try: while True: line = (yield) f.write(line.encode('utf-8')) except GeneratorExit: pass logging.info('File: %s.bz2 done.', filename) else: with open(filename, 'w', encoding='utf-8') as f: try: while True: line = (yield) f.write(line) except GeneratorExit: pass logging.info('File: %s done.', filename)
def zopen(filename, *args, **kwargs): """ This function wraps around the bz2, gzip and standard python's open function to deal intelligently with bzipped, gzipped or standard text files. Args: filename (str/Path): filename or pathlib.Path. \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for write. \*\*kwargs: Standard kwargs for python open(..). Returns: File-like object. Supports with context. """ if Path is not None and isinstance(filename, Path): filename = str(filename) name, ext = os.path.splitext(filename) ext = ext.upper() if ext == ".BZ2": if PY_VERSION[0] >= 3: return bz2.open(filename, *args, **kwargs) else: args = list(args) if len(args) > 0: args[0] = "".join([c for c in args[0] if c != "t"]) if "mode" in kwargs: kwargs["mode"] = "".join([c for c in kwargs["mode"] if c != "t"]) return bz2.BZ2File(filename, *args, **kwargs) elif ext in (".GZ", ".Z"): return gzip.open(filename, *args, **kwargs) else: return io.open(filename, *args, **kwargs)
def make_available(self): bz2filename = "{}.bz2".format(self.filename) if not os.path.isfile(bz2filename): download_and_save_file(self.url, bz2filename) with bz2.open(bz2filename, 'r') as f: data = f.read().decode("ascii") save_file(data, self.filename)
def _open(self, filename): """ Open the input file. Set self._fp to point to it. Read the first line of parameters. @param filename: A C{str} filename containing JSON BLAST records. @raise ValueError: if the first line of the file isn't valid JSON, if the input file is empty, or if the JSON does not contain an 'application' key. """ if filename.endswith('.bz2'): if six.PY3: self._fp = bz2.open(filename, mode='rt', encoding='UTF-8') else: self._fp = bz2.BZ2File(filename) else: self._fp = open(filename) line = self._fp.readline() if not line: raise ValueError('JSON file %r was empty.' % self._filename) try: self.params = loads(line[:-1]) except ValueError as e: raise ValueError( 'Could not convert first line of %r to JSON (%s). ' 'Line is %r.' % (self._filename, e, line[:-1])) else: if 'application' not in self.params: raise ValueError( '%r appears to be an old JSON file with no BLAST global ' 'parameters. Please re-run convert-blast-xml-to-json.py ' 'to convert it to the newest format.' % self._filename)
def zopen(filename, *args, **kwargs): """ This function wraps around the bz2, gzip and standard python's open function to deal intelligently with bzipped, gzipped or standard text files. Args: filename (str): filename \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for write. \*\*kwargs: Standard kwargs for python open(..). Returns: File-like object. Supports with context. """ file_ext = filename.split(".")[-1].upper() if file_ext == "BZ2": if PY_VERSION[0] >= 3: return bz2.open(filename, *args, **kwargs) else: args = list(args) if len(args) > 0: args[0] = "".join([c for c in args[0] if c != "t"]) if "mode" in kwargs: kwargs["mode"] = "".join([c for c in kwargs["mode"] if c != "t"]) return bz2.BZ2File(filename, *args, **kwargs) elif file_ext in ("GZ", "Z"): return gzip.open(filename, *args, **kwargs) else: return open(filename, *args, **kwargs)
def get_uncompressed_stream(input_stream, compression="auto"): """ Returns a file-like object (aka stream) providing an uncompressed version of the content read on the input stream provided. :param input_stream: The file-like object providing compressed data. :param compression: The compression type. Specify "auto" to let the function guess it out of the associated filename (the input_stream needs to have a name attribute, otherwise a ValueError is raised). :type compression: str """ if compression == "auto": # Try to guess compression method if possible if hasattr(input_stream, 'name'): compression = guess_compression_method(input_stream.name) else: raise ValueError("Can't retrieve a name out of %r" % input_stream) if compression == "gzip": import gzip return gzip.open(filename=input_stream, mode="rb") elif compression == "bzip2": import bz2 return bz2.open(filename=input_stream, mode="rb") elif compression == "xz": import lzma return lzma.open(filename=input_stream, mode="rb") elif compression is None: return input_stream else: raise NotImplementedError( "Unknown compression method: %r" % compression)
def __enter__(self): if self.f_name is None: self.logger.error('File name cannot be empty.') elif self.pat_archive.search(self.f_name): self.logger.warning('File \'{f}\' not a supported extension.'.format(f=self.f_name)) elif not os.access(self.f_name, os.R_OK): self.logger.warning('File \'{f}\'cannot be read.'.format(f=self.f_name)) else: f_mode = 'rt' f_codec = locale.getpreferredencoding(False) # Or 'UTF-8' f_err = 'surrogateescape' # Or 'ignore' try: if self.f_name.endswith('.gz') or self.f_name.endswith('.gzip'): with open(self.f_name, 'rb') as byte_handle: if bytearray.fromhex('1f8b08') in byte_handle.read(3): self.handle = gzip.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err) elif self.f_name.endswith('.bz') or self.f_name.endswith('.bz2'): with open(self.f_name, 'rb') as byte_handle: if bytearray.fromhex('425a68') in byte_handle.read(3): self.handle = bz2.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err) # elif self.f_name.endswith('.lzma') or self.f_name.endswith('.lzma'): # with open(self.f_name, 'rb') as byte_handle: # if bytearray.fromhex('5d0000') in byte_handle.read(3): # self.handle = lzma.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err) else: self.handle = open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err) return self except IOError: self.logger.error('Exception opening \'{f}\'.'.format(f=self.f_name)) return self
def load_model(self): if not os.path.exists(self.get_filename(absolute=True)): if args.train: return {}, {} error("Model file with pre-trained convolution layers not found. Download it here...", "https://github.com/alexjc/neural-enhance/releases/download/v%s/%s"%(__version__, self.get_filename())) print(' - Loaded file `{}` with trained model.'.format(self.get_filename())) return pickle.load(bz2.open(self.get_filename(absolute=True), 'rb'))
def prepare_command_line(self): ''' Develops the Commandline to run FastQC in Galaxy ''' # Check whether a given file compression format is valid # This prevents uncompression of already uncompressed files # infname = self.opts.inputfilename infname = self.opts.input ### http://dev.list.galaxyproject.org/FastQC-wrapper-not-seeing-files-at-gzipped-td4666363.html linf = infname.lower() trimext = False # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf # patched may 29 2013 until this is fixed properly if ( linf.endswith('.gz') or linf.endswith('.gzip') ): f = gzip.open(self.opts.input) try: f.readline() except: trimext = True f.close() elif linf.endswith('bz2'): f = bz2.open(self.opts.input,'rb') try: f.readline() except: trimext = True f.close() elif linf.endswith('.zip'): if not zipfile.is_zipfile(self.opts.input): trimext = True if trimext: f = open(self.opts.input) try: f.readline() except: raise Exception("Input file corruption, could not identify the filetype") infname = os.path.splitext(infname)[0] # Replace unwanted or problematic charaters in the input file name self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) # check that the symbolic link gets a proper ending, fastqc seems to ignore the given format otherwise if 'fastq' in opts.informat: # with fastq the .ext is ignored, but when a format is actually passed it must comply with fastqc's # accepted formats.. opts.informat = 'fastq' elif not self.fastqinfilename.endswith(opts.informat): self.fastqinfilename += '.%s' % opts.informat # Build the Commandline from the given parameters command_line = [opts.executable, '--outdir %s' % opts.outputdir] if opts.contaminants != None: command_line.append('--contaminants %s' % opts.contaminants) if opts.limits != None: command_line.append('--limits %s' % opts.limits) command_line.append('--quiet') command_line.append('--extract') # to access the output text file command_line.append(self.fastqinfilename) command_line.append('-f %s' % opts.informat) command_line.append('-t ${GALAXY_SLOTS:-4}') self.command_line = ' '.join(command_line)
def __iter__(self): """Iterate over pages represented as lists of word indices.""" with bz2.open(self._pages_path, 'rt') as pages: for page in pages: words = page.strip().split() words = [self.encode(x) for x in words] yield words
def getSiteInfoFromBz2File(somebz2filename): wikiurl = '' nses = [] try: with bz2.open(somebz2filename, mode = 'rt') as fsource: innamespaces = False for line in fsource: if innamespaces: if line.strip() == '</namespaces>': break line = line.split('key="')[1] num = line.split('"')[0] if line.find('<') > 1: d = line.split('>')[1].split('<')[0] nses.append(num + '#' + d) continue if line.strip() == '<namespaces>': innamespaces = True nses.append('0#main') #b = myre1.split(line) matchobject = re.search(r'<base>(?P<url>.*?)</base>', line.strip()) if matchobject: #wikiurl = matchobject.group('url') c = urllib.parse.urlparse(matchobject.group('url')) wikiurl =urllib.parse.urlunparse((c[0],c[1],'','','','')) + '/' #wikiurl = urllib.parse.unquote(matchobject.group('url')) print(wikiurl) return wikiurl, nses except: raise
def open_zipped(infile, mode='r'): """return file handle of file regardless of compressed or not. also returns already opened files unchanged, text mode automatic for compatibility with python2. """ # return already open files if hasattr(infile, 'write'): return infile # make text mode automatic if len(mode) == 1: mode = mode + 't' # refuse to handle non-strings that aren't files. if not isinstance(infile, str): raise ValueError("i cannot open a filename that isn't a string.") # treat '-' appropriately if infile == '-': if 'w' in mode: return sys.stdout return sys.stdin # if possible open zipped files if infile.endswith('.gz'): return _gzip.open(infile, mode) if infile.endswith('.bz2'): if hasattr(_bz2, 'open'): return _bz2.open(infile, mode) return _bz2.bz2file(infile, mode) # fall back on regular open return open(infile, mode)
def read_bz2(filepath): ''' This opens a bzip file, assuming that it contains a HathiTrust feature JSON, and extracts a list of pages. ''' successflag = 'success' try: with bz2.open(filepath, mode='rt', encoding = 'utf-8') as f: jsonstring = f.read() except: successflag = 'bzip2 failed' jsonstring = '' try: jobj = json.loads(jsonstring) except: successflag = 'json decoding failed' jobj = dict() try: pagelist = jobj['features']['pages'] except: if successflag == 'success': successflag = 'json format unexpected' pagelist = [] # If statement because we don't want to overwrite previous failures. return pagelist, successflag
def _read_xml(self, xml_file): """Salva no banco de dados do Django e retorna lista das votações""" tree = None with bz2.open(xml_file, mode='rt', encoding="iso-8859-1") as f: tree = etree.fromstring(f.read()) return tree
def interlanguage_mapping(interlang_path, ok_concepts): quads = parse_nquads(bz2.open(str(interlang_path), 'rt')) mapping = {} for subj, values in itertools.groupby(quads, itemgetter(0)): subj_url = subj['url'] subj_concept = translate_dbpedia_url(subj_url) pieces = split_uri(subj_concept) if len(pieces) >= 6: sense = pieces[5] if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense: continue if uri_prefix(subj_concept) in ok_concepts: targets = [subj_url] for _subj, _pred, obj, _graph in values: url = obj['url'] if 'www.wikidata.org' in url: continue if url.startswith('http://wikidata.dbpedia.org/'): wikidata_id = resource_name(url) # Return early when we see a high-numbered Wikidata ID if int(wikidata_id[1:]) >= 1000000: return mapping targets.append(url) mapping[subj_url] = targets return mapping
def from_file(self, filename): """ Read data from the file and return an intensity object wtih that data. """ self.filename = filename raw_counts = list() if not os.path.exists(filename): bz2_name = "{}.bz2".format(filename) if os.path.exists(bz2_name): filename = bz2_name if filename.endswith("bz2"): open_f = lambda x: bz2.open(x, "rt") else: open_f = open with open_f(filename) as stream_in: for line in csv.reader(stream_in): bin_left = int(line[0]) bin_right = int(line[1]) counts = tuple(map(int, line[2:])) self.times.append((bin_left, bin_right)) raw_counts.append(counts) for channel, counts in enumerate(numpy.transpose(raw_counts)): self[channel] = counts return(self)
def main(): usage = 'usage: %prog [options] <fastq_file>' parser = OptionParser(usage) parser.add_option('-l', dest='length_min', default=None, type='int', help='Minimum read length') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide FASTQ file') else: fastq_file = args[0] if fastq_file[-3:] == '.gz': fastq_in = gzip.open(fastq_file, 'rt') elif fastq_file[-4:] == '.bz2': fastq_in = bz2.open(fastq_file, 'rt') else: fastq_in = open(fastq_file) header = fastq_in.readline() while header: seq = fastq_in.readline() mid = fastq_in.readline() qual = fastq_in.readline() if options.length_min is not None: if len(seq)-1 >= options.length_min: print('%s%s%s%s' % (header,seq,mid,qual), end='') header = fastq_in.readline() fastq_in.close()
def _open(self, filename): """ Open the input file. Set self._fp to point to it. Read the first line of parameters. @param filename: A C{str} filename containing JSON DIAMOND records. @raise ValueError: if the first line of the file isn't valid JSON, if the input file is empty, or if the JSON does not contain an 'application' key. """ if filename.endswith('.bz2'): if six.PY3: self._fp = bz2.open(filename, mode='rt', encoding='UTF-8') else: self._fp = bz2.BZ2File(filename) else: self._fp = open(filename) line = self._fp.readline() if not line: raise ValueError('JSON file %r was empty.' % self._filename) try: self.params = loads(line[:-1]) except ValueError as e: raise ValueError( 'Could not convert first line of %r to JSON (%s). ' 'Line is %r.' % (self._filename, e, line[:-1]))
def decompress(zip_name, target_directory): """ Decompresses the provided archive to the target directory. The following file extensions are supported: * zip * bz2 * gz * tar * tar.gz * tgz * tar.bz2 The decompression method is chosen based on the file extension. :param zip_name: The full path name to the file that should be decompressed. :param target_directory: The directory to which files should be decompressed. May or may not exist prior to calling this function. """ path_without_extension, extension = splitext(zip_name) filename = basename(path_without_extension) if extension == ".zip": _do_decompress(target_directory, zipfile.ZipFile(zip_name)) elif extension == ".bz2": _do_decompress_manually(target_directory, filename, bz2.open(zip_name)) elif extension == ".gz": _do_decompress_manually(target_directory, filename, gzip.open(zip_name)) elif extension in [".tar", ".tar.gz", ".tgz", ".tar.bz2"]: _do_decompress(target_directory, tarfile.open(zip_name)) else: raise RuntimeError("Unsupported file extension [%s]. Cannot decompress [%s]" % (extension, zip_name))
def open_compressed(filename,mode='rb'): """ Open a file for reading with automatic decompression. Detects gzip, xz, and bz2 files via the file extension. Arguments --------- filename to open Returns ------- open file object """ ext = filename.split('.')[-1] if ext == 'gz': import gzip return gzip.open(filename,mode) elif ext == 'xz': import lzma return lzma.open(filename,mode) elif ext == 'bz2': import bz2 return bz2.open(filename,mode) else: return open(filename,mode)
import bz2 from tqdm import tqdm c = 0 with bz2.open("section09/enwiki-20150112-400-r100-10576.txt.bz2", "rt") as f, open("section09/corpus.txt", mode='wt') as g: for line in tqdm(f.readlines()): line = line.split() for idx in range(len(line)): line[idx] = line[idx].strip(".,!?;:()[]\'\"") if len(line) == 0: continue g.write(" ".join(line)) g.write("\n")
samplingPeriod = float(header[3]) / 1000000.0 samplingFrequency = 1 / samplingPeriod if args.time: samplingPeriod = args.time / len(samples) print( f'Adjusting profile frequency {samplingFrequency:.2f} Hz to {1 / samplingPeriod:.2f} Hz' ) samplingFrequency = 1 / samplingPeriod print( f"Extracted {sampleCount} samples taken at {samplingFrequency:.2f} Hz (ignored {stackTraces} stack traces) for {len(samples) * samplingPeriod:.2f}s time" ) if args.output.endswith(".bz2"): csvFile = bz2.open(args.output, "wt") else: csvFile = open(args.output, "w") csvFile.write('time;pc0\n') runningTime = 0.0 for sample in samples: runningTime += samplingPeriod csvFile.write(f'{runningTime:.16f};{sample}\n') print(f"Wrote to {args.output}") csvFile.close() if (not args.vmmap):
def __iter__(self): for fname in os.listdir(self.dirname): print("processing~ '{}'".format(fname)) for line in bz2.open(os.path.join(self.dirname, fname), "rt"): yield sent_to_spacing_chars(line.strip()).split(splitc)
def __init__(self, volumepath, volumeid): '''Initializes a LoadedVolume by reading wordcounts from a json file. By default it reads all the pages. But if skip-front and skip-back are set to positive values, it will skip n pages.''' if volumepath.endswith('bz2'): with bz2.open(volumepath, mode='rt', encoding='utf-8') as f: thestring = f.read() else: with open(volumepath, encoding='utf-8') as f: thestring = f.read() thejson = json.loads(thestring) self.volumeid = thejson['id'] pagedata = thejson['features']['pages'] self.numpages = len(pagedata) self.pagecounts = [] self.totalcounts = Counter() self.totaltokens = 0 self.bodytokens = 0 chunktokens = 0 typesinthischunk = set() # a set of types in the current 10k-word chunk; progress # toward which is tracked by chunktokens self.integerless_pages = 0 self.skipped_pages = 0 compromise_pg = 0 capitalizedbodytokens = 0 for i in range(self.numpages): thispagecounts = Counter() thisbodytokens = 0 thisheadertokens = 0 thispage = pagedata[i] # There are really two ways of numbering pages. They come in an order, # which gives them an inherent ordinality (this is the *first* page). But # they also have cardinal *labels* attached, in the "seq" field. These labels # are usually, but not necessarily, convertible to integers. (Usually "00000001", # but could be "notes.") *Usually* they are == to the ordinal number, # but again, not necessarily! The world is full of fun edge cases! # In this loop, i is the ordinal page number, and cardinal_page is the cardinal # label; its value will be -1 if it can't be converted to an integer. # compromise_pg skips pages that have no integer seq, but otherwise # proceeds ordinally try: cardinal_page = int(thispage['seq']) except: cardinal_page = -1 if cardinal_page > 0: compromise_pg += 1 elif cardinal_page < 0: self.integerless_pages += 1 if cardinal_page >= 0: bodywords = thispage['body']['tokenPosCount'] for token, partsofspeech in bodywords.items(): normaltokenlist = normalize_token(token) # Notice that we treat each word as a list, to permit # counting both parts of a hyphenated word. # But usually this will be a list of one. for normaltoken in normaltokenlist: for part, count in partsofspeech.items(): thisbodytokens += count chunktokens += count thispagecounts[normaltoken] += count headerwords = thispage['header']['tokenPosCount'] for token, partsofspeech in headerwords.items(): normaltokenlist = normalize_token(token) for normaltoken in normaltokenlist: normaltoken = "#header" + normaltoken for part, count in partsofspeech.items(): thisheadertokens += count thispagecounts[normaltoken] += count # You will notice that I treat footers (mostly) as part of the body # Footers are rare, and rarely interesting. footerwords = thispage['footer']['tokenPosCount'] for token, partsofspeech in footerwords.items(): normaltokenlist = normalize_token(token) for normaltoken in normaltokenlist: for part, count in partsofspeech.items(): thisbodytokens += count chunktokens += count thispagecounts[normaltoken] += count self.pagecounts.append(thispagecounts) for key, value in thispagecounts.items(): self.totalcounts[key] += value self.totaltokens += thisbodytokens self.totaltokens += thisheadertokens self.bodytokens += thisbodytokens else: # print(i, cardinal_page, compromise_pg) self.skipped_pages += 1
def get_tweet_gen(tweet_fpath: str) -> Iterator: with bz2.open(tweet_fpath) as fbz: for line in fbz: yield json.loads(line)["tweet"]
def parse_seq_pe(opts, bc_dict, Flowcell, Lane): """Fastq/a-parser for PE-reads""" if opts.reads1.endswith('.gz'): seq1_handle = gzip.open(opts.reads1, "rb") seq2_handle = gzip.open(opts.reads2, "rb") elif opts.reads1.endswith('.bz2'): seq1_handle = bz2.open(opts.reads1, "rb") seq2_handle = bz2.open(opts.reads2, "rb") else: try: seq1_handle = open(opts.reads1, "r") seq2_handle = open(opts.reads2, "r") except IOError: seq1_handle = gzip.open(opts.reads1+'.gz', "rb") seq2_handle = gzip.open(opts.reads2+'.gz', "rb") opts.reads1+='.gz' if not opts.split: seq1_name = '%(code)s_%(Flowcell)s_s_%(lane)s_fastq.txt'% \ ({'code': 'R1_%s'%opts.output.split('/')[-2],'Flowcell':Flowcell, 'lane':Lane}) seq2_name = '%(code)s_%(Flowcell)s_s_%(lane)s_fastq.txt'% \ ({'code': 'R2_%s'%opts.output.split('/')[-2],'Flowcell':Flowcell, 'lane':Lane}) if opts.reads1.endswith('.gz'): seq1_name += '.gz' seq2_name += '.gz' seq1_out = gzip.open(os.path.join(opts.output, seq1_name), 'a') seq2_out = gzip.open(os.path.join(opts.output, seq2_name), 'a') else: seq1_out = open(os.path.join(opts.output, seq1_name), 'a') seq2_out = open(os.path.join(opts.output, seq2_name), 'a') if opts.reads1.endswith('.gz'): nomatch1_out= gzip.open(opts.nomatch1, "w") nomatch2_out= gzip.open(opts.nomatch2, "w") else: nomatch1_out= open(opts.nomatch1, "w") nomatch2_out= open(opts.nomatch2, "w") seq = 0 bc_set_left = set(k[0] for k in bc_dict.keys()) bc_set_right = set(k[1] for k in bc_dict.keys()) elements_1 = [entry.enz_remnant_R1 for entry in bc_dict.values()] elements_2 = [entry.enz_remnant_R2 for entry in bc_dict.values()] enz_sites_left = [] enz_sites_right = [] if opts.control_nucleotide: for nt in ['C','T']: for element in elements_1[0]: if nt+element[0] not in enz_sites_left: #implement search which includes control nucleotide enz_sites_left += [nt + element] for element in elements_2[0]: if nt+element[0] not in enz_sites_right: enz_sites_right += [nt + element] else: for element in elements_1[0]: if element[0] not in enz_sites_left: # implement search which includes control nucleotide enz_sites_left += [element] for element in elements_2[0]: if element[0] not in enz_sites_right: enz_sites_right += [element] max_bc_len_left = max(k[0][0] + len(k[0][1]) for k in bc_dict.keys()) + max(len(k) for k in enz_sites_left) max_bc_len_right = max(k[1][0] + len(k[1][1]) for k in bc_dict.keys()) + max(len(k) for k in enz_sites_right) left_read = [True] while left_read[0]: seq += 1 left_read = [] right_read = [] for i in range(4): try: left_read += [seq1_handle.readline()] right_read += [seq2_handle.readline()] except StopIteration: break left_bc,wobble_left,left_start,control_left = levenshtein(left_read, bc_set_left,enz_sites_left, opts.mismatch, max_bc_len_left) right_bc,wobble_right,right_start,control_right = levenshtein(right_read, bc_set_right,enz_sites_right, opts.mismatch, max_bc_len_right) if left_bc and right_bc: #Put the correct sequence of the barcode try: bc_dict['%s_%s'%(left_bc, right_bc)+'_count'] += 1 except KeyError: bc_dict['%s_%s'%(left_bc, right_bc)+'_count'] = 1 if opts.addRG: #determine if read is watson or crick. try: SM_id = bc_dict[((3,left_bc),(3,right_bc))].Sample except KeyError: #This can only happen if the barcode is incorrectly read try: SM_id = bc_dict[((0, left_bc), (0, right_bc))].Sample except KeyError: continue #one control nucleotide should be converted the other not. If this succeeds than call read type (watson,crick) #based on left nucleotide. if this is if control_left != control_right: strand = control_left else: strand = control_left RG_id = '%s_%s_%s'%(Flowcell,Lane,SM_id) if wobble_left == '': wobble_left = 'NNN' if wobble_right == '': wobble_right = 'NNN' wobble = wobble_left + "_" + wobble_right left_read[0] = left_read[0].split(' ')[0].rstrip('\n') \ + '\tBC:Z:%s\tBC:Z:%s\tRG:Z:%s\tST:Z:%s\n'%(left_bc, right_bc, RG_id, strand) right_read[0] = right_read[0].split(' ')[0].rstrip('\n') \ + '\tBL:Z:%s\tBR:Z:%s\tRG:Z:%s\tST:Z:%s\n'%(left_bc,right_bc, RG_id, strand) if opts.control_nucleotide: left_read[0] = left_read[0][:-1] + '\tRN:Z:%s\n' % wobble right_read[0] = right_read[0][:-1] + '\tRN:Z:%s\n' % wobble else: id = left_read[0][:-1] if opts.delete: #+1 because of control nucleotide after barcode if opts.control_nucleotide: control_NT = 'C' else: control_NT = '' left_read[1] = left_read[1][left_start + len(left_bc + control_NT):] left_read[3] = left_read[3][left_start + len(left_bc + control_NT):] right_read[1] = right_read[1][right_start + len(right_bc + control_NT):] right_read[3] = right_read[3][right_start + len( right_bc + control_NT):] if not opts.split: seq1_out.write(''.join(left_read)) seq2_out.write(''.join(right_read)) else: #If splitting is activated, compression takes too long, disable! output_location_1 = os.path.join(opts.output, "%s_%s_1.fastq"%(bc_dict[((3,left_bc),(3,right_bc))].Sample)) output_location_2 = os.path.join(opts.output, "%s_%s_2.fastq"%(bc_dict[((3,left_bc),(3,right_bc))].Sample)) output_handle_1 = open(output_location_1, 'a') output_handle_2 = open(output_location_2, 'a') output_handle_1.write(''.join(left_read)) output_handle_2.write(''.join(right_read)) else: #Barcode sequence was not recognized nomatch1_out.write(''.join(left_read)) nomatch2_out.write(''.join(right_read)) seq1_out.close() seq2_out.close() nomatch1_out.close() nomatch2_out.close() return bc_dict
def load(pklfile): #print("Loading from {}".format(pklfile)) with bz2.open(pklfile, "rb") as fin: ret = pickle.load(fin) return ret
def convert(filepath_or_fileobj, dbpath, table, headerspath_or_fileobj=None, compression=None, typespath_or_fileobj=None): if isinstance(filepath_or_fileobj, string_types): if compression is None: fo = open(filepath_or_fileobj, mode=read_mode) elif compression == 'bz2': try: fo = bz2.open(filepath_or_fileobj, mode=read_mode) except AttributeError: fo = bz2.BZ2File(filepath_or_fileobj, mode='r') elif compression == 'gzip': fo = gzip.open(filepath_or_fileobj, mode=read_mode) else: fo = filepath_or_fileobj try: dialect = csv.Sniffer().sniff(fo.readline()) except TypeError: dialect = csv.Sniffer().sniff(str(fo.readline())) fo.seek(0) # get the headers header_given = headerspath_or_fileobj is not None if header_given: if isinstance(headerspath_or_fileobj, string_types): ho = open(headerspath_or_fileobj, mode=read_mode) else: ho = headerspath_or_fileobj header_reader = csv.reader(ho, dialect) headers = [header.strip() for header in next(header_reader)] ho.close() else: reader = csv.reader(fo, dialect) headers = [header.strip() for header in next(reader)] fo.seek(0) # get the types if typespath_or_fileobj is not None: if isinstance(typespath_or_fileobj, string_types): to = open(typespath_or_fileobj, mode=read_mode) else: to = typespath_or_fileobj type_reader = csv.reader(to, dialect) types = [_type.strip() for _type in next(type_reader)] to.close() else: # guess types type_reader = csv.reader(fo, dialect) if not header_given: next(type_reader) types = _guess_types(type_reader, len(headers)) fo.seek(0) # now load data _columns = ','.join([ '"%s" %s' % (header, _type) for (header, _type) in zip(headers, types) ]) reader = csv.reader(fo, dialect) if not header_given: # Skip the header next(reader) conn = sqlite3.connect(dbpath) # shz: fix error with non-ASCII input conn.text_factory = str c = conn.cursor() try: create_query = 'CREATE TABLE %s (%s)' % (table, _columns) c.execute(create_query) except: pass _insert_tmpl = 'INSERT INTO %s VALUES (%s)' % (table, ','.join( ['?'] * len(headers))) line = 0 for row in reader: line += 1 if len(row) == 0: continue # we need to take out commas from int and floats for sqlite to # recognize them properly ... try: row = [ None if x == '' else float(x.replace(',', '')) if y == 'real' else int(x) if y == 'integer' else x for (x, y) in zip(row, types) ] c.execute(_insert_tmpl, row) except ValueError as e: print("Unable to convert value '%s' to type '%s' on line %d" % (x, y, line), file=sys.stderr) except Exception as e: print("Error on line %d: %s" % (line, e), file=sys.stderr) conn.commit() c.close()
def _resolve_archive(self, filename, subpath=None): ext = os.path.splitext(filename)[1] if subpath and subpath[0] == "/": subpath = subpath[1:] if ext == ".zip": import zipfile zf = zipfile.ZipFile(filename) # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [ name for name in zf.namelist() if not (name.startswith("__MACOSX/") or name.endswith("/")) ] if subpath: if subpath in zff: zff = [subpath] else: raise TValueError("File `%s` does not exist in archive " "`%s`" % (subpath, filename)) if len(zff) > 1: warnings.warn( "Zip file %s contains multiple compressed " "files: %r. Only the first of them will be used." % (filename, zff), category=FreadWarning) if len(zff) == 0: raise TValueError("Zip file %s is empty" % filename) if self._verbose: self._logger.debug("Extracting %s to temporary directory %s" % (filename, self.tempdir)) self._tempfiles.append(zf.extract(zff[0], path=self.tempdir)) self._file = self._tempfiles[-1] elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".bz2": import bz2 with bz2.open(filename, mode="rb") as zf: if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xz": import lzma with lzma.open(filename, mode="rb") as zf: if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xlsx" or ext == ".xls": self._result = read_xls_workbook(filename, subpath) else: self._file = filename
# Сжатие с помощью gzip import gzip with gzip.open('somefile.gz', 'rt') as f: text = f.read() # Сжатие с помощью bz2 import bz2 with bz2.open('somefile.bz2', 'rt') as f: text = f.read() Как показано выше, весь ввод и вывод будет использовать текст и проводить кодирование/декодирование в Unicode. Если же вы хотите работать с бинарными данными, используйте файловый режим rb или wb. При записи сжатых данных с помощью необязательного именованного аргу- мента compresslevel может быть установлен уровень компрессии. Например: with gzip.open('somefile.gz', 'wt', compresslevel=5) as f: f.write(text) Уровень по умолчанию – это 9, то есть наивысший. Более низкие уровни увели- чивают скорость, но снижают степень сжатия данных.
def do_bz2_open(user_input, size=0): try: with bz2.open(user_input) as bz2file: return bz2file.read(size) except Exception: return None
def threadfun(startf, endf, pid): ## Individual and Co-Occurence Counts print("Started Process : {} , PID :{}, Startf:{} Endf: {}".format( pid, os.getpid(), startf, endf)) print("Loading Individual count files..") with open('CNG_count.pickle', 'rb') as handle: CNG_count = pickle.load(handle) with open('CNGG_count.pickle', 'rb') as handle: CNGG_count = pickle.load(handle) with open('Word_count.pickle', 'rb') as handle: Word_count = pickle.load(handle) with open('Lemma_count.pickle', 'rb') as handle: Lemma_count = pickle.load(handle) def from_dict(Type_1, Type_2): with open(Type_1 + '|' + Type_2 + '.json', 'r') as fp: d = json.load(fp) return d print("Loading Co-Occurence count files...") CNG_Distinct = len(CNG_count.keys()) graph = { 'LemmaLemma': from_dict('Lemma', 'Lemma'), 'LemmaWord': from_dict('Lemma', 'Word'), 'LemmaCNG': from_dict('Lemma', 'CNG'), 'LemmaCNG_Group': from_dict('Lemma', 'CNG_Group'), 'WordLemma': from_dict('Word', 'Lemma'), 'WordWord': from_dict('Word', 'Word'), 'WordCNG': from_dict('Word', 'CNG'), 'WordCNG_Group': from_dict('Word', 'CNG_Group'), 'CNGLemma': from_dict('CNG', 'Lemma'), 'CNGWord': from_dict('CNG', 'Word'), 'CNGCNG': from_dict('CNG', 'CNG'), 'CNGCNG_Group': from_dict('CNG', 'CNG_Group'), 'CNG_GroupLemma': from_dict('CNG_Group', 'Lemma'), 'CNG_GroupWord': from_dict('CNG_Group', 'Word'), 'CNG_GroupCNG': from_dict('CNG_Group', 'CNG'), 'CNG_GroupCNG_Group': from_dict('CNG_Group', 'CNG_Group') } savedir = 'features/' ## Reading Metapaths # metapaths = [] # with open('feature_ranklist_BM2_t2.txt','r') as file: # rd = file.readlines() # for row in rd: # metapaths.append(row.split(',')[1]) # print(len(metapaths)) # Reading Metapaths df = pd.read_csv("featureStats.csv") metapaths = list(df[df["p2_4K_bigram_mir"] == 1]['FeatureName']) print(len(metapaths)) ##Some utility functions def checktype(el): if (el.lstrip("-").isdigit()): return "CNG" elif (el == 'C'): return 'C' elif (el == 'T'): return "T" elif (el == 'L'): return "L" else: return "CNG_Group" def denfun(el, eltype): if (eltype == 'CNG' or eltype == 'C'): return CNG_count.get(int(el), 0) elif (eltype == 'L'): return Lemma_count.get(el, 0) elif (eltype == 'W'): return Word_count.get(el, 0) else: return CNGG_count.get(el, 0) def changetype(typ): if (typ == 'L'): return "Lemma" elif (typ == 'C'): return "CNG" elif (typ == 'T'): return "Word" else: return typ ##Actual Work Starts here gdir = 'After_graphml/' x = os.listdir(gdir) x.sort() fc = 0 print("Started Iterating over files :{} - {}".format(startf, endf)) for gfile in x[startf:endf]: ##iterating over 119k files try: G = read_graphml(gdir + gfile) cur = [] for i in range(1 + G.number_of_nodes()): cur.append([]) for j in range(1 + G.number_of_nodes()): cur[i].append(0) glemma = nx.get_node_attributes(G, 'lemma') gword = nx.get_node_attributes(G, 'word') gcng = nx.get_node_attributes(G, 'cng') ec = 0 for snode, enode, d in G.edges_iter(data=True): ##iterating over all edges # print(snode,enode) ar = np.zeros(1500) r = 0 c = 0 w = 0 l = 0 g = 0 o = 0 for row in metapaths: ##iterating over 1500 metapaths row = row.split('*') if (len(row) == 2): node1 = row[0] type1 = checktype(node1) if (type1 == 'T'): node1 = glemma[snode] + '_' + str(gcng[snode]) elif (type1 == 'L'): node1 = glemma[snode] elif (type1 == 'C'): node1 = gcng[snode] den1 = denfun(node1, type1) node2 = row[1] type2 = checktype(node2) if (type2 == 'T'): node2 = glemma[enode] + '_' + str(gcng[enode]) elif (type2 == 'L'): node2 = glemma[enode] elif (type2 == 'C'): node2 = gcng[enode] type1 = changetype(type1) type2 = changetype(type2) type12 = type1 + type2 num12 = graph[type12].get( str(node1) + '|' + str(node2), 0) prob12 = (float(num12) + 1) / (den1 + CNG_Distinct) prob = prob12 elif (len(row) == 3): node1 = row[0] type1 = checktype(node1) if (type1 == 'T'): node1 = glemma[snode] + '_' + str(gcng[snode]) elif (type1 == 'L'): node1 = glemma[snode] elif (type1 == 'C'): node1 = gcng[snode] den1 = denfun(node1, type1) node2 = row[1] type2 = checktype(node2) den2 = denfun(node2, type2) node3 = row[2] type3 = checktype(node3) if (type3 == 'T'): node3 = glemma[enode] + '_' + str(gcng[enode]) elif (type3 == 'L'): node3 = glemma[enode] elif (type3 == 'C'): node3 = gcng[enode] type1 = changetype(type1) type2 = changetype(type2) type3 = changetype(type3) type12 = type1 + type2 type23 = type2 + type3 num12 = graph[type12].get( str(node1) + '|' + str(node2), 0) num23 = graph[type23].get( str(node2) + '|' + str(node3), 0) prob12 = (float(num12) + 1) / (den1 + CNG_Distinct) prob23 = (float(num23) + 1) / (den2 + CNG_Distinct) prob = prob12 * prob23 elif (len(row) == 4): node1 = row[0] type1 = checktype(node1) if (type1 == 'T'): node1 = glemma[snode] + '_' + str(gcng[snode]) elif (type1 == 'L'): node1 = glemma[snode] elif (type1 == 'C'): node1 = gcng[snode] den1 = denfun(node1, type1) node2 = row[1] type2 = checktype(node2) den2 = denfun(node2, type2) node3 = row[2] type3 = checktype(node3) den3 = denfun(node3, type3) node4 = row[3] type4 = checktype(node4) if (type4 == 'T'): node4 = glemma[enode] + '_' + str(gcng[enode]) elif (type4 == 'L'): node4 = glemma[enode] elif (type4 == 'C'): node4 = gcng[snode] den4 = denfun(node4, type4) type1 = changetype(type1) type2 = changetype(type2) type3 = changetype(type3) type4 = changetype(type4) type12 = type1 + type2 type23 = type2 + type3 type34 = type3 + type4 num12 = graph[type12].get( str(node1) + '|' + str(node2), 0) num23 = graph[type23].get( str(node2) + '|' + str(node3), 0) num34 = graph[type34].get( str(node3) + '|' + str(node4), 0) prob12 = (float(num12) + 1) / (den1 + CNG_Distinct) prob23 = (float(num23) + 1) / (den2 + CNG_Distinct) prob34 = (float(num34) + 1) / (den3 + CNG_Distinct) prob = prob12 * prob23 * prob34 else: print("Invalid Metapath length") ar[r] = prob r += 1 cur[int(snode)][int(enode)] = ar fc += 1 print("File Number :{}; pid: {}".format(fc, pid)) print("fine till here") with bz2.open( str(savedir) + str(gfile.split(".graphml")[0]) + '.bz2', 'wb') as f: pickle.dump(cur, f) # with bz2.open(str(savedir)+str(gfile.split(".graphml")[0])+'.bz2', 'rb') as f: # y = pickle.load(f) except Exception as e: print(e) print("Error at file :{}".format(str(gfile))) continue print("All Done for pid :{}".format(pid)) # threadfun(0,1000,1)
def write_bz2(self, data): with bz2.open(self.path, 'wb') as f: f.write(data)
print('uvloop is not installed!') exit(1) if not cargs.stateless: # Logging starts here # Create directory for logs if it doesn't exist if not os.path.exists('logs'): os.mkdir('logs') # Compress logfiles that were left over from the last run os.chdir('logs') if not os.path.exists('old'): os.mkdir('old') for item in os.listdir('.'): if item.endswith('.log'): with bz2.open(item + '.bz2', 'w') as f: f.write(open(item, 'rb').read()) os.remove(item) for item in os.listdir('.'): if item.endswith('.gz') or item.endswith('.bz2'): os.rename(item, 'old/' + item) os.chdir('..') # Define a format now = str(datetime.datetime.now()).replace(' ', '_').replace(':', '-').split('.')[0] formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s') # Setting up loggers logger = logging.getLogger('liara')
def xopen(filename, mode='r'): """ Replacement for the "open" function that can also open files that have been compressed with gzip, bzip2 or xz. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. If the filename ends with .gz, the file is opened with a pipe to the gzip program. If that does not work, then gzip.open() is used (the gzip module is slower than the pipe to the gzip program). If the filename ends with .bz2, it's opened as a bz2.BZ2File. Otherwise, the regular open() is used. mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb' Instead of 'rt', 'wt' and 'at', 'r', 'w' and 'a' can be used as abbreviations. In Python 2, the 't' and 'b' characters are ignored. Append mode ('a', 'at', 'ab') is unavailable with BZ2 compression and will raise an error. """ if mode in ('r', 'w', 'a'): mode += 't' if mode not in ('rt', 'rb', 'wt', 'wb', 'at', 'ab'): raise ValueError("mode '{0}' not supported".format(mode)) if not _PY3: mode = mode[0] if not isinstance(filename, basestring): raise ValueError("the filename must be a string") # standard input and standard output handling if filename == '-': return dict(r=sys.stdin, rt=sys.stdin, rb=sys.stdin.buffer, w=sys.stdout, wt=sys.stdout, wb=sys.stdout.buffer)[mode] if filename.endswith('.bz2'): if bz2 is None: raise ImportError( "Cannot open bz2 files: The bz2 module is not available") if _PY3: return bz2.open(filename, mode) else: if mode[0] == 'a': raise ValueError( "mode '{0}' not supported with BZ2 compression".format( mode)) if sys.version_info[:2] <= (2, 6): return ClosingBZ2File(filename, mode) else: return bz2.BZ2File(filename, mode) elif filename.endswith('.xz'): if lzma is None: raise ImportError( "Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)" ) return lzma.open(filename, mode) elif filename.endswith('.gz'): if _PY3: if 't' in mode: # gzip.open in Python 3.2 does not support modes 'rt' and 'wt'' if sys.version_info > (3, 3): return gzip.open(filename, mode) else: return io.TextIOWrapper(gzip.open(filename, mode[0])) else: if 'r' in mode: return io.BufferedReader(gzip.open(filename, mode)) else: return io.BufferedWriter(gzip.open(filename, mode)) else: # rb/rt are equivalent in Py2 if 'r' in mode: try: return PipedGzipReader(filename) except OSError: # gzip not installed return buffered_reader(gzip.open(filename, mode)) else: try: return PipedGzipWriter(filename, mode) except OSError: return buffered_writer(gzip.open(filename, mode)) else: return open(filename, mode)
def read_bz2(self): with bz2.open(self.path, 'rb') as f: return f.read()
def detect_archive_format_and_open(path): if path.endswith(".bz2"): return bz2.open(path, mode='rt') if path.endswith(".gz"): return gzip.open(path, mode='rt') return open(path)
def read(self): with bz2.open(self.path) as f: return [json.loads(l) for l in f.readlines()]
# creating prbe_id list of EU countries and are hosting prb_ID_EU = pd.read_csv("m_AS_EU_hosting.csv") prb_id_list = list(prb_ID_EU["prb_id"]) #%% # asking for user to provide filenam to work with filename = input("enter the filename you want to work with") print("name of file you entered is", filename) bz2Filename = str(filename) # counting total number of lines and the time taken for reading the lines ## fname should be like "ping-2020-02-20T0000.bz2" starttimereadinglines = time.time() bz2File = bz2.open(bz2Filename, 'rt') count_nrlines = 0 for line in bz2File: count_nrlines += 1 if count_nrlines > 10000: ## todo acts as counter to check break print("Total number of lines is:", count_nrlines) # closinf file bz2File.close() # creating end time variable endtimereadingfiles = time.time() # printing total times for reading lines print("total times for reading ", str(count_nrlines), "lines is", endtimereadingfiles - starttimereadinglines, "in seconds") #%%
def open_files(): ''' Goes through the directory containing all the data files. ''' #path = os.path.expanduser('/data/files.pushshift.io/reddit/submissions') os.chdir('/data/files.pushshift.io/reddit/submissions') #files = [f for f in os.listdir(path)] #issue with RS_2011-01.bz2 having some non unicode-32 characters. #files = ['RS_2017-11.bz2','RS_2017-10.bz2','RS_2017-08.bz2','RS_2017-07.bz2','RS_2017-06.bz2','RS_2017-05.bz2','RS_2017-04.bz2'] #files = ['RS_2011-01.bz2', 'RS_2012-01.bz2','RS_2013-01.bz2','RS_2014-01.bz2','RS_2015-01.gz','RS_2016-01.gz','RS_2017-01.bz2','RS_2018-01.xz','RS_2019-01.gz'] #files = ['RS_2012-01.bz2','RS_2013-01.bz2','RS_2014-01.bz2','RS_2015-01.gz','RS_2016-01.gz','RS_2017-01.bz2','RS_2018-01.xz','RS_2019-01.gz'] files = ['RS_2011-01.bz2'] for i in files: year = i[3:7] if year == '2011': with open("/home/bmountain/dm_project/output_2011.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with bz2.open(i, "r") as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2011.json","w") as j_file: json.dump(data,j_file) if year == '2012': with open("/home/bmountain/dm_project/output_2012.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with bz2.open(i, "r") as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2012.json","w") as j_file: json.dump(data,j_file) if year == '2013': with open("/home/bmountain/dm_project/output_2013.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with bz2.open(i, "r") as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2013.json","w") as j_file: json.dump(data,j_file) if year == '2014': with open("/home/bmountain/dm_project/output_2014.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with bz2.open(i, "r") as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2014.json","w") as j_file: json.dump(data,j_file) if year == '2015': with open("/home/bmountain/dm_project/output_2015.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with gzip.open(i) as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2015.json","w") as j_file: json.dump(data,j_file) if year == '2016': with open("/home/bmountain/dm_project/output_2016.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with gzip.open(i) as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2016.json","w") as j_file: json.dump(data,j_file) if year == '2017': with open("/home/bmountain/dm_project/output_2017.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with bz2.open(i, "r") as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2017.json","w") as j_file: json.dump(data,j_file) if year == '2018': with open("/home/bmountain/dm_project/output_2018.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with lzma.open(i, mode='rt') as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = math.log(post.get("score")) * 1.0 if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2018.json","w") as j_file: json.dump(data,j_file) if year == '2019': with open("/home/bmountain/dm_project/output_2019.json", "r+") as json_date_file: data = json.load(json_date_file) with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master: master_data = json.load(json_master) with gzip.open(i) as content: print(datetime.datetime.now(), 'opening ' + i) for line in content: try: post = json.loads(line) sub = post.get("subreddit") if sub in subreddit_list: if post.get("score") > 10: # arbitrary threshold log_normalized_score = (math.log(post.get("score")) * 1.0) if sub in data: data[sub].append([post.get("title"), log_normalized_score]) if sub not in data: data[sub] = [[post.get("title"), log_normalized_score]] if sub in master_data: master_data[sub].append([post.get("title"), log_normalized_score]) if sub not in master_data: master_data[sub] = [[post.get("title"), log_normalized_score]] except: pass with open("/home/bmountain/dm_project/output_master.json", "w") as master_write: json.dump(master_data,master_write) with open("/home/bmountain/dm_project/output_2019.json","w") as j_file: json.dump(data,j_file)
# Load model, feature extractor model = keras.applications.VGG16(weights='imagenet', include_top=True) targetSize = model.input_shape[1:3] print("Target size: %s x %s" % targetSize) feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output) print("Extracting features from each image...") features = np.zeros((fileCount, 4096), dtype=np.float32) for i, fn in enumerate(files): im = image.load_img(fn, target_size=targetSize) x = image.img_to_array(im) x = np.expand_dims(x, axis=0) x = preprocess_input(x) feat = feat_extractor.predict(x)[0] features[i] = feat printProgress(i + 1, fileCount) print("Reducing feature vectors down to %s features..." % a.PCA_COMPONENTS) pca = PCA(n_components=a.PCA_COMPONENTS) pca.fit(features) pca_features = pca.transform(features) print("Saving features file %s..." % a.OUTPUT_FILE) makeDir(a.OUTPUT_FILE) pickle.dump(pca_features, bz2.open(a.OUTPUT_FILE, 'wb')) print("Done.")
#!/usr/bin/env python3 # encoding: utf-8 # # Copyright (c) 2015 Doug Hellmann All rights reserved. # """Write and read unicode data to a file. """ #end_pymotw_header import bz2 import os data = 'Character with an åccent.' with bz2.open('example.bz2', 'wt', encoding='utf-8') as output: output.write(data) with bz2.open('example.bz2', 'rt', encoding='utf-8') as input: print('Full file: {}'.format(input.read())) # Move to the beginning of the accented character. with bz2.open('example.bz2', 'rt', encoding='utf-8') as input: input.seek(18) print('One character: {}'.format(input.read(1))) # Move to the middle of the accented character. with bz2.open('example.bz2', 'rt', encoding='utf-8') as input: input.seek(19) try: print(input.read(1)) except UnicodeDecodeError:
def _open_bz2(filename, mode: str) -> IO: return bz2.open(filename, mode)
args = parser.parse_args() # Load MM corpus and dictionary corpus = load_mm_corpus(args.mm_fname) dict = gensim.corpora.Dictionary.load(args.dict_fname) with open(args.categories_fname, 'rb') as categories_file: categories = pickle.load(categories_file) prepared_query_funcs = {} for name, search_query in ALL_SEARCH_QUERIES.items(): prepared_query_funcs[name] = search_query(corpus, dict, categories) query_funcs = [ eval(query_func, prepared_query_funcs) for query_func in args.query_funcs ] logging.info('Exploring %s with functions %s', args.wiki_dump_fname, ', '.join(map(str, args.query_funcs))) with bz2.open(args.wiki_dump_fname, 'rt') as wiki_dump_file: with open(args.results_fname, 'w') as results_file: results = csv.writer(results_file) # Write header row results.writerow(['Title'] + args.query_funcs) for title, content, pageid in \ wikicorpus.extract_pages(wiki_dump_file, filter_namespaces=('0',)): results.writerow( [title] + [query_func(title, content) for query_func in query_funcs])
def file_type(f_name): if f_name[-4:] == '.bz2': reader = bz2.open(f_name, "rt") return reader return open(f_name, "r")
#!/usr/bin/env python3 import os, sys, bz2, csv, re, json, sqlite3 import urllib.request import pprint myUA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0' conn = sqlite3.connect('giga.authors.sqlite') conn.execute( 'CREATE TABLE PubDat (DOI TEXT, Title TEXT, Type TEXT, Authors TEXT, RefList TEXT)' ) with bz2.open('giga.tsv.bz2', 'rt', encoding='utf-8') as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') for row in tsvin: if row[0] == 'Title' or len(row) == 0: continue print((len(row), row[0])) theurl = 'https://academic.oup.com/gigascience/article-lookup/doi/' + row[ 16] req = urllib.request.Request(theurl) req.add_header('Referer', 'https://academic.oup.com/gigascience/') req.add_header('User-Agent', myUA) with urllib.request.urlopen(req) as r: htm = r.read().decode('utf-8') it = iter(htm.split('\n')) data = {'strAuthors': 'NA', 'reflist': 'NA', 'tocSections': 'NA'} for line, secline in zip(it, it): if re.search('<script type="application\/ld\+json">', line): datAuthors = json.loads(secline) data['strAuthors'] = json.dumps(datAuthors['author'])
def open(cls, column_names: typing.List[str], file_path: typing.Optional[Path], who: str = "output", require_all_columns: bool = True, prohibit_extra_columns: bool = True, fill_missing_columns: bool = False, error_file: typing.TextIO = sys.stderr, header_error_action: ValidationAction = ValidationAction.EXIT, use_mgzip: bool = False, mgzip_threads: int = MGZIP_THREAD_COUNT_DEFAULT, gzip_in_parallel: bool = False, gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT, column_separator: str = KgtkFormat.COLUMN_SEPARATOR, mode: Mode = Mode.AUTO, output_format: typing.Optional[str] = None, output_column_names: typing.Optional[typing.List[str]] = None, old_column_names: typing.Optional[typing.List[str]] = None, new_column_names: typing.Optional[typing.List[str]] = None, verbose: bool = False, very_verbose: bool = False)->"KgtkWriter": if file_path is None or str(file_path) == "-": if verbose: print("KgtkWriter: writing stdout", file=error_file, flush=True) if output_format is None: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup(column_names=column_names, file_path=None, who=who, file_out=sys.stdout, require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) if str(file_path).startswith(">"): fd: int = int(str(file_path)[1:]) if verbose: print("%s: writing file descriptor %d" % (who, fd), file=error_file, flush=True) if output_format is None: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup(column_names=column_names, file_path=file_path, who=who, file_out=open(fd, "w"), require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) if verbose: print("File_path.suffix: %s" % file_path.suffix, file=error_file, flush=True) if file_path.suffix in [".gz", ".bz2", ".xz", ".lz4"]: # TODO: find a better way to coerce typing.IO[Any] to typing.TextIO gzip_file: typing.TextIO if file_path.suffix == ".gz": if use_mgzip: if verbose: print("KgtkWriter: writing gzip with %d threads: %s" % (mgzip_threads, str(file_path)), file=error_file, flush=True) import mgzip gzip_file = mgzip.open(str(file_path), mode="wt", thread=mgzip_threads) # type: ignore else: if verbose: print("KgtkWriter: writing gzip %s" % str(file_path), file=error_file, flush=True) import gzip gzip_file = gzip.open(file_path, mode="wt") # type: ignore elif file_path.suffix == ".bz2": if verbose: print("KgtkWriter: writing bz2 %s" % str(file_path), file=error_file, flush=True) import bz2 gzip_file = bz2.open(file_path, mode="wt") # type: ignore elif file_path.suffix == ".xz": if verbose: print("KgtkWriter: writing lzma %s" % str(file_path), file=error_file, flush=True) import lzma gzip_file = lzma.open(file_path, mode="wt") # type: ignore elif file_path.suffix ==".lz4": if verbose: print("KgtkWriter: writing lz4 %s" % str(file_path), file=error_file, flush=True) import lz4 # type: ignore gzip_file = lz4.frame.open(file_or_path, mode="wt") # type: ignore else: # TODO: throw a better exception. raise ValueError("Unexpected file_path.suffiz = '%s'" % file_path.suffix) if output_format is None: if len(file_path.suffixes) < 2: output_format = cls.OUTPUT_FORMAT_DEFAULT else: format_suffix: str = file_path.suffixes[-2] if format_suffix == ".md": output_format = cls.OUTPUT_FORMAT_MD elif format_suffix == ".csv": output_format = cls.OUTPUT_FORMAT_CSV elif format_suffix == ".json": output_format = cls.OUTPUT_FORMAT_JSON elif format_suffix == ".jsonl": output_format = cls.OUTPUT_FORMAT_JSONL else: output_format = cls.OUTPUT_FORMAT_DEFAULT return cls._setup(column_names=column_names, file_path=file_path, who=who, file_out=gzip_file, require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, ) else: if output_format is None: if file_path.suffix == ".md": output_format = cls.OUTPUT_FORMAT_MD elif file_path.suffix == ".csv": output_format = cls.OUTPUT_FORMAT_CSV elif file_path.suffix == ".json": output_format = cls.OUTPUT_FORMAT_JSON elif file_path.suffix == ".jsonl": output_format = cls.OUTPUT_FORMAT_JSONL else: output_format = cls.OUTPUT_FORMAT_DEFAULT if verbose: print("KgtkWriter: writing file %s" % str(file_path), file=error_file, flush=True) return cls._setup(column_names=column_names, file_path=file_path, who=who, file_out=open(file_path, "w"), require_all_columns=require_all_columns, prohibit_extra_columns=prohibit_extra_columns, fill_missing_columns=fill_missing_columns, error_file=error_file, header_error_action=header_error_action, use_mgzip=use_mgzip, mgzip_threads=mgzip_threads, gzip_in_parallel=gzip_in_parallel, gzip_queue_size=gzip_queue_size, column_separator=column_separator, mode=mode, output_format=output_format, output_column_names=output_column_names, old_column_names=old_column_names, new_column_names=new_column_names, verbose=verbose, very_verbose=very_verbose, )
def open_with_compression(filename, mode='r'): """ Wrapper around builtin `open` that will guess compression of a file from the filename and open it for reading or writing as if it were a standard file. Implemented for ``gz``(gzip), ``bz2``(bzip2) and ``xz``(lzma). Either Python 3 or the ``backports.lzma`` module are required for ``xz``. Supported modes are: * 'r', 'rt', 'w', 'wt' for text mode read and write. * 'rb, 'wb' for binary read and write. Depending on the Python version, you may get errors trying to write the wrong string type to the file. Parameters ========== filename: str Path to the file to open, including any extensions that indicate the compression used. mode: str Mode to open the file, same as for builtin ``open``, e.g 'r', 'w'. Returns ======= fd: file File-like object open with the specified mode. """ if sys.version_info[0] > 2: # Compressed formats sometimes default to binary, so force # text mode in Python 3. if mode == 'r': mode = 'rt' elif mode == 'w': mode = 'wt' elif mode == 'a': mode = 'at' else: # The version of gzip in Anaconda Python 2 on Windows forcibly # adds a 'b', so strip any 't' and let the string conversions # be carried out implicitly by Python. mode = mode.strip('t') root, compression = get_compression(filename) if compression is None: return open(filename, mode) elif compression == 'gz': import gzip fd = gzip.open(filename, mode=mode) elif compression == 'bz2': import bz2 if hasattr(bz2, 'open'): # Python 3 only fd = bz2.open(filename, mode=mode) else: # Python 2 fd = bz2.BZ2File(filename, mode=mode) elif compression == 'xz': try: from lzma import open as lzma_open except ImportError: from backports.lzma import open as lzma_open fd = lzma_open(filename, mode) else: fd = open(filename, mode) return fd
def save(self, pklfile): #print("Saving to {}".format(pklfile)) with bz2.open(pklfile, "wb") as fout: pickle.dump(self, fout)
if "Синоними" in text: result = syns_list_re.search(text) if result and result.group(0).strip(): syns = cleanup_syns_list(result.group(0).strip()) return (word, syns) waiting_ns = False waiting_text = False text = '' title = None all_count = 0 bg_count = 0 syn_count = 0 lang_re = re.compile(r"\{\{-bg-\}\}|ЕЗИК\s*=\s*bg|ЕЗИК\s*=\s*български", re.M | re.UNICODE) with open('bg_wiktionary_syns.txt', 'w') as output_file: for (event, elem) in etree.iterparse(bz2.open("wiktionary.xml.bz2"), events=['start', 'end']): if (event == 'start' and elem.tag[-4:] == 'page'): waiting_ns = True continue if (event == 'end' and elem.tag[-4:] == 'page'): waiting_ns = False waiting_text = False title = None continue if (event == 'end' and waiting_ns and elem.tag[-5:] == 'title'): title = elem.text continue if (waiting_ns and event == 'end' and 'ns' == elem.tag[-2:]): if elem.text.strip() == '0': waiting_text = True continue