def univ_open(file_path, mode='r'): # If the file ends with ".gz" then open it through GZip if file_path.split('.')[-1].lower() == 'gz': from gzip import open as gzopen if mode in ('w', 'wb', 'w+', 'wb+'): return gzopen(file_path, mode, 6) else: return gzopen(file_path, mode) else: return open(file_path, mode)
def open_pdb(structure, verbose=True, try_web=True): '''Return an opened PDB file handle from STDIN, file, local PDB cache, or web''' # STDIN if "<open file '<stdin>', mode 'r' at" in str(structure): pdb_filehandle = structure # AS UNCOMPRESSED PDB FILE elif os.path.exists(structure) and is_binary_file( structure) == False: #file exists and is a text-based file pdb_filehandle = open(structure, 'r') # AS GZIPPED PDB FILE elif os.path.exists(structure) and is_binary_file( structure) == True: #file exists and is likely a gzipped file try: testopen = gzopen(structure, 'r') testopen.readline() testopen.close() pdb_filehandle = gzopen(structure, 'r') except IOError: if (verbose): print 'Invalid structure file-type. Structure file must be a plain-text PDB file or a gzipped PDB file.' return # AS PDB FILE FROM LOCAL COPY OF THE PDB -OR- FROM THE WEB elif len(structure) == 4: pdb_storage_path = os.path.join( PDB_DATA_DIR, '%s/pdb%s.ent.gz' % (structure[1:3].lower(), structure.lower())) #local file if os.path.exists(pdb_storage_path): pdb_filehandle = gzopen(pdb_storage_path, 'r') #try the web elif (try_web): try: pdb_filehandle = urlopen( 'http://www.rcsb.org/pdb/files/%s.pdb' % (structure.upper())) except HTTPError: if (verbose): print 'Invalid structure input: %s. Not found as local file, as PDB structure in %s, or on the web.' % ( structure, PDB_DATA_DIR) return else: return else: if (verbose): print 'Invalid structure input: %s. Not found as local file, and wrong number of characters for direct PDB reference.' % ( structure) return return pdb_filehandle
def sample_reads(self): if file_exists(self.filenames['sampled_reads']): log.info(f"Will use existing {self.filenames['sampled_reads']}") else: if self.coverage == 0: os.symlink(os.path.abspath(self.readfile), self.filenames['sampled_reads']) log.info(f"Using all reads as coverage option is 0") else: log.info( f"Sampling {self.coverage} times coverage of {len(self)/1000000:.1f} Mb assembly from >{self.minreadlength}bp reads in {self.readfile}" ) with gzopen(self.readfile, 'rt') as all_reads, gzopen( self.filenames['sampled_reads'], 'wt', compresslevel=6) as sampled_reads, tqdm( total=self.coverage, leave=False) as pbar: sampled_bases = read_count = times_coverage = 0 readset = [] for title, sequence, quality in FastqGeneralIterator( all_reads): if len(sequence) < self.minreadlength: continue readset.append(f"@{title}\n{sequence}\n+\n{quality}\n") read_count += 1 sampled_bases += len(sequence) new_times_coverage = round(sampled_bases / len(self)) if new_times_coverage > times_coverage: print(''.join(readset), file=sampled_reads, end='') readset = [] pbar.update() times_coverage = new_times_coverage if times_coverage == self.coverage: break print(''.join(readset), file=sampled_reads, end='') log.info( f"Wrote {read_count} reads ({sampled_bases} bases, {times_coverage} times coverage) to {self.filenames['sampled_reads']}" ) if times_coverage < self.coverage: log.warning( f"Only found {times_coverage} times coverage in reads longer than {self.minreadlength}, not {self.coverage} times; consider reducing minimum read length (-l)" )
def close_spider(self, spider): self.dump_data() # The end of JSON dicts f = gzopen(self.fname_crawled_items, "a", 6) f.write("\n}") f.close() f = gzopen(self.fname_web_graph, "ab", 6) f.write("\n}") f.close() self.gzip_dump_timer.cancel() self.gzip_dump_timer = None self.stop_gzip_timer = True # And execute it by hand synchronously to be sure # we finish dumping everything before closing self.dump_gzipped_contents()
def load_crawled_items(self, spider): self.fname_crawled_items = spider.export_results_filename.replace(".json", "") + ".json.gz" # The start of a JSON dict f = gzopen(self.fname_crawled_items, "ab", 6) f.write("{") f.close() self.fname_web_graph = ( spider.export_results_filename.replace(".json", "") + "_graph.json.gz" ) # Not directly replacing ".json" pattern so that if pattern not here, we just append to fname! # The start of a JSON dict f = gzopen(self.fname_web_graph, "ab", 6) f.write("{") f.close() self.crawled_items = {} self.nodes_edges = {}
def executaParalelo(arquivo, saida): # print('pooooo') # lock = th.Lock() with gzopen('/home/snoopy/base_de_dados/Base_original/logs-leticia.gz', 'rt') as baseDados: threads = [] # print(baseDados.readline(1999)) # return # while(True): # try: linha = baseDados.readline() while (linha is not ""): # linha = baseDados.readline() for i in range(NUMERO_PROCESSOS_POR_VEZ): #le uma linha linha = baseDados.readline() # print(linha) #cria um processo processo = th.Thread(target=leSubarquivos, args=(str(linha), saida)) threads.append(processo) # inicia o processo processo.start() #diz para esperar os n acabarem for thread in threads: thread.join()
def cli_traversal(self): step = 0 self.report.branch = None for chksum in self.climfobj.keys(): yield self.report.setStep(step, len(self.climfobj)) if chksum in self.srvmfobj: self.report.branch = 'match' # all files in the current packages are not changed. assert (self.climfobj[chksum].rflist == self.srvmfobj[chksum].rflist) self.report.incKeeps(len(self.climfobj[chksum].rflist)) del (self.srvmfobj[chksum]) step = step + 1 else: chksum0 = self.srvmfobj.find(chksum) if chksum0 is None: self.report.branch = 'discard' # orphan package in client, discard it. # remember the rflist, remove them if required. self.rflist.extend(self.climfobj[chksum].rflist) del (self.climfobj[chksum]) else: self.report.branch = 'patch' # cached patch found, use it. patchbody = self.urlpost('patch', chksum) rflist = self.climfobj[chksum].rflist self.patchflist(patchbody, rflist) del (self.climfobj[chksum]) self.climfobj[chksum0] = self.srvmfobj.pop(chksum0) self.climfobj[chksum0].clean_history() step = step + 1 self.climfobj.save(gzopen(self.manifest, 'wb')) yield self.report.setStep(step, len(self.climfobj)) self.report.branch = None yield None
def w2p_unpack(filename, path, delete_tar=True): if filename=='welcome.w2p' and ( not os.path.exists('welcome.w2p') or \ os.path.exists('NEWINSTALL')): try: w2p_pack('welcome.w2p', 'applications/welcome') os.unlink('NEWINSTALL') except: msg = "New installation: unable to create welcome.w2p file" sys.stderr.write(msg) filename = abspath(filename) path = abspath(path) if filename[-4:] == '.w2p' or filename[-3:] == '.gz': if filename[-4:] == '.w2p': tarname = filename[:-4] + '.tar' else: tarname = filename[:-3] + '.tar' fgzipped = gzopen(filename, 'rb') tarfile = open(tarname, 'wb') tarfile.write(fgzipped.read()) tarfile.close() fgzipped.close() else: tarname = filename untar(tarname, path) if delete_tar: os.unlink(tarname)
def cli_traversal(self): step = 0 self.report.branch = None for chksum in self.climfobj.keys(): yield self.report.setStep(step, len(self.climfobj)) if chksum in self.srvmfobj: self.report.branch = 'match' # all files in the current packages are not changed. assert(self.climfobj[chksum].rflist == self.srvmfobj[chksum].rflist) self.report.incKeeps(len(self.climfobj[chksum].rflist)) del(self.srvmfobj[chksum]) step = step + 1 else: chksum0 = self.srvmfobj.find(chksum) if chksum0 is None: self.report.branch = 'discard' # orphan package in client, discard it. # remember the rflist, remove them if required. self.rflist.extend(self.climfobj[chksum].rflist) del(self.climfobj[chksum]) else: self.report.branch = 'patch' # cached patch found, use it. patchbody = self.urlpost('patch', chksum) rflist = self.climfobj[chksum].rflist self.patchflist(patchbody, rflist) del(self.climfobj[chksum]) self.climfobj[chksum0] = self.srvmfobj.pop(chksum0) self.climfobj[chksum0].clean_history() step = step + 1 self.climfobj.save(gzopen(self.manifest, 'wb')) yield self.report.setStep(step, len(self.climfobj)) self.report.branch = None yield None
def __init__(self, f_in, lemma=False): self.lemma = lemma self.current_line = None if f_in.endswith("gz"): self.source = gzopen(f_in, 'rt', encoding='latin-1') else: self.source = open(f_in, 'r', encoding='latin-1')
def main(): r = requests.get('http://mobile.njit.edu/parking/data.php', headers={ 'Referer': 'http://mobile.njit.edu/parking/', 'Origin': 'http://mobile.njit.edu/' }) current_time = time.strftime('%Y-%m-%d_%H-%M-%S') if not r: print "[{}] Failed to connect".format(current_time) return else: print "[{}] Connected".format(current_time) with gzopen('/opt/parking/data/{}.json.gz'.format(current_time), 'wt') as o: o.write(r.text) decks = r.json['decks'] with engine.connect() as db: for d in decks: deck = decks[d]['SiteName'] available = int(decks[d]['Available']) occupied = int(decks[d]['Occupied']) total = int(decks[d]['Total']) db.execute( "INSERT INTO NJITParking (deck, available, occupied, total) VALUES (%s, %s, %s, %s);", (deck, available, occupied, total))
def w2p_pack(filename, path, compiled=False, filenames=None): """Packs a web2py application. Args: filename(str): path to the resulting archive path(str): path to the application compiled(bool): if `True` packs the compiled version filenames(list): adds filenames to the archive """ filename = abspath(filename) path = abspath(path) tarname = filename + '.tar' if compiled: tar_compiled(tarname, path, r'^[\w.-]+$', exclude_content_from=['cache', 'sessions', 'errors']) else: tar(tarname, path, r'^[\w.-]+$', filenames=filenames, exclude_content_from=['cache', 'sessions', 'errors']) with open(tarname, 'rb') as tarfp, gzopen(filename, 'wb') as gzfp: shutil.copyfileobj(tarfp, gzfp, 4194304) # 4 MB buffer os.unlink(tarname)
def open_file_by_mimetype(filename, mode): """ This function determines the compression MIME type of a file as gz, bz, or none, and returns an open file handle of the requested mode ('w', 'r', or 'a') """ if mode != 'r' and mode != 'w' and mode != 'a': print("please specific a valid mode: w, r, a") return if guess_type(filename)[1] == 'gzip': try: fh = gzopen(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return elif guess_type(filename) == 'bzip2': try: fh = bzopen(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return else: try: fh = open(filename, mode) except Exception as error: print("Error opening file ", filename, ": ", error) return return fh
def create_wp_table(conn): """ Creates the wp table from a sqlite connection. This is basically here for posterity, shouldn't be used. :param conn: a sqlite connection :type conn: sqlite3.Connection """ print 'creating' cur = conn.cursor() cur.execute('''CREATE TABLE IF NOT EXISTS `titles` (title TEXT UNIQUE);''') print "Extracting/Inserting..." counter = 0 for line in list(set(map(lambda x: preprocess(x.strip()), gzopen('/'.join(os.path.realpath(__file__).split('/')[:-1]) + '/enwiki-20131001-all-titles-in-ns0.gz')))): cur.execute("INSERT INTO `titles` (`title`) VALUES (?)", (line,)) counter += 1 if counter % 500 == 0: print counter print "Committing..." conn.commit()
def split_file(input, line_number=10000000): #set starting count values a = 1 b = 0 c = line_number #open first output file and add to list files = ['split' + str(a) + '.tmp'] out = open('split' + str(a) + '.tmp', 'w') #open input file with TextIOWrapper(gzopen(input, 'rb')) as e: #iterate over each line, adding an index for index, line in enumerate(e): #test if index fits between upper and lower limits and write to file if index <= c: if index > b: out.write(str(line)) else: #close last ouput out.close() #reset count values a += 1 b = c c += line_number #open new output and add to list filesappend('split' + str(a) + '.tmp') out = open('split' + str(a) + '.tmp', 'w') #output line out.write(str(line)) #close last ouptut out.close() #return number of temporary files for use in other functions return (files)
def compute_removed_queries_because_of_null_clustering(pickle_path_removed_queries, clusters, join_clusters=None): """ This function will compute the set of queries that should be removed because they have a null clustering over the clusters we are passed in argument. :param clusters: a dict {qid: clustering_vector} :param join_clusters: if the clusters are being loaded in a background process, the function to be executed to force to wait for this background process to have finished before accessing the clusters object """ from numpy.linalg import norm print "Looking for queries with null cluster vector..." t0 = time() try: print "Trying to pickle from disk...", pickle_path_removed_queries with gzopen(pickle_path_removed_queries, 'r') as f: print "File", pickle_path_removed_queries, "was found!" removed_queries = set(load_pickled_list(f)) pickled = True except Exception as err: if not isinstance(err, IOError): print "Error for", pickle_path_removed_queries, "was:", err print "No pickled files or error loading it, recomputing..." pickled = False removed_queries = set() # In case of recomputation we need to wait for the clusters data to be available, if they're loaded in bg if join_clusters is not None: join_clusters() for qid, cl in clusters.items(): if norm(cl) < ZERO_FLOAT: # Should be precise enough? removed_queries.add(qid) print "Done ", time()-t0 pickle_ask(pickled, pickle_path_removed_queries, removed_queries, dump_f=pickle_list) return removed_queries
def write(path, text): from gzip import open as gzopen print 'writing', path, text.count('\n') f = gzopen(path, 'w') f.write(text) f.close() os.system("gzip " + path)
def _write_table(profile_dir, table_name, rows, fields, append=False, gzip=False): # don't gzip if empty rows = iter(rows) try: first_row = next(rows) except StopIteration: gzip = False else: rows = chain([first_row], rows) if gzip and append: logging.warning('Appending to a gzip file may result in ' 'inefficient compression.') if not os.path.exists(profile_dir): raise ItsdbError( 'Profile directory does not exist: {}'.format(profile_dir)) tbl_filename = os.path.join(profile_dir, table_name) mode = 'a' if append else 'w' if gzip: # text mode only from py3.3; until then use TextIOWrapper #mode += 't' # text mode for gzip f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode)) else: f = open(tbl_filename, mode=mode) for row in rows: f.write(make_row(row, fields) + '\n') f.close()
def filter_and_read_tsv(dat, gzipped, integer_samfilters): """If filters supplied, subset DAT first, then read with pandas""" number_retained = 0 if gzipped: opener = gzopen else: opener = open with opener(dat, mode="rt") as dat_handle: with TemporaryDirectory() as tempdir: datflt_name = path.join(tempdir, "dat.gz") with gzopen(datflt_name, mode="wt") as datflt: decorated_line_iterator = progressbar( dat_handle, desc="Filtering", unit=" lines", ) for line in decorated_line_iterator: if line[0] == "#": print(line, end="", file=datflt) else: fields = line.split("\t") line_passes_filter = entry_filters_ok( int(fields[1]), int(fields[4]), integer_samfilters, ) if line_passes_filter: number_retained += 1 print(line, end="", file=datflt) print("Kept {} records".format(number_retained), file=stderr) print("Loading DAT...", file=stderr, flush=True) return read_csv(datflt_name, sep="\t", escapechar="#")
def split_fastq_for_sample_barcodes(path_to_splitted_fastq, read1_file): from Bio.Seq import Seq from Bio import SeqIO from gzip import open as gzopen for record in SeqIO.parse(gzopen(read1_file, "rt"), format="fastq"): BGI_header = record.id tile_fastq = int(BGI_header[20:-2]) tile_fastq = str(tile_fastq) x_pos = int(BGI_header[13:16]) x_pos = str(x_pos) y_pos = int(BGI_header[17:20]) y_pos = str(y_pos) read1_pos = tile_fastq + ":" + x_pos + ":" + y_pos try: read1_info = read1_header_dict[read1_pos] sample = read1_info[0] read1_header = read1_info[1] record.description = record.description.replace(record.id, "") record.id = read1_header splitted_fastq_file = path_to_splitted_fastq + sample + "_read1.fq" with open(splitted_fastq_file, "a") as output_handle: SeqIO.write(record, output_handle, "fastq") except: continue
def w2p_pack(filename, path, compiled=False, filenames=None): """Packs a web2py application. Args: filename(str): path to the resulting archive path(str): path to the application compiled(bool): if `True` packs the compiled version filenames(list): adds filenames to the archive """ filename = abspath(filename) path = abspath(path) tarname = filename + '.tar' if compiled: tar_compiled(tarname, path, '^[\w\.\-]+$', exclude_content_from=['cache', 'sessions', 'errors']) else: tar(tarname, path, '^[\w\.\-]+$', filenames=filenames, exclude_content_from=['cache', 'sessions', 'errors']) w2pfp = gzopen(filename, 'wb') tarfp = open(tarname, 'rb') w2pfp.write(tarfp.read()) w2pfp.close() tarfp.close() os.unlink(tarname)
def open(dir: util.PathLike, name: str, encoding: Optional[str] = None) -> IO[str]: """ Open a TSDB database file. Unlike a normal `open()` call, this function takes a base directory *dir* and a filename *name* and determines whether the plain text *dir*/*name* or compressed *dir*/*name*.gz file is opened. Furthermore, this function only opens files in read-only text mode. For writing database files, see :func:`write`. Args: dir: path to the database directory name: name of the file to open encoding: character encoding of the file Example: >>> sentences = [] >>> with tsdb.open('my-profile', 'item') as item: ... for line in item: ... sentences.append(tsdb.split(line)[6]) """ path = get_path(dir, name) if path.suffix.lower() == '.gz': return gzopen(path, mode='rt', encoding=encoding) else: return path.open(encoding=encoding)
def _write_table(profile_dir, table_name, rows, fields, append=False, gzip=False): # don't gzip if empty rows = iter(rows) try: first_row = next(rows) except StopIteration: gzip = False else: rows = chain([first_row], rows) if gzip and append: logging.warning('Appending to a gzip file may result in ' 'inefficient compression.') if not os.path.exists(profile_dir): raise ItsdbError('Profile directory does not exist: {}' .format(profile_dir)) tbl_filename = os.path.join(profile_dir, table_name) mode = 'a' if append else 'w' if gzip: # text mode only from py3.3; until then use TextIOWrapper #mode += 't' # text mode for gzip f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode)) else: f = open(tbl_filename, mode=mode) for row in rows: f.write(make_row(row, fields) + '\n') f.close()
def _open_table(tbl_filename): if tbl_filename.endswith('.gz'): gz_filename = tbl_filename tbl_filename = tbl_filename[:-3] else: gz_filename = tbl_filename + '.gz' if os.path.exists(tbl_filename) and os.path.exists(gz_filename): logging.warning( 'Both gzipped and plaintext files were found; attempting to ' 'use the plaintext one.' ) if os.path.exists(tbl_filename): with open(tbl_filename) as f: yield f elif os.path.exists(gz_filename): # text mode only from py3.3; until then use TextIOWrapper with TextIOWrapper( BufferedReader(gzopen(tbl_filename + '.gz', mode='r')) ) as f: yield f else: raise ItsdbError( 'Table does not exist at {}(.gz)' .format(tbl_filename) )
def check_seq(file): if file == '-': try: seq = stdin.read() except UnicodeDecodeError: exit( "[!] Cannot read STDIN, if gzipped try: file | gunzip -c | sideroscanner" ) else: if not Path(file).exists(): return print(f"[!] {file} does not exist, skipping...") try: with open(file, 'r') as fr: seq = fr.read() except IsADirectoryError: return print(f"[!] {file} is a directory, skipping...") except UnicodeDecodeError: try: with gzopen(file, "rt") as fr: seq = fr.read() except OSError: return print(f"[!] Could not open {file}, skipping...") if len(seq) <= 10: return print(f"[!] {file} is too small, skipping...") if ">" in seq[0] or "@" in seq[0]: return seq else: return print(f"[!] {file} is not a fasta file, skipping...")
def fastq_info(fastq, chunksize=10000): ''' Extract flowcell and other metadata from a FASTQ Parameters ---------- fastq : str Path to FASTQ file chunksize : int Number of records to read simultaneously ''' # check for file format if fastq.endswith('.gz'): fq_handle = gzopen(fastq, 'rt') else: fq_handle = open(fastq, 'r') # load reads for random access records = SeqIO.parse(fq_handle, 'fastq') # data structure for managing metadata metadata = {'flowcells': set([]), 'n_reads': 0} # load `chunksize` reads at a time chunked_records = grouper(records, chunksize) for chunk in chunked_records: # filter chunk chunk = [r for r in chunk if r is not None] # count reads metadata['n_reads'] += len(chunk) # extract unique flowcell IDs flowcells = set([parse_read_id(r.id, 'flowcell') for r in chunk]) for f in flowcells: metadata['flowcells'].add(f) fq_handle.close() return metadata
def add_ICRA_probs(jsdel_f, in_bam_f, out_bam_f, remove_unmapped=True, remove_not_in_delta=False, delta_thresh=0.9): dt = datetime.now() with gzopen(jsdel_f, 'rt') as jsdel_fh: delta = dict(ujson.load(jsdel_fh, precise_float=True)) log_.info('Loaded delta from file {}. Time: {}'.format( jsdel_f, datetime.now() - dt)) in_bam = pysam.AlignmentFile(in_bam_f) # @UndefinedVariable out_bam = pysam.AlignmentFile(out_bam_f, "wb", header=in_bam.header) # @UndefinedVariable for rid, grp in groupby(in_bam, attrgetter('query_name')): alngrp = list(grp) if remove_unmapped: if (len(alngrp) == 1 and alngrp[0].is_unmapped) \ or (len(alngrp) == 2 and alngrp[0].is_unmapped and alngrp[1].is_unmapped): continue try: d_rid = delta[rid] except KeyError: if remove_not_in_delta: continue for aln in alngrp: aln = _add_zw_tag(aln, 0) _write_to_sam(out_bam, alngrp) continue alngrp = _add_deltas(alngrp, d_rid, delta_thresh) _write_to_sam(out_bam, alngrp)
def open_fasta(filename): """Open FASTA with 'open' if plaintext, 'gzip.open' if gzipped""" with open(filename, mode="rb") as bytes_handle: is_gzipped = (hexlify(bytes_handle.read(2)) == b"1f8b") if is_gzipped: yield gzopen(filename, mode="rt") else: yield open(filename, mode="rt")
def prepare(self): """ Fill all properties not already populated at initialization with values. Returns: True in case of success, False otherwise. """ logger = logging.getLogger("shared.embeddings_config.prepare") # Inspired by https://github.com/bplank/bilstm-aux/blob/master/src/lib/mio.py#L5-L22 logger.debug("Opening embeddings file from %s with %s encoding", self.path, self.encoding) logger.debug("Using separator '%s'", self.separator) if self.lower: logger.debug("All words will be converted to lowercase") if self.gzip: f = gzopen(self.path, mode="r") lines = codecs.getreader("utf-8")(f).readlines() else: f = codecs.open(self.path, mode="r", encoding="utf-8") lines = f for line in lines: try: fields = line.strip().split(self.separator) # All fields but the first are values of the embedding vector vec = [float(value) for value in fields[1:]] # The first field is the word word = fields[0] # Apply lower case if self.lower: word = word.lower() self._vectors[word] = vec except ValueError: logger.warn( "Failed to prepare embeddings because line in embeddings file could not be read: %s", line) return False # Close file f.close() # Check if the length of the vectors is actually the specified embeddings size logger.debug("Vectors should have dimensionality of %d", self.size) logger.debug("Vectors from embedding file have dimensionality of %d", len(vec) if vec else 0) assert len(vec) == self.size logger.info( "Finished reading the embeddings file. Loaded vectors for %d distinct words.", len(self.vectors)) self._prepared = True return True
def generate_region_file(bam_region, region, options): reads, pos_range = get_reads_and_ranges(bam_region, *region, options) wps_list, cov_sites = get_wps(reads, pos_range, *region, options) if cov_sites or options.empty: if region.strand == "-": wps_list = reversed(wps_list) with gzopen(options.outfile%region.cid, "wt") as wps_handle: for line in wps_list: print(*line, sep="\t", file=wps_handle)
def _open_fastq(in_path): """Returns compressed or uncompressed FASTQ file handle""" try: in_fq = gzopen(in_path, 'r') in_fq.readline() except IOError: in_fq = open(in_path, 'r') in_fq.seek(0) return in_fq
def parse_file(path): ranges = {} if '.gz' in path: with gzopen(path, 'rt') as f: parse_lines(f, ranges) else: with open(path) as f: parse_lines(f, ranges) return ranges
def load_delve(dataset_path, dataset_spec, n=None): """ Load an delve dataset. Specification is given by the spec file. :param dataset_path Path to the .data.gz file. :param dataset_spec Path to the .spec file. :param n If defined, read only first n rows. :return Dictionary data, target. """ rdict = dict() sd = parse_spec(dataset_spec) fp = gzopen(dataset_path, "r") line = str(fp.readline()) count = 0 X = list() y = list() while line: if line.count('\\'): # Must read another line line = line.strip().replace("\\", "") + str(fp.readline()) x = zeros((sd.num_vars, )) for i, v in enumerate(line.strip().split()): if i in sd: if sd[i] == sd.TARGET: y.append(float(v)) else: j = sd[i] x[j] = float(v) elif (i, v) in sd: j = sd[i, v] x[j] = 1 else: pass X.append(x) line = str(fp.readline()) count += 1 if n is not None and count == n: break rdict["data"] = array(X) rdict["target"] = array(y) rdict["labels"] = [sd.labels[i] for i in range(len(X[0]))] return rdict
def make_single_fastq_gz(read_sets, out_dir, include_reverse): """Recovers read set information from kneaddata output Parameters ---------- read_sets: list of tup list of 7-tuples with run prefix, sample name, fwd paired read fp, rev paired read fp, fwd unpaired read fp, rev unpaired read fp, and single fwd read fp. out_dir : str The path to a directory in which to write files include_reverse : bool Whether to include reverse sequences in combined file Returns ------- combined_reads: list of tup list of 3-tuples with run prefix, sample name, combined gzip fastq Raises ------ OSError If the Popen process call to cat returns with value other than 0 Notes ----- If all input files are empty for a sample, will not output that sample in the `sample` list. """ combined_reads = [] for run_prefix, sample, f_p, r_p, f_u, r_u, s in read_sets: out_fp = join(out_dir, '%s.fastq.gz' % run_prefix) if s is None: if include_reverse: cmd = 'cat %s %s %s %s > %s' % (f_p, r_p, f_u, r_u, out_fp) else: cmd = 'cat %s %s > %s' % (f_p, f_u, out_fp) else: cmd = 'cat %s > %s' % (s, out_fp) proc = Popen(cmd, shell=True) failure = proc.wait() if failure != 0: raise OSError('Problem with cat of files: %s' % cmd) # Check to make sure that the combined gzip is not totally empty with gzopen(out_fp, 'rb') as f: data = f.read(1).strip() if data: combined_reads.append((run_prefix, sample, out_fp)) return (combined_reads)
def _open_table(tbl_filename): path = _table_filename(tbl_filename) if path.endswith('.gz'): # text mode only from py3.3; until then use TextIOWrapper with TextIOWrapper( BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))) as f: yield f else: with open(tbl_filename) as f: yield f
def main(): blocks = [] with open("Blocks.txt", "r", encoding="utf-8") as f: for line in f: if not line.strip() or line[0] == "#": continue bdata0 = line.split(";") bdata0[1] = bdata0[1].strip() bdata1 = bdata0[0].split("..") blocks.append(UnicodeBlock(bdata1[0], bdata1[1], bdata0[1])) han_re = re_compile(r"[A-Z]") unihan = {} with open("Unihan_Readings.txt", "r", encoding="utf-8") as f: for line in f: if not line.strip() or line[0] == "#": continue bdata0 = line.split("\t") cp = bdata0[0][2:] if cp not in unihan: unihan[cp] = UnihanData(cp) prop = bdata0[1][1:] prop = han_re.sub(lambda m: "_" + m.group(0).lower(), prop)[1:] setattr(unihan[cp], prop, bdata0[2].strip()) data = [] with open("UnicodeData.txt", "r", encoding="utf-8") as f: for line in f: if not line.strip() or line[0] == "#": contine udata = line.split(";") uval = int(udata[0], 16) for block in blocks: if block.start <= uval and block.end >= uval: tblock = block break data.append(UnicodeCodepoint(udata, tblock)) unidata = { "blocks": blocks, "unihan": list(unihan.values()), "characters": data } unidata_json = dumps(unidata, default=json_callback) with gzopen("unicode_data.json.gz", "wb") as f: f.write(unidata_json.encode("utf-8"))
def count_fastq_sequences(filename, min_length=MIN_READ_LENGTH): counts = collections.defaultdict(int) with gzopen(filename, mode="rt") as handle: for idx, line in enumerate(handle): if idx % 4 == 1: read = line.strip().upper() if len(read) > min_length: counts[read] += 1 return dict(counts)
def read_file(self, filename): """Reads zipped NetCDF file and returns its file pointer.""" # # Uncompress NetCDF file. f = gzopen('%s' % (filename), 'rb') g = open('%s_%s.nc' % (self.params['uuid'], 'dump'), 'wb') g.write(f.read()) f.close() g.close() # return netcdf('%s_%s.nc' % (self.params['uuid'], 'dump'), 'r')
def save_fake_fib(fname): """returns a dict to get saved""" inds = np.arange(QSDR_SHAPE[0] * QSDR_SHAPE[1] * QSDR_SHAPE[2]) mx, my, mz = np.unravel_index(inds,QSDR_SHAPE,order="F") fop = gzopen(fname,"wb") savemat(fop, {"dimension":np.array(QSDR_SHAPE), "mx":mx,"my":my,"mz":mz}, format='4' ) fop.close()
def do_process_clusters_pickle(pickle_path_clusters): global big_queries_set, clusters try: print "Trying to pickle from disk...", pickle_path_clusters with gzopen(pickle_path_clusters, 'r') as f: print "File", pickle_path_clusters, "was found!" clusters = load_pickled_dict_to_np_arrays(f, pre_initialized_dict=clusters) except Exception as err: print "Error for", pickle_path_clusters, "was:", err return False return clusters
def w2p_pack(filename, path, compiled=False): tarname = filename + '.tar' if compiled: tar_compiled(tarname, path, '^[\w\.\-]+$') else: tar(tarname, path, '^[\w\.\-]+$') w2pfp = gzopen(filename, 'wb') tarfp = open(tarname, 'rb') w2pfp.write(tarfp.read()) w2pfp.close() tarfp.close() os.unlink(tarname)
def dump_gzipped_contents(self): while self.gzipped_io_queue: fname, content = self.gzipped_io_queue.pop() print "Dumping gzipped data to file", fname f = gzopen(fname, "wb", 6) # 6 is supposed to offer very good perf/size ratio f.write(content) f.close() if self.gzip_dump_timer is None or self.stop_gzip_timer is True: # It means we should not repeat ourselves return else: self.gzip_dump_timer = Timer(self.DUMP_PAGES_EVERY_X_SECONDS, self.dump_gzipped_contents) self.gzip_dump_timer.start()
def testBatchUpload(self): with gzopen(join(self.bulkLoadDir, "test.rdf.gz"), 'w') as f: f.write("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <rdf:Description> <rdf:type>uri:testBatchUpload</rdf:type> </rdf:Description> </rdf:RDF>""") self.runBatchUpload(graph="uri:example.org") json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload" }') self.assertEquals(1, len(json['results']['bindings'])) self.clearBatches() with gzopen(join(self.bulkLoadDir, "test2.rdf.gz"), 'w') as f: f.write("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <rdf:Description> <rdf:type>uri:testBatchUpload2</rdf:type> </rdf:Description> </rdf:RDF>""") self.runBatchUpload(graph="uri:example.org") json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload" }') self.assertEquals(1, len(json['results']['bindings'])) json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload2" }') self.assertEquals(1, len(json['results']['bindings']))
def __init__(self, source): self.singles = u'' self.pinyin_dict = {} self.frequency = {} for ln in gzopen(source, 'rt').readlines(): body = ln.strip().split() thechr = unichr(int(body[0], base=16)) if len(pinyins) == 1: self.singles += thechr self.frequency[thechr] = 0 pinyins = map(lambda pinyin: pinyin.replace('u:', 'v'), body[1][1:-1].split(',')) for pinyin in pinyins: if pinyin not in self.pinyin_dict: self.pinyin_dict[pinyin] = u'' self.pinyin_dict[pinyin] += thechr
def to_arb(self, ids, aln_seq_field, directio_basename=None, size=10000): """Fetch ARB records If direct IO, data written direct to file(s). Data are written gzip'd, and spread over multiple files. directio_basename is the base filename, and that name is tagged with a unique number """ bin_ids = (ids[i:i+size] for i in xrange(0, len(ids), size)) if directio_basename is not None: file_count = 0 else: out = [] cursor = self.con.cursor() for chunk in bin_ids: if directio_basename is not None: out = gzopen(directio_basename + '_%d.txt.gz' % file_count, 'w') joined_ids = ','.join(map(str, chunk)) cursor.execute(FULL_RECORD_DUMP % (aln_seq_field, joined_ids)) for rec in cursor.fetchall(): rec_lines = [] rec_lines.append("BEGIN\n") for o, x in zip(FULL_RECORD_ORDER, rec): if o == 'aligned_seq': rec_lines.append("warning=\n") if x is not None: rec_lines.append("%s=%s\n" % (o, str(x))) else: rec_lines.append("%s=\n" % o) rec_lines.append("END\n\n") if directio_basename is not None: out.write(''.join(rec_lines)) else: out.extend(rec_lines) if directio_basename is not None: out.close() file_count += 1 cursor.close() if directio_basename is None: return out else: return []
def w2p_pack(filename, path, compiled=False): filename = abspath(filename) path = abspath(path) tarname = filename + ".tar" if compiled: tar_compiled(tarname, path, "^[\w\.\-]+$") else: tar(tarname, path, "^[\w\.\-]+$") w2pfp = gzopen(filename, "wb") tarfp = open(tarname, "rb") w2pfp.write(tarfp.read()) w2pfp.close() tarfp.close() os.unlink(tarname)
def load(g, pm, fname): print "Loading file..." with gzopen(fname, 'r') as f: data = json.load(f) print "File loaded" vertices = [None] * 16667698 n = -1 n_e = -1 n0 = n n_e0 = n_e # comma_offset = 0 t0 = time() t1 = t0 for node, edges in data: # node, edges = loads(l.strip()[comma_offset:]) # comma_offset = 1 if vertices[node] is not None: v_node = g.vertex(vertices[node]) # log("Node", node, "already exists") else: # log("Creating node for", node) v_node = g.add_vertex() n += 1 vertices[node] = n pm[v_node] = node # Register the actual id of the node as a property of the node for e in edges: v = None if vertices[e] is not None: v = g.vertex(vertices[e]) # log("Node", e, "already exists") else: # log("Creating node for", e, "(", type(e), ") to create the corresponding edge") v = g.add_vertex() n += 1 vertices[e] = n pm[v] = e # Register the actual id of the node as a property of the node n_e += 1 g.add_edge(v_node, v) if n % 10000 is 0: print "======" print "Loaded", n, "nodes in", time()-t0, ". Average:", n/(time()-t0), "nodes/s. Current pace:", (n-n0)/(time()-t1), "n/s" print "Loaded", n_e, "edges in", time()-t0, ". Average:", n_e/(time()-t0), "edges/s. Current pace:", (n_e-n_e0)/(time()-t1), "e/s" n0 = n n_e0 = n_e t1 = time() print "Loaded ", n, "nodes"
def loadSimulation(f): ''' Inverse operation of L{saveSimulation}. Given a file or filename this returns the parameters passed to L{saveSimulation} saved to that file in the same order. @param f: file or filename to load @type f: file or str @return: list or simulation results, see L{saveSimulation} for format. @rtype: list ''' if type(f)==str: f = gzopen(f,'rb') header,sim,tf,nxf,wf,yf,yt,desc = load(f) f.close() return(sim,tf,nxf,wf,yf,yt,desc)
def create_wp_table(conn): print 'creating' cur = conn.cursor() cur.execute('''CREATE TABLE IF NOT EXISTS `titles` (title TEXT UNIQUE);''') print "Extracting/Inserting..." counter = 0 for line in list(set(map(lambda x: preprocess(x.strip()), gzopen('/'.join(os.path.realpath(__file__).split('/')[:-1])+'/enwiki-20131001-all-titles-in-ns0.gz')))): cur.execute("INSERT INTO `titles` (`title`) VALUES (?)", (line,)) counter += 1 if counter % 500 == 0: print counter print "Committing..." conn.commit()
def w2p_unpack(filename, path, delete_tar=True): if filename[-4:] == '.w2p' or filename[-3:] == '.gz': if filename[-4:] == '.w2p': tarname = filename[:-4] + '.tar' else: tarname = filename[:-3] + '.tar' fgzipped = gzopen(filename, 'rb') tarfile = open(tarname, 'wb') tarfile.write(fgzipped.read()) tarfile.close() fgzipped.close() else: tarname = filename untar(tarname, path) if delete_tar: os.unlink(tarname)
def w2p_unpack(filename, path, delete_tar=True): if filename == 'welcome.w2p': create_welcome_w2p() filename = abspath(filename) tarname = None if filename.endswith('.w2p'): tarname = filename[:-4] + '.tar' elif filename.endswith('.gz'): tarname = filename[:-3] + '.tar' if tarname is not None: with gzopen(filename, 'rb') as gzfp, open(tarname, 'wb') as tarfp: shutil.copyfileobj(gzfp, tarfp, 4194304) # 4 MB buffer else: tarname = filename path = abspath(path) untar(tarname, path) if delete_tar: os.unlink(tarname)
def get_from_file(self, doc_id): ''' Return a response with the XML of the parsed text :param doc_id: the id of the document in Solr ''' response = {} (wid, id) = doc_id.split('_') xmlPath = '%s/%s/%s/%s.xml' % (XML_PATH, wid, id[0], id) gzXmlPath = xmlPath + '.gz' if path.exists(gzXmlPath): response['status'] = 200 response[doc_id] = ''.join(gzopen(gzXmlPath).readlines()) elif path.exists(xmlPath): response['status'] = 200 response[doc_id] = ''.join(open(xmlPath).readlines()) else: response['status'] = 500 response['message'] = 'File not found for document %s' % doc_id return response
def _open_table(self, table): tbl_filename = os.path.join(self.root, table) gz_filename = tbl_filename + '.gz' if os.path.exists(tbl_filename) and os.path.exists(gz_filename): logging.warning('Both gzipped and plaintext files for table "{}" ' 'were found; attempting to use the plaintext one.' .format(table)) if os.path.exists(tbl_filename): f = open(tbl_filename) elif os.path.exists(gz_filename): # text mode only from py3.3; until then use TextIOWrapper f = TextIOWrapper( BufferedReader(gzopen(tbl_filename + '.gz', mode='r')) ) else: raise ItsdbError( 'Table {} does not exist at {}(.gz)' .format(table, tbl_filename) ) return f
def srv_traversal(self): maxstep = len(self.srvmfobj) #self.srvmfobj.save(open('xxx.mf', 'wt')) self.report.branch = None for chksum in self.srvmfobj.keys(): yield self.report.setStep(maxstep - len(self.srvmfobj), maxstep) rflist = self.srvmfobj[chksum].rflist self.signprepare() self.signobj.push_rflist(rflist) self.signobj.push_end() fpathes = self.signobj.fpathes if not self.signobj.fpathes: signbody = '' else: self.signobj.run(self.jsz.buf) signbody = self.signobj.sinkobj.get_string() self.signobj = None limit = self.jsz.signlimit * len(rflist) +\ sum(map(lambda rfpath: len(rfpath), rflist)) if len(signbody) < limit: self.report.branch = 'zip' # new package or not worth to do cmps, use cached zip directly. # clean up firstly. rmcnt = 0 for fpath in fpathes: if not pathexists(fpath): continue remove(fpath) rmcnt = rmcnt + 1 self.report.incDels(rmcnt) zipbody = self.urlpost('zip', chksum) self.applyzip(zipbody) else: self.report.branch = 'cmp' # no cache, do signature, patch. patchbody = self.urlpost('cmp', signbody) self.patchflist(patchbody) self.climfobj[chksum] = self.srvmfobj.pop(chksum) self.climfobj.save(gzopen(self.manifest, 'wb')) assert(self.srvmfobj == {}) yield self.report.setStep(maxstep, maxstep) self.report.branch = None yield None
def greengenes_open(file_fp, permission='U'): """Read or write the contents of a file file_fp : file path permission : either 'U','r','w','a' NOTE: univeral line breaks are always used, so 'r' is automatically changed into 'U' """ if permission not in ['U','r','w','a']: raise IOError, "Unknown permission: %s" % permission if file_fp.endswith('gz'): # gzip doesn't support Ub if permission == 'U': permission = 'r' return gzopen(file_fp, permission) else: if permission == 'r': permission = 'U' return open(file_fp, permission)
def w2p_unpack(filename, path, delete_tar=True): if filename == "welcome.w2p": create_welcome_w2p() filename = abspath(filename) path = abspath(path) if filename[-4:] == ".w2p" or filename[-3:] == ".gz": if filename[-4:] == ".w2p": tarname = filename[:-4] + ".tar" else: tarname = filename[:-3] + ".tar" fgzipped = gzopen(filename, "rb") tarfile = open(tarname, "wb") tarfile.write(fgzipped.read()) tarfile.close() fgzipped.close() else: tarname = filename untar(tarname, path) if delete_tar: os.unlink(tarname)