def univ_open(file_path, mode='r'):
    # If the file ends with ".gz" then open it through GZip
    if file_path.split('.')[-1].lower() == 'gz':
        from gzip import open as gzopen
        if mode in ('w', 'wb', 'w+', 'wb+'):
            return gzopen(file_path, mode, 6)
        else:
            return gzopen(file_path, mode)
    else:
        return open(file_path, mode)
Beispiel #2
0
def open_pdb(structure, verbose=True, try_web=True):
    '''Return an opened PDB file handle from STDIN, file, local PDB cache, or web'''

    # STDIN
    if "<open file '<stdin>', mode 'r' at" in str(structure):
        pdb_filehandle = structure

    # AS UNCOMPRESSED PDB FILE
    elif os.path.exists(structure) and is_binary_file(
            structure) == False:  #file exists and is a text-based file
        pdb_filehandle = open(structure, 'r')

    # AS GZIPPED PDB FILE
    elif os.path.exists(structure) and is_binary_file(
            structure) == True:  #file exists and is likely a gzipped file
        try:
            testopen = gzopen(structure, 'r')
            testopen.readline()
            testopen.close()
            pdb_filehandle = gzopen(structure, 'r')
        except IOError:
            if (verbose):
                print 'Invalid structure file-type. Structure file must be a plain-text PDB file or a gzipped PDB file.'
            return

    # AS PDB FILE FROM LOCAL COPY OF THE PDB -OR- FROM THE WEB
    elif len(structure) == 4:

        pdb_storage_path = os.path.join(
            PDB_DATA_DIR,
            '%s/pdb%s.ent.gz' % (structure[1:3].lower(), structure.lower()))

        #local file
        if os.path.exists(pdb_storage_path):
            pdb_filehandle = gzopen(pdb_storage_path, 'r')
        #try the web
        elif (try_web):
            try:
                pdb_filehandle = urlopen(
                    'http://www.rcsb.org/pdb/files/%s.pdb' %
                    (structure.upper()))
            except HTTPError:
                if (verbose):
                    print 'Invalid structure input: %s. Not found as local file, as PDB structure in %s, or on the web.' % (
                        structure, PDB_DATA_DIR)
                return
        else:
            return
    else:
        if (verbose):
            print 'Invalid structure input: %s. Not found as local file, and wrong number of characters for direct PDB reference.' % (
                structure)
        return

    return pdb_filehandle
Beispiel #3
0
    def sample_reads(self):
        if file_exists(self.filenames['sampled_reads']):
            log.info(f"Will use existing {self.filenames['sampled_reads']}")
        else:
            if self.coverage == 0:
                os.symlink(os.path.abspath(self.readfile),
                           self.filenames['sampled_reads'])
                log.info(f"Using all reads as coverage option is 0")
            else:
                log.info(
                    f"Sampling {self.coverage} times coverage of {len(self)/1000000:.1f} Mb assembly from >{self.minreadlength}bp reads in {self.readfile}"
                )

                with gzopen(self.readfile,
                            'rt') as all_reads, gzopen(
                                self.filenames['sampled_reads'],
                                'wt',
                                compresslevel=6) as sampled_reads, tqdm(
                                    total=self.coverage, leave=False) as pbar:
                    sampled_bases = read_count = times_coverage = 0
                    readset = []

                    for title, sequence, quality in FastqGeneralIterator(
                            all_reads):

                        if len(sequence) < self.minreadlength:
                            continue

                        readset.append(f"@{title}\n{sequence}\n+\n{quality}\n")

                        read_count += 1
                        sampled_bases += len(sequence)
                        new_times_coverage = round(sampled_bases / len(self))

                        if new_times_coverage > times_coverage:
                            print(''.join(readset), file=sampled_reads, end='')
                            readset = []
                            pbar.update()
                            times_coverage = new_times_coverage

                        if times_coverage == self.coverage:
                            break
                    print(''.join(readset), file=sampled_reads, end='')

                log.info(
                    f"Wrote {read_count} reads ({sampled_bases} bases, {times_coverage} times coverage) to {self.filenames['sampled_reads']}"
                )
                if times_coverage < self.coverage:
                    log.warning(
                        f"Only found {times_coverage} times coverage in reads longer than {self.minreadlength}, not {self.coverage} times; consider reducing minimum read length (-l)"
                    )
 def close_spider(self, spider):
     self.dump_data()
     # The end of JSON dicts
     f = gzopen(self.fname_crawled_items, "a", 6)
     f.write("\n}")
     f.close()
     f = gzopen(self.fname_web_graph, "ab", 6)
     f.write("\n}")
     f.close()
     self.gzip_dump_timer.cancel()
     self.gzip_dump_timer = None
     self.stop_gzip_timer = True
     # And execute it by hand synchronously to be sure
     # we finish dumping everything before closing
     self.dump_gzipped_contents()
 def load_crawled_items(self, spider):
     self.fname_crawled_items = spider.export_results_filename.replace(".json", "") + ".json.gz"
     # The start of a JSON dict
     f = gzopen(self.fname_crawled_items, "ab", 6)
     f.write("{")
     f.close()
     self.fname_web_graph = (
         spider.export_results_filename.replace(".json", "") + "_graph.json.gz"
     )  # Not directly replacing ".json" pattern so that if pattern not here, we just append to fname!
     # The start of a JSON dict
     f = gzopen(self.fname_web_graph, "ab", 6)
     f.write("{")
     f.close()
     self.crawled_items = {}
     self.nodes_edges = {}
def executaParalelo(arquivo, saida):
    # print('pooooo')
    # lock = th.Lock()
    with gzopen('/home/snoopy/base_de_dados/Base_original/logs-leticia.gz',
                'rt') as baseDados:
        threads = []
        # print(baseDados.readline(1999))
        # return
        # while(True):
        # try:
        linha = baseDados.readline()
        while (linha is not ""):
            # linha = baseDados.readline()
            for i in range(NUMERO_PROCESSOS_POR_VEZ):
                #le uma linha
                linha = baseDados.readline()
                # print(linha)
                #cria um processo
                processo = th.Thread(target=leSubarquivos,
                                     args=(str(linha), saida))
                threads.append(processo)
                # inicia o processo
                processo.start()
                #diz para esperar os n acabarem
                for thread in threads:
                    thread.join()
Beispiel #7
0
 def cli_traversal(self):
     step = 0
     self.report.branch = None
     for chksum in self.climfobj.keys():
         yield self.report.setStep(step, len(self.climfobj))
         if chksum in self.srvmfobj:
             self.report.branch = 'match'
             # all files in the current packages are not changed.
             assert (self.climfobj[chksum].rflist ==
                     self.srvmfobj[chksum].rflist)
             self.report.incKeeps(len(self.climfobj[chksum].rflist))
             del (self.srvmfobj[chksum])
             step = step + 1
         else:
             chksum0 = self.srvmfobj.find(chksum)
             if chksum0 is None:
                 self.report.branch = 'discard'
                 # orphan package in client, discard it.
                 # remember the rflist, remove them if required.
                 self.rflist.extend(self.climfobj[chksum].rflist)
                 del (self.climfobj[chksum])
             else:
                 self.report.branch = 'patch'
                 # cached patch found, use it.
                 patchbody = self.urlpost('patch', chksum)
                 rflist = self.climfobj[chksum].rflist
                 self.patchflist(patchbody, rflist)
                 del (self.climfobj[chksum])
                 self.climfobj[chksum0] = self.srvmfobj.pop(chksum0)
                 self.climfobj[chksum0].clean_history()
                 step = step + 1
             self.climfobj.save(gzopen(self.manifest, 'wb'))
     yield self.report.setStep(step, len(self.climfobj))
     self.report.branch = None
     yield None
Beispiel #8
0
def w2p_unpack(filename, path, delete_tar=True):

    if filename=='welcome.w2p' and (
        not os.path.exists('welcome.w2p') or \
            os.path.exists('NEWINSTALL')):
        try:
            w2p_pack('welcome.w2p', 'applications/welcome')
            os.unlink('NEWINSTALL')
        except:
            msg = "New installation: unable to create welcome.w2p file"
            sys.stderr.write(msg)

    filename = abspath(filename)
    path = abspath(path)
    if filename[-4:] == '.w2p' or filename[-3:] == '.gz':
        if filename[-4:] == '.w2p':
            tarname = filename[:-4] + '.tar'
        else:
            tarname = filename[:-3] + '.tar'
        fgzipped = gzopen(filename, 'rb')
        tarfile = open(tarname, 'wb')
        tarfile.write(fgzipped.read())
        tarfile.close()
        fgzipped.close()
    else:
        tarname = filename
    untar(tarname, path)
    if delete_tar:
        os.unlink(tarname)
Beispiel #9
0
 def cli_traversal(self):
     step = 0
     self.report.branch = None
     for chksum in self.climfobj.keys():
         yield self.report.setStep(step, len(self.climfobj))
         if chksum in self.srvmfobj:
             self.report.branch = 'match'
             # all files in the current packages are not changed.
             assert(self.climfobj[chksum].rflist ==
                    self.srvmfobj[chksum].rflist)
             self.report.incKeeps(len(self.climfobj[chksum].rflist))
             del(self.srvmfobj[chksum])
             step = step + 1
         else:
             chksum0 = self.srvmfobj.find(chksum)
             if chksum0 is None:
                 self.report.branch = 'discard'
                 # orphan package in client, discard it.
                 # remember the rflist, remove them if required.
                 self.rflist.extend(self.climfobj[chksum].rflist)
                 del(self.climfobj[chksum])
             else:
                 self.report.branch = 'patch'
                 # cached patch found, use it.
                 patchbody = self.urlpost('patch', chksum)
                 rflist = self.climfobj[chksum].rflist
                 self.patchflist(patchbody, rflist)
                 del(self.climfobj[chksum])
                 self.climfobj[chksum0] = self.srvmfobj.pop(chksum0)
                 self.climfobj[chksum0].clean_history()
                 step = step + 1
             self.climfobj.save(gzopen(self.manifest, 'wb'))
     yield self.report.setStep(step, len(self.climfobj))
     self.report.branch = None
     yield None
Beispiel #10
0
 def __init__(self, f_in, lemma=False):
     self.lemma = lemma
     self.current_line = None
     if f_in.endswith("gz"):
         self.source = gzopen(f_in, 'rt', encoding='latin-1')
     else:
         self.source = open(f_in, 'r', encoding='latin-1')
Beispiel #11
0
def main():
    r = requests.get('http://mobile.njit.edu/parking/data.php',
                     headers={
                         'Referer': 'http://mobile.njit.edu/parking/',
                         'Origin': 'http://mobile.njit.edu/'
                     })
    current_time = time.strftime('%Y-%m-%d_%H-%M-%S')

    if not r:
        print "[{}] Failed to connect".format(current_time)
        return
    else:
        print "[{}] Connected".format(current_time)

    with gzopen('/opt/parking/data/{}.json.gz'.format(current_time),
                'wt') as o:
        o.write(r.text)

    decks = r.json['decks']

    with engine.connect() as db:
        for d in decks:
            deck = decks[d]['SiteName']
            available = int(decks[d]['Available'])
            occupied = int(decks[d]['Occupied'])
            total = int(decks[d]['Total'])

            db.execute(
                "INSERT INTO NJITParking (deck, available, occupied, total) VALUES (%s, %s, %s, %s);",
                (deck, available, occupied, total))
Beispiel #12
0
def w2p_pack(filename, path, compiled=False, filenames=None):
    """Packs a web2py application.

    Args:
        filename(str): path to the resulting archive
        path(str): path to the application
        compiled(bool): if `True` packs the compiled version
        filenames(list): adds filenames to the archive
    """
    filename = abspath(filename)
    path = abspath(path)
    tarname = filename + '.tar'
    if compiled:
        tar_compiled(tarname,
                     path,
                     r'^[\w.-]+$',
                     exclude_content_from=['cache', 'sessions', 'errors'])
    else:
        tar(tarname,
            path,
            r'^[\w.-]+$',
            filenames=filenames,
            exclude_content_from=['cache', 'sessions', 'errors'])
    with open(tarname, 'rb') as tarfp, gzopen(filename, 'wb') as gzfp:
        shutil.copyfileobj(tarfp, gzfp, 4194304)  # 4 MB buffer
    os.unlink(tarname)
Beispiel #13
0
def open_file_by_mimetype(filename, mode):
    """
    This function determines the compression MIME type of a file as gz, bz, or none, and returns
    an open file handle of the requested mode ('w', 'r', or 'a')
    """

    if mode != 'r' and mode != 'w' and mode != 'a':
        print("please specific a valid mode:  w, r, a")
        return

    if guess_type(filename)[1] == 'gzip':
        try:
            fh = gzopen(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return
    elif guess_type(filename) == 'bzip2':
        try:
            fh = bzopen(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return
    else:
        try:
            fh = open(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return

    return fh
Beispiel #14
0
def create_wp_table(conn):
    """
    Creates the wp table from a sqlite connection. This is basically here for posterity, shouldn't be used.

    :param conn: a sqlite connection
    :type conn: sqlite3.Connection

    """
    print 'creating'
    cur = conn.cursor()
    cur.execute('''CREATE TABLE IF NOT EXISTS `titles`
                    (title TEXT UNIQUE);''')

    print "Extracting/Inserting..."
    counter = 0
    for line in list(set(map(lambda x: preprocess(x.strip()),
                             gzopen('/'.join(os.path.realpath(__file__).split('/')[:-1])
                                     + '/enwiki-20131001-all-titles-in-ns0.gz')))):
        cur.execute("INSERT INTO `titles` (`title`) VALUES (?)", (line,))
        counter += 1
        if counter % 500 == 0:
            print counter

    print "Committing..."
    conn.commit()
Beispiel #15
0
def split_file(input, line_number=10000000):
    #set starting count values
    a = 1
    b = 0
    c = line_number
    #open first output file and add to list
    files = ['split' + str(a) + '.tmp']
    out = open('split' + str(a) + '.tmp', 'w')
    #open input file
    with TextIOWrapper(gzopen(input, 'rb')) as e:
        #iterate over each line, adding an index
        for index, line in enumerate(e):
            #test if index fits between upper and lower limits and write to file
            if index <= c:
                if index > b:
                    out.write(str(line))
            else:
                #close last ouput
                out.close()
                #reset count values
                a += 1
                b = c
                c += line_number
                #open new output and add to list
                filesappend('split' + str(a) + '.tmp')
                out = open('split' + str(a) + '.tmp', 'w')
                #output line
                out.write(str(line))
        #close last ouptut
        out.close()
    #return number of temporary files for use in other functions
    return (files)
def compute_removed_queries_because_of_null_clustering(pickle_path_removed_queries, clusters, join_clusters=None):
    """
        This function will compute the set of queries that should be removed because they have a null clustering 
        over the clusters we are passed in argument.

        :param clusters: a dict {qid: clustering_vector}
        :param join_clusters: if the clusters are being loaded in a background process, the function to be executed to
            force to wait for this background process to have finished before accessing the clusters object
    """
    from numpy.linalg import norm
    print "Looking for queries with null cluster vector..."
    t0 = time()
    try:
        print "Trying to pickle from disk...", pickle_path_removed_queries
        with gzopen(pickle_path_removed_queries, 'r') as f:
            print "File", pickle_path_removed_queries, "was found!"
            removed_queries = set(load_pickled_list(f))
        pickled = True
    except Exception as err:
        if not isinstance(err, IOError):
            print "Error for", pickle_path_removed_queries, "was:", err
        print "No pickled files or error loading it, recomputing..."
        pickled = False
        removed_queries = set()
        # In case of recomputation we need to wait for the clusters data to be available, if they're loaded in bg
        if join_clusters is not None:
            join_clusters()
        for qid, cl in clusters.items():
            if norm(cl) < ZERO_FLOAT:  # Should be precise enough?
                removed_queries.add(qid)
    print "Done ", time()-t0
    pickle_ask(pickled, pickle_path_removed_queries, removed_queries, dump_f=pickle_list)
    return removed_queries
Beispiel #17
0
def write(path, text):
    from gzip import open as gzopen
    print 'writing', path, text.count('\n')
    f = gzopen(path, 'w')
    f.write(text)
    f.close()
    os.system("gzip " + path)
Beispiel #18
0
def _write_table(profile_dir,
                 table_name,
                 rows,
                 fields,
                 append=False,
                 gzip=False):
    # don't gzip if empty
    rows = iter(rows)
    try:
        first_row = next(rows)
    except StopIteration:
        gzip = False
    else:
        rows = chain([first_row], rows)
    if gzip and append:
        logging.warning('Appending to a gzip file may result in '
                        'inefficient compression.')

    if not os.path.exists(profile_dir):
        raise ItsdbError(
            'Profile directory does not exist: {}'.format(profile_dir))

    tbl_filename = os.path.join(profile_dir, table_name)
    mode = 'a' if append else 'w'
    if gzip:
        # text mode only from py3.3; until then use TextIOWrapper
        #mode += 't'  # text mode for gzip
        f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode))
    else:
        f = open(tbl_filename, mode=mode)

    for row in rows:
        f.write(make_row(row, fields) + '\n')

    f.close()
Beispiel #19
0
def filter_and_read_tsv(dat, gzipped, integer_samfilters):
    """If filters supplied, subset DAT first, then read with pandas"""
    number_retained = 0
    if gzipped:
        opener = gzopen
    else:
        opener = open
    with opener(dat, mode="rt") as dat_handle:
        with TemporaryDirectory() as tempdir:
            datflt_name = path.join(tempdir, "dat.gz")
            with gzopen(datflt_name, mode="wt") as datflt:
                decorated_line_iterator = progressbar(
                    dat_handle,
                    desc="Filtering",
                    unit=" lines",
                )
                for line in decorated_line_iterator:
                    if line[0] == "#":
                        print(line, end="", file=datflt)
                    else:
                        fields = line.split("\t")
                        line_passes_filter = entry_filters_ok(
                            int(fields[1]),
                            int(fields[4]),
                            integer_samfilters,
                        )
                        if line_passes_filter:
                            number_retained += 1
                            print(line, end="", file=datflt)
                print("Kept {} records".format(number_retained), file=stderr)
            print("Loading DAT...", file=stderr, flush=True)
            return read_csv(datflt_name, sep="\t", escapechar="#")
def split_fastq_for_sample_barcodes(path_to_splitted_fastq, read1_file):
    from Bio.Seq import Seq
    from Bio import SeqIO
    from gzip import open as gzopen

    for record in SeqIO.parse(gzopen(read1_file, "rt"), format="fastq"):
        BGI_header = record.id
        tile_fastq = int(BGI_header[20:-2])
        tile_fastq = str(tile_fastq)
        x_pos = int(BGI_header[13:16])
        x_pos = str(x_pos)
        y_pos = int(BGI_header[17:20])
        y_pos = str(y_pos)
        read1_pos = tile_fastq + ":" + x_pos + ":" + y_pos
        try:
            read1_info = read1_header_dict[read1_pos]
            sample = read1_info[0]
            read1_header = read1_info[1]
            record.description = record.description.replace(record.id, "")
            record.id = read1_header
            splitted_fastq_file = path_to_splitted_fastq + sample + "_read1.fq"
            with open(splitted_fastq_file, "a") as output_handle:
                SeqIO.write(record, output_handle, "fastq")
        except:
            continue
Beispiel #21
0
def w2p_pack(filename, path, compiled=False, filenames=None):
    """Packs a web2py application.

    Args:
        filename(str): path to the resulting archive
        path(str): path to the application
        compiled(bool): if `True` packs the compiled version
        filenames(list): adds filenames to the archive
    """
    filename = abspath(filename)
    path = abspath(path)
    tarname = filename + '.tar'
    if compiled:
        tar_compiled(tarname,
                     path,
                     '^[\w\.\-]+$',
                     exclude_content_from=['cache', 'sessions', 'errors'])
    else:
        tar(tarname,
            path,
            '^[\w\.\-]+$',
            filenames=filenames,
            exclude_content_from=['cache', 'sessions', 'errors'])
    w2pfp = gzopen(filename, 'wb')
    tarfp = open(tarname, 'rb')
    w2pfp.write(tarfp.read())
    w2pfp.close()
    tarfp.close()
    os.unlink(tarname)
Beispiel #22
0
def open(dir: util.PathLike,
         name: str,
         encoding: Optional[str] = None) -> IO[str]:
    """
    Open a TSDB database file.

    Unlike a normal `open()` call, this function takes a base
    directory *dir* and a filename *name* and determines whether the
    plain text *dir*/*name* or compressed *dir*/*name*.gz file is
    opened. Furthermore, this function only opens files in read-only
    text mode. For writing database files, see :func:`write`.

    Args:
        dir: path to the database directory
        name: name of the file to open
        encoding: character encoding of the file
    Example:
        >>> sentences = []
        >>> with tsdb.open('my-profile', 'item') as item:
        ...     for line in item:
        ...         sentences.append(tsdb.split(line)[6])
    """
    path = get_path(dir, name)
    if path.suffix.lower() == '.gz':
        return gzopen(path, mode='rt', encoding=encoding)
    else:
        return path.open(encoding=encoding)
Beispiel #23
0
def _write_table(profile_dir, table_name, rows, fields,
                 append=False, gzip=False):
    # don't gzip if empty
    rows = iter(rows)
    try:
        first_row = next(rows)
    except StopIteration:
        gzip = False
    else:
        rows = chain([first_row], rows)
    if gzip and append:
        logging.warning('Appending to a gzip file may result in '
                        'inefficient compression.')

    if not os.path.exists(profile_dir):
        raise ItsdbError('Profile directory does not exist: {}'
                         .format(profile_dir))

    tbl_filename = os.path.join(profile_dir, table_name)
    mode = 'a' if append else 'w'
    if gzip:
        # text mode only from py3.3; until then use TextIOWrapper
        #mode += 't'  # text mode for gzip
        f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode))
    else:
        f = open(tbl_filename, mode=mode)

    for row in rows:
        f.write(make_row(row, fields) + '\n')

    f.close()
Beispiel #24
0
def _open_table(tbl_filename):
    if tbl_filename.endswith('.gz'):
        gz_filename = tbl_filename
        tbl_filename = tbl_filename[:-3]
    else:
        gz_filename = tbl_filename + '.gz'

    if os.path.exists(tbl_filename) and os.path.exists(gz_filename):
        logging.warning(
            'Both gzipped and plaintext files were found; attempting to '
            'use the plaintext one.'
        )
    if os.path.exists(tbl_filename):
        with open(tbl_filename) as f:
            yield f
    elif os.path.exists(gz_filename):
        # text mode only from py3.3; until then use TextIOWrapper
        with TextIOWrapper(
                BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))
             ) as f:
            yield f
    else:
        raise ItsdbError(
            'Table does not exist at {}(.gz)'
            .format(tbl_filename)
        )
Beispiel #25
0
def check_seq(file):
    if file == '-':
        try:
            seq = stdin.read()
        except UnicodeDecodeError:
            exit(
                "[!] Cannot read STDIN, if gzipped try: file | gunzip -c | sideroscanner"
            )
    else:
        if not Path(file).exists():
            return print(f"[!] {file} does not exist, skipping...")
        try:
            with open(file, 'r') as fr:
                seq = fr.read()
        except IsADirectoryError:
            return print(f"[!] {file} is a directory, skipping...")

        except UnicodeDecodeError:
            try:
                with gzopen(file, "rt") as fr:
                    seq = fr.read()
            except OSError:
                return print(f"[!] Could not open {file}, skipping...")

    if len(seq) <= 10:
        return print(f"[!] {file} is too small, skipping...")

    if ">" in seq[0] or "@" in seq[0]:
        return seq
    else:
        return print(f"[!] {file} is not a fasta file, skipping...")
Beispiel #26
0
def fastq_info(fastq, chunksize=10000):
    '''
    Extract flowcell and other metadata from a FASTQ

    Parameters
    ----------
    fastq : str
        Path to FASTQ file
    chunksize : int
        Number of records to read simultaneously
    '''
    # check for file format
    if fastq.endswith('.gz'):
        fq_handle = gzopen(fastq, 'rt')
    else:
        fq_handle = open(fastq, 'r')
    # load reads for random access
    records = SeqIO.parse(fq_handle, 'fastq')
    # data structure for managing metadata
    metadata = {'flowcells': set([]), 'n_reads': 0}
    # load `chunksize` reads at a time
    chunked_records = grouper(records, chunksize)
    for chunk in chunked_records:
        # filter chunk
        chunk = [r for r in chunk if r is not None]
        # count reads
        metadata['n_reads'] += len(chunk)
        # extract unique flowcell IDs
        flowcells = set([parse_read_id(r.id, 'flowcell') for r in chunk])
        for f in flowcells:
            metadata['flowcells'].add(f)

    fq_handle.close()
    return metadata
def add_ICRA_probs(jsdel_f,
                   in_bam_f,
                   out_bam_f,
                   remove_unmapped=True,
                   remove_not_in_delta=False,
                   delta_thresh=0.9):
    dt = datetime.now()
    with gzopen(jsdel_f, 'rt') as jsdel_fh:
        delta = dict(ujson.load(jsdel_fh, precise_float=True))
    log_.info('Loaded delta from file {}. Time: {}'.format(
        jsdel_f,
        datetime.now() - dt))
    in_bam = pysam.AlignmentFile(in_bam_f)  # @UndefinedVariable
    out_bam = pysam.AlignmentFile(out_bam_f, "wb",
                                  header=in_bam.header)  # @UndefinedVariable
    for rid, grp in groupby(in_bam, attrgetter('query_name')):
        alngrp = list(grp)
        if remove_unmapped:
            if (len(alngrp) == 1 and alngrp[0].is_unmapped) \
                    or (len(alngrp) == 2 and alngrp[0].is_unmapped and alngrp[1].is_unmapped):
                continue
        try:
            d_rid = delta[rid]
        except KeyError:
            if remove_not_in_delta:
                continue
            for aln in alngrp:
                aln = _add_zw_tag(aln, 0)
            _write_to_sam(out_bam, alngrp)
            continue
        alngrp = _add_deltas(alngrp, d_rid, delta_thresh)
        _write_to_sam(out_bam, alngrp)
Beispiel #28
0
def open_fasta(filename):
    """Open FASTA with 'open' if plaintext, 'gzip.open' if gzipped"""
    with open(filename, mode="rb") as bytes_handle:
        is_gzipped = (hexlify(bytes_handle.read(2)) == b"1f8b")
    if is_gzipped:
        yield gzopen(filename, mode="rt")
    else:
        yield open(filename, mode="rt")
Beispiel #29
0
    def prepare(self):
        """
        Fill all properties not already populated at initialization with values.
        Returns:
            True in case of success, False otherwise.
        """
        logger = logging.getLogger("shared.embeddings_config.prepare")

        # Inspired by https://github.com/bplank/bilstm-aux/blob/master/src/lib/mio.py#L5-L22
        logger.debug("Opening embeddings file from %s with %s encoding",
                     self.path, self.encoding)
        logger.debug("Using separator '%s'", self.separator)
        if self.lower:
            logger.debug("All words will be converted to lowercase")

        if self.gzip:
            f = gzopen(self.path, mode="r")
            lines = codecs.getreader("utf-8")(f).readlines()
        else:
            f = codecs.open(self.path, mode="r", encoding="utf-8")
            lines = f

        for line in lines:
            try:
                fields = line.strip().split(self.separator)
                # All fields but the first are values of the embedding vector
                vec = [float(value) for value in fields[1:]]
                # The first field is the word
                word = fields[0]

                # Apply lower case
                if self.lower:
                    word = word.lower()

                self._vectors[word] = vec

            except ValueError:
                logger.warn(
                    "Failed to prepare embeddings because line in embeddings file could not be read: %s",
                    line)
                return False

        # Close file
        f.close()

        # Check if the length of the vectors is actually the specified embeddings size
        logger.debug("Vectors should have dimensionality of %d", self.size)
        logger.debug("Vectors from embedding file have dimensionality of %d",
                     len(vec) if vec else 0)

        assert len(vec) == self.size

        logger.info(
            "Finished reading the embeddings file. Loaded vectors for %d distinct words.",
            len(self.vectors))

        self._prepared = True
        return True
def generate_region_file(bam_region, region, options):
    reads, pos_range = get_reads_and_ranges(bam_region, *region, options)
    wps_list, cov_sites = get_wps(reads, pos_range, *region, options)
    if cov_sites or options.empty:
        if region.strand == "-":
            wps_list = reversed(wps_list)
        with gzopen(options.outfile%region.cid, "wt") as wps_handle:
            for line in wps_list:
                print(*line, sep="\t", file=wps_handle)
Beispiel #31
0
def _open_fastq(in_path):
    """Returns compressed or uncompressed FASTQ file handle"""
    try:
        in_fq = gzopen(in_path, 'r')
        in_fq.readline()
    except IOError:
        in_fq = open(in_path, 'r')
    in_fq.seek(0)
    return in_fq
def parse_file(path):
    ranges = {}
    if '.gz' in path:
        with gzopen(path, 'rt') as f:
            parse_lines(f, ranges)
    else:
        with open(path) as f:
            parse_lines(f, ranges)
    return ranges
Beispiel #33
0
def load_delve(dataset_path, dataset_spec, n=None):
    """
        Load an delve dataset. Specification is given by the spec file.

        :param dataset_path
            Path to the .data.gz file.
        :param dataset_spec
            Path to the .spec file.
        :param n
            If defined, read only first n rows.

        :return
            Dictionary data, target.
    """
    rdict = dict()
    sd = parse_spec(dataset_spec)
    fp = gzopen(dataset_path, "r")
    line = str(fp.readline())
    count = 0

    X = list()
    y = list()

    while line:

        if line.count('\\'):
            # Must read another line
            line = line.strip().replace("\\", "") + str(fp.readline())

        x = zeros((sd.num_vars, ))
        for i, v in enumerate(line.strip().split()):
            if i in sd:
                if sd[i] == sd.TARGET:
                    y.append(float(v))
                else:
                    j = sd[i]
                    x[j] = float(v)
            elif (i, v) in sd:
                j = sd[i, v]
                x[j] = 1
            else:
                pass

        X.append(x)

        line = str(fp.readline())
        count += 1

        if n is not None and count == n:
            break

    rdict["data"] = array(X)
    rdict["target"] = array(y)
    rdict["labels"] = [sd.labels[i] for i in range(len(X[0]))]

    return rdict
Beispiel #34
0
def make_single_fastq_gz(read_sets, out_dir, include_reverse):
    """Recovers read set information from kneaddata output

    Parameters
    ----------
    read_sets: list of tup
        list of 7-tuples with run prefix, sample name, fwd paired read fp,
        rev paired read fp, fwd unpaired read fp, rev unpaired read fp, and
        single fwd read fp.
    out_dir : str
        The path to a directory in which to write files
    include_reverse : bool
        Whether to include reverse sequences in combined file

    Returns
    -------
    combined_reads: list of tup
        list of 3-tuples with run prefix, sample name, combined gzip fastq

    Raises
    ------
    OSError
        If the Popen process call to cat returns with value other than 0

    Notes
    -----
    If all input files are empty for a sample, will not output that sample in
    the `sample` list.
    """
    combined_reads = []
    for run_prefix, sample, f_p, r_p, f_u, r_u, s in read_sets:
        out_fp = join(out_dir, '%s.fastq.gz' % run_prefix)

        if s is None:
            if include_reverse:
                cmd = 'cat %s %s %s %s > %s' % (f_p, r_p, f_u, r_u, out_fp)
            else:
                cmd = 'cat %s %s > %s' % (f_p, f_u, out_fp)
        else:
            cmd = 'cat %s > %s' % (s, out_fp)

        proc = Popen(cmd, shell=True)

        failure = proc.wait()

        if failure != 0:
            raise OSError('Problem with cat of files: %s' % cmd)

        # Check to make sure that the combined gzip is not totally empty
        with gzopen(out_fp, 'rb') as f:
            data = f.read(1).strip()

        if data:
            combined_reads.append((run_prefix, sample, out_fp))

    return (combined_reads)
Beispiel #35
0
def _open_table(tbl_filename):
    path = _table_filename(tbl_filename)
    if path.endswith('.gz'):
        # text mode only from py3.3; until then use TextIOWrapper
        with TextIOWrapper(
                BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))) as f:
            yield f
    else:
        with open(tbl_filename) as f:
            yield f
Beispiel #36
0
def main():
    blocks = []
    with open("Blocks.txt", "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip() or line[0] == "#":
                continue

            bdata0 = line.split(";")
            bdata0[1] = bdata0[1].strip()
            bdata1 = bdata0[0].split("..")

            blocks.append(UnicodeBlock(bdata1[0], bdata1[1], bdata0[1]))

    han_re = re_compile(r"[A-Z]")
    unihan = {}
    with open("Unihan_Readings.txt", "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip() or line[0] == "#":
                continue

            bdata0 = line.split("\t")
            cp = bdata0[0][2:]
            if cp not in unihan:
                unihan[cp] = UnihanData(cp)

            prop = bdata0[1][1:]
            prop = han_re.sub(lambda m: "_" + m.group(0).lower(), prop)[1:]

            setattr(unihan[cp], prop, bdata0[2].strip())

    data = []
    with open("UnicodeData.txt", "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip() or line[0] == "#":
                contine

            udata = line.split(";")
            uval = int(udata[0], 16)

            for block in blocks:
                if block.start <= uval and block.end >= uval:
                    tblock = block
                    break

            data.append(UnicodeCodepoint(udata, tblock))

    unidata = {
        "blocks": blocks,
        "unihan": list(unihan.values()),
        "characters": data
    }
    unidata_json = dumps(unidata, default=json_callback)

    with gzopen("unicode_data.json.gz", "wb") as f:
        f.write(unidata_json.encode("utf-8"))
Beispiel #37
0
def count_fastq_sequences(filename, min_length=MIN_READ_LENGTH):
    counts = collections.defaultdict(int)

    with gzopen(filename, mode="rt") as handle:
        for idx, line in enumerate(handle):
            if idx % 4 == 1:
                read = line.strip().upper()
                if len(read) > min_length:
                    counts[read] += 1

    return dict(counts)
Beispiel #38
0
 def read_file(self, filename):
     """Reads zipped NetCDF file and returns its file pointer."""
     #
     # Uncompress NetCDF file.
     f = gzopen('%s' % (filename), 'rb')
     g = open('%s_%s.nc' % (self.params['uuid'], 'dump'), 'wb')
     g.write(f.read())
     f.close()
     g.close()
     #
     return netcdf('%s_%s.nc' % (self.params['uuid'], 'dump'), 'r')
Beispiel #39
0
def save_fake_fib(fname):
    """returns a dict to get saved"""
    inds = np.arange(QSDR_SHAPE[0] * QSDR_SHAPE[1] * QSDR_SHAPE[2])
    mx, my, mz = np.unravel_index(inds,QSDR_SHAPE,order="F")
    fop = gzopen(fname,"wb")
    savemat(fop,
            {"dimension":np.array(QSDR_SHAPE),
                     "mx":mx,"my":my,"mz":mz},
            format='4'
            )
    fop.close()
Beispiel #40
0
 def read_file(self, filename):
     """Reads zipped NetCDF file and returns its file pointer."""
     #
     # Uncompress NetCDF file.
     f = gzopen('%s' % (filename), 'rb')
     g = open('%s_%s.nc' % (self.params['uuid'], 'dump'), 'wb')
     g.write(f.read())
     f.close()
     g.close()
     #
     return netcdf('%s_%s.nc' % (self.params['uuid'], 'dump'), 'r')
def do_process_clusters_pickle(pickle_path_clusters):    
    global big_queries_set, clusters

    try:
        print "Trying to pickle from disk...", pickle_path_clusters
        with gzopen(pickle_path_clusters, 'r') as f:
            print "File", pickle_path_clusters, "was found!"
            clusters = load_pickled_dict_to_np_arrays(f, pre_initialized_dict=clusters)
    except Exception as err:
        print "Error for", pickle_path_clusters, "was:", err
        return False
    return clusters
def w2p_pack(filename, path, compiled=False):
    tarname = filename + '.tar'
    if compiled:
        tar_compiled(tarname, path, '^[\w\.\-]+$')
    else:
        tar(tarname, path, '^[\w\.\-]+$')
    w2pfp = gzopen(filename, 'wb')
    tarfp = open(tarname, 'rb')
    w2pfp.write(tarfp.read())
    w2pfp.close()
    tarfp.close()
    os.unlink(tarname)
 def dump_gzipped_contents(self):
     while self.gzipped_io_queue:
         fname, content = self.gzipped_io_queue.pop()
         print "Dumping gzipped data to file", fname
         f = gzopen(fname, "wb", 6)  # 6 is supposed to offer very good perf/size ratio
         f.write(content)
         f.close()
     if self.gzip_dump_timer is None or self.stop_gzip_timer is True:
         # It means we should not repeat ourselves
         return
     else:
         self.gzip_dump_timer = Timer(self.DUMP_PAGES_EVERY_X_SECONDS, self.dump_gzipped_contents)
         self.gzip_dump_timer.start()
Beispiel #44
0
    def testBatchUpload(self):
        with gzopen(join(self.bulkLoadDir, "test.rdf.gz"), 'w') as f:
            f.write("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <rdf:Description>
        <rdf:type>uri:testBatchUpload</rdf:type>
    </rdf:Description>
</rdf:RDF>""")
        self.runBatchUpload(graph="uri:example.org")
        json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload" }')
        self.assertEquals(1, len(json['results']['bindings']))
        self.clearBatches()
        with gzopen(join(self.bulkLoadDir, "test2.rdf.gz"), 'w') as f:
            f.write("""<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <rdf:Description>
        <rdf:type>uri:testBatchUpload2</rdf:type>
    </rdf:Description>
</rdf:RDF>""")
        self.runBatchUpload(graph="uri:example.org")
        json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload" }')
        self.assertEquals(1, len(json['results']['bindings']))
        json = self.query('SELECT ?s WHERE { ?s ?p "uri:testBatchUpload2" }')
        self.assertEquals(1, len(json['results']['bindings']))
Beispiel #45
0
 def __init__(self, source):
     self.singles = u''
     self.pinyin_dict = {}
     self.frequency = {}
     for ln in gzopen(source, 'rt').readlines():
         body = ln.strip().split()
         thechr = unichr(int(body[0], base=16))
         if len(pinyins) == 1: self.singles += thechr
         self.frequency[thechr] = 0
         pinyins = map(lambda pinyin: pinyin.replace('u:', 'v'), body[1][1:-1].split(','))
         for pinyin in pinyins:
             if pinyin not in self.pinyin_dict: self.pinyin_dict[pinyin] = u''
             self.pinyin_dict[pinyin] += thechr
Beispiel #46
0
    def to_arb(self, ids, aln_seq_field, directio_basename=None, size=10000):
        """Fetch ARB records

        If direct IO, data written direct to file(s). Data are written gzip'd,
        and spread over multiple files. directio_basename is the base
        filename, and that name is tagged with a unique number
        """
        bin_ids = (ids[i:i+size] for i in xrange(0, len(ids), size))

        if directio_basename is not None:
            file_count = 0
        else:
            out = []

        cursor = self.con.cursor()
        for chunk in bin_ids:
            if directio_basename is not None:
                out = gzopen(directio_basename + '_%d.txt.gz' % file_count,
                             'w')

            joined_ids = ','.join(map(str, chunk))
            cursor.execute(FULL_RECORD_DUMP % (aln_seq_field, joined_ids))

            for rec in cursor.fetchall():
                rec_lines = []
                rec_lines.append("BEGIN\n")
                for o, x in zip(FULL_RECORD_ORDER, rec):
                    if o == 'aligned_seq':
                        rec_lines.append("warning=\n")

                    if x is not None:
                        rec_lines.append("%s=%s\n" % (o, str(x)))
                    else:
                        rec_lines.append("%s=\n" % o)
                rec_lines.append("END\n\n")

                if directio_basename is not None:
                    out.write(''.join(rec_lines))
                else:
                    out.extend(rec_lines)

            if directio_basename is not None:
                out.close()
                file_count += 1

        cursor.close()
        if directio_basename is None:
            return out
        else:
            return []
Beispiel #47
0
def w2p_pack(filename, path, compiled=False):
    filename = abspath(filename)
    path = abspath(path)
    tarname = filename + ".tar"
    if compiled:
        tar_compiled(tarname, path, "^[\w\.\-]+$")
    else:
        tar(tarname, path, "^[\w\.\-]+$")
    w2pfp = gzopen(filename, "wb")
    tarfp = open(tarname, "rb")
    w2pfp.write(tarfp.read())
    w2pfp.close()
    tarfp.close()
    os.unlink(tarname)
def load(g, pm, fname):
    print "Loading file..."
    with gzopen(fname, 'r') as f:
        data = json.load(f)
    print "File loaded"
    vertices = [None] * 16667698
    n = -1
    n_e = -1
    n0 = n
    n_e0 = n_e
    # comma_offset = 0
    t0 = time()
    t1 = t0
    for node, edges in data:
        # node, edges = loads(l.strip()[comma_offset:])
        # comma_offset = 1

        if vertices[node] is not None:
            v_node = g.vertex(vertices[node])
            # log("Node", node, "already exists")
        else:
            # log("Creating node for", node)
            v_node = g.add_vertex()
            n += 1
            vertices[node] = n
            pm[v_node] = node  # Register the actual id of the node as a property of the node
            
        for e in edges:
            v = None
            if vertices[e] is not None:
                v = g.vertex(vertices[e])
                # log("Node", e, "already exists")
            else:
                # log("Creating node for", e, "(", type(e), ") to create the corresponding edge")
                v = g.add_vertex()
                n += 1
                vertices[e] = n
                pm[v] = e  # Register the actual id of the node as a property of the node
            n_e += 1
            g.add_edge(v_node, v)

        if n % 10000 is 0:
            print "======"
            print "Loaded", n, "nodes in", time()-t0, ". Average:", n/(time()-t0), "nodes/s. Current pace:", (n-n0)/(time()-t1), "n/s"
            print "Loaded", n_e, "edges in", time()-t0, ". Average:", n_e/(time()-t0), "edges/s. Current pace:", (n_e-n_e0)/(time()-t1), "e/s"
            n0 = n
            n_e0 = n_e
            t1 = time()
    print "Loaded ", n, "nodes"
Beispiel #49
0
def loadSimulation(f):
  '''
  Inverse operation of L{saveSimulation}. Given a file or filename 
  this returns the parameters passed to L{saveSimulation} saved to 
  that file in the same order.
  @param f: file or filename to load
  @type f: file or str
  @return: list or simulation results, see L{saveSimulation} for format.
  @rtype: list
  '''
  if type(f)==str:
    f = gzopen(f,'rb')
  header,sim,tf,nxf,wf,yf,yt,desc = load(f)
  f.close()
  return(sim,tf,nxf,wf,yf,yt,desc)
def create_wp_table(conn):
    print 'creating'
    cur = conn.cursor()
    cur.execute('''CREATE TABLE IF NOT EXISTS `titles`
                    (title TEXT UNIQUE);''')

    print "Extracting/Inserting..."
    counter = 0
    for line in list(set(map(lambda x: preprocess(x.strip()), gzopen('/'.join(os.path.realpath(__file__).split('/')[:-1])+'/enwiki-20131001-all-titles-in-ns0.gz')))):
        cur.execute("INSERT INTO `titles` (`title`) VALUES (?)", (line,))
        counter += 1
        if counter % 500 == 0:
            print counter

    print "Committing..."
    conn.commit()
def w2p_unpack(filename, path, delete_tar=True):
    if filename[-4:] == '.w2p' or filename[-3:] == '.gz':
        if filename[-4:] == '.w2p':
            tarname = filename[:-4] + '.tar'
        else:
            tarname = filename[:-3] + '.tar'
        fgzipped = gzopen(filename, 'rb')
        tarfile = open(tarname, 'wb')
        tarfile.write(fgzipped.read())
        tarfile.close()
        fgzipped.close()
    else:
        tarname = filename
    untar(tarname, path)
    if delete_tar:
        os.unlink(tarname)
Beispiel #52
0
def w2p_unpack(filename, path, delete_tar=True):
    if filename == 'welcome.w2p':
        create_welcome_w2p()
    filename = abspath(filename)
    tarname = None
    if filename.endswith('.w2p'):
        tarname = filename[:-4] + '.tar'
    elif filename.endswith('.gz'):
        tarname = filename[:-3] + '.tar'
    if tarname is not None:
        with gzopen(filename, 'rb') as gzfp, open(tarname, 'wb') as tarfp:
            shutil.copyfileobj(gzfp, tarfp, 4194304) # 4 MB buffer
    else:
        tarname = filename
    path = abspath(path)
    untar(tarname, path)
    if delete_tar:
        os.unlink(tarname)
Beispiel #53
0
    def get_from_file(self, doc_id):
        ''' Return a response with the XML of the parsed text 
        :param doc_id: the id of the document in Solr
        '''

        response = {}
        (wid, id) = doc_id.split('_')
        xmlPath = '%s/%s/%s/%s.xml' % (XML_PATH, wid, id[0], id)
        gzXmlPath = xmlPath + '.gz'
        if path.exists(gzXmlPath):
            response['status'] = 200
            response[doc_id] = ''.join(gzopen(gzXmlPath).readlines())
        elif path.exists(xmlPath):
            response['status'] = 200
            response[doc_id] = ''.join(open(xmlPath).readlines())
        else:
            response['status'] = 500
            response['message'] = 'File not found for document %s' % doc_id
        return response
Beispiel #54
0
 def _open_table(self, table):
     tbl_filename = os.path.join(self.root, table)
     gz_filename = tbl_filename + '.gz'
     if os.path.exists(tbl_filename) and os.path.exists(gz_filename):
         logging.warning('Both gzipped and plaintext files for table "{}" '
                         'were found; attempting to use the plaintext one.'
                         .format(table))
     if os.path.exists(tbl_filename):
         f = open(tbl_filename)
     elif os.path.exists(gz_filename):
         # text mode only from py3.3; until then use TextIOWrapper
         f = TextIOWrapper(
             BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))
         )
     else:
         raise ItsdbError(
             'Table {} does not exist at {}(.gz)'
             .format(table, tbl_filename)
         )
     return f
Beispiel #55
0
 def srv_traversal(self):
     maxstep = len(self.srvmfobj)
     #self.srvmfobj.save(open('xxx.mf', 'wt'))
     self.report.branch = None
     for chksum in self.srvmfobj.keys():
         yield self.report.setStep(maxstep - len(self.srvmfobj), maxstep)
         rflist = self.srvmfobj[chksum].rflist
         self.signprepare()
         self.signobj.push_rflist(rflist)
         self.signobj.push_end()
         fpathes = self.signobj.fpathes
         if not self.signobj.fpathes: signbody = ''
         else:
             self.signobj.run(self.jsz.buf)
             signbody = self.signobj.sinkobj.get_string()
         self.signobj = None
         limit = self.jsz.signlimit * len(rflist) +\
                 sum(map(lambda rfpath: len(rfpath), rflist))
         if len(signbody) < limit:
             self.report.branch = 'zip'
             # new package or not worth to do cmps, use cached zip directly.
             # clean up firstly.
             rmcnt = 0
             for fpath in fpathes:
                 if not pathexists(fpath): continue
                 remove(fpath)
                 rmcnt = rmcnt + 1
             self.report.incDels(rmcnt)
             zipbody = self.urlpost('zip', chksum)
             self.applyzip(zipbody)
         else:
             self.report.branch = 'cmp'
             # no cache, do signature, patch.
             patchbody = self.urlpost('cmp', signbody)
             self.patchflist(patchbody)
         self.climfobj[chksum] = self.srvmfobj.pop(chksum)
         self.climfobj.save(gzopen(self.manifest, 'wb'))
     assert(self.srvmfobj == {})
     yield self.report.setStep(maxstep, maxstep)
     self.report.branch = None
     yield None
Beispiel #56
0
def w2p_pack(filename, path, compiled=False, filenames=None):
    """Packs a web2py application.

    Args:
        filename(str): path to the resulting archive
        path(str): path to the application
        compiled(bool): if `True` packs the compiled version
        filenames(list): adds filenames to the archive
    """
    filename = abspath(filename)
    path = abspath(path)
    tarname = filename + '.tar'
    if compiled:
        tar_compiled(tarname, path, r'^[\w.-]+$',
                     exclude_content_from=['cache', 'sessions', 'errors'])
    else:
        tar(tarname, path, r'^[\w.-]+$', filenames=filenames,
            exclude_content_from=['cache', 'sessions', 'errors'])
    with open(tarname, 'rb') as tarfp, gzopen(filename, 'wb') as gzfp:
        shutil.copyfileobj(tarfp, gzfp, 4194304) # 4 MB buffer
    os.unlink(tarname)
Beispiel #57
0
def greengenes_open(file_fp, permission='U'):
    """Read or write the contents of a file
    
    file_fp : file path
    permission : either 'U','r','w','a'
    
    NOTE: univeral line breaks are always used, so 'r' is automatically changed
    into 'U'
    """
    if permission not in ['U','r','w','a']:
        raise IOError, "Unknown permission: %s" % permission

    if file_fp.endswith('gz'):
        # gzip doesn't support Ub
        if permission == 'U':
            permission = 'r'
        return gzopen(file_fp, permission)
    else:
        if permission == 'r':
            permission = 'U'
        return open(file_fp, permission)
Beispiel #58
0
def w2p_unpack(filename, path, delete_tar=True):

    if filename == "welcome.w2p":
        create_welcome_w2p()
    filename = abspath(filename)
    path = abspath(path)
    if filename[-4:] == ".w2p" or filename[-3:] == ".gz":
        if filename[-4:] == ".w2p":
            tarname = filename[:-4] + ".tar"
        else:
            tarname = filename[:-3] + ".tar"
        fgzipped = gzopen(filename, "rb")
        tarfile = open(tarname, "wb")
        tarfile.write(fgzipped.read())
        tarfile.close()
        fgzipped.close()
    else:
        tarname = filename
    untar(tarname, path)
    if delete_tar:
        os.unlink(tarname)