def open_by_suffix(filename): if filename.endswith('.gz'): return gzip.open(filename, 'rt') elif filename.endswith('.bz2'): return bz2.BZ2file(filename, 'r') else: # assume text file return open(filename, 'r')
def open_by_suffix(filename): """Detect compressed file by filename suffix, and return gzip, bz2, or normal file handles.""" if Path(filename).suffix == '.gz': return gzip.open(filename, 'rt') elif Path(filename).suffix == '.bz2': return bz2.BZ2file(filename, 'rt') else: return open(filename, 'r')
def open_by_suffix(filename, mode='rt'): """Open compressed or uncompressed files.""" if filename.endswith('.gz'): return gzip.open(filename, mode) elif filename.endswith('.bz2'): return bz2.BZ2file(filename, mode) else: return open(filename, mode[0])
def phred_parse(fastqlist, sample_size): phred64dict = {} counter = 0 for element in fastqlist: switch = False print(element) if element.split('.')[-1] == 'gz' or element.split('.')[-1] == 'gzip': for line in gzip.open(element, 'r'): bline = line.decode()[2:-3] if counter > sample_size: break elif len(bline) < 3: continue elif line[0] == "@": continue else: switch, counter, breakswitch = do_line( bline, switch, phred64dict, counter) if breakswitch == True: break elif element.split('.')[-1] == 'bz2' or element.split( '.')[-1] == 'bzip2': for line in bz2.BZ2file(element, "r"): bline = line.decode()[2:-3] if counter > sample_size: break elif len(bline) < 3: continue elif line[0] == "@": continue else: switch, counter, breakswitch = do_line( line, switch, phred64dict, counter) if breakswitch == True: break elif element.split('.')[-1] == 'tar': for line in tarfile.open(element, "r"): bline = line.decode()[2:-3] if counter > sample_size: break elif len(bline) < 3: continue elif line[0] == "@": continue else: switch, counter, breakswitch = do_line( line, switch, phred64dict, counter) if breakswitch == True: break else: for line in open(element): if counter > sample_size: break elif len(line) < 3: continue elif line[0] == "@": continue else: switch, counter, breakswitch = do_line( line, switch, phred64dict, counter) if breakswitch == True: break for element in fastqlist: if element not in phred64dict: phred64dict[element] = "33" return phred64dict
def get_mean_read_len(fastqfile, sample_size, compressed_dict): mean_read_dict = {} sampling = [] if (compressed_dict[fastqfile] != "no-compression" and fastqfile[:fastqfile.rfind(".") + 1][-3:] == "fa") or ( compressed_dict[fastqfile] != "no-compression" and fastqfile[:fastqfile.rfind(".") + 1][-6:] == ".fasta"): fastafile = SeqIO.parse(fastqfile, "fasta") for i in fastafile: sampling.append(len(i)) if len(sampling) >= sample_size: break elif fastqfile[-3:] == "fa" or fastqfile[-6:] == ".fasta": fastafile = SeqIO.parse(fastqfile, "fasta") for i in fastafile: sampling.append(len(i)) if len(sampling) >= sample_size: break else: switch = False if compressed_dict[fastqfile] == "gzip": open_fastqfile = gzip.open(fastqfile, 'r') elif compressed_dict[fastqfile] == "bz2": open_fastqfile = bz2.BZ2file(fastqfile, "r") elif compressed_dict[fastqfile] == "tar": open_fastqfile = tarfile.open(fastqfile, "r") else: open_fastqfile = open(fastqfile) for line in open_fastqfile: if switch == True: sampling.append(len(line) - 1) switch = False if len(sampling) >= sample_size: break if type(line) is str: if line[0] == "+": switch = True elif type(line) == bytes: try: bline = line.decode('cp437') except UnicodeDecodeError: continue else: if bline.find("+") < 5: switch = True else: continue return [int(numpy.mean(sampling)), numpy.std(sampling)]