Exemple #1
0
def open_by_suffix(filename):
    if filename.endswith('.gz'):
        return gzip.open(filename, 'rt')
    elif filename.endswith('.bz2'):
        return bz2.BZ2file(filename, 'r')
    else:  # assume text file
        return open(filename, 'r')
def open_by_suffix(filename):
    """Detect compressed file by filename suffix, and return gzip, bz2, or normal file handles."""
    if Path(filename).suffix == '.gz':
        return gzip.open(filename, 'rt')
    elif Path(filename).suffix == '.bz2':
        return bz2.BZ2file(filename, 'rt')
    else:
        return open(filename, 'r')
Exemple #3
0
def open_by_suffix(filename, mode='rt'):
    """Open compressed or uncompressed files."""
    if filename.endswith('.gz'):
        return gzip.open(filename, mode)
    elif filename.endswith('.bz2'):
        return bz2.BZ2file(filename, mode)
    else:
        return open(filename, mode[0])
def phred_parse(fastqlist, sample_size):
    phred64dict = {}
    counter = 0
    for element in fastqlist:
        switch = False
        print(element)
        if element.split('.')[-1] == 'gz' or element.split('.')[-1] == 'gzip':
            for line in gzip.open(element, 'r'):
                bline = line.decode()[2:-3]
                if counter > sample_size: break
                elif len(bline) < 3: continue
                elif line[0] == "@": continue
                else:
                    switch, counter, breakswitch = do_line(
                        bline, switch, phred64dict, counter)
                    if breakswitch == True:
                        break
        elif element.split('.')[-1] == 'bz2' or element.split(
                '.')[-1] == 'bzip2':
            for line in bz2.BZ2file(element, "r"):
                bline = line.decode()[2:-3]
                if counter > sample_size: break
                elif len(bline) < 3: continue
                elif line[0] == "@": continue
                else:
                    switch, counter, breakswitch = do_line(
                        line, switch, phred64dict, counter)
                    if breakswitch == True:
                        break
        elif element.split('.')[-1] == 'tar':
            for line in tarfile.open(element, "r"):
                bline = line.decode()[2:-3]
                if counter > sample_size: break
                elif len(bline) < 3: continue
                elif line[0] == "@": continue
                else:
                    switch, counter, breakswitch = do_line(
                        line, switch, phred64dict, counter)
                    if breakswitch == True:
                        break
        else:
            for line in open(element):
                if counter > sample_size: break
                elif len(line) < 3: continue
                elif line[0] == "@": continue
                else:
                    switch, counter, breakswitch = do_line(
                        line, switch, phred64dict, counter)
                    if breakswitch == True:
                        break

    for element in fastqlist:
        if element not in phred64dict:
            phred64dict[element] = "33"
    return phred64dict
def get_mean_read_len(fastqfile, sample_size, compressed_dict):
    mean_read_dict = {}
    sampling = []
    if (compressed_dict[fastqfile] != "no-compression"
            and fastqfile[:fastqfile.rfind(".") + 1][-3:] == "fa") or (
                compressed_dict[fastqfile] != "no-compression"
                and fastqfile[:fastqfile.rfind(".") + 1][-6:] == ".fasta"):
        fastafile = SeqIO.parse(fastqfile, "fasta")
        for i in fastafile:
            sampling.append(len(i))
            if len(sampling) >= sample_size:
                break
    elif fastqfile[-3:] == "fa" or fastqfile[-6:] == ".fasta":
        fastafile = SeqIO.parse(fastqfile, "fasta")
        for i in fastafile:
            sampling.append(len(i))
            if len(sampling) >= sample_size:
                break
    else:
        switch = False
        if compressed_dict[fastqfile] == "gzip":
            open_fastqfile = gzip.open(fastqfile, 'r')
        elif compressed_dict[fastqfile] == "bz2":
            open_fastqfile = bz2.BZ2file(fastqfile, "r")
        elif compressed_dict[fastqfile] == "tar":
            open_fastqfile = tarfile.open(fastqfile, "r")
        else:
            open_fastqfile = open(fastqfile)
        for line in open_fastqfile:
            if switch == True:
                sampling.append(len(line) - 1)
                switch = False
                if len(sampling) >= sample_size:
                    break
            if type(line) is str:
                if line[0] == "+":
                    switch = True
            elif type(line) == bytes:
                try:
                    bline = line.decode('cp437')
                except UnicodeDecodeError:
                    continue
                else:
                    if bline.find("+") < 5:
                        switch = True
            else:
                continue
    return [int(numpy.mean(sampling)), numpy.std(sampling)]