Beispiel #1
0
def filter_fastq(fname, barcodes, tmp_folder):
    """
    Filter reads belonging to unselected barcodes

    args:
    ------
	fname: 
        a fastq file, support .gz, .txt, .bz2 file

    barcodes:
	    a list contains selected barcodes
    
    tmp_folder:
        folder to store temp file
    
    output:
    ------
    temporary file name
    """

    if file_type(fname) == "gz":
        fin = gzip.open(fname, 'rb')
    elif file_type(fname) == "bz2":
        fin = bz2.BZ2File(fname, 'r')
    elif file_type(fname) == "txt":
        fin = open(fname, 'r')
    else:
        print(("error: unrecoginized fastq " + fname +
               " file format, only supports .gz, .bz2, .fastq"))
        sys.exit(1)

    if len(barcodes) == 0:
        print("error: no barcode is selected")
        sys.exit(1)
    else:
        barcodes = set(barcodes)

    fout = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder)
    fout_name = fout.name
    while True:
        cur_name = fin.readline()
        cur_read = fin.readline()
        cur_plus = fin.readline()
        cur_qual = fin.readline()
        if cur_name == "": break
        cur_barcode = cur_name.split(":")[0][1:]
        if cur_barcode in barcodes:
            fout.write(cur_name)
            fout.write(cur_read)
            fout.write(cur_plus)
            fout.write(cur_qual)
    fin.close()
    fout.close()
    return (fout_name)
Beispiel #2
0
def get_barcode_cov_from_bed(barcode_list, input_bed):
    """
    Get barcode coverage from bed file
    
    Args:
    -----
    barcode_dict: 
        a list of pre-defined barcodes

    input_bed: 
        a bed file
    
    Returns:
    ------
    a dictionary contains barcode coverage
    """

    if len(barcode_list) == 0:
        print("error: @get_barcode_cov_from_bam: barcode_list is empty!")
        sys.exit(1)

    if file_type(input_bed) == "gz":
        fin = gzip.open(input_bed, 'rb')
        barcode_dict = collections.defaultdict(lambda: 0)
        for _read in fin:
            barcode = _read.decode().split()[3].split(":")[0].upper()
            # approximate counting, a read is half fragment
            barcode_dict[barcode] += 1
    elif file_type(input_bed) == "bz2":
        fin = bz2.BZ2File(input_bed, 'r')
        barcode_dict = collections.defaultdict(lambda: 0)
        for _read in fin:
            barcode = _read.decode().split()[3].split(":")[0].upper()
            # approximate counting, a read is half fragment
            barcode_dict[barcode] += 1
    elif file_type(input_bed) == "txt":
        fin = open(input_bed, 'r')
        barcode_dict = collections.defaultdict(lambda: 0)
        for _read in fin:
            barcode = _read.split()[3].split(":")[0].upper()
            # approximate counting, a read is half fragment
            barcode_dict[barcode] += 1
    else:
        print(
            "error: unrecoginized bed file format, only supports .gz, .bz2, .fastq"
        )
        sys.exit(1)

    fin.close()
    return barcode_dict
Beispiel #3
0
def group_reads_by_barcode_bed(input_bed):
    """ Group fargments based on the barcodes
    
    Args:
        input_bed: a bed file

    Returns:
        Generator that contains reads sharing the same barcode
    """
    if not os.path.exists(input_bed):
        print(("Error @group_reads_by_barcode_bam: " + input_bed +
               " does not exist!"))

    read_group_list = []
    pre_barcode = ""

    if file_type(input_bed) == "gz":
        fin = gzip.open(input_bed, 'rb')
    elif file_type(input_bed) == "bz2":
        fin = bz2.BZ2File(input_bed, 'r')
    elif file_type(input_bed) == "txt":
        fin = open(input_bed, 'r')
    else:
        print(
            "error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq"
        )
        sys.exit(1)

    for cur_read in fin:
        if type(cur_read) is bytes:
            cur_read = cur_read.decode()

        cur_barcode = cur_read.split()[3].split(":")[0].upper()
        if cur_barcode == pre_barcode:
            read_group_list.append(cur_read)
        else:
            if pre_barcode != "":
                # return read group
                yield (x for x in read_group_list)
            read_group_list = [cur_read]  # add the first read
            pre_barcode = cur_barcode
    # reads from the last barcode
    yield (x for x in read_group_list)
    fin.close()
Beispiel #4
0
def count_barcode_cov_from_fastq(fname):
    """
    Count barcode coverage from fastq file

    args:
    -----
	fname: 
        a fastq file, support .gz, .txt, .bz2 file

    output:
    -----
	a dictionary contains barode and its coverage
    """
    if file_type(fname) == "gz":
        fin = gzip.open(fname, 'rb')
    elif file_type(fname) == "bz2":
        fin = bz2.BZ2File(fname, 'r')
    elif file_type(fname) == "txt":
        fin = open(fname, 'r')
    else:
        print(
            "error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq"
        )
        sys.exit(1)

    barcode_cov = collections.defaultdict(lambda: 0)
    while True:
        cur_name = fin.readline().strip()[1:]
        cur_read = fin.readline().strip()
        cur_plus = fin.readline().strip()
        cur_qual = fin.readline().strip()
        if cur_name == "": break
        cur_barcode = cur_name.split(":")[0]
        barcode_cov[cur_barcode] += 1

    fin.close()
    return (barcode_cov)
Beispiel #5
0
def dex_fastq(input_fastq, output_fastq, index_fastq_list):
    """
    De-multiplex fastq files by adding barcode to the beginning of each read name.

    Required:
    --------
    input_fastq: 
        a fastq format file that contains the sequencing reads;

    output_fastq: 
        a fastq file contains output fastq file;

    index_fastq_list: 
        a list of fastq files that contains the barcode

    """

    # check wheather snap file exists
    if not os.path.exists(input_fastq):
        print(('error: ' + input_fastq + ' does not exist!'))
        sys.exit(1)

    if os.path.exists(output_fastq):
        print(('error: ' + output_fastq + ' already exists, remove it first!'))
        sys.exit(1)

    for index_fastq in index_fastq_list:
        if not os.path.exists(index_fastq):
            print(('error: ' + index_fastq + ' does not exist!'))
            sys.exit(1)

    if file_type(input_fastq) == "gz":
        fr1 = gzip.open(input_fastq, 'rb')
    elif file_type(input_fastq) == "bz2":
        fr1 = bz2.BZ2File(input_fastq, 'r')
    elif file_type(input_fastq) == "txt":
        fr1 = open(input_fastq, 'r')

    index_files = []
    for index_fastq in index_fastq_list:
        if file_type(index_fastq) == "gz":
            fix = gzip.open(index_fastq, 'rb')
        elif file_type(index_fastq) == "bz2":
            fix = bz2.BZ2File(index_fastq, 'r')
        elif file_type(index_fastq) == "txt":
            fix = open(index_fastq, 'r')
        index_files.append(fix)

    if output_fastq.endswith("gz"):
        fout = gzip.open(output_fastq, 'wb')
    elif output_fastq.endswith("bz2"):
        fout = bz2.BZ2File(output_fastq, 'w')
    else:
        fout = open(output_fastq, 'w')

    while True:
        cur_r1_name = fr1.readline().strip()[1:]
        if cur_r1_name == "": break
        cur_r1_read = fr1.readline().strip()
        cur_r1_plus = fr1.readline().strip()
        cur_r1_qual = fr1.readline().strip()

        cur_idex_list = []
        for fix in index_files:
            cur_name = fix.readline().strip()[1:]
            cur_read = fix.readline().strip()
            cur_plus = fix.readline().strip()
            cur_qual = fix.readline().strip()
            cur_idex_list.append(cur_read)
        cur_barcode = "".join(cur_idex_list)

        if not (cur_name.split()[0] == cur_r1_name.split()[0]):
            sys.exit("read name does not match")
        fout.write('@' + cur_barcode + ':' + cur_r1_name + "\n")
        fout.write(cur_r1_read + "\n")
        fout.write("+\n")
        fout.write(cur_r1_qual + "\n")

    fout.close()
    fr1.close()
    for fix in index_files:
        fix.close()
Beispiel #6
0
def run_align_se(input_reference, input_fastq1, output_bam, aligner,
                 path_to_aligner, read_fastq_command, num_threads, min_cov,
                 aligner_options, if_sort, tmp_folder, overwrite):
    """
    Map single-cell ATAC-seq reads in single-end mode

    Required
    --------
    input_reference: reference genome file generated by index_reference

    input_fastq1: a fastq file contains R1 reads, supports .fq, .fastq, .gz, .bz2

    output_bam: a bam file contains alignments
    
    Optional
    --------
    path_to_aligner: directory path access to the aligner

    aligner: aligner name "bwa", "bowtie", "bowtie2" or "minimap2"

    aligner_options is a list of strings indicating options you'd like passed to aligner.
        (default for bowtie2: "-X 1000 -k 2 --no-mixed --no-discordant")
        (default for bowtie: "-X 1000 -S -k 1 -m 1 --best --strata --chunkmbs 64 -n 1")
        (default for bwa: "mem")
        (default for minimap2: "-ax sr --secondary=no")
                 
    num_threads: number of mapping threads [3];

    if_sort: if sort the alignment based on read name [True];

    tmp_folder: where to store the temporary files [None];
    
    min_cov: 
        barcodes of fragments fewer than min_cov will be filtered before alingment; 
    
    read_fastq_command: command to uncompress a compressed fastq file i.e. 'zcat', 'bzcat' [None];

    overwrite: whether to overwrite the output file if it already exists [False];
    """
    # if the aligner path given, need to check the existance of the aligner
    if path_to_aligner != None:
        path_to_aligner += "/"
        if not os.path.isdir(path_to_aligner):
            print('Error: path_to_aligner is not a folder')
            sys.exit(1)
        if not os.path.exists(path_to_aligner + aligner):
            print('Error: aligner does not exist')
            sys.exit(1)
    else:
        try:
            # pipe output to /dev/null for silence
            null = open("/dev/null", "w")
            subprocess.Popen(aligner, stdout=null, stderr=null)
            null.close()
        except OSError as e:
            print(('Error: ' + aligner + ' does not exist!'))
            sys.exit(1)
        path_to_aligner = ""

    if (tmp_folder != None):
        if not os.path.isdir(tmp_folder):
            print('Error: tmp_folder is not a folder or does not exist')
            sys.exit(1)

    # check the existance of input and output files
    if not os.path.exists(input_fastq1):
        sys.exit('Error: ' + input_fastq1 + ' does not exist!')

    if os.path.isfile(output_bam):
        if overwrite:
            subprocess.check_call(["rm", output_bam])
        else:
            sys.exit("error: \'%s\' already exists, remove it first" %
                     output_bam)

    # check if can create the output_bam file
    try:
        with open(output_bam, "w") as outfile:
            outfile.write('')
        subprocess.check_call(["rm", output_bam])
    except IOError:
        print(("error: could not create %s, check if the folder exists." %
               output_bam))
        sys.exit(1)

    if min_cov > 0:
        barcode_dict = count_barcode_cov_from_fastq(input_fastq1)
        barcode_sel = set(
            [key for key in barcode_dict if barcode_dict[key] > min_cov])
        if len(barcode_sel) == 0:
            print(
                "error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!"
            )
            sys.exit(1)
        input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder)
        read_fastq_command = "cat"

    # check validity of aligner
    aligner = aligner.lower()
    if aligner not in ["bwa", "bowtie", "bowtie2", "minimap2"]:
        sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2')

    # default aligner option
    if aligner_options is None:
        if aligner.lower() == "minimap2":
            aligner_options = ["-ax", "sr", "--secondary=no"]
        elif aligner.lower() == "bowtie":
            aligner_options = [
                "-S", "-k 1", "-m 1", "--best", "--strata", "--chunkmbs 3072",
                "-n 1", "-e 100"
            ]
            aligner_options.append("--phred33-quals")
        elif aligner.lower() == "bowtie2":  # bowtie2
            aligner_options = []
            aligner_options.append("--phred33-quals")
        elif aligner.lower() == "bwa":  # bowtie2
            aligner_options = ["mem"]
    options = aligner_options

    # update num_threads if it is given in the aligner_options
    if aligner in ["bowtie", "bowtie2"]:
        if " ".join(options).find(" -p ") == -1:
            options.append("-p " + str(num_threads))
    elif aligner in ["minimap2", "bwa"]:
        if " ".join(options).find(" -t ") == -1:
            options.append("-t " + str(num_threads))
    else:
        sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2')

    # if cat_cmd is not given, automatically detect file type and choose cat_cmd
    if read_fastq_command == None:
        if file_type(input_fastq1) == "gz":
            read_fastq_command = "zcat"
        elif file_type(input_fastq1) == "bz2":
            read_fastq_command = "bzcat"
        elif file_type(input_fastq1) == "txt":  # .fq or fastq file
            read_fastq_command = "cat"
        else:
            sys.exit(
                'Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file'
            )

    # mapping and write the alignments into a temporary file
    if aligner.lower() == "minimap2":
        args = [path_to_aligner + "minimap2"]
        args.extend(options)
        args.append(input_reference)
        args.append("<(" + read_fastq_command + " " + input_fastq1 + ")")
    elif aligner.lower() == "bowtie":
        args = [path_to_aligner + "bowtie"]
        args.extend(options)
        args.append(input_reference)
        args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 +
                    ")")
    elif aligner.lower() == "bowtie2":  # bowtie2
        args = [path_to_aligner + "bowtie2"]
        args.extend(options)
        args.append("-x " + input_reference)
        args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 +
                    ")")
    else:
        args = [path_to_aligner + "bwa"]
        args.extend(options)
        args.append(input_reference)
        args.append("<(" + read_fastq_command + " " + input_fastq1 + ")")

    ftmp = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder)
    try:
        subprocess.check_call(" ".join(args),
                              stdout=ftmp,
                              shell=True,
                              executable='/bin/bash')
    except subprocess.CalledProcessError as e:
        sys.exit(
            'Error: failed to run alignment, check if aligner and reference genome is correct!'
        )
    ftmp.close()

    if (if_sort):
        pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name)
    else:
        samfile = pysam.AlignmentFile(ftmp.name, "r")
        fout = pysam.AlignmentFile(output_bam, "wb", template=samfile)
        for read in samfile.fetch():
            fout.write(read)
        fout.close()
        samfile.close()
    subprocess.check_call(["rm", ftmp.name])

    # remove tmp fastq file after alignment
    if min_cov > 0:
        subprocess.check_call(["rm", input_fastq1])
    return 0
Beispiel #7
0
def dex_fastq(input_fastq, output_fastq, index1_fastq, index2_fastq,
              index_list):
    """
    Decomplex fastq file by adding barcode to the beginning of the read name.

    Args:
        input_fastq: fastq file contains sequencing reads (demo.R1.fastq.gz), support .fastq, .gz, .bz2 
              
        index1_fastq: fastq file contains r7, i7 barcode (demo.I1.fastq.gz), support .fastq, .gz, .bz2 

        index2_fastq: fastq file contains r5, i5 barcode (demo.I2.fastq.gz), support .fastq, .gz, .bz2 

        index_list: txt file contains pre-designed r7, i7, r5, i5 barcodes (barcodes.txt)
    
    """

    # check if those files exist
    if not os.path.isfile(input_fastq):
        exit("error: \'%s\' not exist" % input_fastq)
    if not os.path.isfile(index1_fastq):
        exit("error: \'%s\' not exist" % index1_fastq)
    if not os.path.isfile(index2_fastq):
        exit("error: \'%s\' not exist" % index2_fastq)
    if not os.path.isfile(index_list):
        exit("error: \'%s\' not exist" % index_list)

    if os.path.isfile(output_fastq):
        exit("error: \'%s\' already exists, remove it first" % output_fastq)
    # check if they are fastq file
    #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % input_fastq)
    #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % index1_fastq)
    #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % index2_fastq)

    # check barcodes
    r7_dict = collections.defaultdict(int)
    i7_dict = collections.defaultdict(int)
    r5_dict = collections.defaultdict(int)
    i5_dict = collections.defaultdict(int)
    with open(index_list) as fin:
        for line in fin:
            elems = line.split()
            if (len(elems) != 2): continue
            if elems[1] == "r7":
                r7_dict[elems[0].upper()] = 0
            elif elems[1] == "i7":
                i7_dict[elems[0].upper()] = 0
            elif elems[1] == "r5":
                r5_dict[elems[0].upper()] = 0
            elif elems[1] == "i5":
                i5_dict[elems[0].upper()] = 0
            else:
                exit(
                    "error: unorganized index  \'%s\', only support r7, i7, r5, i5"
                    % elems[1])

    # check if index is the same length
    if (len(set(map(len, r7_dict.keys()))) != 1):
        exit("error: r7 index has different length")

    if (len(set(map(len, i7_dict.keys()))) != 1):
        exit("error: i7 index has different length")

    if (len(set(map(len, r5_dict.keys()))) != 1):
        exit("error: r5 index has different length")

    if (len(set(map(len, i5_dict.keys()))) != 1):
        exit("error: i5 index has different length")

    r7_len = len(r7_dict.keys()[0])
    i7_len = len(i7_dict.keys()[0])
    r5_len = len(r5_dict.keys()[0])
    i5_len = len(i5_dict.keys()[0])

    if file_type(index1_fastq) == "gz":
        fi1 = gzip.open(index1_fastq, 'rb')
    elif file_type(index1_fastq) == "bz2":
        fi1 = bz2.BZ2File(index1_fastq, 'r')
    elif file_type(index1_fastq) == "txt":
        fi1 = open(index1_fastq, 'r')

    if file_type(index2_fastq) == "gz":
        fi2 = gzip.open(index2_fastq, 'rb')
    elif file_type(index2_fastq) == "bz2":
        fi2 = bz2.BZ2File(index2_fastq, 'r')
    elif file_type(index2_fastq) == "txt":
        fi2 = open(index2_fastq, 'r')

    if file_type(input_fastq) == "gz":
        fr1 = gzip.open(input_fastq, 'rb')
    elif file_type(input_fastq) == "bz2":
        fr1 = bz2.BZ2File(input_fastq, 'r')
    elif file_type(input_fastq) == "txt":
        fr1 = open(input_fastq, 'r')

    if output_fastq.endswith("gz"):
        fout = gzip.open(output_fastq, 'wb')
    elif output_fastq.endswith("bz2"):
        fout = bz2.BZ2File(output_fastq, 'w')
    else:
        fout = open(output_fastq, 'w')

    TOTAL_READS = 0  # number of totally sequenced reads
    QUALI_READS = 0  # number of usable reads

    while True:
        cur_i1_name = fi1.readline().strip()[1:]
        cur_i1_read = fi1.readline().strip()
        cur_i1_plus = fi1.readline().strip()
        cur_i1_qual = fi1.readline().strip()

        cur_i2_name = fi2.readline().strip()[1:]
        cur_i2_read = fi2.readline().strip()
        cur_i2_plus = fi2.readline().strip()
        cur_i2_qual = fi2.readline().strip()

        cur_r1_name = fr1.readline().strip()[1:]
        cur_r1_read = fr1.readline().strip()
        cur_r1_plus = fr1.readline().strip()
        cur_r1_qual = fr1.readline().strip()

        if cur_i1_name == "" or cur_i2_name == "" or cur_r1_name == "": break
        if not (cur_i1_name.split()[0] == cur_i2_name.split()[0] ==
                cur_r1_name.split()[0]):
            sys.exit("error: read name does not match")
        TOTAL_READS += 1
        cur_r7 = cur_i1_read[:r7_len].upper()
        cur_i7 = cur_i1_read[-i7_len:].upper()
        cur_i5 = cur_i2_read[:r5_len].upper()
        cur_r5 = cur_i2_read[-i5_len:].upper()
        if (cur_i5 in i5_dict) and (cur_i7 in i7_dict) and (
                cur_r5 in r5_dict) and (cur_r7 in r7_dict):
            QUALI_READS += 1
            cur_barcode_cur = cur_r7 + cur_i7 + cur_i5 + cur_r5
            fout.write('@' + cur_barcode_cur + ':' + cur_r1_name + "\n")
            fout.write(cur_r1_read + "\n")
            fout.write("+\n")
            fout.write(cur_r1_qual + "\n")
            r7_dict[cur_r7] += 1
            r5_dict[cur_r5] += 1
            i7_dict[cur_i7] += 1
            i5_dict[cur_i5] += 1
    fi1.close()
    fi2.close()
    fr1.close()
    fout.close()
    #### generate a report
    print("Total number of sequencing reads: ", TOTAL_READS)
    print("Total number of usable reads: ", QUALI_READS)
    print "=========================================="
    for key in r7_dict:
        print('%s\t%s\t%.2f%%' %
              (key, "r7", round(float(r7_dict[key]) / QUALI_READS * 100, 2)))
    for key in i7_dict:
        print('%s\t%s\t%.2f%%' %
              (key, "i7", round(float(i7_dict[key]) / QUALI_READS * 100, 2)))
    for key in r5_dict:
        print('%s\t%s\t%.2f%%' %
              (key, "r5", round(float(r5_dict[key]) / QUALI_READS * 100, 2)))
    for key in i5_dict:
        print('%s\t%s\t%.2f%%' %
              (key, "i5", round(float(i5_dict[key]) / QUALI_READS * 100, 2)))
    return 0