Beispiel #1
0
def move_barcode_to_name_in_fastq(filename, out_dir):

    if not os.path.exists(out_dir):
        os.system('mkdir ' + out_dir)

    outfilename = out_dir + '/%s' % os.path.basename(filename)

    if os.path.exists(outfilename):
        print("Output file {} exists... Not overwriting...".format(outfilename))
        return
    else:
        print("Writing {}".format(outfilename))

    outf = open(outfilename, 'w')
    fastq = HTSeq.FastqReader(filename)
    obs_let = set()

    # phred 33
    for read in fastq:

        if len(read.seq) < 14:
            continue
        if min(read.qual[:9]) < 30:
            continue

        _seq = read.seq.decode()

        n_read = HTSeq.SequenceWithQualities(
            read.seq[9:],
            read.name.partition(' ')[0] + '#' + _seq[0:3] + _seq[7:9],
            read.qualstr[9:])

        n_read.write_to_fastq_file(outf)

    return
Beispiel #2
0
def create_alnmt(start, end, chrom, name):
    '''
    Given 1-based blast start, end and chromosome, return an HTSeq Alignment object
    '''
    iv = create_genomic_interval(start, end, chrom)
    r = HTSeq.SequenceWithQualities(b"", str(name), b"")
    alnmt = HTSeq.Alignment(r, iv)
    return alnmt
Beispiel #3
0
def process_file(filename, output_filename):
    outf = open(output_filename, 'w')
    fastq = HTSeq.FastqReader(filename)
    for read in fastq:
        barcode = read.name[-9:]
        new_barcode = barcode[:3] + barcode[7:]
        n_read = HTSeq.SequenceWithQualities(
            read.seq,
            read.name[:-9] + new_barcode,
            read.qualstr)
        n_read.write_to_fastq_file(outf)
    outf.close()
Beispiel #4
0
def move_barcode_to_name_in_fastq(filename, out_dir):
    if not os.path.exists(out_dir): os.system('mkdir ' + out_dir)
    outf = open(out_dir + '/%s' % os.path.basename(filename), 'w')
    fastq = HTSeq.FastqReader(filename)
    obs_let = set()
    # phred 33
    for read in fastq:
        if len(read.seq) < 14: continue
        if min(read.qualstr[:9]) < 53: continue
        n_read = HTSeq.SequenceWithQualities(
            read.seq[9:],
            read.name.partition(' ')[0] + '#' + read.seq[0:9],
            read.qualstr[9:])
        n_read.write_to_fastq_file(outf)
def filter_n_split_read(intuple):

    read = intuple[0]
    min_base_qual = intuple[1]
    mean_quality = intuple[2]
    max_length = intuple[3]

    global processed_read_count
    global low_mean_base_qual_read
    global long_read_count
    global low_base_qual_read_count

    with processed_read_count.get_lock():
        processed_read_count.value += 1

    htread = HTSeq.SequenceWithQualities(read[0].encode(), read[1],
                                         read[2].encode())

    read_len = len(htread.qual)
    read_mean_qual = sum(htread.qual) / float(read_len)
    read_lowqual_bases = sum(htread.qual < min_base_qual)

    ## format FASTQ entry to print
    read = ('@' + read[1], read[0], '+', read[2])

    ## default no filter passing code
    filter_code = -1

    if read_lowqual_bases < 2:
        if read_mean_qual >= mean_quality:
            if read_len <= max_length:
                ## set reads-passing-filters file code
                filter_code = 0

            else:
                with long_read_count.get_lock():
                    long_read_count.value += 1
                ## set lenght-discarded read file code
                filter_code = 1

        else:
            with low_mean_base_qual_read.get_lock():
                low_mean_base_qual_read.value += 1

    else:
        with low_base_qual_read_count.get_lock():
            low_base_qual_read_count.value += 1

    ## return filter code and FASTQ formatted read
    return (filter_code, read)
Beispiel #6
0
def trim_filter_read(input_fastq_read1, input_fastq_read2,
                     trimmed_read2_output, tagged_read1_output, **kwargs):
    """
    trims the read to UMI_BC1_BC2_BC3, and filters the read by quality score.

    This should be done for read2.
    """
    fastq_1_file = HTSeq.FastqReader(input_fastq_read1)
    fastq_2_file = HTSeq.FastqReader(input_fastq_read2)
    output_file = open(trimmed_read2_output, 'wa')
    output_reads = []
    output_indices = []
    for index, reads in tqdm(enumerate(zip(fastq_1_file, fastq_2_file))):
        read0 = reads[0]
        read = reads[1]
        if read[80:86] == 'CATTCG':
            # check for UMI/barcode quality
            quality = True
            read_umi = read[:10]
            bc3 = read[10:18]
            bc2 = read[48:56]
            bc1 = read[86:94]
            if sum(read_umi.qual < 20) > 1:
                quality = False
            elif sum(bc3.qual < 20) + sum(bc2.qual < 20) + sum(
                    bc1.qual < 20) > 1:
                quality = False
            if quality:
                # TODO: tag read1 with the barcode + UMI?
                new_read = HTSeq.SequenceWithQualities(
                    read_umi.seq + bc3.seq + bc2.seq + bc1.seq, read.name,
                    read_umi.qualstr + bc3.qualstr + bc2.qualstr + bc1.qualstr,
                    'phred')
                output_indices.append(index)
                output_reads.append(new_read)
                new_read.write_to_fastq(output_file)
    return output_indices
Beispiel #7
0
import sys
import matplotlib.pyplot as plt
if len(sys.argv) < 3:
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
if  not (input_file.endswith(".sam") and output_file.endswith(".fastq")):
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
import HTSeq
import numpy as np
alignment_file = HTSeq.SAM_Reader(input_file)
len_reads=[]
my_fastq_file = open( output_file, "w" )
for aln in alignment_file:
    if not aln.aligned:
        len_reads.append(len(aln.read.seq))
        if len(aln.read.seq)>200:
            myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr )
            myread.write_to_fastq_file( my_fastq_file )
my_fastq_file.close()
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(len_reads, bins=10)
plt.savefig(output_file+".png")
Beispiel #8
0
        sam_file_name = samfile

        if not intersected_reads:
            intersected_reads = sam_file_to_dict[sam_file_name][
                "idNotAlignedReads"]
        intersected_reads = list(
            set(intersected_reads).intersection(
                sam_file_to_dict[sam_file_name]["idNotAlignedReads"]))
        import HTSeq
        fastq_file = HTSeq.FastqReader(read_file)
        my_fastq_file = open(
            os.path.join(output_dir,
                         extract_prefix + "_not_aligned_reads.fastq"), "w")
        for read in fastq_file:
            if any(read.name.split(" ")[0] in s for s in intersected_reads):
                myread = HTSeq.SequenceWithQualities(read.seq, read.name,
                                                     read.qualstr)
                myread.write_to_fastq_file(my_fastq_file)
        my_fastq_file.close()

if extracted_aligned:
    intersected_reads = []
    for fqfile, samfile in sam_fasta_pairs:
        sam_file_name = samfile
        if not intersected_reads:
            intersected_reads = sam_file_to_dict[sam_file_name][
                "idAlignedReads"]
        intersected_reads = list(
            set(intersected_reads).intersection(
                sam_file_to_dict[sam_file_name]["idAlignedReads"]))
    import HTSeq
    fastq_file = HTSeq.FastqReader(read_file)
Beispiel #9
0
    "SRR2078287": "ACTCAGCAG",
    "SRR2078288": "ACGCAGCAG",
    "SRR2078289": "AGACAGCAG",
    "SRR2078290": "ATCCAGCAG",
    "SRR2078291": "ATGCAGCAG",
    "SRR2078292": "CTTCAGCAG"
}

inadapter = sample2inadapter.get(input_id)
adapter = inadapter + "......"
sequence_lthreshold = int(len(adapter)) + 32
sequence_qthreshold = 39

idread = 0
for seq, name, quals in fqr.reads(quality_values=True):
    allread = HTSeq.SequenceWithQualities(seq, name.decode("utf-8"), quals)
    if mean_quals(allread.qual) > sequence_qthreshold and len(
            seq) >= sequence_lthreshold:
        pass_quality += 1
        pos_tag = search_adapter(adapter, seq)
        if len(pos_tag) == 2:
            has_adapter += 1
            cage_tag_seq = seq[pos_tag[0]:pos_tag[1]]
            cage_tag_quals = quals[pos_tag[0]:pos_tag[1]]
            old_length = "length=" + str(len(seq))
            if len(cage_tag_seq) >= 21:
                pass_length += 1
                new_length = "length=" + str(len(cage_tag_seq))
                str_name = name.decode("utf-8").replace(old_length, new_length)
                cutread = HTSeq.SequenceWithQualities(cage_tag_seq, str_name,
                                                      cage_tag_quals)
Beispiel #10
0
                for read in fastq:
                    if min(read.qual[0:10]) < 20: continue
                    else: read.write_to_fastq_file(fastq_out)
            if os.path.exists(output_filename): continue
            cmd = 'perl ../clip/CIMS/stripBarcode.pl -len 9 -format fastq'
            cmd += ' %s %s' % (filtered_filename, output_filename)
            print cmd
            os.system(cmd)
        else:
            cmd = 'perl ../clip/CIMS/stripBarcode.pl -len 9 -format fastq'
            cmd += ' %s %s' % (input_fastq, output_filename)
            print cmd
            os.system(cmd)
    else:
        fastq_out = open(output_filename, 'w')
        fastq = HTSeq.FastqReader(input_fastq)
        for read in fastq:
            # read is a SequenceWithQualities object.
            if min(read.qual[0:10]) < 20:
                continue
            else:
                # Trim and write out.
                read_out = HTSeq.SequenceWithQualities(read.seq[10:],
                                                       read.name,
                                                       read.qualstr[10:])
                #            read.seq = read.seq[10:]
                #            read.qualstr = read.qualstr[10:]
                #            read.qual = read.qual[10:]
                read_out.write_to_fastq_file(fastq_out)
        fastq_out.close()
def extract_bc_umi_seq(input_files, barcodes):

    seq_ofile_handles = {}
    index_ofile_handles = {}
    for bc in barcodes.keys():
        sample = barcodes[bc]
        seq_ofile_handles[bc] = gzip.GzipFile("%s_genomic.fastq.gz" % sample,
                                              "w")
        index_ofile_handles[bc] = gzip.GzipFile("%s_BC_UMI.fastq.gz" % sample,
                                                "w")

    excluded_ofile = gzip.GzipFile("undetermined.fastq.gz", "w")

    for ifile_name in input_files:
        #print ifile_name

        if ifile_name.endswith(".gz"):
            ifile = gzip.GzipFile(ifile_name, "r")
        else:
            ifile = ifile_name

        total_read_cnt = 0
        accepted_read_cnt = 0
        all_bcs = {}
        for r in HTSeq.FastqReader(ifile):
            total_read_cnt += 1
            #umi = r.seq[0:8]
            bc = r.seq[8:16]

            if bc in all_bcs:
                all_bcs[bc] += 1
            else:
                all_bcs[bc] = 1

            if bc in barcodes:
                accepted_read_cnt += 1
                new_genomic_item = HTSeq.SequenceWithQualities(
                    r.seq[16:(len(r.seq) + 1)], r.name,
                    r.qualstr[16:(len(r.seq) + 1)])
                new_index_item = HTSeq.SequenceWithQualities(
                    r.seq[0:16], r.name, r.qualstr[0:16])

                new_genomic_item.write_to_fastq_file(seq_ofile_handles[bc])
                new_index_item.write_to_fastq_file(index_ofile_handles[bc])
            else:
                r.write_to_fastq_file(excluded_ofile)

    print "\nTotal reads seen: %d\tAssigned to known barcodes: %d (%g %%)\tUndetermined: %d (%g %%)" % (
        total_read_cnt, accepted_read_cnt,
        round(float(accepted_read_cnt) / total_read_cnt * 100,
              2), total_read_cnt - accepted_read_cnt,
        round(
            float(total_read_cnt - accepted_read_cnt) / total_read_cnt * 100,
            2))

    print "\nTop 100 observed barcodes with frequencies:"
    for k, v in sorted(all_bcs.items(), key=itemgetter(1),
                       reverse=True)[0:100]:
        found = "?" if not k in barcodes else barcodes[k]
        print "%s\t%d\t%s" % (k, v, found)

    for bc in barcodes.keys():
        seq_ofile_handles[bc].close()
        index_ofile_handles[bc].close()

    excluded_ofile.close()