コード例 #1
0
def run_mBP_mBPN_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read,
                      min_av_read_qual, min_bp_qual_or_N):
    iter1 = FastqGeneralIterator(f1_in)
    iter2 = FastqGeneralIterator(f2_in)
    for (idLine, seqLine, qualLine) in iter1:
        (idLine2, seqLine2, qualLine2) = next(iter2)
        npQualLine = numpy.fromstring(
            qualLine, dtype=numpy.uint8) - 33  #assume illumina 1.7
        npQualLine2 = numpy.fromstring(
            qualLine2, dtype=numpy.uint8) - 33  #assume illumina 1.7
        min = numpy.min(npQualLine)
        min2 = numpy.min(npQualLine2)
        if min >= min_bp_qual_in_read and min2 >= min_bp_qual_in_read:
            npSeqLine = numpy.fromstring(seqLine, 'c')
            npSeqLine[npQualLine < min_bp_qual_or_N] = 'N'
            f1_out.write(
                "@%s\n%s\n%s\n%s\n" %
                (idLine, npSeqLine.tostring().decode('utf-8'), "+", qualLine))
            npSeqLine2 = numpy.fromstring(seqLine2, 'c')
            npSeqLine2[npQualLine2 < min_bp_qual_or_N] = 'N'
            f2_out.write("@%s\n%s\n%s\n%s\n" %
                         (idLine2, npSeqLine2.tostring().decode('utf-8'), "+",
                          qualLine2))
コード例 #2
0
def fastqtrimmer(threeprimetrim, forreads, revreads):
    #Maybe you want to trim the reads from the 3' end before giving them to salmon.
    #Trim <threeprimetrim> nt from the 3' end of the read
    threeprimetrim = int(threeprimetrim)

    counter = 0
    foutfilename = 'tempf.fastq'
    routfilename = 'tempr.fastq'
    with gzip.open(forreads, 'rb') as forinfh, gzip.open(
            revreads,
            'rb') as revinfh, open(foutfilename,
                                   'w') as foroutfh, open(routfilename,
                                                          'w') as revoutfh:
        try:
            for title, seq, qual in FastqGeneralIterator(forinfh):
                counter += 1
                if counter % 1000000 == 0:
                    print 'On read {0} of {1}.'.format(counter, forreads)
                foroutfh.write('@{0}\n{1}\n+\n{2}\n'.format(
                    title, seq[:threeprimetrim * -1],
                    qual[:threeprimetrim * -1]))
        except ValueError:
            pass

        try:
            for title, seq, qual in FastqGeneralIterator(revinfh):
                counter += 1
                if counter % 1000000 == 0:
                    print 'On read {0} of {1}.'.format(counter, forreads)
                revoutfh.write('@{0}\n{1}\n+\n{2}\n'.format(
                    title, seq[:threeprimetrim * -1],
                    qual[:threeprimetrim * -1]))

        except ValueError:
            pass

    print 'Done trimming {0} and {1}.'.format(forreads, revreads)
コード例 #3
0
ファイル: amptklib.py プロジェクト: irawand07/amptk
def FastMaxEEFilter(input, trunclen, maxee, output):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    with open(output, 'w') as out:
        with open(input, 'rU') as file:
            for title, seq, qual in FastqGeneralIterator(file):
                trunclen = int(trunclen)
                Seq = seq[:trunclen]
                Qual = qual[:trunclen]
                ee = 0
                for bp, Q in enumerate(Qual):
                    q = int(ASCII.get(Q))
                    P = 10**(float(-q) / 10)
                    ee += P
                if ee <= float(maxee):
                    out.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual))
コード例 #4
0
def _fastq_generic(in_handle, out_handle, mapping):
    """FASTQ helper function where can't have data loss by truncation (PRIVATE)."""
    from Bio.SeqIO.QualityIO import FastqGeneralIterator

    # For real speed, don't even make SeqRecord and Seq objects!
    count = 0
    null = chr(0)
    for title, seq, old_qual in FastqGeneralIterator(in_handle):
        count += 1
        # map the qual...
        qual = old_qual.translate(mapping)
        if null in qual:
            raise ValueError("Invalid character in quality string")
        out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    return count
コード例 #5
0
def read_fastq(fname):
    """Provide read info from fastq file, potentially not existing.
    """
    if not fname:
        for info in itertools.repeat(("", None, None)):
            yield info

    if os.path.splitext(fname)[1] == ".gz":
        open_file = gzip.open
    else:
        open_file = open

    with open_file(fname) as in_handle:
        for info in FastqGeneralIterator(in_handle):
            yield info
コード例 #6
0
ファイル: _convert.py プロジェクト: JPoziemski/biopython
def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
    """Fast FASTQ to simple tabbed conversion (PRIVATE).

    Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
    Seq objects in order to speed up this conversion.

    NOTE - This does NOT check the characters used in the FASTQ quality string are valid!
    """
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    #For real speed, don't even make SeqRecord and Seq objects!
    count = 0
    for title, seq, qual in FastqGeneralIterator(in_handle):
        count += 1
        out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
    return count
コード例 #7
0
def filter_my_fastq_file (in_fastq, trim_len, out_fastq):
    """function to parse fastq files and trim off a set
    lenght (trim_len)
    Writes to a new fq file"""
    # creat a new fastq file to write to
    out_file = open(out_fastq, "w")
    # open the fastq file
    in_file = open(in_fastq)
    # iterate through the fastq file
    for title, seq, qual in FastqGeneralIterator(in_file):
        out_file.write("@%s\n%s\n+\n%s\n" % (title,
                                             seq[trim_len:],
                                             qual[trim_len:]))
    out_file.close()
    in_file.close()
コード例 #8
0
def RandomReadIndexes(indexFile, indexFileRand, probability):
    # print("    Random read indexFile: '{}' ...".format(os.path.basename(indexFile)))
    records = GetTotalSeqRecords(indexFile)
    bar = progressbar.ProgressBar(
        maxval=records,
        widgets=[progressbar.Bar(left='<', marker='.', right='>')]).start()
    t = 0
    randomSeq = [random.random() < probability for x in xrange(records)]
    with open(indexFileRand, "w") as handle:
        for title, seq, qual in FastqGeneralIterator(nopen(indexFile)):
            if randomSeq[t]:
                handle.write("@{}\n{}\n+\n{}\n".format(title, seq, qual))
            bar.update(t)
            t += 1
    bar.finish()
コード例 #9
0
def load_fastq(fastq_file: str,
               pair: Optional[str] = None) -> Tuple[fastq.Read]:
    """Load a FASTQ file"""
    logging.info("Reading in FASTQ file %s", fastq_file)
    read_start = time.time()  # type: float
    reads = {}  # type: Dict[str, fastq.Read]
    if os.path.splitext(fastq_file)[-1] == '.gz':
        my_open = gzip.open  # type: function
    else:
        my_open = open  # type: function
    with my_open(fastq_file, 'rt') as ffile:
        for read in FastqGeneralIterator(ffile):  # type: Tuple[str, str, str]
            name, seq, qual = read  # type: str, str, str
            reads[name] = fastq.Read(read_id=name, seq=seq, qual=qual)
    logging.debug("Reading in FASTQ file %s took %s seconds", fastq_file,
                  round(time.time() - read_start, 3))
    if pair:
        logging.info("Reading in reverse FASTQ file %s", pair)
        reverse_start = time.time()  # type: float
        if os.path.splitext(pair)[-1] == '.gz':
            my_open = gzip.open  # type: function
        else:
            my_open = open  # type: function
        with my_open(pair, 'rt') as rfile:
            for rread in FastqGeneralIterator(
                    rfile):  # type: Tuple[str, str, str]
                rname, rseq, rqual = rread  # type: str, str, str
                try:
                    reads[rname].add_reverse(seq=rseq, qual=rqual)
                except KeyError:
                    logging.error(
                        "Reverse read %s doesn't match any reads in the forward FASTQ file",
                        rname)
        logging.debug("Reading in reverse FASTQ file took %s seconds",
                      round(time.time() - reverse_start, 3))
    return tuple(reads.values())
コード例 #10
0
def splitDemux2(input, outputdir):
    for title, seq, qual in FastqGeneralIterator(open(input)):
        sample = title.split('barcodelabel=')[1]
        sample = sample.replace(';', '')
        if not args.length:
            with open(os.path.join(outputdir, sample + '.fastq'),
                      'ab') as output:
                output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
        else:
            if len(seq) >= int(args.length):
                with open(os.path.join(outputdir, sample + '.fastq'),
                          'ab') as output:
                    output.write("@%s\n%s\n+\n%s\n" %
                                 (title, seq[:int(args.length):],
                                  qual[:int(args.length)]))
コード例 #11
0
def main(basename, input_dir, output_dir):
    input_dir = abspath(expanduser(input_dir))
    output_dir = abspath(expanduser(output_dir))
    output_r1 = path.join(output_dir, basename + "_R1.fastq.gz")
    output_r2 = path.join(output_dir, basename + "_R2.fastq.gz")
    r1_file = glob(input_dir + '/*{}*R1*.fastq.gz'.format(basename))
    r2_file = glob(input_dir + '/*{}*R2*.fastq.gz'.format(basename))
    r3_file = glob(input_dir + '/*{}*R3*.fastq.gz'.format(basename))

    if (len(r1_file) != 1):
        raise Exception("More than 1 R1 file found")
    if (len(r2_file) != 1):
        raise Exception("More than 1 R2 file found")
    if (len(r3_file) != 1):
        raise Exception("More than 1 R3 file found")

    with gzip.open(r2_file[0], 'rt') as barcode_handle:
        with gzip.open(r3_file[0], 'rt') as umi_handle:
            with gzip.open(output_r1, 'wt') as out_handle:
                for barcode_record, umi_record in zip(
                        FastqGeneralIterator(barcode_handle),
                        FastqGeneralIterator(umi_handle)):
                    assert barcode_record[0].split()[0] == \
                        umi_record[0].split()[0], "record titles match"
                    seq_string = barcode_record[
                        1][:CONFIG['barcode-length']] + umi_record[
                            1][:CONFIG['umi-length']]
                    qual_string = barcode_record[
                        2][:CONFIG['barcode-length']] + umi_record[
                            2][:CONFIG['umi-length']]
                    out_handle.write("@" + barcode_record[0] + "\n")
                    out_handle.write(seq_string + "\n")
                    out_handle.write("+\n")
                    out_handle.write(qual_string + "\n")

    copyfile(r1_file[0], output_r2)
コード例 #12
0
def filter_reads(readfile):
    print("Filtering reads\n")
    ssw = Aligner(tn_seq)
    total=0
    matched=0
    with open(filtered_filename,'w') as f:
        for title, seq, qual in FastqGeneralIterator(open(readfile)):
            total+=1
            res = ssw.align(seq,min_score, min_match_length)
            if res:
                end = res.query_end+1
                if len(seq)-end >= min_remaining_length:
                    matched+=1
                    f.write('@%s\n%s\n+\n%s\n' % (title, seq[end:], qual[end:]))
    print("%s of %s read had the tn seq\n" % (matched, total))
コード例 #13
0
def run_mBP_mRQ_mBPN(f1_in, f1_out, min_bp_qual_in_read, min_av_read_qual,
                     min_bp_qual_or_N):
    iter1 = FastqGeneralIterator(f1_in)
    for (idLine, seqLine, qualLine) in iter1:
        npQualLine = numpy.fromstring(
            qualLine, dtype=numpy.uint8) - 33  #assume illumina 1.7
        min = numpy.min(npQualLine)
        if min >= min_bp_qual_in_read:
            mean = numpy.mean(npQualLine)
            if mean >= min_av_read_qual:
                npSeqLine = numpy.fromstring(seqLine, 'c')
                npSeqLine[npQualLine < min_bp_qual_or_N] = 'N'
                f1_out.write("@%s\n%s\n%s\n%s\n" %
                             (idLine, npSeqLine.tostring().decode('utf-8'),
                              "+", qualLine))
コード例 #14
0
ファイル: deconvolute.py プロジェクト: zjwang6/jcvi
def split_barcode(t):

    barcode, excludebarcode, outdir, inputfile = t
    trim = len(barcode.seq)
    outfastq = op.join(outdir, "{0}.{1}.fastq".format(barcode.id, barcode.seq))

    fp = must_open(inputfile)
    fw = open(outfastq, "w")
    for title, seq, qual in FastqGeneralIterator(fp):
        if not is_barcode_sample(seq, barcode, excludebarcode, trim):
            continue
        print("@{0}\n{1}\n+\n{2}".format(title, seq[trim:], qual[trim:]),
              file=fw)

    fw.close()
コード例 #15
0
def analysis1():
    seq1 = SeqIO.read("/data/compbio2/linyi/emx1.fa", "fasta")
    scores = []
    #records in the fastq file is 1385196
    scores = np.zeros(1385196)
    count = 0

    with open(reads_fa_file) as in_handle:
        for title, seq, qual in FastqGeneralIterator(in_handle):
            alignments = pairwise2.align.globalds(seq1.seq, seq, blosum62, -10,
                                                  -0.5)
            scores[count] = alignments[0][2]
            count += 1

    return scores
コード例 #16
0
def main():
  #To parse command line
  usage = "usage: %prog [options]"
  p = optparse.OptionParser(usage)
  
  p.add_option('-i', '--input', help='Input fastq [None,REQD]')
  p.add_option('-s', '--size', type="int", default=1000, help="Minimum size of read to keep [1000]")
  p.add_option('-o', '--output', help='Output fastq [None,REQD]')
  
  opts, args = p.parse_args()

  with open(opts.input, "r") as fin:
    with open(opts.output, "w") as fout:
      for record in FastqGeneralIterator(fin):
        if len(record[1]) >= opts.size:
          fout.write("@%s\n%s\n+\n%s\n" % (record[0], record[1], record[2]))
コード例 #17
0
def filter_my_fastq_file(in_fastq, number_of_seq, out_fastq):
    #open the fastq file
    number_of_seq = int(number_of_seq)
    in_file = open(in_fastq)
    #creat a new fastq file to write to
    out_file = open(out_fastq, "w")
    # enumerate is a way of counting i
    # iterate through the fastq file
    for i, (title, seq, qual) in enumerate(FastqGeneralIterator(in_file)):
        #python magic to identify every number_of_seq "loop"
        if i % number_of_seq == 0:
            #write this to a file
            out_file.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
    out_file.close()
    in_file.close()
    return True
コード例 #18
0
def get_info(handle,barcode=None,UMI=None):
    read_flag=0;total_data=0;total_q20=0;total_q30=0
    barcode_q20=0;barcode_q30=0;UMI_q20=0;UMI_q30=0
    remain_q20=0;remain_q30=0
    for title, seq, qual in FastqGeneralIterator(handle):
        read_flag+=1
        qual_num=[(ord(i)-33) for i in qual]
        raw_len=len(qual_num)
        total_data+=raw_len
        q20_len=get_qual_num(qual_num,20)
        q30_len=get_qual_num(qual_num,30)
        total_q20+=q20_len
        total_q30+=q30_len
        if barcode and UMI:
            barcode_q20_len=get_qual_num(qual_num,20,barcode)
            UMI_q20_len=get_qual_num(qual_num,20,UMI)
            barcode_q30_len=get_qual_num(qual_num,30,barcode)
            UMI_q30_len=get_qual_num(qual_num,30,UMI)
            max_index=max(barcode + UMI)
            remain_q20_len=get_qual_num(qual_num,20,[max_index])
            remain_q30_len=get_qual_num(qual_num,30,[max_index])
            barcode_q20+=barcode_q20_len;UMI_q20+=UMI_q20_len;barcode_q30+=barcode_q30_len;UMI_q30+=UMI_q30_len
            remain_q20+=remain_q20_len;remain_q30+=remain_q30_len
        elif barcode:
            barcode_q20_len=get_qual_num(qual_num,20,barcode)
            barcode_q30_len=get_qual_num(qual_num,30,barcode)   
            max_index=max(barcode)
            remain_q20_len=get_qual_num(qual_num,20,[max_index])
            remain_q30_len=get_qual_num(qual_num,30,[max_index])
            barcode_q20+=barcode_q20_len;barcode_q30+=barcode_q30_len
            remain_q20+=remain_q20_len;remain_q30+=remain_q30_len
        elif UMI:
            UMI_q20_len=get_qual_num(qual_num,20,UMI)
            UMI_q30_len=get_qual_num(qual_num,30,UMI)
            max_index=max(UMI)
            remain_q20_len=get_qual_num(qual_num,20,[max_index])
            remain_q30_len=get_qual_num(qual_num,30,[max_index])
            UMI_q20+=UMI_q20_len;UMI_q30+=UMI_q30_len
            remain_q20+=remain_q20_len;remain_q30+=remain_q30_len
    if barcode and UMI:
        return read_flag,total_data,total_q20,total_q30,barcode_q20,barcode_q30,UMI_q20,UMI_q30,remain_q20,remain_q30
    elif barcode:
        return read_flag,total_data,total_q20,total_q30,barcode_q20,barcode_q30,remain_q20,remain_q30
    elif UMI:
        return read_flag,total_data,total_q20,total_q30,UMI_q20,UMI_q30,remain_q20,remain_q30
    else:
        return read_flag,total_data,total_q20,total_q30
コード例 #19
0
ファイル: QFilter.py プロジェクト: bennet-heida/ngs-mrd
def adjust_quality_distr(list_AQD_path_input_files, int_AQD_phred_cutoff,
                         int_AQD_input_max_low_q_percent,
                         int_AQD_max_read_filter_percent, int_AQD_read_chunks,
                         int_AQD_number_processes):

    list_AQD_quality_distr = [0 for _ in range(101)]

    with contextlib.ExitStack() as stack:
        list_AQD_input_files = [stack.enter_context(open(str_AQD_temp_path_input_file, 'r')) for str_AQD_temp_path_input_file \
         in list_AQD_path_input_files]
        it_AQD_input_fastq = fastq_iterator(fastq_zip_equal(*[FastqGeneralIterator(obj_AQD_temp_input_file) \
         for obj_AQD_temp_input_file in list_AQD_input_files]))

        for list_AQD_reads_for_processing in iter_double_chunked(
                it_AQD_input_fastq, int_AQD_read_chunks,
                int_AQD_number_processes):
            with concurrent.futures.ProcessPoolExecutor() as executor:

                list_AQD_read_chunk_distr = executor.map(
                    quality_read_chunk, list_AQD_reads_for_processing,
                    itertools.repeat(int_AQD_phred_cutoff))

                for list_AQD_temp_read_chunk_distr in list_AQD_read_chunk_distr:
                    list_AQD_quality_distr = [
                        int_AQD_temp_this_score + int_AQD_temp_total
                        for int_AQD_temp_this_score, int_AQD_temp_total in zip(
                            list_AQD_temp_read_chunk_distr,
                            list_AQD_quality_distr)
                    ]

    int_AQD_total_reads = sum(list_AQD_quality_distr)

    int_AQD_reads_pot_filter = 0
    for int_AQD_i in range(101):
        int_AQD_reads_pot_filter += list_AQD_quality_distr[int_AQD_i]
        int_AQD_min_low_q = int_AQD_i
        if division_zero_tolerant(
                int_AQD_reads_pot_filter, int_AQD_total_reads
        ) * 100 >= 100 - int_AQD_max_read_filter_percent:
            break

    if int_AQD_min_low_q > int_AQD_input_max_low_q_percent:
        print('Adjusted filter criterion to ' + str(int_AQD_min_low_q) + '%')
        return int_AQD_min_low_q
    else:
        return int_AQD_input_max_low_q_percent
コード例 #20
0
def write_output(fastx, ftype, comp_vectors, lengths_d, target_range):

    if ftype == "fastq":
        for read_num, (read_id, seq,
                       qual) in enumerate(FastqGeneralIterator(open(fastx))):
            status = print_comp_vectors(read_num, target_range, comp_vectors,
                                        read_id, lengths_d)
            if status == "over":
                break

    elif ftype == "fasta":
        for read_num, (read_id,
                       seq) in enumerate(SimpleFastaParser(open(fastx))):
            status = print_comp_vectors(read_num, target_range, comp_vectors,
                                        read_id, lengths_d)
            if status == "over":
                break
コード例 #21
0
def splitread(args):
    """
    %prog splitread fastqfile

    Split fastqfile into two read fastqfiles, cut in the middle.
    """
    p = OptionParser(splitread.__doc__)
    p.add_option("-n",
                 dest="n",
                 default=76,
                 type="int",
                 help="Split at N-th base position [default: %default]")
    p.add_option("--rc",
                 default=False,
                 action="store_true",
                 help="Reverse complement second read [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pairsfastq, = args

    base = op.basename(pairsfastq).split(".")[0]
    fq1 = base + ".1.fastq"
    fq2 = base + ".2.fastq"
    fw1 = must_open(fq1, "w")
    fw2 = must_open(fq2, "w")

    fp = must_open(pairsfastq)
    n = opts.n

    for name, seq, qual in FastqGeneralIterator(fp):

        name = "@" + name
        rec1 = FastqLite(name, seq[:n], qual[:n])
        rec2 = FastqLite(name, seq[n:], qual[n:])
        if opts.rc:
            rec2.rc()

        print >> fw1, rec1
        print >> fw2, rec2

    logging.debug("Reads split into `{0},{1}`".format(fq1, fq2))
    fw1.close()
    fw2.close()
コード例 #22
0
def fastqtrimmer(trim, outfile, infile):

    trim = int(trim)
    counter = 0
    handle = open(outfile, "w")
    try:
        for title, seq, qual in FastqGeneralIterator(open(infile)):
            counter += 1
            if counter % 1000000 == 0:
                print 'On read {0}'.format(counter)
            if len(seq) >= trim and len(seq) == len(qual):
                handle.write("@%s\n%s\n+\n%s\n" %
                             (title, seq[:trim], qual[:trim]))
    except ValueError:
        print 'Title and second title line don\'t match for read {0}.'.format(
            title)
    handle.close()
コード例 #23
0
def _trim_by_read(in_handles, to_trim, min_length):
    """Lazy generator for trimmed reads for all input files.
    """
    iterators = [(f, FastqGeneralIterator(h)) for f, h in in_handles.iteritems()]
    f1, x1 = iterators[0]
    for name, seq, qual in x1:
        out = {}
        tseq, tqual = _trim_quality(seq, qual, to_trim, min_length)
        if tseq:
            out[f1] = (name, tseq, tqual)
        for f2, x2 in iterators[1:]:
            name, seq, qual = x2.next()
            tseq, tqual = _trim_quality(seq, qual, to_trim, min_length)
            if tseq:
                out[f2] = (name, tseq, tqual)
        if len(out) == len(iterators):
            yield out
コード例 #24
0
def gc_content(fastx_fn, ftype):
    gc = [] 
    if ftype=="fastq":
        for read_id, seq, qual in FastqGeneralIterator(open(fastx_fn)):
            seq_l   = list(seq)
            read_gc = (seq_l.count("G") + seq_l.count("C")) / float(len(seq_l))
            # read_gc = 0.5
            gc.append( (read_id.split(" ")[0], read_gc) )
    elif ftype=="fasta":
        for read_id, seq in SimpleFastaParser(open(fastx_fn)):
            seq_l   = list(seq)
            read_gc = (seq_l.count("G") + seq_l.count("C")) / float(len(seq_l))
            # read_gc = 0.5
            gc.append( (read_id.split(" ")[0], read_gc) )
            
    gc_df = pd.DataFrame(gc, columns=["read","GC"])
    return gc_df
コード例 #25
0
ファイル: barcode.py プロジェクト: parveezsha/gitlabjuly
def convert_illumina_oldstyle(in_file):
    """Convert older Illumina barcoding conventions to current usage.
    """
    to_remove = ["s_", "_sequence"]
    out_file = in_file
    for rem in to_remove:
        out_file = out_file.replace(rem, "")
    assert out_file != in_file
    if not os.path.exists(out_file):
        with open(in_file) as in_handle:
            with open(out_file, "w") as out_handle:
                for name, seq, qual in FastqGeneralIterator(in_handle):
                    bc = name.split("#")[1].split("/")[0]
                    seq += bc
                    qual += qual[0] * len(bc)
                    out_handle.write("@%s\n%s\n+\n%s\n" % (name, seq, qual))
    return out_file
コード例 #26
0
def main(fqInputName, outputName, minScore):
    with gzip.open(outputName, 'wb') as outHandle:
        with gzip.open(fqInputName, 'rt') as inHandle:
            inCounter = 0
            outCounter = 0
            for title, seq, qual in FastqGeneralIterator(inHandle):
                inCounter += 1
                if checkHighQual(qual, minScore=minScore):
                    outEntry = '@' + title + '\n' + seq + '\n+\n' + qual + '\n'
                    outHandle.write(outEntry.encode())
                    outCounter += 1
            print(','.join([
                fqInputName,
                str(inCounter), outputName,
                str(outCounter),
                format(outCounter / inCounter, '.8f')
            ]))
コード例 #27
0
def primerStrip(file, GoodOut, BadOut, fwdprimer, revprimer):
    PL = len(fwdprimer)
    with open(GoodOut, 'w') as good:
        with open(BadOut, 'w') as bad:
            for title, seq, qual in FastqGeneralIterator(open(file)):
                Diffs = primer.MatchPrefix(seq, fwdprimer)
                if Diffs <= args.primer_mismatch:
                    Seq = seq[PL:]
                    Qual = qual[PL:]
                    if revprimer:#now need to look for reverse primer
                        BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revcomp_lib.RevComp(revprimer), args.primer_mismatch)
                        if BestPosRev > 0:  #reverse primer was found
                            Seq = Seq[:BestPosRev]
                            Qual = Qual[:BestPosRev]                                           
                    good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual))
                else:
                    bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))                   
コード例 #28
0
def find_percentage(filename):
    f = open("results.txt", "w")
    fname = filename
    resultlist = []
    seqcounterlist = []
    seqlist = []
    seqcounter = 1
    with open(fname) as handle:
        for (title, sequence, quality) in FastqGeneralIterator(handle):
            # print(sequence)
            counter = 0
            for char in sequence:
                if (char == "C" or char == "G"):
                    counter = counter + 1
            result = round(counter / len(sequence), 3)

            f.write(str(seqcounter) + " ")
            seqcounterlist.append(seqcounter)
            seqcounter = seqcounter + 1
            resultlist.append(result)
            seqlist.append(sequence)
            f.write(str(result))
            f.write("\n")
            f.write(str(sequence))
            f.write("\n")

        # x axis values


# corresponding y axis values

    plt.hist(resultlist, bins=75)
    plt.xticks(np.arange(0, 1, 0.05))
    # plotting the points

    # naming the x axis
    plt.xlabel('G/C dažnis')
    # naming the y axis
    plt.ylabel('seku skaičius')

    # giving a title to my graph
    plt.title('Grafikas')

    # function to show the plot
    plt.show()
コード例 #29
0
ファイル: undr_rover.py プロジェクト: khalidm/undr_rover
def complete_blocks(args, blocks, fastq_pair):
    """ Organise reads into blocks."""
    for block in blocks:
        if len(blocks[block]) > 2:
            blocks[block][3].clear()
    sample = os.path.basename(fastq_pair[0]).split('_')
    if len(sample) > 1:
        sample = '_'.join(sample[:3])
        logging.info("Processing sample {}".format(sample))
    else:
        exit('Cannot deduce sample name from fastq filename {}'.\
            format(fastq_pair[0]))
    for fastq_file in fastq_pair:
        with open(fastq_file, "rU") as fastq:
            for (title, seq, qual) in FastqGeneralIterator(fastq):
                # Each read is also stored as a dictionary.
                read = {'name': title.partition(' ')[0], 'seq': seq}
                read['qual'] = qual if args.qualthresh else []
                # Try to match each read with an expected primer.
                read_bases = read['seq']
                primer_key = read_bases[:args.primer_prefix_size]
                if len(blocks.get(primer_key, [])) == 9:
                    # Possible forward primer matched.
                    fseq = blocks[primer_key][7]
                    if fseq == read_bases[:len(fseq)]:
                        if read['name'] not in blocks[primer_key][3]:
                            blocks[primer_key][3][read['name']] = [read, 0, \
                            len(fseq), 0, sample]
                        else:
                            blocks[primer_key][3][read['name']][0] = read
                            blocks[primer_key][3][read['name']][2] = len(fseq)
                elif len(blocks.get(primer_key, [])) == 2:
                    # Possible reverse primer matched.
                    rseq = blocks[primer_key][0]
                    if rseq == read_bases[:len(rseq)]:
                        forward_key = blocks[primer_key][1]
                        if read['name'] not in blocks[forward_key][3]:
                            blocks[forward_key][3][read['name']] = [0, read, \
                            0, len(rseq), sample]
                        else:
                            blocks[forward_key][3][read['name']][1] = read
                            blocks[forward_key][3][read['name']][3] = len(rseq)
    # For the next stage, we only need the actual blocks.
    result = [b[:5] for b in blocks.values() if len(b) > 2]
    return result
コード例 #30
0
ファイル: _convert.py プロジェクト: JPoziemski/biopython
def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
    """Fast FASTQ to FASTA conversion (PRIVATE).

    Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
    Seq objects in order to speed up this conversion.

    NOTE - This does NOT check the characters used in the FASTQ quality string are valid!
    """
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    #For real speed, don't even make SeqRecord and Seq objects!
    count = 0
    for title, seq, qual in FastqGeneralIterator(in_handle):
        count += 1
        out_handle.write(">%s\n" % title)
        #Do line wrapping
        for i in range(0, len(seq), 60):
            out_handle.write(seq[i:i + 60] + "\n")
    return count