Python HTSeq.FastqReader Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_general.py Projekt: zzygyx9119/htseq

def test_fastq_parser():
    print("Test Fastq parser")
    for seq in HTSeq.FastqReader('example_data/fastqEx.fastq'):
        pass
    print("Test passed")
    print("Test Fastq parser on gzip input")
    for seq in HTSeq.FastqReader('example_data/fastqExgzip.fastq.gz'):
        pass
    print("Test passed")
    print("Test Fastq parser on gzip input (raw iterator)")
    for seq in HTSeq.FastqReader('example_data/fastqExgzip.fastq.gz',
                                 raw_iterator=True):
        pass
    print("Test passed")

Beispiel #2

0

Datei anzeigen

    def sample(self):
        fraction = float(self.rate)
        in1 = iter(HTSeq.FastqReader(self.fq1))
        in2 = iter(HTSeq.FastqReader(self.fq2))
        o1 = open(self.o1, "w")
        o2 = open(self.o2, "w")

        for read1, read2 in itertools.izip(in1, in2):
            if random.random() < fraction:
                read1.write_to_fastq_file(o1)
                read2.write_to_fastq_file(o2)

        o1.close()
        o2.close()

Beispiel #3

0

Datei anzeigen

Datei: generate_seq_dist_data.py Projekt: emrobe/Metavis

def count_reads(data):
    reads = {'readlengths': [], 'readcount': 0}
    reads['readlengths'] = [
        len(s[0]) for s in HTSeq.FastqReader(data, raw_iterator=True)
    ]
    reads['readcount'] = len(reads['readlengths'])
    return reads

Beispiel #4

0

Datei anzeigen

def move_barcode_to_name_in_fastq(filename, out_dir):

    if not os.path.exists(out_dir):
        os.system('mkdir ' + out_dir)

    outfilename = out_dir + '/%s' % os.path.basename(filename)

    if os.path.exists(outfilename):
        print("Output file {} exists... Not overwriting...".format(outfilename))
        return
    else:
        print("Writing {}".format(outfilename))

    outf = open(outfilename, 'w')
    fastq = HTSeq.FastqReader(filename)
    obs_let = set()

    # phred 33
    for read in fastq:

        if len(read.seq) < 14:
            continue
        if min(read.qual[:9]) < 30:
            continue

        _seq = read.seq.decode()

        n_read = HTSeq.SequenceWithQualities(
            read.seq[9:],
            read.name.partition(' ')[0] + '#' + _seq[0:3] + _seq[7:9],
            read.qualstr[9:])

        n_read.write_to_fastq_file(outf)

    return

Beispiel #5

0

Datei anzeigen

def catagorize_fastq(matched, filename):
    """
    Catagoriz each read of a fastq file based on whether it is aligned to
    the reference genome (therefore name appears in matched) or not.
    Output two files - "filename_aligned" & "filename_unaligned".
    """
    fastq_reader = HTSeq.FastqReader(filename)
    counter = 0

    aligned_output = open(filename[:-6] + "_aligned.fq", "w")
    unaligned_output = open(filename[:-6] + "_unaligned.fq", "w")

    for read in fastq_reader:
        if read.name.split(" ")[0] in matched:
            counter += 1
            read.write_to_fasta_file(aligned_output)
        else:
            read.write_to_fasta_file(unaligned_output)

    aligned_output.close()
    unaligned_output.close()

    # Compress as .gz
    subprocess.call(["gzip", filename[:-6] + "_aligned.fq"])
    subprocess.call(["gzip", filename[:-6] + "_unaligned.fq"])

    print("Find %d aligns in fastq" % counter)
    return 0

Beispiel #6

0

Datei anzeigen

def main():
    ifile = sys.argv[1]
    ofile = sys.argv[2]
    r = HTSeq.FastqReader(ifile)
    with open(ofile, 'wb') as outstream:
        for ent in r:
            ent.write_to_fastq_file(outstream)

Beispiel #7

0

Datei anzeigen

Datei: split_r2.py Projekt: dfporter/easyCLIP

def count_barcodes(read1, read2=None):
    
    fastq_file = HTSeq.FastqReader(read1)
    
    barcode_counts = collections.defaultdict(int) 
    barcode_reg_counts = collections.defaultdict(int) 
    
    for n, read in enumerate(fastq_file):
        
        if (n > 1000000):
            break 
            
        barcode_reg = read.seq[:10] 
        barcode = read.seq[:3]
        barcode_counts[barcode] += 1 
        barcode_reg_counts[barcode_reg] += 1 
        
    with open('barcode_counts.txt', 'w') as f: 
        for barcode in sorted(barcode_counts, key=lambda x: barcode_counts[x]):
        
            if barcode_counts[barcode] < 100:
                continue
                
            f.write("{b}\t{c}\n".format(b=barcode, c=str(barcode_counts[barcode]))) 
            
    with open('barcode_region_counts.txt', 'w') as f: 
        for barcode in barcode_reg_counts: 
            f.write("{b}\t{c}\n".format(b=barcode, c=str(barcode_reg_counts[barcode])))

Beispiel #8

0

Datei anzeigen

def get_counts(sequences, infile, seq2regex):
    """Get number of reads within a file that contain
    each sequence (sequences should be a list)"""
    fastq_file = HTSeq.FastqReader(infile)
    counts = collections.Counter()
    old_id = None
    old_seq = None
    barcode = False
    for read in fastq_file:
        counts["reads"] += 1
        read_id = read.name.split(' ')[0]
        if old_id == read_id:
            counts["read pairs"] += 1
        else:
            old_id = read_id
            if barcode == False:
                counts["nothin"] += 1
            barcode = False
            old_seq = read.seq
        for sequence in sequences:
            if re.search(seq2regex[sequence], read.seq):
                if barcode:
                    counts["double"] += 1
                if barcode==sequence:
                    counts["double same"] += 1
                counts[sequence] += 1
                barcode=sequence
    print("Reads: %d" % (counts["reads"]))
    print("Read pairs: %d" % (counts["read pairs"]))
    print("Nothin: %d" % (counts["nothin"]))
    for sequence in sequences:
        print("%s: %d" % (sequence, counts[sequence]))
    print("Double: %d" % (counts["double"]))
    print("Double same: %d" % (counts["double same"]))
    print("\n")

Beispiel #9

0

Datei anzeigen

Datei: bc2rg.py Projekt: dawe/scatACC

def read_barcodes(bc_file, coder):
    namer = {}
    for r in HTSeq.FastqReader(bc_file):
        if r.seq.decode() in coder:
            namer[r.name.split()[0]] = coder[r.seq.decode()]

    return namer

Beispiel #10

0

Datei anzeigen

Datei: DemultiplexingSingle.py Projekt: Mapler2014/demultiplexing

def demultiplexing(sampleIndex, read1, read2, outdir, mismatch):
    Read1 = HTSeq.FastqReader(read1)
    Read2 = HTSeq.FastqReader(read2)
    for eachSample in sampleIndex:
        sampleIndex[eachSample]['Read1'] = gzip.open(os.path.join(
            outdir, eachSample + '_' + sampleIndex[eachSample]['i7Index'] +
            '_1.fq.gz'),
                                                     mode='wb',
                                                     compresslevel=1)
        sampleIndex[eachSample]['Read2'] = gzip.open(os.path.join(
            outdir, eachSample + '_' + sampleIndex[eachSample]['i7Index'] +
            '_2.fq.gz'),
                                                     mode='wb',
                                                     compresslevel=1)
    undetermined1 = gzip.open(os.path.join(outdir, 'undetermined_1.fq.gz'),
                              mode='wb',
                              compresslevel=1)
    undetermined2 = gzip.open(os.path.join(outdir, 'undetermined_2.fq.gz'),
                              mode='wb',
                              compresslevel=1)
    for R1, R2 in itertools.izip(Read1, Read2):
        if "+" in R1.name.split(':')[-1]:
            i7IndexSeq, i5IndexSeq = R1.name.split(':')[-1].strip().split('+')
        else:
            i7IndexSeq = R1.name.split(':')[-1].strip()
        bestMatch = None
        bestMisCount = 1000
        for eachSample in sampleIndex:
            i7HD = hamming_distance(i7IndexSeq,
                                    sampleIndex[eachSample]['i7Index'])
            if i7HD == 0:
                bestMatch = eachSample
                break
            if i7HD <= mismatch and i7HD < bestMisCount:
                bestMatch = eachSample
                bestMisCount = i7HD
        if bestMatch:
            R1.write_to_fastq_file(sampleIndex[bestMatch]['Read1'])
            R2.write_to_fastq_file(sampleIndex[bestMatch]['Read2'])
        else:
            R1.write_to_fastq_file(undetermined1)
            R2.write_to_fastq_file(undetermined2)
    for eachSample in sampleIndex:
        sampleIndex[eachSample]['Read1'].close()
        sampleIndex[eachSample]['Read2'].close()
    undetermined1.close()
    undetermined2.close()

Beispiel #11

0

Datei anzeigen

Datei: fastqheadMany.py Projekt: Christian-B/grid_scripts

def head(fastqs, outputs, sequences=100, qual_scale=_default_qual_scale):
    if len(fastq) != len(output):
        raise ValueError("Length of fastq and output parameters must match")
    for (i, fastq) in enumerate(fastqs):
        fastq_iterator = HTSeq.FastqReader(fastq, qual_scale)
        with open(outputs[i], 'w') as headFile:
            for sequence in itertools.islice(fastq_iterator, sequences):
                sequence.write_to_fastq_file(headFile)

Beispiel #12

0

Datei anzeigen

def converting_fasta_to_fastq(align_clusters, merged_reads, seqtk):
    print "Converting Fasta to Fastq step has been started...."

    if os.path.isfile(os.path.join(align_clusters, "temp.fastq")):
        shutil.rmtree(os.path.join(align_clusters, "temp.fastq"))
        merged_fq = open(os.path.join(align_clusters, "temp.fastq"), "a")
    else:
        merged_fq = open(os.path.join(align_clusters, "temp.fastq"), "a")

    print merged_reads
    for fq in merged_reads:
        fq_reads = HTSeq.FastqReader(fq)
        for r in fq_reads:
            r.write_to_fastq_file(merged_fq)
    merged_fq.close()

    fastq_dir = glob.glob(os.path.join(align_clusters, '*_cl*', '*.fq'))

    reads_merged = os.path.join(align_clusters, "temp.fastq")
    for fq in fastq_dir:
        cl_files = HTSeq.FastqReader(fq)
        reads_merged_header_l = os.path.join(align_clusters, "temp.list")
        proc1 = subprocess.Popen(['grep', '^@M', fq],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=False)
        (out1, err1) = proc1.communicate()
        if not err1:
            with open(reads_merged_header_l, "w") as f:
                f.write(out1.replace("@", ""))

        else:
            print "Errors:", err1

        proc2 = subprocess.Popen(
            [seqtk, 'subseq', reads_merged, reads_merged_header_l],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=False)
        (out2, err2) = proc2.communicate()
        if not err2:
            new_fq_w_q = os.path.splitext(fq)[0] + "_q.fq"
            with open(new_fq_w_q, "w") as f:
                f.write(out2)

    print "Converting Fasta to Fastq step has been finished...."

Beispiel #13

0

Datei anzeigen

 def __init__(self, original_fastq, trimmed_fastq, unpaired_fastq,
              shorter_file, dropped_file,
              outputdir=os.getcwd(),
              qual_scale=_default_qual_scale):
     original_reader = HTSeq.FastqReader(original_fastq, qual_scale)
     original_iterator = original_reader.__iter__()
     trimmed_reader = HTSeq.FastqReader(trimmed_fastq, qual_scale)
     trimmed_iterator = trimmed_reader.__iter__()
     unpaired_reader = HTSeq.FastqReader(unpaired_fastq, qual_scale)
     unpaired_iterator = unpaired_reader.__iter__()
     brenninc_comp_counter.Comparer.__init__(self, original_iterator,
                                             trimmed_iterator,
                                             unpaired_iterator)
     self.shorter = collections.Counter()
     self.same = 0
     self.update_factor = 100000
     self.shorter_file = shorter_file
     self.dropped_file = dropped_file

Beispiel #14

0

Datei anzeigen

def sub_fraction_reads(fq1, fq2, fraction, fq1_out, fq2_out):
    fraction = float(fraction)
    in1 = iter( HTSeq.FastqReader(fq1) )
    in2 = iter( HTSeq.FastqReader(fq2) )
    out1 = open( fq1_out, "w" )
    out2 = open( fq2_out, "w" )

    while True:
        try:
            read1 = next( in1 )
            read2 = next( in2 )
            if random.random() < fraction:
                read1.write_to_fastq_file( out1 )
                read2.write_to_fastq_file( out2 )
        except StopIteration:
           sys.exit()
    out1.close()
    out2.close()

Beispiel #15

0

Datei anzeigen

def read_fastQ():
    '''Reads 200,000 sequences of a fastQ file and writes every sequence to a new file called result.txt'''
    fastq_file = HTSeq.FastqReader(".fastq.gz")
    result_f = open("result.txt", "w")

    for read in itertools.islice(fastq_file, 200000):
        result_f.write(str(read))

    result_f.close()

Beispiel #16

0

Datei anzeigen

Datei: split_by_barcode.py Projekt: dfporter/fbf_clip

def split_by_barcode(sample1_filename, sample2_filename, sample3_filename,
                     missing_barcode_filename, initial_filename):
    sample1f = open(sample1_filename, 'w')
    sample2f = open(sample2_filename, 'w')
    sample3f = open(sample3_filename, 'w')
    missingf = open(missing_barcode_filename, 'w')
    fastq_file = HTSeq.FastqReader(initial_filename)
    total_reads = 0
    sample1_num = 0
    sample2_num = 0
    sample3_num = 0
    found = 0
    for read in fastq_file:
        total_reads += 1
        if (not (total_reads % 100000)):
            sum_with_barcode = sample1_num + sample2_num + sample3_num
            print "Read: %i (%i, %i, %i). Barcode in %f reads" % (
                total_reads, sample1_num, sample2_num, sample3_num,
                float(sum_with_barcode) / float(total_reads))
        if strain == 'n2':
            if (re.match('\w{3}TTGT.*', read.seq)):
                sample1_num += 1
                read.write_to_fastq_file(sample1f)
                continue
            if (re.match('\w{3}CCGG.*', read.seq)):
                sample2_num += 1
                read.write_to_fastq_file(sample2f)
                continue
            if (re.match('\w{3}GGCA.*', read.seq)):
                sample3_num += 1
                read.write_to_fastq_file(sample3f)
                continue
            read.write_to_fastq_file(missingf)
        if strain == 'fbf':
            if (re.match('\w{3}TGGC.*', read.seq)):
                sample1_num += 1
                read.write_to_fastq_file(sample1f)
                continue
            if (re.match('\w{3}GGTT.*', read.seq)):
                sample2_num += 1
                read.write_to_fastq_file(sample2f)
                continue
            if (re.match('\w{3}CGGA.*', read.seq)):
                sample3_num += 1
                read.write_to_fastq_file(sample3f)
                continue
            read.write_to_fastq_file(missingf)
    print """Results:
Reads: %i
Sample 1: %i (%f)
Sample 2: %i (%f)
(sum sample 1 + 2): %i
remaining: %i""" % (total_reads, sample1_num,
                    float(sample1_num) / float(total_reads), sample2_num,
                    float(sample2_num) / float(total_reads), sample1_num +
                    sample2_num, total_reads - sample1_num + sample2_num)

Beispiel #17

0

Datei anzeigen

Datei: fastqhead.py Projekt: Christian-B/grid_scripts

def head(path, sequences=100, outputdir=None, qual_scale=_default_qual_scale):
    extra = "_head" + str(sequences)
    new_path = brenninc_utils.create_new_file(path,
                                              extra,
                                              outputdir=outputdir,
                                              gzipped=False)
    fastq_iterator = HTSeq.FastqReader(path, qual_scale)
    with open(new_path, 'w') as headFile:
        for sequence in itertools.islice(fastq_iterator, sequences):
            sequence.write_to_fastq_file(headFile)

Beispiel #18

0

Datei anzeigen

def clip_adapters_if_not_already_clipped(in_dir, out_dir, args):

    # Do we need to remove adapters, or are they already removed?
    # Check the first N sequences to determine.
    n_lines_to_check = 4e3
    need_to_clip = True

    for _file in glob.glob(in_dir + '/*.fastq'):

        fastq_reader = HTSeq.FastqReader(_file)

        read_lens = set()

        for i, read in enumerate(fastq_reader):
            read_lens.add(len(read.seq))

            if i > n_lines_to_check:
                break

        if len([x for x in read_lens if x<20]) > 2:
            need_to_clip = False
            break

    if not need_to_clip:
        print("Adapters in {0} are apparently already clipped...".format(in_dir))
        return

    # If adapters are not already removed:
    print("Clipping adapters in {0}...".format(in_dir))

    if in_dir == out_dir:
        print("Input and output dir can't be the same.")
        sys.exit()

    args.input_dir = in_dir
    args.output_dir = 'temp_fastq/'

    if os.path.exists('temp_fastq/'):
        os.system('rm -r temp_fastq/')

    # Remove the 3' linker first:
    args.adapter = ''
    args.three_prime_linker = True
    args.rt_primer = False

    clip_adapter.run(args)

    # Remove the RT primer second:
    for k, v in list(
        {'three_prime_linker': False, 'rt_primer': True,
        'input_dir': 'temp_fastq/', 'output_dir': out_dir, 'adapter': ''}.items()):
        setattr(args, k, v)

    clip_adapter.run(args)

Beispiel #19

0

Datei anzeigen

 def getInstrument(self):
     for r in HTSeq.FastqReader(self.r1):
         serialNumber = r.name.split(":")[0]
         try:
             instrument = Instruments.objects.get(
                 serial_number=serialNumber)
         except:
             sys.exit("Instrument with Serial Number " + serialNumber +
                      " not found in the database!")
         break
     return instrument

Beispiel #20

0

Datei anzeigen

def process_file(filename, output_filename):
    outf = open(output_filename, 'w')
    fastq = HTSeq.FastqReader(filename)
    for read in fastq:
        barcode = read.name[-9:]
        new_barcode = barcode[:3] + barcode[7:]
        n_read = HTSeq.SequenceWithQualities(
            read.seq,
            read.name[:-9] + new_barcode,
            read.qualstr)
        n_read.write_to_fastq_file(outf)
    outf.close()

Beispiel #21

0

Datei anzeigen

def openDNA(filename):
	extension=os.path.splitext(filename)[1]
	if extension in ['.fna','.fasta','.ffn','.faa','.frn']:
		print('File '+filename+' is type FastA.')
		file=HTSeq.FastaReader(filename)
		num_lines=sum(1 for line in open(filename))
	elif extension in ['.fq','.fastq']:
		print('File '+filename+' is type FastQ.')
		file=HTSeq.FastqReader(filename)
		num_lines=int(sum(1 for line in open(filename))/4) #1/4 of lines are sequencesy in fastQ
	else: raise Exception('Unknown file type, exiting.')
	return file, num_lines

Beispiel #22

0

Datei anzeigen

def collapse_reads(infile, outfile=None, min_length=15):
    """Collapse identical reads, writing collapsed reads to a new fasta file.
      Retains copy number in fasta headers. Each sequence in the resulting file
      should be unique.

    Args:
        infile: input fastq file
        outfile: output fasta file with collapsed reads
        min_length: minimum length of read to include
    Returns:
        True if successful, otherwise False
    """

    #from itertools import islice
    if outfile == None:
        outfile = os.path.splitext(infile)[0] + '_collapsed.fa'
    print('collapsing reads %s' % infile)
    ext = os.path.splitext(infile)[1]
    if ext == '.fastq':
        fastfile = HTSeq.FastqReader(infile, "solexa")
    elif ext == '.fa' or ext == '.fasta':
        fastfile = HTSeq.FastaReader(infile)
    else:
        print('not fasta or fastq')
        return False

    i = 0
    total = 0
    f = {}
    #print (fastfile)
    for s in fastfile:
        seq = s.seq.decode()
        if seq in f:
            f[seq]['reads'] += 1
        else:
            f[seq] = {'name': s.name, 'reads': 1}
        total += 1

    df = pd.DataFrame.from_dict(f, orient='index')
    df.index.name = 'seq'
    df = df.reset_index()
    l = df.seq.str.len()
    df = df[l >= min_length]
    df = df.drop(['name'], 1)
    df = df.sort_values(by='reads', ascending=False).reset_index()
    df['read_id'] = df.index.copy()
    df['read_id'] = df.apply(lambda x: str(x.read_id) + '_' + str(x.reads), 1)
    #print df[:10]
    utils.dataframe_to_fasta(df, idkey='read_id', outfile=outfile)
    #df.to_csv(os.path.splitext(outfile)[0]+'.csv', index=False)
    print('collapsed %s reads to %s' % (total, len(df)))
    return True

Beispiel #23

0

Datei anzeigen

def fastq_to_fasta(infile, rename=True):
    """Fastq to fasta"""

    fastqfile = HTSeq.FastqReader(infile, "solexa")
    outfile = open(os.path.splitext(infile)[0] + '.fa', 'w')
    i = 1
    for s in fastqfile:
        if rename == True:
            s.name = str(i)
        s.write_to_fasta_file(outfile)
        i += 1
    outfile.close()
    return

Beispiel #24

0

Datei anzeigen

def summary(fastq_file, qual_scale="phred"):
    fastq_iterator = HTSeq.FastqReader(fastq_file, qual_scale)

    for sequence in itertools.islice(fastq_iterator, 1):
        qualsum = numpy.zeros(len(sequence), numpy.int)
        counts = numpy.zeros((len(sequence), 5), numpy.int)
    nsequence = 0
    for sequence in fastq_iterator:
        qualsum += sequence.qual
        nsequence += 1
        sequence.add_bases_to_count_array(counts)

    return (qualsum / float(nsequence), counts)

Beispiel #25

0

Datei anzeigen

def create_random_fastq(sourcefile, path, sizes=None):
    """Generate multiple random subsets of reads for testing"""

    fastqfile = HTSeq.FastqReader(sourcefile, "solexa")
    sequences = [s for s in fastqfile]
    print('source file has %s seqs' % len(sequences))
    if sizes == None:
        sizes = np.arange(5e5, 7.e6, 5e5)
    for s in sizes:
        label = str(s / 1e6)
        name = os.path.join(path, 'test_%s.fa' % label)
        create_random_subset(sequences=sequences, size=s, outfile=name)
    return

Beispiel #26

0

Datei anzeigen

def subSampleFastQSE(ReadFraction, FastQFileIn, FastQFileOut, Zip=False):

    in1 = iter(HTSeq.FastqReader(FastQFileIn))
    out1 = open(FastQFileOut, "w")

    for read1 in in1:
        if random.random() < ReadFraction:
            read1.write_to_fastq_file(out1)

    in1.close()
    out1.close()
    if Zip:
        system('gzip %s' % (FastQFileOut))

Beispiel #27

0

Datei anzeigen

Datei: bam_to_novo_bed.py Projekt: dfporter/fbf_clip

def move_barcode_to_name_in_fastq(filename, out_dir):
    if not os.path.exists(out_dir): os.system('mkdir ' + out_dir)
    outf = open(out_dir + '/%s' % os.path.basename(filename), 'w')
    fastq = HTSeq.FastqReader(filename)
    obs_let = set()
    # phred 33
    for read in fastq:
        if len(read.seq) < 14: continue
        if min(read.qualstr[:9]) < 53: continue
        n_read = HTSeq.SequenceWithQualities(
            read.seq[9:],
            read.name.partition(' ')[0] + '#' + read.seq[0:9],
            read.qualstr[9:])
        n_read.write_to_fastq_file(outf)

Beispiel #28

0

Datei anzeigen

def generate_seq_stats(seqfile, header, table=None, fastqfile=True):
    '''
    This function creates the JSON-files table.j, hist.j, edges.j, which are the basis for the sequence statistics table and graph visualized in the Sequence distribution-tab.
    If no table object is provided, headers are created and a table object is returned with two columns, headers and values. 
    If a table object is provided, the function will add a new column to the table

    table: existing table (for adding a column)
    seqfile: path to sequencefile (fasta/fastq)
    header: name of column
    fastqfile: the function assumes a fastq file. "False" will accept fasta
    '''
    if not table:
        table = {
            'Statistic': [
                'Count (#)', 'Length (bp)', 'Over 100 bp', 'Over 500 bp',
                'Over 1000 bp', 'Over 5000 bp', 'Over 10000 bp',
                'Largest (bp)', 'Smallest (bp)', 'Average length (bp)',
                'Median (bp)', 'N50'
            ]
        }

    # Parse sequencefile
    if fastqfile:
        seqlengths = [
            len(s[0]) for s in HTSeq.FastqReader(seqfile, raw_iterator=True)
        ]
    else:
        seqlengths = [
            len(s[0]) for s in HTSeq.FastaReader(seqfile, raw_iterator=True)
        ]

    # Calculate statistcs
    table[header] = []
    table[header].append(len(seqlengths))
    table[header].append(sum(seqlengths))
    table[header].append(len([x for x in seqlengths if x > 100]))
    table[header].append(len([x for x in seqlengths if x > 500]))
    table[header].append(len([x for x in seqlengths if x > 1000]))
    table[header].append(len([x for x in seqlengths if x > 5000]))
    table[header].append(len([x for x in seqlengths if x > 10000]))
    table[header].append(max(seqlengths))
    table[header].append(min(seqlengths))
    table[header].append(np.mean(seqlengths))
    table[header].append(calculate_n50(seqlengths))

    # Create historgram data
    hist, edges = np.histogram(seqlengths,
                               density=False,
                               bins=int(max(seqlengths) / 10))
    return (table, hist.tolist(), edges.tolist())

Beispiel #29

0

Datei anzeigen

def demultiplex(infile, outfile, sequences, seq2regex):
    fastq_file = HTSeq.FastqReader(infile)
    with open(outfile, "w+") as outf:
        for read in fastq_file:
            for sequence in sequences:
                if 'r' not in sequence:
                    continue
                match = re.search(seq2regex[sequence], read.seq)
                if not match:
                    continue
                barcode = HTSeq.Sequence(match.group(0))
                read2 = read.trim_left_end_with_quals(barcode)
                read2.write_to_fastq_file(outf)
                break

Beispiel #30

0

Datei anzeigen

Datei: DNAFileReader.py Projekt: soham6001/Algorithm-for-Motif-Discovery-with-Iteration-on-Lengths-of-Motifs-AMDILM-

def readFile(filename, fileType):
    """

    :rtype : return type is DNA Sequence as a list of characters
    """
    fasta_file = ""  #dummy initialization
    if (fileType == FASTA):
        fasta_file = HTSeq.FastaReader(filename)
    elif (fileType == FASTQ):
        fasta_file = HTSeq.FastqReader(filename)
    sequence = ""
    for read in fasta_file:
        sequence += read.seq
    return (map(lambda x: x.upper(), list(sequence)))