def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0): """Fasta sequences from set of genomic features in a bed file Args: fastafile: fasta file with genomic sequence features: dataframe of features/coords with bed file col names bedfile: optionally provide a bed file instead pad5,pad3: flanking sequence at 5' or 3' ends Returns: a pandas dataframe with name, sequence and coord columns""" from pybedtools import BedTool if bedfile != None: features = utils.bed_to_dataframe(bedfile) new = [] for n, r in features.iterrows(): if r.strand == '+': coords = (r.chr, r.chromStart - pad5, r.chromEnd + pad3) seq = str(BedTool.seq(coords, fastafile)) else: #reverse strand coords = (r.chr, r.chromStart - pad3, r.chromEnd + pad5) seq = str(BedTool.seq(coords, fastafile)) seq = HTSeq.Sequence(seq).get_reverse_complement() #print n, coords, r['name'] new.append([r['name'], str(seq), coords]) new = pd.DataFrame(new, columns=['name', 'seq', 'coords']) return new
def test_pickle(): import pickle print('Test pickling and inpickling') pickles = [ { 'name': 'HTSeq.Sequence', 'object': HTSeq.Sequence(b'ACTG', 'sequence'), 'assert_properties': ('seq', 'name', 'descr') }, ] for pic in pickles: print('Pickling ' + pic['name']) pickled = pickle.dumps(pic['object']) print('Done') print('Unpickling ' + pic['name']) unpick = pickle.loads(pickled) print('Done') if 'assert_properties' in pic: print('Checking serialized/deserialized') for prop in pic['assert_properties']: assert getattr(pic['object'], prop) == getattr(unpick, prop) print('Done') print("Test passed")
def get_consensus(sites, seq_type, m=-1): """return a string as consesus sequence """ bases = ['A', 'C', 'G', 'T', 'N'] n = len(sites) # number of sites if m == -1 and sites: m = len(sites[0][seq_type]) # initialize count array # rows correspond to positions in motif sequence # columns correspond to bases: "A", "C", "G", "T", and "N" counts = np.zeros((m, 5), np.int) consenus = "" for s in sites: # convert seq to HTSeq.Sequence object seq = HTSeq.Sequence(str(s[seq_type])) # count bases to counts array seq.add_bases_to_count_array(counts) base_idx = np.argmax(counts, 1) for i in range(m): # test if at least 75% of sites have same base: if n > 0 and counts[i, base_idx[i]] / float(n) > 0.75: consenus += bases[base_idx[i]] # test if at least 50% of sites have same base: elif n > 0 and counts[i, base_idx[i]] / float(n) > 0.5: consenus += bases[base_idx[i]].lower() else: consenus += "." return consenus
def filter_fasta(infile): fastafile = HTSeq.FastaReader(infile) sequences = [(s.name, s.seq, s.descr) for s in fastafile] out = open('filtered.fa', "w") for s in sequences: if s[1] == 'Sequence unavailable': continue myseq = HTSeq.Sequence(s[1], s[0]) myseq.write_to_fasta_file(out) return
def readdna(filename): """ Reads in the dna sequence of the given fasta @type filename: string @param filename: Fasta-file used as input. @rtype: HTSeq Sequence object @return: Reference Fasta. """ chr = HTSeq.FastaReader(filename) for fasta in chr: referenz = HTSeq.Sequence(fasta.seq, fasta.name) return (referenz)
def demultiplex(infile, outfile, sequences, seq2regex): fastq_file = HTSeq.FastqReader(infile) with open(outfile, "w+") as outf: for read in fastq_file: for sequence in sequences: if 'r' not in sequence: continue match = re.search(seq2regex[sequence], read.seq) if not match: continue barcode = HTSeq.Sequence(match.group(0)) read2 = read.trim_left_end_with_quals(barcode) read2.write_to_fastq_file(outf) break
def sequence_from_coords(fastafile, coords): """Fasta sequence from genome coords""" from pybedtools import BedTool chrom, start, end, strand = coords if not os.path.exists(fastafile): print('no such file') return try: if strand == '+': seq = str(BedTool.seq(coords, fastafile)) else: #reverse strand seq = str(BedTool.seq(coords, fastafile)) seq = str(HTSeq.Sequence(seq).get_reverse_complement()) except Exception as e: print(e) return return seq
def trim_adapters(infile, adapter, outfile='cut.fastq', method='default'): """Trim adapters using cutadapt""" if not type(adapter) is str: print ('not valid adapter') return if method == 'default': newfile = open( outfile, "w" ) fastfile = HTSeq.FastqReader(infile, "solexa") a = HTSeq.Sequence(adapter) for s in fastfile: new = s.trim_right_end(a, mismatch_prop = 0.) new.write_to_fastq_file( newfile ) newfile.close() elif method == 'cutadapt': cmd = 'cutadapt -m 18 -O 5 -q 20 -a %s %s -o %s' %(adapter,infile,outfile) print (cmd) result = subprocess.check_output(cmd, shell=True, executable='/bin/bash') return
def dataframe_to_fasta(df, outfile='out.fa', seqkey='seq', idkey='id'): """Convert dataframe to fasta""" if idkey not in df.columns: df = df.reset_index() fastafile = open(outfile, "w") for i, row in df.iterrows(): try: seq = row[seqkey] except: continue seq = seq.upper().replace('U', 'T') if idkey in row: d = str(row[idkey]) else: d = row.name seq = seq.encode() myseq = HTSeq.Sequence(seq, str(d)) myseq.write_to_fasta_file(fastafile) fastafile.close() return
def match_read_primers(read1, read2, primer_sets): amplicon_assignment = defaultdict(lambda: 0.0) for amplicon in primer_sets: read1_primer_region = read1.seq[0:len(primer_sets[amplicon]['dlso'])] read2_primer_region = read2.seq[0:len(primer_sets[amplicon]['ulso'])] dlso = HTSeq.Sequence(primer_sets[amplicon]['dlso'], "DLSO") ratio1 = fuzz.ratio(read1_primer_region, dlso.get_reverse_complement()) ratio2 = fuzz.ratio(read2_primer_region, primer_sets[amplicon]['ulso']) if ratio1 > amplicon_assignment['ratio1']: amplicon_assignment['ratio1_amplicon'] = amplicon amplicon_assignment['ratio1'] = ratio1 amplicon_assignment['read1_primer_seq'] = read1_primer_region if ratio2 > amplicon_assignment['ratio2']: amplicon_assignment['ratio2_amplicon'] = amplicon amplicon_assignment['ratio2'] = ratio2 amplicon_assignment['read2_primer_seq'] = read2_primer_region return amplicon_assignment
def separate_demultiplex(libraries_dir, libname, libpath, lib2seq, sequences, seq2regex): fastq_file = HTSeq.FastqReader(libpath) seq_storage = collections.Counter() count = 0 for barcode_num in lib2seq[libname]: seq_storage[barcode_num] = [] for read in fastq_file: for sequence in sequences: if 'r' not in sequence: continue match = re.search(seq2regex[sequence], read.seq) if not match: continue count += 1 barcode = HTSeq.Sequence(match.group(0)) barcode_num = sequence[:2] read2 = read.trim_left_end_with_quals(barcode) seq_storage[barcode_num].append(read2) break for barcode_num in seq_storage.keys(): with open("{}/{}/{}.filter-RNA.demulti.{}.fastq".format(libraries_dir, libname, libname, barcode_num), "w+") as outf: for read in seq_storage[barcode_num]: read.write_to_fastq_file(outf)
def sequence_from_coords(fastafile, coords): """Fasta sequence from genome coords. Args: fastafile: inpout fasta file coords: genome coordinates as tuple of the form (chrom,start,end,strand) """ if not os.path.exists(fastafile): print ('no such file') return chrom,start,end,strand = coords from pyfaidx import Fasta genes = Fasta(fastafile) try: #gets seq string from genome seq = str(genes[chrom][start:end]) if strand == '-': seq = str(HTSeq.Sequence(seq.encode()).get_reverse_complement()) except Exception as e: print (e) return return seq
transcript[feature.attr["Parent"]]['CDSfeats'].append(feature.iv) ## Future worry: do I need CDS.frame in transcript object? CDSfeat[feature.iv] = feature print( "# Chrom\tPos\tPos in CDS\tBase change\tAA change\tAA pos in transcript\ttranscript ID" ) vcfr = HTSeq.VCF_Reader(sys.argv[3]) for vc in vcfr: vCDS = CDSfeat[vc.pos] # vCDS.iv.start is base before 1st base of CDS if not vCDS == None and not vc.pos.start == vCDS.iv.start: vTranscript = transcript[vCDS.attr["Parent"]] refseq = str( HTSeq.Sequence( sequences[vCDS.iv.chrom].seq[vCDS.iv.start:vCDS.iv.end])) refseqT = ''.join( str(HTSeq.Sequence(sequences[CDS.chrom].seq[CDS.start:CDS.end])) for CDS in vTranscript['CDSfeats']) relpos = vc.pos.start - vCDS.iv.start # if variant is 1st base of CDS, relpos=1 if refseq[relpos - 1] != vc.ref: print("ERROR: Reference Base not according to SNP") for alt in vc.alt: # vc.alt is a list of alternative base(s) alternateSeq = refseq[0:relpos - 1] + alt + refseq[relpos:] altSeqT = '' relposT = 0 exonFound = False transLen = 0 for exon in vTranscript['CDSfeats']: transLen += exon.length
def parse_matrix_scan(inFile): """ Parses the RSAT matrix-scan output file and returns a list of dicts ordered by score Keeps only one hit if 2 hits at the same position on both strands. Genomic coordinates are transfromed from 1-based (in matrix-scan output format) into zero-based half open (like BED format) for internal representation and HTSeq compatibility. Assumes the "galaxy format" for sequence IDs in the first column. """ # dict for unique sites unique_sites = {} # count number of total input sites n_sites = 0 for line in open(inFile): # ignore comment lines if not line.startswith(';') and not line.startswith('#'): sp = line.strip().split('\t') loc = sp[0] chr = loc.split('_')[1] # the peak-coordinates are assumed now again ONE-based in matrix-scan output format! peak_start = int(loc.split('_')[2]) - 1 # the motif coordinates are ONE-based and relative to peak start in the matrix-scan output file format start = peak_start + int(sp[4]) - 1 end = peak_start + int(sp[5]) strand = sp[3].replace('DR', '+').replace('D', '+').replace('R', '-') score = float(sp[7]) # one based locus coordinates of the motif: motif_loc = chr + ":" + str(start + 1) + "-" + str(end) # keep it only if score is greater than sits with same location if motif_loc not in unique_sites or score > unique_sites[ motif_loc]["score"]: type = sp[1] ft_name = sp[2] seq = HTSeq.Sequence(sp[6], loc) # append region as dict with all annotations unique_sites[motif_loc] = {"chr":chr, "start":start, "end":end, \ "strand":strand, "score":score, "type":type, \ "ft_name":ft_name, "motif_seq":seq, "motif_loc":motif_loc, "seq_id":loc, "name":motif_loc} n_sites += 1 # increase counter for total number of sites # get list of sites sorted by motif score sorted_sites = sorted(unique_sites.values(), cmp=lambda x, y: cmp(x['score'], y['score']), reverse=True) print "INFO: Read {0} of {1} input regions.".format( len(sorted_sites), n_sites) return sorted_sites
* -i --input fastq file * -b --basename Three letter prefix which identifies a sample output: * -o --output fasta file ''' import argparse import HTSeq parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', dest="input", required=True) parser.add_argument('-b', '--basename', dest="basename", required=True) parser.add_argument('-o', '--output', dest="output", required=True) args = parser.parse_args() uniques = {} for s in HTSeq.FastqReader(args.input): if s.seq in uniques: uniques[s.seq] += 1 else: uniques[s.seq] = 1 progressive_num = 1 with open(args.output, "w") as out: for sequence in uniques.keys(): name = "{0}_{1}_x{2}".format(args.basename, progressive_num, uniques[sequence]) fasta_read = HTSeq.Sequence(name=name, seq=sequence) progressive_num += 1 fasta_read.write_to_fasta_file(out)