def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"): """From a given FASTA reference sequence creates a bloom filter file from each read. """ if format == "fasta": file_it = FastaIterator record = lambda it: (seq.seq for seq in it) elif format == "fastq": file_it = FastqGeneralIterator record = lambda it: (seq for _, seq, _ in it) capacity = total_reads(reference_file) with open(reference_file) as handle: it = file_it(handle) read_it = record(it) read_len = 109 read_in = [] read = [] buffer = [] bf = BloomFilter(capacity, error_rate, bf_file) sequence = read_it.next() step = read_len i = 0 while i < len(sequence): read = sequence[i:i + read_len - 1] i += step print(read) bf.update(read) bf.close()
def create(infile, outfile, capacity: int, error_rate: float = 0.05): import tqdm import urllib from pybloomfilter import BloomFilter bf = BloomFilter(capacity, error_rate, outfile) with open(infile) as f: for _, word in enumerate(tqdm.tqdm(f, total=capacity)): if "%" in word: word = urllib.parse.unquote(word).lower() word = word.rstrip() bf.add(word) bf.close()
def process(files): #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty. #If a filename is '-', it is also replaced by sys.stdin. if os.path.isfile(bloomfile): UNIQUES = BloomFilter.open(bloomfile) else: UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile) for record in fileinput.input(files): record = str(record).strip() if not record in UNIQUES: UNIQUES.add(record) print record UNIQUES.sync() UNIQUES.close()