Beispiel #1
0
def create_ref_bloom_filter(reference_file, error_rate, bf_file, format="fasta"):
    """From a given FASTA reference sequence creates a bloom filter file
    from each read.
    """

    if format == "fasta":
    	file_it = FastaIterator
        record = lambda it: (seq.seq for seq in it)
    elif format == "fastq":
        file_it = FastqGeneralIterator
        record = lambda it: (seq for _, seq, _ in it)

    capacity = total_reads(reference_file)
    with open(reference_file) as handle:
        it = file_it(handle)
        read_it = record(it)
        read_len = 109
        read_in = []
        read = []
        buffer = []
        
        bf = BloomFilter(capacity, error_rate, bf_file)
        sequence = read_it.next()

        step = read_len
        
        i = 0
        while i < len(sequence):
            read = sequence[i:i + read_len - 1]
            i += step
            print(read)
            bf.update(read)
                
        bf.close()
Beispiel #2
0
    def create(infile, outfile, capacity: int, error_rate: float = 0.05):
        import tqdm
        import urllib
        from pybloomfilter import BloomFilter

        bf = BloomFilter(capacity, error_rate, outfile)
        with open(infile) as f:
            for _, word in enumerate(tqdm.tqdm(f, total=capacity)):
                if "%" in word:
                    word = urllib.parse.unquote(word).lower()
                word = word.rstrip()
                bf.add(word)

        bf.close()
Beispiel #3
0
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()
Beispiel #4
0
def process(files):
    #Iterate over the lines of all files listed in sys.argv[1:], defaulting to sys.stdin if the list is empty.
    #If a filename is '-', it is also replaced by sys.stdin.
    if os.path.isfile(bloomfile):
        UNIQUES = BloomFilter.open(bloomfile)
    else:
        UNIQUES = BloomFilter(MAXUNIQUES, ACCUACY, bloomfile)

    for record in fileinput.input(files):
        record = str(record).strip()
        if not record in UNIQUES:
            UNIQUES.add(record)
            print record
    UNIQUES.sync()
    UNIQUES.close()