Ejemplo n.º 1
0
class TestContains(unittest.TestCase):
    def setUp(self):
        self.e = Elf(100)
        for i in range(100):
            self.e.add(str(i))

    def testContains(self):
        for i in range(100):
            self.assert_(str(i) in self.e)
Ejemplo n.º 2
0
class TestAddMany(unittest.TestCase):

    def setUp(self):
        self.e = Elf(100)

    def testAdd(self):
        self.e.addmany((str(i) for i in range(100)))

        for i in range(100):
            self.assert_(str(i) in self.e)

    def testUpdate(self):
        self.e.update((str(i) for i in range(100)))

        for i in range(100):
            self.assert_(str(i) in self.e)
Ejemplo n.º 3
0
class TestSerialize(TestContains):
    def setUp(self):
        self.e = Elf(100)
        for i in range(100):
            self.e.add(str(i))
        self.e.save("_t.elf")

    def testSave(self):
        self.assert_(os.path.exists("_t.elf"))

    def testLoad(self):
        e = Elf.load("_t.elf")
        self.assert_("1" in e)
        self.assert_("10" in e)

    def tearDown(self):
        os.unlink("_t.elf")
Ejemplo n.º 4
0
__doc__ %= sys.argv[0]
if len(sys.argv) > 2:
    print sys.argv
    print __doc__
    sys.exit()

print >> sys.stderr, "Command: ", " ".join(sys.argv)
infile = sys.argv[1]
fp = FastQParser(infile)
for _ in fp:
    pass
records = fp.rread()
print >> sys.stderr, records, "records in file ", infile

# say 1 out of 1000 is false positive.
bloom = Elf(records, error_rate=1e-3)
fp.seek(0)
checks = []
for _, seq, _, _ in fp:
    if seq in bloom:
        checks.append(seq)
    bloom.add(seq)

# now checks contains anything that could be a duplicate according to
# the bloomfilter. for some, they were false positives.
# for actual duplicated, just choose the first, but can also sort by quality.
fp.seek(0)
checks = frozenset(checks)
print >>sys.stderr, "checking %s potential duplicates in a python set" \
                                            % len(checks)
outfile = "%s-unique.fastq.gz" % infile.split(".")[0]
Ejemplo n.º 5
0
 def setUp(self):
     self.e = Elf(100)
Ejemplo n.º 6
0
 def testLoad(self):
     e = Elf.load("_t.elf")
     self.assert_("1" in e)
     self.assert_("10" in e)
Ejemplo n.º 7
0
 def setUp(self):
     self.e = Elf(100)
     for i in range(100):
         self.e.add(str(i))
     self.e.save("_t.elf")
Ejemplo n.º 8
0
 def setUp(self):
     self.e = Elf(100)
     for i in range(100):
         self.e.add(str(i))
Ejemplo n.º 9
0
    %s < in.fastq > out.unique.fastq
"""
from bloomfaster import Elf
import collections
import sys
__doc__ %= sys.argv[0]
if len(sys.argv) > 1:
    print sys.argv
    print __doc__
    sys.exit()

records = sum(1 for _ in sys.stdin) / 2
print >>sys.stderr, records, "records in file"

# say 1 out of 1000 is false positive.
bloom = Elf(records, error_rate=1e-3)
sys.stdin.seek(0)
readline = sys.stdin.readline

checks = []
header = readline().rstrip()
while header:
    seq = readline().rstrip()
    
    if seq in bloom:
        checks.append(seq)
    bloom.add(seq)
    header = readline().rstrip()

# now checks contains anything that could be a duplicate according to
# the bloomfilter. for some, they were false positives.