Esempio n. 1
0
def filter_potential_sines(in_fname,
                           sine_string,
                           sine_header=67,
                           maxerr=19,
                           reverse_complement=False):
    """
    Finds candidate SINEs with a certain distance from a prefix length.
    To be used for preliminary screening (input for later steps).
    """
    with gene_lib.open_compressed(in_fname, 'rt') as in_file_handle:
        records = SeqIO.parse(in_file_handle, format="fastq")
        re = tre.compile(sine[:sine_header], tre.EXTENDED)
        fuzziness = tre.Fuzzyness(maxerr=maxerr)

        for rec in records:
            if reverse_complement:
                cur_seq = rec.seq.reverse_complement()
            else:
                cur_seq = rec.seq

            match = re.search(str(cur_seq), fuzziness)
            if match:
                # log(rec.seq)
                #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example
                SeqIO.write(rec, sys.stdout, 'fastq')
Esempio n. 2
0
    def check(self, path):
        """ the real check """
        super(UniventionPackageCheck, self).check(path)

        fz = tre.Fuzzyness(maxerr=2)
        pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE)

        for fn in uub.FilteredDirWalkGenerator(path):
            fd = open(fn, 'r')
            try:
                for lnr, line in enumerate(fd, start=1):
                    origline = line
                    if UniventionPackageCheck.RE_WHITELINE.match(line):
                        continue
                    pos = 0
                    while True:
                        m = pt.search(line[pos:], fz)
                        if m:
                            if not UniventionPackageCheck.RE_WHITEWORD.match(
                                    m[0]):
                                self.debug('%s:%d: found="%s"  origline="%s"' %
                                           (fn, lnr, m[0], origline))
                                self.addmsg(
                                    '0015-2',
                                    'univention is incorrectly spelled: %s' %
                                    m[0],
                                    filename=fn,
                                    line=lnr)
                            pos += m.groups()[0][1]
                        else:
                            break
            finally:
                fd.close()
Esempio n. 3
0
 def __init__(self, regex, target_group=0, maxerr=1, caseSensitive=True):
     self.regex = regex
     self.target_group = target_group
     self.fuzzyness = tre.Fuzzyness(maxerr = maxerr)
     if not caseSensitive:
         self.r = tre.compile(regex, tre.ICASE | tre.EXTENDED)
     else:
         self.r = tre.compile(regex, tre.EXTENDED)
Esempio n. 4
0
def filter_potential_sines(records, sine_pattern, sine_header=67, maxerr=14):
    re = tre.compile(sine_pattern[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    for rec in records:
        match = re.search(str(rec.seq), fuzziness)
        if match:
            yield rec
Esempio n. 5
0
 def search_cdr3_seq(self, seq, mmaxerr, end_to_end=True):
     for maxerr in range(0, 1 + mmaxerr):
         fuzzyness = tre.Fuzzyness(maxerr=maxerr)
         for p in (self.patterns_end_to_end
                   if end_to_end else self.patterns_any):
             if p[1].search(seq, fuzzyness):
                 return p[0]
     return np.nan
def new_SINES_filter_for_histogram(in_file_initial_filtering, main_dict, noDuplicate, distribution_of_neighbors, lenght, key_size=9, maxerr=3):

	fuzziness = tre.Fuzzyness(maxerr=maxerr)

	# Create slave processes
	

	with open_any(in_file_initial_filtering, "rt") as handle_read_initial_filtering:

		records = gene_records_parse(handle_read_initial_filtering)
		#q = queue.Queue()
		new_SINES_filter_proc_histogram(records, main_dict, noDuplicate, key_size, fuzziness, distribution_of_neighbors, lenght)
Esempio n. 7
0
def new_SINES_filter_for_graph(in_file_initial_filtering,
                               main_dict,
                               i=0,
                               key_size=9,
                               maxerr=3):
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    with open_compressed(in_file_initial_filtering,
                         "rt") as handle_read_initial_filtering:

        records = gene_records_parse(handle_read_initial_filtering)

        new_SINES_filter_proc_graph(records, main_dict, key_size, fuzziness, i)
Esempio n. 8
0
def getNTweetsWithFuzz(locations, numTweets, fuzz):
    """ 
    Give this function like 500 results if numTweets is 200 so it has enough to pull entries
    with the requested fuzz/cost. We use this function to check the precision of the algorithm
    at various edit distances. These will be unique matches,
    """

    global fz
    fz = tre.Fuzzyness(maxerr=fuzz)

    res = []

    print
    print("Starting search...")

    startTime = time.time()

    pool = Pool(16)

    seen = []

    while len(res) < numTweets:
        # Pass the function chunks until it finds enough hits with the requested cost.
        randomLocations = [
            locations.pop(random.randrange(len(locations)))
            for _ in xrange(numTweets * 100)
        ]
        print("Testing {} random locations.".format(len(randomLocations)))
        res = res + mainSearch(randomLocations, fuzz)

        # Filter matches we've already had and items of the wrong cost.
        new = []
        for r in res:
            if r["match"] not in seen and r["cost"] == fuzz:
                new.append(r)
                seen.append(r["match"])

        res = new
        print("Number of tweets at cost {} found so far: {}".format(
            fuzz, len(res)))

    print("--- %s seconds ---" % (time.time() - startTime))

    return res[:numTweets]
Esempio n. 9
0
def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
      open_compressed(out_file_with_sine, "wt") as handle_write_sine,\
      open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)
        rec_i = 0
        filter_potential_sines_and_locations_proc(records, re, fuzziness,
                                                  handle_write_sine,
                                                  handle_write_loc)
Esempio n. 10
0
def mainSearch(locations, fuzz):
    """
    Main functionality. Tests all 1.3 million locations against the small tweet file.
    Majority of study was conducted on this function.
    """
    # Creating the fuzziness object. This maxerr represents the max local edit distance.
    global fz  # The global fz prevents having to pass fz to checkLocations each call in pool.map
    fz = tre.Fuzzyness(maxerr=fuzz)

    print
    print("Starting search...")

    startTime = time.time()

    pool = Pool(16)
    res = pool.map(checkLocations, chunkGen(locations))

    res = [item for sublist in res
           for item in sublist]  # Flattening the list of dicts.

    print("--- %s seconds ---" % (time.time() - startTime))

    return res
Esempio n. 11
0
def showResult(file_centers,in_file_sine,sine_header=67, maxerr=19):
    sum = 0
    hist = {}
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    stringSine=sine
    print ('original sine',stringSine)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)
    with open(file_centers, "r") as centerFile:
        for line in centerFile:
            currentLine = line.strip()
#            re2 = tre.compile(currentLine, tre.EXTENDED)
#            match = re2.search(stringSine, fuzziness)
            match = re.search(currentLine, fuzziness)
            sine_location=match.groups()
#            print (sine_location)
#            print ('current center', currentLine)
#            print ('match sine', str(sine[sine_location[0][0] :sine_location[0][1]]))
#            print ('current center',  nltk.edit_distance(sine[sine_location[0][0] :sine_location[0][1]],currentLine))
            hist[nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)] = hist.get(nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine), 0) + 1
            sum = sum + nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)
        print(sum/1000)
        print(sorted(hist.items()))
Esempio n. 12
0
def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
            open_compressed(out_file_with_sine, "wt") as handle_write_sine, \
            open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)

        for rec in tqdm(records, miniters=100):
            match = re.search(str(rec.seq), fuzziness)
            if match:
                sine_location = match.groups()
                gene_record_write(rec, handle_write_sine, 'fasta')
                handle_write_loc.write(
                    ",".join([str(i) for i in sine_location[0]]) + "\n")
Esempio n. 13
0
import tre

fz = tre.Fuzzyness(maxerr=3)
print(fz)

pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.

"""

m = pt.search(data, fz)

if m:
    print(m.groups())
    print(m[0])
Esempio n. 14
0
def new_SINES_filter(in_file_initial_filtering,
                     out_file_new_SINES,
                     out_file_inherited_SINES,
                     main_dict,
                     key_size=9,
                     maxerr=3):

    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    # Create slave processes
    procs = []
    for _ in range(multiprocessing.cpu_count() - 3):
        # Create a communication queue between this process and slave process
        q = GeneDQueue()

        # Create and start slave process
        p = Process(target=new_SINES_filter_proc,
                    args=(q, main_dict, key_size, fuzziness))
        p.start()

        procs.append({'p': p, 'q': q, 'batch': [], 'write_i': 0})

    with open_compressed(in_file_initial_filtering, "rt") as handle_read_initial_filtering,\
         open_compressed(out_file_new_SINES, "wt") as handle_write_new,\
         open_compressed(out_file_inherited_SINES, "wt") as handle_write_inherited:

        records = gene_records_parse(handle_read_initial_filtering)
        rec_i = 0
        for rec in tqdm(records):
            # Simple round-robin between the slave processes
            proc = procs[rec_i % len(procs)]
            # Add a new record into a local batch array of slave process
            proc['batch'].append(rec)

            if len(proc['batch']) >= 10:
                new_SINES_filter_write(proc['q'], handle_write_inherited,
                                       handle_write_new)

                # Put batch of new records into slave process queue
                proc['q'].put(proc['batch'])

                # Reset local batch of slave process
                proc['batch'] = []

            # Uncomment for testing a small amount of records
            # if rec_i == 100000:
            #     break

            rec_i += 1

        print_step("cleanup")

        # Cleanup slave processes
        for proc in procs:
            # Get found potential sine from slave process queue, before last batch
            new_SINES_filter_write(proc['q'], handle_write_inherited,
                                   handle_write_new)

        for proc in procs:
            # Put last batch, if avaliable
            if len(proc['batch']):
                proc['q'].put(proc['batch'])
                proc['batch'] = []

        for proc in procs:
            # Make slave proccess terminate
            proc['q'].put(None)

        for proc in procs:
            # Get found potential sine from slave process queue, very last time
            new_SINES_filter_write(proc['q'],
                                   handle_write_inherited,
                                   handle_write_new,
                                   wait_none=True)

        for proc in procs:
            # Wait for termination
            proc['p'].join()
Esempio n. 15
0
def search_sines(sines,
                 r1_f,
                 override=0,
                 upper_mut_dist=20,
                 step_print=1000000,
                 nlines=100000000,
                 sine_l=70):

    print('override =', override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}

    global detailed_stats
    detailed_stats = collections.Counter()

    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    complete_regexp = '''|'''.join([sine[:sine_l] for sine in sines])
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A', 'C', 'G', 'T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join([bases[ipnd_list[i]] for i in range(sine_l)])
        r_sine_rc = ''.join([bases[3 - ind_list[i]] for i in range(sine_l)])
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range
    if override > 1:
        if override > 2:
            d = override - 1  #random.randrange(2, override)
            print('skipping ', d)
            for (i, cur_seq) in enumerate(r1_f):
                if i == d:
                    break

        sine_set = []
        for (i, s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break

        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    # bar_code_len = 60
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr=sine_l - 10))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            # barcodes are not in place here

            stats[d] += 1
            bar_code_min_len = 23
            #  if (m.groups()[0][1] < len(cur_seq) - 5) and (d <= upper_mut_dist):

            if (m.groups()[0][0] >= bar_code_min_len) and (d <=
                                                           upper_mut_dist):
                cnt += 1
                detailed_stats[res] += 1
                bar_code = cur_seq[m.groups()[0][0] -
                                   bar_code_min_len:m.groups()[0][0]]

                bar_codes.setdefault(bar_code, 0)
                bar_codes[bar_code] += 1

        #    distances_from_combined_regexp[res] = d

        if (total % step_print == 0) or (total == nlines):
            print('''stats for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')

            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',
                      cnt)
        #   pprint.pprint(collections.Counter(detailed_stats.values()))

        if (total == nlines):
            return bar_codes
Esempio n. 16
0
def merged_paired_ends(records1, records2):
    tot_good = 0
    tot_great = 0
    tot = 0
    #    log('in merged_paired_ends',records1,records2)
    for (rec1, rec2) in zip(records1, records2):
        tot += 1
        str1 = str(rec1.seq)
        str2 = str(rec2.seq.reverse_complement())
        #        log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================')
        end1 = str1[-common_req:]
        re = tre.compile(end1, tre.EXTENDED)
        # we expect small errors here
        res_seq = None
        match = re.search(str2, tre.Fuzzyness(maxerr=init_err))
        if match:
            tot_good += 1
            match_loc = match.groups()[0][0]
            to_search_len = match_loc + common_req
            fuzzyness = max(tot_err, ceil(0.1 * to_search_len))
            re = tre.compile(str1[-to_search_len:], tre.EXTENDED)
            match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness))
            #           log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost)
            if match_tot:
                #    if (tot_good % 100 == 0):
                #        log('fuzzyness = ', fuzzyness)
                #              log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost)
                tot_great += 1
                # An arbitrary decision: take the common string from r2
                res_str = str1[:-to_search_len] + str2
                # TODO: preserve qualities
                res_seq = SeqRecord(Seq(res_str),
                                    id=rec1.id,
                                    name=rec1.name,
                                    description=rec1.description,
                                    letter_annotations={
                                        "phred_quality":
                                        [30 for i in range(len(res_str))]
                                    })
                if (tot_great % step == 0):
                    log('nicely matched ', str1, '\n', str2, to_search_len,
                        match_tot.group(0), match.group(0), match_tot.cost,
                        match.cost)
#             log('result = ',str(res_seq.seq))
                yield res_seq
                continue

        res_str = str1 + ('N' * padding) + str2
        res_seq = SeqRecord(Seq(res_str),
                            id=rec1.id,
                            name=rec1.name,
                            description=rec1.description,
                            letter_annotations={
                                "phred_quality":
                                [30 for i in range(len(res_str))]
                            })
        if (tot % step == 0):
            log(tot, tot_good, tot_great)
        # log('matched ',str1,'\n',str2, len(str1), len(str2))
#      log('result = ',str(res_seq.seq))
        yield res_seq
Esempio n. 17
0
def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80):
    print ('override =',override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}
    
    global detailed_stats
    detailed_stats = collections.Counter()
    
    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    matcher = difflib.SequenceMatcher()
    
    for sine_record in SeqIO.parse(sine_f, "fasta"):
        cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA())
        cur_seq_rc = cur_seq.reverse_complement()
        sine_set.append(str(cur_seq))
        sine_set.append(str(cur_seq_rc))
        print(cur_seq, cur_seq_rc, '''\n ======================''')

    complete_regexp = '''|'''.join(sine_set)
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A','C','G','T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] )
        r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] )
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range   
    if override > 1:
        if override > 2:
            d = override - 1 #random.randrange(2, override)
            print('skipping ',d)
            for (i,cur_seq) in enumerate(r1_f):
                if i == d:
                    break
                
        sine_set = []
        for (i,s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break
            
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)     

        
    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    bar_code_len = 60                         
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40):
                # print(m.groups(), len(cur_seq))
                cnt += 1      
                stats[d] += 1

                bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]]

                if bar_code in bar_codes:
                   bar_codes[bar_code] +=  1
                else:
                    bar_codes[bar_code] = 1

            detailed_stats[res] += 1
            distances_from_combined_regexp[res] = d 

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time)/60.0, '''minutes''')
            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',cnt)
        
        if (total == nlines):
            break
Esempio n. 18
0
def search_sines2(sine,
                  r1_f,
                  frac_bound,
                  pref_bound,
                  start_line=0,
                  step_print=1000000,
                  nlines=200000000,
                  thresh=9,
                  pref=60):

    global stats
    stats = {}

    print('step ', step_print, nlines)
    sine = sine[:pref]
    matcher = difflib.SequenceMatcher(isjunk=None,
                                      a=sine,
                                      b='',
                                      autojunk=False)

    total = 0
    cnt = 0
    start_time = time()
    print('''condidates for sine = ''')

    if start_line > 0:
        for (i, cur_seq) in enumerate(r1_f):
            if i == start_line - 1:
                break

    for cur_seq in r1_f:

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')
            for k in sorted(stats):
                n = sum([i for i in stats[k][1].values()])
                print('longest common =', k, 'num matches =', n, stats[k][0],
                      '''/''', cnt)
                if (total >= nlines) and (k >= thresh):
                    for (i, frac) in enumerate(sorted(stats[k][1])):
                        print(k, 'Fraction = ', frac)
                        if i == 20:
                            break

        if (total == nlines):
            break

        total += 1
        matcher.set_seq2(cur_seq)
        res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq))
        com = res[2]

        complete_regexp = sine[:res[0]] + '$'
        p = tre.compile(complete_regexp, tre.EXTENDED)
        max_fuzz = res[
            0]  # int(frac_bound*res[0]) is better perhaps, but want to trivialize it for now
        m = p.search(
            cur_seq[:res[1]],
            tre.Fuzzyness(maxcost=max_fuzz,
                          delcost=int(1 / 4.0 * max_fuzz) + 1,
                          inscost=int(1 / 4.0 * max_fuzz) + 1))
        if m == None:
            continue

        start_p = m.groups()[0][0]
        d = m.cost

        # This is the fraction of edit distance out of all.
        # In most cases, this is the right edit distance for the overall prefix

        if (res[0] + com) == 0:
            print('How peculier!', 'com =', com, 'res[0] = ', res[0], m.cost)
            continue

        frac = Fraction(d, res[0] + com)

        stats.setdefault(com, [0, collections.Counter()])
        stats[com][0] += 1

        try:
            if (start_p >= pref_bound) and Fraction(d, res[0]) <= frac_bound:
                stats[com][1][frac] += 1
                cnt += 1
        except (ZeroDivisionError):
            pass
Esempio n. 19
0
import tre

fz = tre.Fuzzyness(maxcost = 3)

print fz

pt = tre.compile("(foo)(bar)", tre.EXTENDED)

m = pt.match("zoobag", fz)

if m:
    print m.groups()
    print m[2]
Esempio n. 20
0
#!/usr/bin/env python

import difflib
import sys
import tre
import gzip

# define barcode format; build regex objects for approximate string matching
linker1 = "CCTAGTCGCGTAGAC"
l1reg = tre.compile(linker1)
linker1Length = len(linker1)

# define Fuzzyness for tre matching
fz = tre.Fuzzyness(maxins=0, maxdel=0, maxsub=1)


# pull in read for parsing
def readread(s):
    return [
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n')
    ]


def diff_letters(a, b):
    return sum(a[i] != b[i] for i in range(len(a)))


def parseRead(s, o):
Esempio n. 21
0
# along with this program.  If not, see
# <http://www.gnu.org/licenses/>.
#

# OCR for hand-written digits
#
import tre

from . import geometry as g

param_cross_num_lines = 15
param_cell_margin = 2
param_max_errors = 4

# Tre initializations
tre_fz = tre.Fuzzyness(maxerr=param_max_errors)
regexps = [
    (
        r'^1{0,2}222+1{0,2}$',  # zero
        r'^1{0,2}222+1{0,2}$',
        r'^/(XXX/|X._/|_.X/|.X./)+X_X/(X_X/)+(XXX/|X._/|_.X/|.X./)+$',
        r'^/(XXX/|X._/|_.X/|.X./)+X_X/(X_X/)+(XXX/|X._/|_.X/|.X./)+$'),
    (
        r'^11+(22+1+|211+|111+)11+$',  # one
        r'^1+$|^1{0,2}2+11+$',
        r'^/(_.X/|_X./)(_.X/|_X./)+(X.X/)*(_X_/|__X/|XX_/)+' +
        r'(.XX/|XX./){0,2}$',
        r'(XXX/|XX_/_XX)'),
    (
        r'^1+2{0,4}111+2{0,2}11{0,2}$',  # two
        r'^1{0,3}2*3+2+1{0,3}$',
def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    # Create slave processes
    procs = []
    for _ in range(multiprocessing.cpu_count() - 3):
        # Create a communication queue between this process and slave process
        q = GeneDQueue()
        
        # Create and start slave process
        p = Process(target=filter_potential_sines_and_locations_proc, args=(q, re, fuzziness))
        p.start()

        procs.append({
            'p': p,
            'q': q,
            'batch': [],
            'write_i': 0
        })

    with open_any(in_file_unify, "rt") as handle_read, \
         open_any(out_file_with_sine, "wt") as handle_write_sine,\
         open_any(out_file_location, "wt") as handle_write_loc:


        records = gene_records_parse(handle_read)
        rec_i = 0

        for rec in tqdm(records, miniters=100):
            # Simple round-robin between the slave processes
            proc = procs[rec_i % len(procs)]

            # Add a new record into a local batch array of slave process
            proc['batch'].append(rec)

            if len(proc['batch']) >= 20:
                # Get found potential sine from slave process queue
                #
                # Optimization: 
                # Don't check the slave queue every iteration, as the check slows down.
                # Moreover we won't get a potential sine for every record.
                if proc['write_i'] > 3:
                    filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
                    proc['write_i'] = 0
                else:
                    proc['write_i'] += 1

                # Put batch of new records into slave process queue
                proc['q'].put(proc['batch'])

                # Reset local batch of slave process
                proc['batch'] = []

            # Uncomment for testing a small amount of records
            # if rec_i == 100000:
            #     break

            rec_i += 1
        
        # Cleanup slave processes
        for proc in procs:
            # Get found potential sine from slave process queue, before last batch
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)

            # Put last batch, if avaliable
            if len(proc['batch']):
                proc['q'].put(proc['batch'])
                proc['batch'] = []
            
            # Make slave proccess terminate
            proc['q'].put(None)

            # Wait for termination
            proc['p'].join()
            
            # Get found potential sine from slave process queue, very last time
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
Esempio n. 23
0
            except ValueError:
                print "Invalid product length for region", primername
                max_product_len = 10000
            min_product_len = 0
        else:
            max_product_len = 10000
            min_product_len = 0

        primeroutput[primername] = open(options.prefix + primername + ".fasta",
                                        "w")
        patterns[primername].append(max_product_len)
        patterns[primername].append(min_product_len)

    print "Found", len(patterns), "primer pairs"

    fz = tre.Fuzzyness(maxerr=options.maxcost)

    for file in seqorder:

        alltaboutput = open(options.prefix + file[0] + "_" + "pcr_regions.tab",
                            'w')
        print >> alltaboutput, 'ID   pcr_regions'

        print "Searching file: " + file[0]

        totallength = 0
        for x, seq in enumerate(file[1]):
            for primername in patterns.keys():

                #print "Searching for", primername