def __init__(self, regex, target_group=0, maxerr=1, caseSensitive=True): self.regex = regex self.target_group = target_group self.fuzzyness = tre.Fuzzyness(maxerr = maxerr) if not caseSensitive: self.r = tre.compile(regex, tre.ICASE | tre.EXTENDED) else: self.r = tre.compile(regex, tre.EXTENDED)
def __init__(self, true_clones_df, nt=True): self.true_clones_df = true_clones_df column = 'nSeqCDR3' if nt else 'aaSeqCDR3' patterns_end_to_end = [] patterns_any = [] for row in true_clones.iterrows(): patterns_end_to_end += [(row[1][column], tre.compile("^" + row[1][column] + "$"))] #patternsAny += [tre.compile(row[1].cdr3)] patterns_any += [(row[1][column], tre.compile(row[1][column][3:-3]))] self.patterns_end_to_end = patterns_end_to_end self.patterns_any = patterns_any
def filter_potential_sines(in_fname, sine_string, sine_header=67, maxerr=19, reverse_complement=False): """ Finds candidate SINEs with a certain distance from a prefix length. To be used for preliminary screening (input for later steps). """ with gene_lib.open_compressed(in_fname, 'rt') as in_file_handle: records = SeqIO.parse(in_file_handle, format="fastq") re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) for rec in records: if reverse_complement: cur_seq = rec.seq.reverse_complement() else: cur_seq = rec.seq match = re.search(str(cur_seq), fuzziness) if match: # log(rec.seq) #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example SeqIO.write(rec, sys.stdout, 'fastq')
def findApproxText( # Text to search for searchText, # Text in which to find the searchText targetText, # Maximum allowable cost for an approximate match. None indicates no maximum cost. cost = None): # tre.LITERAL specifies that searchText is a literal search string, not # a regex. pat = tre.compile(searchText, tre.LITERAL) fz = tre.Fuzzyness(maxerr = cost) if cost else tre.Fuzzyness() match = pat.search(targetText, fz) # Store the index into the target string of the first and last matched chars. beginInTarget, endInTarget = match.groups()[0] # TRE picks the first match it finds, even if there is # more than one match with identical error. So, manually # call it again excluding the found text to check. In addition, # make sure this match is unique: it should be 10% # better than the next best match. matchAgain = pat.search(targetText[:beginInTarget] + targetText[endInTarget:], fz) if matchAgain and (matchAgain.cost <= match.cost*1.1): ## print('Multiple matches ' + str(matchAgain.groups())) return None, 0, 0 else: ## print(searchText + '\n' + targetText[beginInTarget:endInTarget]) return match, beginInTarget, endInTarget
def check(self, path): """ the real check """ super(UniventionPackageCheck, self).check(path) fz = tre.Fuzzyness(maxerr=2) pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE) for fn in uub.FilteredDirWalkGenerator(path): fd = open(fn, 'r') try: for lnr, line in enumerate(fd, start=1): origline = line if UniventionPackageCheck.RE_WHITELINE.match(line): continue pos = 0 while True: m = pt.search(line[pos:], fz) if m: if not UniventionPackageCheck.RE_WHITEWORD.match( m[0]): self.debug('%s:%d: found="%s" origline="%s"' % (fn, lnr, m[0], origline)) self.addmsg( '0015-2', 'univention is incorrectly spelled: %s' % m[0], filename=fn, line=lnr) pos += m.groups()[0][1] else: break finally: fd.close()
def check(self, path): """ the real check """ super(UniventionPackageCheck, self).check(path) fz = tre.Fuzzyness(maxerr = 2) pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE) for fn in uub.FilteredDirWalkGenerator(path): fd = open(fn, 'r') try: for lnr, line in enumerate(fd, start=1): origline = line if UniventionPackageCheck.RE_WHITELINE.match(line): continue pos = 0 while True: m = pt.search(line[pos:], fz) if m: if not UniventionPackageCheck.RE_WHITEWORD.match(m[0]): self.debug('%s:%d: found="%s" origline="%s"' % (fn, lnr, m[0], origline)) self.addmsg('0015-2', 'univention is incorrectly spelled: %s' % m[0], filename=fn, line=lnr) pos += m.groups()[0][1] else: break finally: fd.close()
def mismatch_search(haystack, needle, mismatches=1): """Return the number of times needle occurs in haystack, allowing mismatches. tre doesn't support multiple results out of the box, but it starts from the end of the sequence and works to the left, so use each result's indices to pair down the haystack and search again. """ haystack = haystack.encode('utf-8'); needle = needle.encode('utf-8'); if not using_tre: raise RBSError("tre isn't loaded.") fz = tre.Fuzzyness(maxerr=mismatches, maxsub=mismatches, maxdel=0, maxins=0) needle = ".*(%s).*" % needle pt = tre.compile(needle, tre.EXTENDED) incidence = 0 while True: m = pt.search(haystack, fz) if m: index = m.groups()[1][1]-1 incidence += 1 haystack = haystack[:index] else: break return incidence
def new_SINES_filter_proc_histogram(recs, main_dict, noDuplicate, key_size, fuzziness, distribution_of_neighbors, length): with open_compressed(noDuplicate, "wt") as handle_noDuplicate: count = 0 for rec in recs: str_barc = str(rec.seq) re = tre.compile(str_barc, tre.EXTENDED) barc_parts_list = barcode_parts(rec, key_size) match = [] for rec_part in barc_parts_list: is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match, length) count = count + 1 if count % 100000 == 0: print_step(count) if len(match) == 1: gene_record_write(rec, handle_noDuplicate) if (len(match) >= length): distribution_of_neighbors[ length - 1] = distribution_of_neighbors[length - 1] + 1 else: distribution_of_neighbors[len( match)] = distribution_of_neighbors[len(match)] + 1
def test_search(): """Test searching for matches in a bytestring""" pattern = re.compile('a([0-9])a') m = pattern.search('bcda7aefga8ah') assert m.groups() == ('7',) assert m.group(0) == 'a7a' assert m.group(1) == '7'
def checkLocations(locations): """ Takes a chunk of locations and checks the tweets for these locations. Will be run in parallel. """ output = [] for l, origL in locations: # Check only for locations with spaces/tabs/etc. on the start and end. # Eliminates potential matches, but most of them would be garbage. # Trades precision for accuracy cmpl = tre.compile(r"\b{}\b".format(l), tre.EXTENDED) for t, origT in tweets: m = cmpl.search(t, fz) if m: out = { "tweet": origT, "location": origL, "match": m[0], "cost": m.cost, "numDel": m.numdel, "numIns": m.numins, "numSub": m.numsub } output.append(out) return output
def new_SINES_filter_proc_graph(q, main_dict, key_size, fuzziness): while True: recs = q.get() # log(rec) G = nx.Graph() # crete an empty graph if recs is None: q.put(None) break for rec in recs: str_barc = str(rec.seq) G.add_node((rec.seq, rec.id)) re = tre.compile(str_barc, tre.EXTENDED) barc_parts_list = barcode_parts( rec, key_size) # brake the barcode to 4 parts match = ( ) # create a tuple to connect a barcode to the sines id with edit-distance of at most 3 for rec_part in barc_parts_list: match = is_match_barcodes_graph(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match) # create the match print(type(match)) print("this is match: ", match) for m in match: G.add_edge((rec.seq, rec.id), (m[0], tuple( m[1]))) # create a edge between the barcode and its... q.put((rec, match)) nx.draw(G) plt.show() log("Slave process exited")
def test_search_unicode(): """Test searching for matches in a unicode string""" pattern = re.compile(u'ä([0-9])ö') m = pattern.search(u'bcdä7öefga8ah') assert m.groups() == (u'7',) assert m.group(0) == u'ä7ö' assert m.group(1) == u'7'
def filter_potential_sines(records, sine_pattern, sine_header=67, maxerr=14): re = tre.compile(sine_pattern[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) for rec in records: match = re.search(str(rec.seq), fuzziness) if match: yield rec
def test_match_groups(): pattern = re.compile('a([0-9])') m = pattern.match('a4ra6') assert m is not None assert m.groups() == (4,) assert m.group() == 'a4' assert m.group(0) == 'a4' assert m.group(1) == '4'
def test_match(): """Test matching""" pattern = re.compile('zat') m = pattern.match('zatazata') assert m is not None assert m.groups() == tuple() assert m.group() == 'zat' assert m.group(0) == 'zat'
def test_search_approx(): """Test approximate search""" pattern = re.compile(u'abc([0-9])abc') m = pattern.approx(u'asdfabc5acbasdfsd', cost_subst=1, max_costs=10, max_subst=10, max=10) assert m is not None assert m.groups() == ('5',) assert m.group(0) == 'abc5acb' assert m.cost == 2 assert m.num == (0, 0, 2)
def test_module_match(): """Tests whether tre.match() finds the same as a compiled regex""" regex = r'a([0-9])a' text = 'a3abda6ama7ada' m1 = re.compile(regex).match(text) m2 = re.match(regex, text) assert m1.groups() == m2.groups() assert m1.group(0) == m2.group(0) assert m1.group(1) == m2.group(1)
def test_finditer(): """Test whether finditer() returns the proper matches""" pattern = re.compile('[0-9]') results = pattern.finditer('d3t4 ru7e5!') # check for each one and for the exception, not by using list(results) assert results.next() == '3' assert results.next() == '4' assert results.next() == '7' assert results.next() == '5' assert_raises(StopIteration, results.next)
def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14): sine = gene_lib.get_sine_forward(in_file_sine) #"B1.fasta" re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) with open_compressed(in_file_unify, "rt") as handle_read, \ open_compressed(out_file_with_sine, "wt") as handle_write_sine,\ open_compressed(out_file_location, "wt") as handle_write_loc: records = gene_records_parse(handle_read) rec_i = 0 filter_potential_sines_and_locations_proc(records, re, fuzziness, handle_write_sine, handle_write_loc)
def new_SINES_filter_proc_graph(recs, main_dict, key_size, fuzziness, i=0): G = nx.Graph() # crete an empty graph if (i == 0): graph_file = 'graphPart' else: graph_file = 'graphPart' + str(i) main_key_len = int(36 / (3 + 1)) for i, rec in enumerate(recs): rec_part = list(barcode_wins(rec, main_key_len))[0] str_barc_part = str(rec_part.seq) sec_dict = main_dict[str_barc_part] str_barc = str(rec.seq) # print ('sec_dict type', type(sec_dict[str_barc])) # print(type(rec.id)) if (sec_dict[str_barc] == rec.id): G.add_node((rec.seq, rec.id)) re = tre.compile(str_barc, tre.EXTENDED) barc_parts_list = barcode_parts( rec, key_size) # brake the barcode to 4 parts match = ( ) # create a tuple to connect a barcode to the sines id with edit-distance of at most 3 for rec_part in barc_parts_list: match = is_match_barcodes_graph(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match) # create the match # print(type(match)) # print("this is match: ", match) for m in match: if (str(rec.seq) != str(m[0])): G.add_edge( (rec.seq, rec.id), (m[0], m[1] )) # create a edge between the barcode and its... outfile = open(graph_file, 'wb') pickle.dump(G, outfile) outfile.close() nx.draw(G) log("Slave process exited")
def new_SINES_filter_proc_histogram(q, main_dict, key_size, fuzziness): while True: recs = q.get() # log(rec) if recs is None: q.put(None) break for rec in recs: str_barc = str(rec.seq) re = tre.compile(str_barc, tre.EXTENDED) barc_parts_list = barcode_parts(rec, key_size) match = [] for rec_part in barc_parts_list: is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match) q.put((rec, match)) log("Slave process exited")
def showResult(file_centers,in_file_sine,sine_header=67, maxerr=19): sum = 0 hist = {} sine = gene_lib.get_sine_forward(in_file_sine) # "B1.fasta" re = tre.compile(sine[:sine_header], tre.EXTENDED) stringSine=sine print ('original sine',stringSine) fuzziness = tre.Fuzzyness(maxerr=maxerr) with open(file_centers, "r") as centerFile: for line in centerFile: currentLine = line.strip() # re2 = tre.compile(currentLine, tre.EXTENDED) # match = re2.search(stringSine, fuzziness) match = re.search(currentLine, fuzziness) sine_location=match.groups() # print (sine_location) # print ('current center', currentLine) # print ('match sine', str(sine[sine_location[0][0] :sine_location[0][1]])) # print ('current center', nltk.edit_distance(sine[sine_location[0][0] :sine_location[0][1]],currentLine)) hist[nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)] = hist.get(nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine), 0) + 1 sum = sum + nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine) print(sum/1000) print(sorted(hist.items()))
def __init__( self, pattern, maxErrors = None ): """ Initialize an ARE with pattern `pattern`, given as a string. The number of allowed errors is maxErors. If it is None, it will allow any number of errors to be made, and the ARE is basically used to counted the required number of errors made to match. """ #Copy constructor if type( pattern ) == ARE: self.maxErrors = pattern.maxErrors self.pattern = pattern.pattern #normal constructor else: #self.pattern = unidecode( pattern ) self.pattern = pattern self.maxErrors = maxErrors self.fuzzyness = tre.Fuzzyness() if maxErrors != None : self.fuzzyness.maxerr = self.maxErrors #self.fuzzyness.maxcost = self.maxErrors self.fuzzyness.subcost = 1.5 self.re = tre.compile( pattern, tre.EXTENDED )
def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14): sine = gene_lib.get_sine_forward(in_file_sine) # "B1.fasta" re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) with open_compressed(in_file_unify, "rt") as handle_read, \ open_compressed(out_file_with_sine, "wt") as handle_write_sine, \ open_compressed(out_file_location, "wt") as handle_write_loc: records = gene_records_parse(handle_read) for rec in tqdm(records, miniters=100): match = re.search(str(rec.seq), fuzziness) if match: sine_location = match.groups() gene_record_write(rec, handle_write_sine, 'fasta') handle_write_loc.write( ",".join([str(i) for i in sine_location[0]]) + "\n")
import tre fz = tre.Fuzzyness(maxerr = 3) print fz pt = tre.compile("Beulan Lake", tre.EXTENDED) data = """ In addition to fundamental contributions in several branches of theoretical computer science, Beulahh Lake is the creator of the TeX computer typesetting system, the related METAFONT font definition language and rendering system, and the Computer Modern family of typefaces. """ m = pt.search(data, fz) if m: print m.groups() print m[0]
def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80): print ('override =',override) sine_set = [] stats = collections.Counter() global bar_codes bar_codes = {} global detailed_stats detailed_stats = collections.Counter() global distances_from_combined_regexp distances_from_combined_regexp = {} matcher = difflib.SequenceMatcher() for sine_record in SeqIO.parse(sine_f, "fasta"): cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) print(cur_seq, cur_seq_rc, '''\n ======================''') complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) if override == 1: bases = ['A','C','G','T'] ind_list = [random.randrange(4) for i in range(sine_l)] r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] ) r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] ) sine_set = [r_sine, r_sine_rc] complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) # Also specifies the shift range if override > 1: if override > 2: d = override - 1 #random.randrange(2, override) print('skipping ',d) for (i,cur_seq) in enumerate(r1_f): if i == d: break sine_set = [] for (i,s) in enumerate(r1_f): cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) if i == 2: break complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) total = 0 cnt = 0 start_time = time() print('''sequences = ''') bar_code_len = 60 for cur_seq in r1_f: total += 1 m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist)) if m: res = m.group(0) d = m.cost # Filter out strings that were cut out. Approximate by max-length matches # 10 is arbitrary, not very small if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40): # print(m.groups(), len(cur_seq)) cnt += 1 stats[d] += 1 bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]] if bar_code in bar_codes: bar_codes[bar_code] += 1 else: bar_codes[bar_code] = 1 detailed_stats[res] += 1 distances_from_combined_regexp[res] = d if (total % step_print == 0 or total == nlines): print('''distances for first''', total, '''segments \n''') print('''========================''') print('''time elapsed''', (time() - start_time)/60.0, '''minutes''') for k in sorted(stats): print('edit distance =', k, 'matches =', stats[k], '''/''',cnt) if (total == nlines): break
import tre fz = tre.Fuzzyness(maxerr = 3) print fz pt = tre.compile("San Francisco", tre.EXTENDED) data = """ In addition to fundamental contributions in several branches of theoretical computer science, Donnald Erwin Kuth is the creator of the TeX computer typesetting system, the related METAFONT font definition language and rendering system, and the Computer Modern family of typefaces in San Francisco hey bois. """ m = pt.search(data, fz) if m: print m.groups() print m[0]
import tre fz = tre.Fuzzyness(maxcost = 3) print fz pt = tre.compile("(foo)(bar)", tre.EXTENDED) m = pt.match("zoobag", fz) if m: print m.groups() print m[2]
def search_sines(sines, r1_f, override=0, upper_mut_dist=20, step_print=1000000, nlines=100000000, sine_l=70): print('override =', override) sine_set = [] stats = collections.Counter() global bar_codes bar_codes = {} global detailed_stats detailed_stats = collections.Counter() global distances_from_combined_regexp distances_from_combined_regexp = {} complete_regexp = '''|'''.join([sine[:sine_l] for sine in sines]) p = tre.compile(complete_regexp, tre.EXTENDED) if override == 1: bases = ['A', 'C', 'G', 'T'] ind_list = [random.randrange(4) for i in range(sine_l)] r_sine = ''.join([bases[ipnd_list[i]] for i in range(sine_l)]) r_sine_rc = ''.join([bases[3 - ind_list[i]] for i in range(sine_l)]) sine_set = [r_sine, r_sine_rc] complete_regexp = '|'.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) # Also specifies the shift range if override > 1: if override > 2: d = override - 1 #random.randrange(2, override) print('skipping ', d) for (i, cur_seq) in enumerate(r1_f): if i == d: break sine_set = [] for (i, s) in enumerate(r1_f): cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) if i == 2: break complete_regexp = '|'.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) total = 0 cnt = 0 start_time = time() print('''sequences = ''') # bar_code_len = 60 for cur_seq in r1_f: total += 1 m = p.search(cur_seq, tre.Fuzzyness(maxerr=sine_l - 10)) if m: res = m.group(0) d = m.cost # Filter out strings that were cut out. Approximate by max-length matches # 10 is arbitrary, not very small # barcodes are not in place here stats[d] += 1 bar_code_min_len = 23 # if (m.groups()[0][1] < len(cur_seq) - 5) and (d <= upper_mut_dist): if (m.groups()[0][0] >= bar_code_min_len) and (d <= upper_mut_dist): cnt += 1 detailed_stats[res] += 1 bar_code = cur_seq[m.groups()[0][0] - bar_code_min_len:m.groups()[0][0]] bar_codes.setdefault(bar_code, 0) bar_codes[bar_code] += 1 # distances_from_combined_regexp[res] = d if (total % step_print == 0) or (total == nlines): print('''stats for first''', total, '''segments \n''') print('''========================''') print('''time elapsed''', (time() - start_time) / 60.0, '''minutes''') for k in sorted(stats): print('edit distance =', k, 'matches =', stats[k], '''/''', cnt) # pprint.pprint(collections.Counter(detailed_stats.values())) if (total == nlines): return bar_codes
import tre fz = tre.Fuzzyness(maxerr = 3) print fz pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED) data = """ In addition to fundamental contributions in several branches of theoretical computer science, Donnald Erwin Kuth is the creator of the TeX computer typesetting system, the related METAFONT font definition language and rendering system, and the Computer Modern family of typefaces. """ m = pt.search(data, fz) print dir(pt) if m: print m.groups() print m[0] print m[1]
def test_valid_compile(): """Tests for compilation of patterns which should be ok""" pattern = tre.compile("a")
import tre fz = tre.Fuzzyness(maxerr=3) print(fz) pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED) data = """ In addition to fundamental contributions in several branches of theoretical computer science, Donnald Erwin Kuth is the creator of the TeX computer typesetting system, the related METAFONT font definition language and rendering system, and the Computer Modern family of typefaces. """ m = pt.search(data, fz) if m: print(m.groups()) print(m[0])
def test_search_nomatch(): """Test whether a string with no match returns None""" pattern = re.compile("Doesn't exist") assert pattern.search('In this text') is None
def test_match_nomatch(): """Test matching with strings that don't match""" pattern = re.compile('a') m = pattern.match('zzzzaaaa') assert m is None
elif len(words) < 3: print "Found invalid line in primer file", line.strip() continue primername = words[0] primer1seq = replace_ambiguity_codes(words[1].upper()) primer2seq = replace_ambiguity_codes(words[2].upper()) if options.rcreverse: primer2seq = revcomp(primer2seq) if patterns.has_key(primername): print "Error: Two of your regions have the same name:", primername print "Skipping..." continue patterns[primername] = [[], []] patterns[primername][0].append(tre.compile(primer1seq, tre.EXTENDED)) patterns[primername][0].append( tre.compile(revcomp(primer1seq), tre.EXTENDED)) patterns[primername][1].append(tre.compile(primer2seq, tre.EXTENDED)) patterns[primername][1].append( tre.compile(revcomp(primer2seq), tre.EXTENDED)) if len(words) >= 5: try: min_product_len = int(words[4]) except ValueError: print "Invalid minimum product length for region", primername min_product_len = 0 if len(words) >= 4: try: max_product_len = int(words[3])
def basic_scan(image_name): full_image_path = images_location + image_name # New file # dest = open(os.path.join(parser_location), 'w') # shutil.copy(image.buffer(), dest) tre_fuzzyness = tre.Fuzzyness(delcost = 3, inscost = 1, subcost = 2, maxcost = 2) tre_matcher = tre.compile(ere_end_of_line_price, tre.EXTENDED) # print os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path os.system(os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path) # os.path.join(parser_location, '/ocr.sh') raw_tab_data = [] tab_items = [] tab_meta = [] with open(full_image_path + '1.txt','r') as file: for line in file.read().splitlines(): # print line # line = line.encode('punycode') debug(line) tre_match = tre_matcher.search(line, tre_fuzzyness) if tre_match: tmp_description = re.sub(r'[\s:]*' + re.escape(tre_match.group(0)), '', line).lower() tmp_value = tre_match.group(0).strip() if len(tmp_description) > 2: raw_tab_data.append({'description' : tmp_description, 'value' : tmp_value}) tre_fuzzyness = tre.Fuzzyness(maxerr = 3) tab_meta = {} cut_off_meta = 0 for raw_item in raw_tab_data: raw_item_description = raw_item['description'] raw_item_value = raw_item['value'] matches_to_compare = [] for parser_key in config['mid_parsers']: tre_matcher = tre.compile(config['mid_parsers'][parser_key]['ere'], tre.EXTENDED) tre_match = tre_matcher.search(name_fix(raw_item_description), tre_fuzzyness) debug('xxxxxxxxxxxxxxxxxxxxxxxxxx') debug(name_fix(raw_item_description) + ' XXX ' + config['mid_parsers'][parser_key]['ere'] + ' XXX ' + raw_item_value) if tre_match: debug('match') matches_to_compare.append((tre_match, config['mid_parsers'][parser_key]['string'])) if matches_to_compare: # If there were matches cut_off_meta += 1 min = matches_to_compare[0] for match in matches_to_compare: if match[0].cost < min[0].cost: min = match tab_meta[min[1]] = raw_item_value else: debug('SHOULD HAVE BEEN CUT OFF') if cut_off_meta < 1: tab_items.append(raw_item) tab = {'tab_items' : tab_items, 'tab_meta' : tab_meta} print tab # print analyze_tab(tab) x = analyze_tab(tab) print x return x ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ###################### ##################
def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14): sine = gene_lib.get_sine_forward(in_file_sine) #"B1.fasta" re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) # Create slave processes procs = [] for _ in range(multiprocessing.cpu_count() - 3): # Create a communication queue between this process and slave process q = GeneDQueue() # Create and start slave process p = Process(target=filter_potential_sines_and_locations_proc, args=(q, re, fuzziness)) p.start() procs.append({ 'p': p, 'q': q, 'batch': [], 'write_i': 0 }) with open_any(in_file_unify, "rt") as handle_read, \ open_any(out_file_with_sine, "wt") as handle_write_sine,\ open_any(out_file_location, "wt") as handle_write_loc: records = gene_records_parse(handle_read) rec_i = 0 for rec in tqdm(records, miniters=100): # Simple round-robin between the slave processes proc = procs[rec_i % len(procs)] # Add a new record into a local batch array of slave process proc['batch'].append(rec) if len(proc['batch']) >= 20: # Get found potential sine from slave process queue # # Optimization: # Don't check the slave queue every iteration, as the check slows down. # Moreover we won't get a potential sine for every record. if proc['write_i'] > 3: filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc) proc['write_i'] = 0 else: proc['write_i'] += 1 # Put batch of new records into slave process queue proc['q'].put(proc['batch']) # Reset local batch of slave process proc['batch'] = [] # Uncomment for testing a small amount of records # if rec_i == 100000: # break rec_i += 1 # Cleanup slave processes for proc in procs: # Get found potential sine from slave process queue, before last batch filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc) # Put last batch, if avaliable if len(proc['batch']): proc['q'].put(proc['batch']) proc['batch'] = [] # Make slave proccess terminate proc['q'].put(None) # Wait for termination proc['p'].join() # Get found potential sine from slave process queue, very last time filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
match = [] for rec_part in barc_parts_list: is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match) q.put((rec, match)) log("Slave process exited") >>>>>>> 43f3ca09184d4218470cf7f545d6841e1e2ec1ad count = 0 for rec in recs: str_barc = str(rec.seq) re = tre.compile(str_barc, tre.EXTENDED) barc_parts_list = barcode_parts(rec, key_size) match = [] for rec_part in barc_parts_list: is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match, lenght) count = count + 1 if count % 100000 == 0 : print_step(count) if len(match) == 1: gene_record_write(rec, handle_noDuplicate) if(len(match)>= lenght): distribution_of_neighbors[lenght-1] = distribution_of_neighbors[lenght-1] + 1
def merged_paired_ends(records1, records2): tot_good = 0 tot_great = 0 tot = 0 # log('in merged_paired_ends',records1,records2) for (rec1, rec2) in zip(records1, records2): tot += 1 str1 = str(rec1.seq) str2 = str(rec2.seq.reverse_complement()) # log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================') end1 = str1[-common_req:] re = tre.compile(end1, tre.EXTENDED) # we expect small errors here res_seq = None match = re.search(str2, tre.Fuzzyness(maxerr=init_err)) if match: tot_good += 1 match_loc = match.groups()[0][0] to_search_len = match_loc + common_req fuzzyness = max(tot_err, ceil(0.1 * to_search_len)) re = tre.compile(str1[-to_search_len:], tre.EXTENDED) match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness)) # log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost) if match_tot: # if (tot_good % 100 == 0): # log('fuzzyness = ', fuzzyness) # log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost) tot_great += 1 # An arbitrary decision: take the common string from r2 res_str = str1[:-to_search_len] + str2 # TODO: preserve qualities res_seq = SeqRecord(Seq(res_str), id=rec1.id, name=rec1.name, description=rec1.description, letter_annotations={ "phred_quality": [30 for i in range(len(res_str))] }) if (tot_great % step == 0): log('nicely matched ', str1, '\n', str2, to_search_len, match_tot.group(0), match.group(0), match_tot.cost, match.cost) # log('result = ',str(res_seq.seq)) yield res_seq continue res_str = str1 + ('N' * padding) + str2 res_seq = SeqRecord(Seq(res_str), id=rec1.id, name=rec1.name, description=rec1.description, letter_annotations={ "phred_quality": [30 for i in range(len(res_str))] }) if (tot % step == 0): log(tot, tot_good, tot_great) # log('matched ',str1,'\n',str2, len(str1), len(str2)) # log('result = ',str(res_seq.seq)) yield res_seq
def test_compile_twice(): """Tests whether a pattern can be compiled twice""" old = tre.compile("a") new = tre.compile(old) assert old is new
( r'^1{0,2}22+1+22+1{0,2}$', # eight r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$', r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+' + r'(.../)*(X_X/)+', r'(XXX/)(XXX/)(XXX/)'), ( r'^1{0,2}22+3?2?12?11+$', # nine r'^1{0,2}(2+3+2*|222+)1+$', r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$', r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$') ] re_compiled = [] for row in regexps: re_compiled.append( (tre.compile(row[0], tre.EXTENDED), tre.compile(row[1], tre.EXTENDED), tre.compile(row[2], tre.EXTENDED), tre.compile(row[3], tre.EXTENDED))) # limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings, # max_num_hcrossings, max_num_vcrossings, # min_num_hcrossings, min_num_vcrossings, # min_max_num_hcrossings, min_max_num_vcrossings) limits = [ (4, 4, 3, 3, 1, 1, 2, 2), # zero (4, 1, 2, 2, 1, 1, 1, 1), # one (4, 4, 3, 3, 1, 1, 1, 2), # two (4, 4, 2, 4, 1, 1, 1, 3), # three (4, 4, 3, 2, 1, 1, 2, 1), # four (4, 4, 2, 3, 1, 1, 1, 3), # five (4, 4, 2, 3, 1, 1, 2, 2), # six (4, 3, 2, 3, 1, 1, 1, 2), # seven
def test_findall(): """Test whether findall() returns the proper list of matches""" pattern = re.compile('[0-9]') results = pattern.findall('d3t4 ru7e5!') assert results == ['3', '4', '7', '5']
r'1+2(2|1)+11?', r'^/(X../|..X/){0,3}(_X./|_.X/)+(.XX/)(.XX/)?(_X./|_.X/)' + r'(_X./|_.X/)+(.X_/|X._/)*$', r'^/(_X./)*(X._/)+(.XX/)+.*$'), (r'^1{0,2}22+1+22+1{0,2}$', # eight r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$', r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+' + r'(.../)*(X_X/)+', r'(XXX/)(XXX/)(XXX/)'), (r'^1{0,2}22+3?2?12?11+$', # nine r'^1{0,2}(2+3+2*|222+)1+$', r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$', r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$')] re_compiled = [] for row in regexps: re_compiled.append((tre.compile(row[0], tre.EXTENDED), tre.compile(row[1], tre.EXTENDED), tre.compile(row[2], tre.EXTENDED), tre.compile(row[3], tre.EXTENDED))) # limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings, # max_num_hcrossings, max_num_vcrossings, # min_num_hcrossings, min_num_vcrossings, # min_max_num_hcrossings, min_max_num_vcrossings) limits = [(4, 4, 3, 3, 1, 1, 2, 2), # zero (4, 1, 2, 2, 1, 1, 1, 1), # one (4, 4, 3, 3, 1, 1, 1, 2), # two (4, 4, 2, 4, 1, 1, 1, 3), # three (4, 4, 3, 2, 1, 1, 2, 1), # four (4, 4, 2, 3, 1, 1, 1, 3), # five (4, 4, 2, 3, 1, 1, 2, 2), # six
def search_sines2(sine, r1_f, frac_bound, pref_bound, start_line=0, step_print=1000000, nlines=200000000, thresh=9, pref=60): global stats stats = {} print('step ', step_print, nlines) sine = sine[:pref] matcher = difflib.SequenceMatcher(isjunk=None, a=sine, b='', autojunk=False) total = 0 cnt = 0 start_time = time() print('''condidates for sine = ''') if start_line > 0: for (i, cur_seq) in enumerate(r1_f): if i == start_line - 1: break for cur_seq in r1_f: if (total % step_print == 0 or total == nlines): print('''distances for first''', total, '''segments \n''') print('''========================''') print('''time elapsed''', (time() - start_time) / 60.0, '''minutes''') for k in sorted(stats): n = sum([i for i in stats[k][1].values()]) print('longest common =', k, 'num matches =', n, stats[k][0], '''/''', cnt) if (total >= nlines) and (k >= thresh): for (i, frac) in enumerate(sorted(stats[k][1])): print(k, 'Fraction = ', frac) if i == 20: break if (total == nlines): break total += 1 matcher.set_seq2(cur_seq) res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq)) com = res[2] complete_regexp = sine[:res[0]] + '$' p = tre.compile(complete_regexp, tre.EXTENDED) max_fuzz = res[ 0] # int(frac_bound*res[0]) is better perhaps, but want to trivialize it for now m = p.search( cur_seq[:res[1]], tre.Fuzzyness(maxcost=max_fuzz, delcost=int(1 / 4.0 * max_fuzz) + 1, inscost=int(1 / 4.0 * max_fuzz) + 1)) if m == None: continue start_p = m.groups()[0][0] d = m.cost # This is the fraction of edit distance out of all. # In most cases, this is the right edit distance for the overall prefix if (res[0] + com) == 0: print('How peculier!', 'com =', com, 'res[0] = ', res[0], m.cost) continue frac = Fraction(d, res[0] + com) stats.setdefault(com, [0, collections.Counter()]) stats[com][0] += 1 try: if (start_p >= pref_bound) and Fraction(d, res[0]) <= frac_bound: stats[com][1][frac] += 1 cnt += 1 except (ZeroDivisionError): pass
#!/usr/bin/env python import difflib import sys import tre import gzip # define barcode format; build regex objects for approximate string matching linker1 = "CCTAGTCGCGTAGAC" l1reg = tre.compile(linker1) linker1Length = len(linker1) # define Fuzzyness for tre matching fz = tre.Fuzzyness(maxins=0, maxdel=0, maxsub=1) # pull in read for parsing def readread(s): return [ s.readline().rstrip('\n'), s.readline().rstrip('\n'), s.readline().rstrip('\n'), s.readline().rstrip('\n') ] def diff_letters(a, b): return sum(a[i] != b[i] for i in range(len(a))) def parseRead(s, o):