コード例 #1
0
 def __init__(self, regex, target_group=0, maxerr=1, caseSensitive=True):
     self.regex = regex
     self.target_group = target_group
     self.fuzzyness = tre.Fuzzyness(maxerr = maxerr)
     if not caseSensitive:
         self.r = tre.compile(regex, tre.ICASE | tre.EXTENDED)
     else:
         self.r = tre.compile(regex, tre.EXTENDED)
コード例 #2
0
    def __init__(self, true_clones_df, nt=True):
        self.true_clones_df = true_clones_df

        column = 'nSeqCDR3' if nt else 'aaSeqCDR3'
        patterns_end_to_end = []
        patterns_any = []
        for row in true_clones.iterrows():
            patterns_end_to_end += [(row[1][column],
                                     tre.compile("^" + row[1][column] + "$"))]
            #patternsAny += [tre.compile(row[1].cdr3)]
            patterns_any += [(row[1][column],
                              tre.compile(row[1][column][3:-3]))]

        self.patterns_end_to_end = patterns_end_to_end
        self.patterns_any = patterns_any
コード例 #3
0
def filter_potential_sines(in_fname,
                           sine_string,
                           sine_header=67,
                           maxerr=19,
                           reverse_complement=False):
    """
    Finds candidate SINEs with a certain distance from a prefix length.
    To be used for preliminary screening (input for later steps).
    """
    with gene_lib.open_compressed(in_fname, 'rt') as in_file_handle:
        records = SeqIO.parse(in_file_handle, format="fastq")
        re = tre.compile(sine[:sine_header], tre.EXTENDED)
        fuzziness = tre.Fuzzyness(maxerr=maxerr)

        for rec in records:
            if reverse_complement:
                cur_seq = rec.seq.reverse_complement()
            else:
                cur_seq = rec.seq

            match = re.search(str(cur_seq), fuzziness)
            if match:
                # log(rec.seq)
                #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example
                SeqIO.write(rec, sys.stdout, 'fastq')
コード例 #4
0
ファイル: approx_match.py プロジェクト: lukeandrew/enki
def findApproxText(
  # Text to search for
  searchText,
  # Text in which to find the searchText
  targetText,
  # Maximum allowable cost for an approximate match. None indicates no maximum cost.
  cost = None):

    # tre.LITERAL specifies that searchText is a literal search string, not
    # a regex.
    pat = tre.compile(searchText, tre.LITERAL)
    fz = tre.Fuzzyness(maxerr = cost) if cost else tre.Fuzzyness()
    match = pat.search(targetText, fz)
    # Store the index into the target string of the first and last matched chars.
    beginInTarget, endInTarget = match.groups()[0]

    # TRE picks the first match it finds, even if there is
    # more than one match with identical error. So, manually
    # call it again excluding the found text to check. In addition,
    # make sure this match is unique: it should be 10%
    # better than the next best match.
    matchAgain = pat.search(targetText[:beginInTarget] + targetText[endInTarget:], fz)

    if matchAgain and (matchAgain.cost <= match.cost*1.1):
        ## print('Multiple matches ' + str(matchAgain.groups()))
        return None, 0, 0
    else:
        ## print(searchText + '\n' + targetText[beginInTarget:endInTarget])
        return match, beginInTarget, endInTarget
コード例 #5
0
    def check(self, path):
        """ the real check """
        super(UniventionPackageCheck, self).check(path)

        fz = tre.Fuzzyness(maxerr=2)
        pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE)

        for fn in uub.FilteredDirWalkGenerator(path):
            fd = open(fn, 'r')
            try:
                for lnr, line in enumerate(fd, start=1):
                    origline = line
                    if UniventionPackageCheck.RE_WHITELINE.match(line):
                        continue
                    pos = 0
                    while True:
                        m = pt.search(line[pos:], fz)
                        if m:
                            if not UniventionPackageCheck.RE_WHITEWORD.match(
                                    m[0]):
                                self.debug('%s:%d: found="%s"  origline="%s"' %
                                           (fn, lnr, m[0], origline))
                                self.addmsg(
                                    '0015-2',
                                    'univention is incorrectly spelled: %s' %
                                    m[0],
                                    filename=fn,
                                    line=lnr)
                            pos += m.groups()[0][1]
                        else:
                            break
            finally:
                fd.close()
コード例 #6
0
ファイル: 0015-FuzzyNames.py プロジェクト: B-Rich/smart
	def check(self, path):
		""" the real check """
		super(UniventionPackageCheck, self).check(path)

		fz = tre.Fuzzyness(maxerr = 2)
		pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE)

		for fn in uub.FilteredDirWalkGenerator(path):
				fd = open(fn, 'r')
				try:
					for lnr, line in enumerate(fd, start=1):
						origline = line
						if UniventionPackageCheck.RE_WHITELINE.match(line):
							continue
						pos = 0
						while True:
							m = pt.search(line[pos:], fz)
							if m:
								if not UniventionPackageCheck.RE_WHITEWORD.match(m[0]):
									self.debug('%s:%d: found="%s"  origline="%s"' % (fn, lnr, m[0], origline))
									self.addmsg('0015-2', 'univention is incorrectly spelled: %s' % m[0], filename=fn, line=lnr)
								pos += m.groups()[0][1]
							else:
								break
				finally:
					fd.close()
コード例 #7
0
ファイル: Strings.py プロジェクト: barendt/RBS-Utilities
def mismatch_search(haystack, needle, mismatches=1):
    """Return the number of times needle occurs in haystack, allowing 
    mismatches.

    tre doesn't support multiple results out of the box, but it starts from
    the end of the sequence and works to the left, so use each result's indices
    to pair down the haystack and search again.

    """
    haystack = haystack.encode('utf-8');
    needle = needle.encode('utf-8');
    if not using_tre:
        raise RBSError("tre isn't loaded.")
    fz = tre.Fuzzyness(maxerr=mismatches, maxsub=mismatches,
                       maxdel=0, maxins=0)
    needle = ".*(%s).*" % needle
    pt = tre.compile(needle, tre.EXTENDED)
    incidence = 0
    while True:
        m = pt.search(haystack, fz)
        if m:
            index = m.groups()[1][1]-1
            incidence += 1
            haystack = haystack[:index]
        else:
            break
    return incidence
コード例 #8
0
ファイル: organized_all.py プロジェクト: anpc/sines-in-aging
def new_SINES_filter_proc_histogram(recs, main_dict, noDuplicate, key_size,
                                    fuzziness, distribution_of_neighbors,
                                    length):

    with open_compressed(noDuplicate, "wt") as handle_noDuplicate:

        count = 0
        for rec in recs:
            str_barc = str(rec.seq)
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(rec, key_size)
            match = []

            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id,
                                       re, fuzziness, match, length)

            count = count + 1
            if count % 100000 == 0:
                print_step(count)

            if len(match) == 1:
                gene_record_write(rec, handle_noDuplicate)

            if (len(match) >= length):
                distribution_of_neighbors[
                    length - 1] = distribution_of_neighbors[length - 1] + 1
            else:
                distribution_of_neighbors[len(
                    match)] = distribution_of_neighbors[len(match)] + 1
コード例 #9
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_search():
    """Test searching for matches in a bytestring"""
    pattern = re.compile('a([0-9])a')
    m = pattern.search('bcda7aefga8ah')
    assert m.groups() == ('7',)
    assert m.group(0) == 'a7a'
    assert m.group(1) == '7'
コード例 #10
0
ファイル: treMain.py プロジェクト: banool/comp30018-assn1
def checkLocations(locations):
    """
    Takes a chunk of locations and checks the tweets for these locations.
    Will be run in parallel.
    """
    output = []
    for l, origL in locations:
        # Check only for locations with spaces/tabs/etc. on the start and end.
        # Eliminates potential matches, but most of them would be garbage.
        # Trades precision for accuracy
        cmpl = tre.compile(r"\b{}\b".format(l), tre.EXTENDED)
        for t, origT in tweets:
            m = cmpl.search(t, fz)
            if m:
                out = {
                    "tweet": origT,
                    "location": origL,
                    "match": m[0],
                    "cost": m.cost,
                    "numDel": m.numdel,
                    "numIns": m.numins,
                    "numSub": m.numsub
                }
                output.append(out)
    return output
コード例 #11
0
def new_SINES_filter_proc_graph(q, main_dict, key_size, fuzziness):
    while True:
        recs = q.get()
        # log(rec)

        G = nx.Graph()  # crete an empty graph

        if recs is None:
            q.put(None)
            break
        for rec in recs:
            str_barc = str(rec.seq)
            G.add_node((rec.seq, rec.id))
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(
                rec, key_size)  # brake the barcode to 4 parts
            match = (
            )  # create a tuple to connect a barcode to the sines id with edit-distance of at most 3

            for rec_part in barc_parts_list:
                match = is_match_barcodes_graph(main_dict[str(rec_part.seq)],
                                                rec.id, re, fuzziness,
                                                match)  # create the match

            print(type(match))
            print("this is match: ", match)
            for m in match:
                G.add_edge((rec.seq, rec.id), (m[0], tuple(
                    m[1])))  # create a edge between the barcode and its...

        q.put((rec, match))
        nx.draw(G)
        plt.show()

    log("Slave process exited")
コード例 #12
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_search_unicode():
    """Test searching for matches in a unicode string"""
    pattern = re.compile(u'ä([0-9])ö')
    m = pattern.search(u'bcdä7öefga8ah')
    assert m.groups() == (u'7',)
    assert m.group(0) == u'ä7ö'
    assert m.group(1) == u'7'
コード例 #13
0
ファイル: sines.py プロジェクト: ayeletYe/sines-in-aging
def filter_potential_sines(records, sine_pattern, sine_header=67, maxerr=14):
    re = tre.compile(sine_pattern[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    for rec in records:
        match = re.search(str(rec.seq), fuzziness)
        if match:
            yield rec
コード例 #14
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_match_groups():
    pattern = re.compile('a([0-9])')
    m = pattern.match('a4ra6')
    assert m is not None
    assert m.groups() == (4,)
    assert m.group() == 'a4'
    assert m.group(0) == 'a4'
    assert m.group(1) == '4'
コード例 #15
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_match():
    """Test matching"""
    pattern = re.compile('zat')
    m = pattern.match('zatazata')
    assert m is not None
    assert m.groups() == tuple()
    assert m.group() == 'zat'
    assert m.group(0) == 'zat'
コード例 #16
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_search_approx():
    """Test approximate search"""
    pattern = re.compile(u'abc([0-9])abc')
    m = pattern.approx(u'asdfabc5acbasdfsd', cost_subst=1, max_costs=10, max_subst=10, max=10)
    assert m is not None
    assert m.groups() == ('5',)
    assert m.group(0) == 'abc5acb'
    assert m.cost == 2
    assert m.num == (0, 0, 2)
コード例 #17
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_module_match():
    """Tests whether tre.match() finds the same as a compiled regex"""
    regex = r'a([0-9])a'
    text = 'a3abda6ama7ada'
    m1 = re.compile(regex).match(text)
    m2 = re.match(regex, text)
    assert m1.groups() == m2.groups()
    assert m1.group(0) == m2.group(0)
    assert m1.group(1) == m2.group(1)
コード例 #18
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_finditer():
    """Test whether finditer() returns the proper matches"""
    pattern = re.compile('[0-9]')
    results = pattern.finditer('d3t4 ru7e5!')
    # check for each one and for the exception, not by using list(results)
    assert results.next() == '3'
    assert results.next() == '4'
    assert results.next() == '7'
    assert results.next() == '5'
    assert_raises(StopIteration, results.next)
コード例 #19
0
ファイル: organized_all.py プロジェクト: anpc/sines-in-aging
def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
      open_compressed(out_file_with_sine, "wt") as handle_write_sine,\
      open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)
        rec_i = 0
        filter_potential_sines_and_locations_proc(records, re, fuzziness,
                                                  handle_write_sine,
                                                  handle_write_loc)
コード例 #20
0
def new_SINES_filter_proc_graph(recs, main_dict, key_size, fuzziness, i=0):
    G = nx.Graph()  # crete an empty graph
    if (i == 0):
        graph_file = 'graphPart'
    else:
        graph_file = 'graphPart' + str(i)
    main_key_len = int(36 / (3 + 1))
    for i, rec in enumerate(recs):
        rec_part = list(barcode_wins(rec, main_key_len))[0]
        str_barc_part = str(rec_part.seq)
        sec_dict = main_dict[str_barc_part]
        str_barc = str(rec.seq)
        #        print ('sec_dict type', type(sec_dict[str_barc]))
        #        print(type(rec.id))
        if (sec_dict[str_barc] == rec.id):
            G.add_node((rec.seq, rec.id))

            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(
                rec, key_size)  # brake the barcode to 4 parts
            match = (
            )  # create a tuple to connect a barcode to the sines id with edit-distance of at most 3

            for rec_part in barc_parts_list:
                match = is_match_barcodes_graph(main_dict[str(rec_part.seq)],
                                                rec.id, re, fuzziness,
                                                match)  # create the match
                # print(type(match))
                # print("this is match: ", match)
                for m in match:
                    if (str(rec.seq) != str(m[0])):
                        G.add_edge(
                            (rec.seq, rec.id),
                            (m[0], m[1]
                             ))  # create a edge between the barcode and its...

    outfile = open(graph_file, 'wb')
    pickle.dump(G, outfile)
    outfile.close()
    nx.draw(G)
    log("Slave process exited")
コード例 #21
0
def new_SINES_filter_proc_histogram(q, main_dict, key_size, fuzziness):
    while True:
        recs = q.get()
        # log(rec)

        if recs is None:
            q.put(None)
            break

        for rec in recs:
            str_barc = str(rec.seq)
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(rec, key_size)
            match = []

            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id,
                                       re, fuzziness, match)

            q.put((rec, match))

    log("Slave process exited")
コード例 #22
0
ファイル: showResults.py プロジェクト: anpc/sines-in-aging
def showResult(file_centers,in_file_sine,sine_header=67, maxerr=19):
    sum = 0
    hist = {}
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    stringSine=sine
    print ('original sine',stringSine)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)
    with open(file_centers, "r") as centerFile:
        for line in centerFile:
            currentLine = line.strip()
#            re2 = tre.compile(currentLine, tre.EXTENDED)
#            match = re2.search(stringSine, fuzziness)
            match = re.search(currentLine, fuzziness)
            sine_location=match.groups()
#            print (sine_location)
#            print ('current center', currentLine)
#            print ('match sine', str(sine[sine_location[0][0] :sine_location[0][1]]))
#            print ('current center',  nltk.edit_distance(sine[sine_location[0][0] :sine_location[0][1]],currentLine))
            hist[nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)] = hist.get(nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine), 0) + 1
            sum = sum + nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)
        print(sum/1000)
        print(sorted(hist.items()))
コード例 #23
0
ファイル: are.py プロジェクト: DataEssential/corp-data
 def __init__( self, pattern, maxErrors = None ):
     """ Initialize an ARE with pattern `pattern`, given as a string.
         The number of allowed errors is maxErors.
         If it is None, it will allow any number of errors to be made, and the ARE is basically used to counted the
         required number of errors made to match.
     """
     #Copy constructor
     if type( pattern ) == ARE:
         self.maxErrors = pattern.maxErrors
         self.pattern = pattern.pattern
     #normal constructor
     else:
         #self.pattern = unidecode( pattern )
         self.pattern = pattern
         self.maxErrors = maxErrors
     
     self.fuzzyness = tre.Fuzzyness()
     if maxErrors != None :
         self.fuzzyness.maxerr = self.maxErrors
     #self.fuzzyness.maxcost = self.maxErrors
     self.fuzzyness.subcost = 1.5 
     
     self.re = tre.compile( pattern, tre.EXTENDED )
コード例 #24
0
def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
            open_compressed(out_file_with_sine, "wt") as handle_write_sine, \
            open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)

        for rec in tqdm(records, miniters=100):
            match = re.search(str(rec.seq), fuzziness)
            if match:
                sine_location = match.groups()
                gene_record_write(rec, handle_write_sine, 'fasta')
                handle_write_loc.write(
                    ",".join([str(i) for i in sine_location[0]]) + "\n")
コード例 #25
0
import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("Beulan Lake", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Beulahh Lake is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.
"""

m = pt.search(data, fz)

if m:
    print m.groups()
    print m[0]
コード例 #26
0
ファイル: sines.py プロジェクト: Yitzhakbin9/sines-in-aging
def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80):
    print ('override =',override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}
    
    global detailed_stats
    detailed_stats = collections.Counter()
    
    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    matcher = difflib.SequenceMatcher()
    
    for sine_record in SeqIO.parse(sine_f, "fasta"):
        cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA())
        cur_seq_rc = cur_seq.reverse_complement()
        sine_set.append(str(cur_seq))
        sine_set.append(str(cur_seq_rc))
        print(cur_seq, cur_seq_rc, '''\n ======================''')

    complete_regexp = '''|'''.join(sine_set)
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A','C','G','T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] )
        r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] )
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range   
    if override > 1:
        if override > 2:
            d = override - 1 #random.randrange(2, override)
            print('skipping ',d)
            for (i,cur_seq) in enumerate(r1_f):
                if i == d:
                    break
                
        sine_set = []
        for (i,s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break
            
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)     

        
    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    bar_code_len = 60                         
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40):
                # print(m.groups(), len(cur_seq))
                cnt += 1      
                stats[d] += 1

                bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]]

                if bar_code in bar_codes:
                   bar_codes[bar_code] +=  1
                else:
                    bar_codes[bar_code] = 1

            detailed_stats[res] += 1
            distances_from_combined_regexp[res] = d 

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time)/60.0, '''minutes''')
            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',cnt)
        
        if (total == nlines):
            break
コード例 #27
0
ファイル: example.py プロジェクト: banool/comp30018-assn1
import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("San Francisco", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces in San Francisco hey bois.
"""

m = pt.search(data, fz)

if m:
    print m.groups()
    print m[0]
コード例 #28
0
import tre

fz = tre.Fuzzyness(maxcost = 3)

print fz

pt = tre.compile("(foo)(bar)", tre.EXTENDED)

m = pt.match("zoobag", fz)

if m:
    print m.groups()
    print m[2]
コード例 #29
0
ファイル: sines.py プロジェクト: ayeletYe/sines-in-aging
def search_sines(sines,
                 r1_f,
                 override=0,
                 upper_mut_dist=20,
                 step_print=1000000,
                 nlines=100000000,
                 sine_l=70):

    print('override =', override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}

    global detailed_stats
    detailed_stats = collections.Counter()

    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    complete_regexp = '''|'''.join([sine[:sine_l] for sine in sines])
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A', 'C', 'G', 'T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join([bases[ipnd_list[i]] for i in range(sine_l)])
        r_sine_rc = ''.join([bases[3 - ind_list[i]] for i in range(sine_l)])
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range
    if override > 1:
        if override > 2:
            d = override - 1  #random.randrange(2, override)
            print('skipping ', d)
            for (i, cur_seq) in enumerate(r1_f):
                if i == d:
                    break

        sine_set = []
        for (i, s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break

        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    # bar_code_len = 60
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr=sine_l - 10))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            # barcodes are not in place here

            stats[d] += 1
            bar_code_min_len = 23
            #  if (m.groups()[0][1] < len(cur_seq) - 5) and (d <= upper_mut_dist):

            if (m.groups()[0][0] >= bar_code_min_len) and (d <=
                                                           upper_mut_dist):
                cnt += 1
                detailed_stats[res] += 1
                bar_code = cur_seq[m.groups()[0][0] -
                                   bar_code_min_len:m.groups()[0][0]]

                bar_codes.setdefault(bar_code, 0)
                bar_codes[bar_code] += 1

        #    distances_from_combined_regexp[res] = d

        if (total % step_print == 0) or (total == nlines):
            print('''stats for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')

            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',
                      cnt)
        #   pprint.pprint(collections.Counter(detailed_stats.values()))

        if (total == nlines):
            return bar_codes
コード例 #30
0
ファイル: example.py プロジェクト: jsanch/okra
import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.

"""

m = pt.search(data, fz)

print dir(pt)
if m:
    print m.groups()
    print m[0]
    print m[1]
コード例 #31
0
ファイル: tests_compat.py プロジェクト: Leonidas-from-XIV/tre
def test_valid_compile():
    """Tests for compilation of patterns which should be ok"""
    pattern = tre.compile("a")
コード例 #32
0
ファイル: example.py プロジェクト: digitalbiology/tre
import tre

fz = tre.Fuzzyness(maxerr=3)
print(fz)

pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.

"""

m = pt.search(data, fz)

if m:
    print(m.groups())
    print(m[0])
コード例 #33
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_search_nomatch():
    """Test whether a string with no match returns None"""
    pattern = re.compile("Doesn't exist")
    assert pattern.search('In this text') is None
コード例 #34
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_match_nomatch():
    """Test matching with strings that don't match"""
    pattern = re.compile('a')
    m = pattern.match('zzzzaaaa')
    assert m is None
コード例 #35
0
        elif len(words) < 3:
            print "Found invalid line in primer file", line.strip()
            continue
        primername = words[0]
        primer1seq = replace_ambiguity_codes(words[1].upper())
        primer2seq = replace_ambiguity_codes(words[2].upper())
        if options.rcreverse:
            primer2seq = revcomp(primer2seq)

        if patterns.has_key(primername):
            print "Error: Two of your regions have the same name:", primername
            print "Skipping..."
            continue

        patterns[primername] = [[], []]
        patterns[primername][0].append(tre.compile(primer1seq, tre.EXTENDED))
        patterns[primername][0].append(
            tre.compile(revcomp(primer1seq), tre.EXTENDED))
        patterns[primername][1].append(tre.compile(primer2seq, tre.EXTENDED))
        patterns[primername][1].append(
            tre.compile(revcomp(primer2seq), tre.EXTENDED))

        if len(words) >= 5:
            try:
                min_product_len = int(words[4])
            except ValueError:
                print "Invalid minimum product length for region", primername
                min_product_len = 0
        if len(words) >= 4:
            try:
                max_product_len = int(words[3])
コード例 #36
0
ファイル: okraparser.py プロジェクト: jsanch/okra
def basic_scan(image_name):
	full_image_path = images_location + image_name
	# New file
	# dest = open(os.path.join(parser_location), 'w')
	# shutil.copy(image.buffer(), dest)

	tre_fuzzyness = tre.Fuzzyness(delcost = 3, inscost = 1, subcost = 2, maxcost = 2)
	tre_matcher = tre.compile(ere_end_of_line_price, tre.EXTENDED)
	
	# print os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path
	os.system(os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path)
	# os.path.join(parser_location, '/ocr.sh')

	raw_tab_data = []
	tab_items = []
	tab_meta = []

	with open(full_image_path + '1.txt','r') as file:
		for line in file.read().splitlines():
			# print line
			# line = line.encode('punycode')
			debug(line)
			tre_match = tre_matcher.search(line, tre_fuzzyness)
			if tre_match:
				tmp_description = re.sub(r'[\s:]*' + re.escape(tre_match.group(0)), '', line).lower()
				tmp_value = tre_match.group(0).strip()
				if len(tmp_description) > 2:
					raw_tab_data.append({'description' : tmp_description, 'value' : tmp_value})


	tre_fuzzyness = tre.Fuzzyness(maxerr = 3)
	tab_meta = {}

	cut_off_meta = 0

	for raw_item in raw_tab_data:
		raw_item_description = raw_item['description']
		raw_item_value = raw_item['value']
		matches_to_compare = []
		for parser_key in config['mid_parsers']:
			tre_matcher = tre.compile(config['mid_parsers'][parser_key]['ere'], tre.EXTENDED)
			tre_match = tre_matcher.search(name_fix(raw_item_description), tre_fuzzyness)
			debug('xxxxxxxxxxxxxxxxxxxxxxxxxx')
			debug(name_fix(raw_item_description) + ' XXX ' + config['mid_parsers'][parser_key]['ere'] + ' XXX ' + raw_item_value)
			if tre_match:
				debug('match')
				matches_to_compare.append((tre_match, config['mid_parsers'][parser_key]['string']))

		if matches_to_compare:
			# If there were matches
			cut_off_meta += 1
			min = matches_to_compare[0]
			for match in matches_to_compare:
				if match[0].cost < min[0].cost:
					min = match
			tab_meta[min[1]] = raw_item_value
		else:
			debug('SHOULD HAVE BEEN CUT OFF')
			if cut_off_meta < 1:
				tab_items.append(raw_item)

	tab = {'tab_items' : tab_items, 'tab_meta' : tab_meta}
	print tab

	# print analyze_tab(tab)
	x = analyze_tab(tab)
	print x
	return x


	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################

	######################
	######################
	######################
	######################

	######################
	######################
	##################
コード例 #37
0
def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    # Create slave processes
    procs = []
    for _ in range(multiprocessing.cpu_count() - 3):
        # Create a communication queue between this process and slave process
        q = GeneDQueue()
        
        # Create and start slave process
        p = Process(target=filter_potential_sines_and_locations_proc, args=(q, re, fuzziness))
        p.start()

        procs.append({
            'p': p,
            'q': q,
            'batch': [],
            'write_i': 0
        })

    with open_any(in_file_unify, "rt") as handle_read, \
         open_any(out_file_with_sine, "wt") as handle_write_sine,\
         open_any(out_file_location, "wt") as handle_write_loc:


        records = gene_records_parse(handle_read)
        rec_i = 0

        for rec in tqdm(records, miniters=100):
            # Simple round-robin between the slave processes
            proc = procs[rec_i % len(procs)]

            # Add a new record into a local batch array of slave process
            proc['batch'].append(rec)

            if len(proc['batch']) >= 20:
                # Get found potential sine from slave process queue
                #
                # Optimization: 
                # Don't check the slave queue every iteration, as the check slows down.
                # Moreover we won't get a potential sine for every record.
                if proc['write_i'] > 3:
                    filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
                    proc['write_i'] = 0
                else:
                    proc['write_i'] += 1

                # Put batch of new records into slave process queue
                proc['q'].put(proc['batch'])

                # Reset local batch of slave process
                proc['batch'] = []

            # Uncomment for testing a small amount of records
            # if rec_i == 100000:
            #     break

            rec_i += 1
        
        # Cleanup slave processes
        for proc in procs:
            # Get found potential sine from slave process queue, before last batch
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)

            # Put last batch, if avaliable
            if len(proc['batch']):
                proc['q'].put(proc['batch'])
                proc['batch'] = []
            
            # Make slave proccess terminate
            proc['q'].put(None)

            # Wait for termination
            proc['p'].join()
            
            # Get found potential sine from slave process queue, very last time
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
コード例 #38
0
            match = []
            
            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match)
                    
                    

            q.put((rec, match))
    
    log("Slave process exited")	
>>>>>>> 43f3ca09184d4218470cf7f545d6841e1e2ec1ad

		count = 0
		for rec in recs:
			str_barc = str(rec.seq)
			re = tre.compile(str_barc, tre.EXTENDED)
			barc_parts_list = barcode_parts(rec, key_size)
			match = []
			
			for rec_part in barc_parts_list:
				is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match, lenght)
					
			count = count + 1		
			if count % 100000 == 0 :
				print_step(count)
			
			if len(match) == 1:
				gene_record_write(rec, handle_noDuplicate)
				
			if(len(match)>= lenght):
				distribution_of_neighbors[lenght-1] = distribution_of_neighbors[lenght-1] + 1
コード例 #39
0
def merged_paired_ends(records1, records2):
    tot_good = 0
    tot_great = 0
    tot = 0
    #    log('in merged_paired_ends',records1,records2)
    for (rec1, rec2) in zip(records1, records2):
        tot += 1
        str1 = str(rec1.seq)
        str2 = str(rec2.seq.reverse_complement())
        #        log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================')
        end1 = str1[-common_req:]
        re = tre.compile(end1, tre.EXTENDED)
        # we expect small errors here
        res_seq = None
        match = re.search(str2, tre.Fuzzyness(maxerr=init_err))
        if match:
            tot_good += 1
            match_loc = match.groups()[0][0]
            to_search_len = match_loc + common_req
            fuzzyness = max(tot_err, ceil(0.1 * to_search_len))
            re = tre.compile(str1[-to_search_len:], tre.EXTENDED)
            match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness))
            #           log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost)
            if match_tot:
                #    if (tot_good % 100 == 0):
                #        log('fuzzyness = ', fuzzyness)
                #              log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost)
                tot_great += 1
                # An arbitrary decision: take the common string from r2
                res_str = str1[:-to_search_len] + str2
                # TODO: preserve qualities
                res_seq = SeqRecord(Seq(res_str),
                                    id=rec1.id,
                                    name=rec1.name,
                                    description=rec1.description,
                                    letter_annotations={
                                        "phred_quality":
                                        [30 for i in range(len(res_str))]
                                    })
                if (tot_great % step == 0):
                    log('nicely matched ', str1, '\n', str2, to_search_len,
                        match_tot.group(0), match.group(0), match_tot.cost,
                        match.cost)
#             log('result = ',str(res_seq.seq))
                yield res_seq
                continue

        res_str = str1 + ('N' * padding) + str2
        res_seq = SeqRecord(Seq(res_str),
                            id=rec1.id,
                            name=rec1.name,
                            description=rec1.description,
                            letter_annotations={
                                "phred_quality":
                                [30 for i in range(len(res_str))]
                            })
        if (tot % step == 0):
            log(tot, tot_good, tot_great)
        # log('matched ',str1,'\n',str2, len(str1), len(str2))
#      log('result = ',str(res_seq.seq))
        yield res_seq
コード例 #40
0
ファイル: tests_compat.py プロジェクト: Leonidas-from-XIV/tre
def test_compile_twice():
    """Tests whether a pattern can be compiled twice"""
    old = tre.compile("a")
    new = tre.compile(old)
    assert old is new
コード例 #41
0
    (
        r'^1{0,2}22+1+22+1{0,2}$',  # eight
        r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$',
        r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+' +
        r'(.../)*(X_X/)+',
        r'(XXX/)(XXX/)(XXX/)'),
    (
        r'^1{0,2}22+3?2?12?11+$',  # nine
        r'^1{0,2}(2+3+2*|222+)1+$',
        r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$',
        r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$')
]
re_compiled = []
for row in regexps:
    re_compiled.append(
        (tre.compile(row[0], tre.EXTENDED), tre.compile(row[1], tre.EXTENDED),
         tre.compile(row[2], tre.EXTENDED), tre.compile(row[3], tre.EXTENDED)))

# limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings,
#                         max_num_hcrossings, max_num_vcrossings,
#                         min_num_hcrossings, min_num_vcrossings,
#                         min_max_num_hcrossings, min_max_num_vcrossings)
limits = [
    (4, 4, 3, 3, 1, 1, 2, 2),  # zero
    (4, 1, 2, 2, 1, 1, 1, 1),  # one
    (4, 4, 3, 3, 1, 1, 1, 2),  # two
    (4, 4, 2, 4, 1, 1, 1, 3),  # three
    (4, 4, 3, 2, 1, 1, 2, 1),  # four
    (4, 4, 2, 3, 1, 1, 1, 3),  # five
    (4, 4, 2, 3, 1, 1, 2, 2),  # six
    (4, 3, 2, 3, 1, 1, 1, 2),  # seven
コード例 #42
0
ファイル: test_basic.py プロジェクト: Leonidas-from-XIV/tre
def test_findall():
    """Test whether findall() returns the proper list of matches"""
    pattern = re.compile('[0-9]')
    results = pattern.findall('d3t4 ru7e5!')
    assert results == ['3', '4', '7', '5']
コード例 #43
0
ファイル: ocr.py プロジェクト: Alvibanez/eyegrade
            r'1+2(2|1)+11?',
            r'^/(X../|..X/){0,3}(_X./|_.X/)+(.XX/)(.XX/)?(_X./|_.X/)'
            + r'(_X./|_.X/)+(.X_/|X._/)*$',
            r'^/(_X./)*(X._/)+(.XX/)+.*$'),
           (r'^1{0,2}22+1+22+1{0,2}$', # eight
            r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$',
            r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+'
            + r'(.../)*(X_X/)+',
            r'(XXX/)(XXX/)(XXX/)'),
           (r'^1{0,2}22+3?2?12?11+$', # nine
            r'^1{0,2}(2+3+2*|222+)1+$',
            r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$',
            r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$')]
re_compiled = []
for row in regexps:
    re_compiled.append((tre.compile(row[0], tre.EXTENDED),
                        tre.compile(row[1], tre.EXTENDED),
                        tre.compile(row[2], tre.EXTENDED),
                        tre.compile(row[3], tre.EXTENDED)))

# limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings,
#                         max_num_hcrossings, max_num_vcrossings,
#                         min_num_hcrossings, min_num_vcrossings,
#                         min_max_num_hcrossings, min_max_num_vcrossings)
limits = [(4, 4, 3, 3, 1, 1, 2, 2), # zero
          (4, 1, 2, 2, 1, 1, 1, 1), # one
          (4, 4, 3, 3, 1, 1, 1, 2), # two
          (4, 4, 2, 4, 1, 1, 1, 3), # three
          (4, 4, 3, 2, 1, 1, 2, 1), # four
          (4, 4, 2, 3, 1, 1, 1, 3), # five
          (4, 4, 2, 3, 1, 1, 2, 2), # six
コード例 #44
0
ファイル: sines.py プロジェクト: ayeletYe/sines-in-aging
def search_sines2(sine,
                  r1_f,
                  frac_bound,
                  pref_bound,
                  start_line=0,
                  step_print=1000000,
                  nlines=200000000,
                  thresh=9,
                  pref=60):

    global stats
    stats = {}

    print('step ', step_print, nlines)
    sine = sine[:pref]
    matcher = difflib.SequenceMatcher(isjunk=None,
                                      a=sine,
                                      b='',
                                      autojunk=False)

    total = 0
    cnt = 0
    start_time = time()
    print('''condidates for sine = ''')

    if start_line > 0:
        for (i, cur_seq) in enumerate(r1_f):
            if i == start_line - 1:
                break

    for cur_seq in r1_f:

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')
            for k in sorted(stats):
                n = sum([i for i in stats[k][1].values()])
                print('longest common =', k, 'num matches =', n, stats[k][0],
                      '''/''', cnt)
                if (total >= nlines) and (k >= thresh):
                    for (i, frac) in enumerate(sorted(stats[k][1])):
                        print(k, 'Fraction = ', frac)
                        if i == 20:
                            break

        if (total == nlines):
            break

        total += 1
        matcher.set_seq2(cur_seq)
        res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq))
        com = res[2]

        complete_regexp = sine[:res[0]] + '$'
        p = tre.compile(complete_regexp, tre.EXTENDED)
        max_fuzz = res[
            0]  # int(frac_bound*res[0]) is better perhaps, but want to trivialize it for now
        m = p.search(
            cur_seq[:res[1]],
            tre.Fuzzyness(maxcost=max_fuzz,
                          delcost=int(1 / 4.0 * max_fuzz) + 1,
                          inscost=int(1 / 4.0 * max_fuzz) + 1))
        if m == None:
            continue

        start_p = m.groups()[0][0]
        d = m.cost

        # This is the fraction of edit distance out of all.
        # In most cases, this is the right edit distance for the overall prefix

        if (res[0] + com) == 0:
            print('How peculier!', 'com =', com, 'res[0] = ', res[0], m.cost)
            continue

        frac = Fraction(d, res[0] + com)

        stats.setdefault(com, [0, collections.Counter()])
        stats[com][0] += 1

        try:
            if (start_p >= pref_bound) and Fraction(d, res[0]) <= frac_bound:
                stats[com][1][frac] += 1
                cnt += 1
        except (ZeroDivisionError):
            pass
コード例 #45
0
import tre

fz = tre.Fuzzyness(maxcost = 3)

print fz

pt = tre.compile("(foo)(bar)", tre.EXTENDED)

m = pt.match("zoobag", fz)

if m:
    print m.groups()
    print m[2]
コード例 #46
0
#!/usr/bin/env python

import difflib
import sys
import tre
import gzip

# define barcode format; build regex objects for approximate string matching
linker1 = "CCTAGTCGCGTAGAC"
l1reg = tre.compile(linker1)
linker1Length = len(linker1)

# define Fuzzyness for tre matching
fz = tre.Fuzzyness(maxins=0, maxdel=0, maxsub=1)


# pull in read for parsing
def readread(s):
    return [
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n')
    ]


def diff_letters(a, b):
    return sum(a[i] != b[i] for i in range(len(a)))


def parseRead(s, o):