Esempi in Python per compile, esempi in Python per tre.compile

Esempio n. 1

0

Mostra file

 def __init__(self, regex, target_group=0, maxerr=1, caseSensitive=True):
     self.regex = regex
     self.target_group = target_group
     self.fuzzyness = tre.Fuzzyness(maxerr = maxerr)
     if not caseSensitive:
         self.r = tre.compile(regex, tre.ICASE | tre.EXTENDED)
     else:
         self.r = tre.compile(regex, tre.EXTENDED)

Esempio n. 2

0

Mostra file

    def __init__(self, true_clones_df, nt=True):
        self.true_clones_df = true_clones_df

        column = 'nSeqCDR3' if nt else 'aaSeqCDR3'
        patterns_end_to_end = []
        patterns_any = []
        for row in true_clones.iterrows():
            patterns_end_to_end += [(row[1][column],
                                     tre.compile("^" + row[1][column] + "$"))]
            #patternsAny += [tre.compile(row[1].cdr3)]
            patterns_any += [(row[1][column],
                              tre.compile(row[1][column][3:-3]))]

        self.patterns_end_to_end = patterns_end_to_end
        self.patterns_any = patterns_any

Esempio n. 3

0

Mostra file

File: filter_candidates.py Progetto: anpc/sines-in-aging

def filter_potential_sines(in_fname,
                           sine_string,
                           sine_header=67,
                           maxerr=19,
                           reverse_complement=False):
    """
    Finds candidate SINEs with a certain distance from a prefix length.
    To be used for preliminary screening (input for later steps).
    """
    with gene_lib.open_compressed(in_fname, 'rt') as in_file_handle:
        records = SeqIO.parse(in_file_handle, format="fastq")
        re = tre.compile(sine[:sine_header], tre.EXTENDED)
        fuzziness = tre.Fuzzyness(maxerr=maxerr)

        for rec in records:
            if reverse_complement:
                cur_seq = rec.seq.reverse_complement()
            else:
                cur_seq = rec.seq

            match = re.search(str(cur_seq), fuzziness)
            if match:
                # log(rec.seq)
                #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example
                SeqIO.write(rec, sys.stdout, 'fastq')

Esempio n. 4

0

Mostra file

File: approx_match.py Progetto: lukeandrew/enki

def findApproxText(
  # Text to search for
  searchText,
  # Text in which to find the searchText
  targetText,
  # Maximum allowable cost for an approximate match. None indicates no maximum cost.
  cost = None):

    # tre.LITERAL specifies that searchText is a literal search string, not
    # a regex.
    pat = tre.compile(searchText, tre.LITERAL)
    fz = tre.Fuzzyness(maxerr = cost) if cost else tre.Fuzzyness()
    match = pat.search(targetText, fz)
    # Store the index into the target string of the first and last matched chars.
    beginInTarget, endInTarget = match.groups()[0]

    # TRE picks the first match it finds, even if there is
    # more than one match with identical error. So, manually
    # call it again excluding the found text to check. In addition,
    # make sure this match is unique: it should be 10%
    # better than the next best match.
    matchAgain = pat.search(targetText[:beginInTarget] + targetText[endInTarget:], fz)

    if matchAgain and (matchAgain.cost <= match.cost*1.1):
        ## print('Multiple matches ' + str(matchAgain.groups()))
        return None, 0, 0
    else:
        ## print(searchText + '\n' + targetText[beginInTarget:endInTarget])
        return match, beginInTarget, endInTarget

Esempio n. 5

0

Mostra file

    def check(self, path):
        """ the real check """
        super(UniventionPackageCheck, self).check(path)

        fz = tre.Fuzzyness(maxerr=2)
        pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE)

        for fn in uub.FilteredDirWalkGenerator(path):
            fd = open(fn, 'r')
            try:
                for lnr, line in enumerate(fd, start=1):
                    origline = line
                    if UniventionPackageCheck.RE_WHITELINE.match(line):
                        continue
                    pos = 0
                    while True:
                        m = pt.search(line[pos:], fz)
                        if m:
                            if not UniventionPackageCheck.RE_WHITEWORD.match(
                                    m[0]):
                                self.debug('%s:%d: found="%s"  origline="%s"' %
                                           (fn, lnr, m[0], origline))
                                self.addmsg(
                                    '0015-2',
                                    'univention is incorrectly spelled: %s' %
                                    m[0],
                                    filename=fn,
                                    line=lnr)
                            pos += m.groups()[0][1]
                        else:
                            break
            finally:
                fd.close()

Esempio n. 6

0

Mostra file

File: 0015-FuzzyNames.py Progetto: B-Rich/smart

	def check(self, path):
		""" the real check """
		super(UniventionPackageCheck, self).check(path)

		fz = tre.Fuzzyness(maxerr = 2)
		pt = tre.compile("\<univention\>", tre.EXTENDED | tre.ICASE)

		for fn in uub.FilteredDirWalkGenerator(path):
				fd = open(fn, 'r')
				try:
					for lnr, line in enumerate(fd, start=1):
						origline = line
						if UniventionPackageCheck.RE_WHITELINE.match(line):
							continue
						pos = 0
						while True:
							m = pt.search(line[pos:], fz)
							if m:
								if not UniventionPackageCheck.RE_WHITEWORD.match(m[0]):
									self.debug('%s:%d: found="%s"  origline="%s"' % (fn, lnr, m[0], origline))
									self.addmsg('0015-2', 'univention is incorrectly spelled: %s' % m[0], filename=fn, line=lnr)
								pos += m.groups()[0][1]
							else:
								break
				finally:
					fd.close()

Esempio n. 7

0

Mostra file

File: Strings.py Progetto: barendt/RBS-Utilities

def mismatch_search(haystack, needle, mismatches=1):
    """Return the number of times needle occurs in haystack, allowing 
    mismatches.

    tre doesn't support multiple results out of the box, but it starts from
    the end of the sequence and works to the left, so use each result's indices
    to pair down the haystack and search again.

    """
    haystack = haystack.encode('utf-8');
    needle = needle.encode('utf-8');
    if not using_tre:
        raise RBSError("tre isn't loaded.")
    fz = tre.Fuzzyness(maxerr=mismatches, maxsub=mismatches,
                       maxdel=0, maxins=0)
    needle = ".*(%s).*" % needle
    pt = tre.compile(needle, tre.EXTENDED)
    incidence = 0
    while True:
        m = pt.search(haystack, fz)
        if m:
            index = m.groups()[1][1]-1
            incidence += 1
            haystack = haystack[:index]
        else:
            break
    return incidence

Esempio n. 8

0

Mostra file

File: organized_all.py Progetto: anpc/sines-in-aging

def new_SINES_filter_proc_histogram(recs, main_dict, noDuplicate, key_size,
                                    fuzziness, distribution_of_neighbors,
                                    length):

    with open_compressed(noDuplicate, "wt") as handle_noDuplicate:

        count = 0
        for rec in recs:
            str_barc = str(rec.seq)
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(rec, key_size)
            match = []

            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id,
                                       re, fuzziness, match, length)

            count = count + 1
            if count % 100000 == 0:
                print_step(count)

            if len(match) == 1:
                gene_record_write(rec, handle_noDuplicate)

            if (len(match) >= length):
                distribution_of_neighbors[
                    length - 1] = distribution_of_neighbors[length - 1] + 1
            else:
                distribution_of_neighbors[len(
                    match)] = distribution_of_neighbors[len(match)] + 1

Esempio n. 9

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_search():
    """Test searching for matches in a bytestring"""
    pattern = re.compile('a([0-9])a')
    m = pattern.search('bcda7aefga8ah')
    assert m.groups() == ('7',)
    assert m.group(0) == 'a7a'
    assert m.group(1) == '7'

Esempio n. 10

0

Mostra file

File: treMain.py Progetto: banool/comp30018-assn1

def checkLocations(locations):
    """
    Takes a chunk of locations and checks the tweets for these locations.
    Will be run in parallel.
    """
    output = []
    for l, origL in locations:
        # Check only for locations with spaces/tabs/etc. on the start and end.
        # Eliminates potential matches, but most of them would be garbage.
        # Trades precision for accuracy
        cmpl = tre.compile(r"\b{}\b".format(l), tre.EXTENDED)
        for t, origT in tweets:
            m = cmpl.search(t, fz)
            if m:
                out = {
                    "tweet": origT,
                    "location": origL,
                    "match": m[0],
                    "cost": m.cost,
                    "numDel": m.numdel,
                    "numIns": m.numins,
                    "numSub": m.numsub
                }
                output.append(out)
    return output

Esempio n. 11

0

Mostra file

File: organized_all_2.py Progetto: shirarotshild/sines-in-aging

def new_SINES_filter_proc_graph(q, main_dict, key_size, fuzziness):
    while True:
        recs = q.get()
        # log(rec)

        G = nx.Graph()  # crete an empty graph

        if recs is None:
            q.put(None)
            break
        for rec in recs:
            str_barc = str(rec.seq)
            G.add_node((rec.seq, rec.id))
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(
                rec, key_size)  # brake the barcode to 4 parts
            match = (
            )  # create a tuple to connect a barcode to the sines id with edit-distance of at most 3

            for rec_part in barc_parts_list:
                match = is_match_barcodes_graph(main_dict[str(rec_part.seq)],
                                                rec.id, re, fuzziness,
                                                match)  # create the match

            print(type(match))
            print("this is match: ", match)
            for m in match:
                G.add_edge((rec.seq, rec.id), (m[0], tuple(
                    m[1])))  # create a edge between the barcode and its...

        q.put((rec, match))
        nx.draw(G)
        plt.show()

    log("Slave process exited")

Esempio n. 12

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_search_unicode():
    """Test searching for matches in a unicode string"""
    pattern = re.compile(u'ä([0-9])ö')
    m = pattern.search(u'bcdä7öefga8ah')
    assert m.groups() == (u'7',)
    assert m.group(0) == u'ä7ö'
    assert m.group(1) == u'7'

Esempio n. 13

0

Mostra file

File: sines.py Progetto: ayeletYe/sines-in-aging

def filter_potential_sines(records, sine_pattern, sine_header=67, maxerr=14):
    re = tre.compile(sine_pattern[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    for rec in records:
        match = re.search(str(rec.seq), fuzziness)
        if match:
            yield rec

Esempio n. 14

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_match_groups():
    pattern = re.compile('a([0-9])')
    m = pattern.match('a4ra6')
    assert m is not None
    assert m.groups() == (4,)
    assert m.group() == 'a4'
    assert m.group(0) == 'a4'
    assert m.group(1) == '4'

Esempio n. 15

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_match():
    """Test matching"""
    pattern = re.compile('zat')
    m = pattern.match('zatazata')
    assert m is not None
    assert m.groups() == tuple()
    assert m.group() == 'zat'
    assert m.group(0) == 'zat'

Esempio n. 16

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_search_approx():
    """Test approximate search"""
    pattern = re.compile(u'abc([0-9])abc')
    m = pattern.approx(u'asdfabc5acbasdfsd', cost_subst=1, max_costs=10, max_subst=10, max=10)
    assert m is not None
    assert m.groups() == ('5',)
    assert m.group(0) == 'abc5acb'
    assert m.cost == 2
    assert m.num == (0, 0, 2)

Esempio n. 17

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_module_match():
    """Tests whether tre.match() finds the same as a compiled regex"""
    regex = r'a([0-9])a'
    text = 'a3abda6ama7ada'
    m1 = re.compile(regex).match(text)
    m2 = re.match(regex, text)
    assert m1.groups() == m2.groups()
    assert m1.group(0) == m2.group(0)
    assert m1.group(1) == m2.group(1)

Esempio n. 18

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_finditer():
    """Test whether finditer() returns the proper matches"""
    pattern = re.compile('[0-9]')
    results = pattern.finditer('d3t4 ru7e5!')
    # check for each one and for the exception, not by using list(results)
    assert results.next() == '3'
    assert results.next() == '4'
    assert results.next() == '7'
    assert results.next() == '5'
    assert_raises(StopIteration, results.next)

Esempio n. 19

0

Mostra file

File: organized_all.py Progetto: anpc/sines-in-aging

def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
      open_compressed(out_file_with_sine, "wt") as handle_write_sine,\
      open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)
        rec_i = 0
        filter_potential_sines_and_locations_proc(records, re, fuzziness,
                                                  handle_write_sine,
                                                  handle_write_loc)

Esempio n. 20

0

Mostra file

def new_SINES_filter_proc_graph(recs, main_dict, key_size, fuzziness, i=0):
    G = nx.Graph()  # crete an empty graph
    if (i == 0):
        graph_file = 'graphPart'
    else:
        graph_file = 'graphPart' + str(i)
    main_key_len = int(36 / (3 + 1))
    for i, rec in enumerate(recs):
        rec_part = list(barcode_wins(rec, main_key_len))[0]
        str_barc_part = str(rec_part.seq)
        sec_dict = main_dict[str_barc_part]
        str_barc = str(rec.seq)
        #        print ('sec_dict type', type(sec_dict[str_barc]))
        #        print(type(rec.id))
        if (sec_dict[str_barc] == rec.id):
            G.add_node((rec.seq, rec.id))

            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(
                rec, key_size)  # brake the barcode to 4 parts
            match = (
            )  # create a tuple to connect a barcode to the sines id with edit-distance of at most 3

            for rec_part in barc_parts_list:
                match = is_match_barcodes_graph(main_dict[str(rec_part.seq)],
                                                rec.id, re, fuzziness,
                                                match)  # create the match
                # print(type(match))
                # print("this is match: ", match)
                for m in match:
                    if (str(rec.seq) != str(m[0])):
                        G.add_edge(
                            (rec.seq, rec.id),
                            (m[0], m[1]
                             ))  # create a edge between the barcode and its...

    outfile = open(graph_file, 'wb')
    pickle.dump(G, outfile)
    outfile.close()
    nx.draw(G)
    log("Slave process exited")

Esempio n. 21

0

Mostra file

def new_SINES_filter_proc_histogram(q, main_dict, key_size, fuzziness):
    while True:
        recs = q.get()
        # log(rec)

        if recs is None:
            q.put(None)
            break

        for rec in recs:
            str_barc = str(rec.seq)
            re = tre.compile(str_barc, tre.EXTENDED)
            barc_parts_list = barcode_parts(rec, key_size)
            match = []

            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id,
                                       re, fuzziness, match)

            q.put((rec, match))

    log("Slave process exited")

Esempio n. 22

0

Mostra file

File: showResults.py Progetto: anpc/sines-in-aging

def showResult(file_centers,in_file_sine,sine_header=67, maxerr=19):
    sum = 0
    hist = {}
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    stringSine=sine
    print ('original sine',stringSine)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)
    with open(file_centers, "r") as centerFile:
        for line in centerFile:
            currentLine = line.strip()
#            re2 = tre.compile(currentLine, tre.EXTENDED)
#            match = re2.search(stringSine, fuzziness)
            match = re.search(currentLine, fuzziness)
            sine_location=match.groups()
#            print (sine_location)
#            print ('current center', currentLine)
#            print ('match sine', str(sine[sine_location[0][0] :sine_location[0][1]]))
#            print ('current center',  nltk.edit_distance(sine[sine_location[0][0] :sine_location[0][1]],currentLine))
            hist[nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)] = hist.get(nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine), 0) + 1
            sum = sum + nltk.edit_distance(stringSine[sine_location[0][0] :sine_location[0][1]],currentLine)
        print(sum/1000)
        print(sorted(hist.items()))

Esempio n. 23

0

Mostra file

File: are.py Progetto: DataEssential/corp-data

 def __init__( self, pattern, maxErrors = None ):
     """ Initialize an ARE with pattern `pattern`, given as a string.
         The number of allowed errors is maxErors.
         If it is None, it will allow any number of errors to be made, and the ARE is basically used to counted the
         required number of errors made to match.
     """
     #Copy constructor
     if type( pattern ) == ARE:
         self.maxErrors = pattern.maxErrors
         self.pattern = pattern.pattern
     #normal constructor
     else:
         #self.pattern = unidecode( pattern )
         self.pattern = pattern
         self.maxErrors = maxErrors
     
     self.fuzzyness = tre.Fuzzyness()
     if maxErrors != None :
         self.fuzzyness.maxerr = self.maxErrors
     #self.fuzzyness.maxcost = self.maxErrors
     self.fuzzyness.subcost = 1.5 
     
     self.re = tre.compile( pattern, tre.EXTENDED )

Esempio n. 24

0

Mostra file

def filter_potential_sines_and_locations(in_file_unify,
                                         in_file_sine,
                                         out_file_with_sine,
                                         out_file_location,
                                         sine_header=67,
                                         maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  # "B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)


    with open_compressed(in_file_unify, "rt") as handle_read, \
            open_compressed(out_file_with_sine, "wt") as handle_write_sine, \
            open_compressed(out_file_location, "wt") as handle_write_loc:

        records = gene_records_parse(handle_read)

        for rec in tqdm(records, miniters=100):
            match = re.search(str(rec.seq), fuzziness)
            if match:
                sine_location = match.groups()
                gene_record_write(rec, handle_write_sine, 'fasta')
                handle_write_loc.write(
                    ",".join([str(i) for i in sine_location[0]]) + "\n")

Esempio n. 25

0

Mostra file

import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("Beulan Lake", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Beulahh Lake is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.
"""

m = pt.search(data, fz)

if m:
    print m.groups()
    print m[0]

Esempio n. 26

0

Mostra file

File: sines.py Progetto: Yitzhakbin9/sines-in-aging

def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80):
    print ('override =',override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}
    
    global detailed_stats
    detailed_stats = collections.Counter()
    
    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    matcher = difflib.SequenceMatcher()
    
    for sine_record in SeqIO.parse(sine_f, "fasta"):
        cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA())
        cur_seq_rc = cur_seq.reverse_complement()
        sine_set.append(str(cur_seq))
        sine_set.append(str(cur_seq_rc))
        print(cur_seq, cur_seq_rc, '''\n ======================''')

    complete_regexp = '''|'''.join(sine_set)
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A','C','G','T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] )
        r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] )
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range   
    if override > 1:
        if override > 2:
            d = override - 1 #random.randrange(2, override)
            print('skipping ',d)
            for (i,cur_seq) in enumerate(r1_f):
                if i == d:
                    break
                
        sine_set = []
        for (i,s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break
            
        complete_regexp = '''|'''.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)     

        
    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    bar_code_len = 60                         
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40):
                # print(m.groups(), len(cur_seq))
                cnt += 1      
                stats[d] += 1

                bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]]

                if bar_code in bar_codes:
                   bar_codes[bar_code] +=  1
                else:
                    bar_codes[bar_code] = 1

            detailed_stats[res] += 1
            distances_from_combined_regexp[res] = d 

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time)/60.0, '''minutes''')
            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',cnt)
        
        if (total == nlines):
            break

Esempio n. 27

0

Mostra file

File: example.py Progetto: banool/comp30018-assn1

import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("San Francisco", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces in San Francisco hey bois.
"""

m = pt.search(data, fz)

if m:
    print m.groups()
    print m[0]

Esempio n. 28

0

Mostra file

File: example.py Progetto: ArchAssault-Project/Scalpel-2.0

import tre

fz = tre.Fuzzyness(maxcost = 3)

print fz

pt = tre.compile("(foo)(bar)", tre.EXTENDED)

m = pt.match("zoobag", fz)

if m:
    print m.groups()
    print m[2]

Esempio n. 29

0

Mostra file

File: sines.py Progetto: ayeletYe/sines-in-aging

def search_sines(sines,
                 r1_f,
                 override=0,
                 upper_mut_dist=20,
                 step_print=1000000,
                 nlines=100000000,
                 sine_l=70):

    print('override =', override)
    sine_set = []
    stats = collections.Counter()

    global bar_codes
    bar_codes = {}

    global detailed_stats
    detailed_stats = collections.Counter()

    global distances_from_combined_regexp
    distances_from_combined_regexp = {}

    complete_regexp = '''|'''.join([sine[:sine_l] for sine in sines])
    p = tre.compile(complete_regexp, tre.EXTENDED)

    if override == 1:
        bases = ['A', 'C', 'G', 'T']
        ind_list = [random.randrange(4) for i in range(sine_l)]
        r_sine = ''.join([bases[ipnd_list[i]] for i in range(sine_l)])
        r_sine_rc = ''.join([bases[3 - ind_list[i]] for i in range(sine_l)])
        sine_set = [r_sine, r_sine_rc]
        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    # Also specifies the shift  range
    if override > 1:
        if override > 2:
            d = override - 1  #random.randrange(2, override)
            print('skipping ', d)
            for (i, cur_seq) in enumerate(r1_f):
                if i == d:
                    break

        sine_set = []
        for (i, s) in enumerate(r1_f):
            cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA())
            cur_seq_rc = cur_seq.reverse_complement()
            sine_set.append(str(cur_seq))
            sine_set.append(str(cur_seq_rc))
            if i == 2:
                break

        complete_regexp = '|'.join(sine_set)
        p = tre.compile(complete_regexp, tre.EXTENDED)

    total = 0
    cnt = 0
    start_time = time()
    print('''sequences = ''')

    # bar_code_len = 60
    for cur_seq in r1_f:
        total += 1
        m = p.search(cur_seq, tre.Fuzzyness(maxerr=sine_l - 10))
        if m:
            res = m.group(0)
            d = m.cost
            # Filter out strings that were cut out. Approximate by max-length matches
            # 10 is arbitrary, not very small
            # barcodes are not in place here

            stats[d] += 1
            bar_code_min_len = 23
            #  if (m.groups()[0][1] < len(cur_seq) - 5) and (d <= upper_mut_dist):

            if (m.groups()[0][0] >= bar_code_min_len) and (d <=
                                                           upper_mut_dist):
                cnt += 1
                detailed_stats[res] += 1
                bar_code = cur_seq[m.groups()[0][0] -
                                   bar_code_min_len:m.groups()[0][0]]

                bar_codes.setdefault(bar_code, 0)
                bar_codes[bar_code] += 1

        #    distances_from_combined_regexp[res] = d

        if (total % step_print == 0) or (total == nlines):
            print('''stats for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')

            for k in sorted(stats):
                print('edit distance =', k, 'matches =', stats[k], '''/''',
                      cnt)
        #   pprint.pprint(collections.Counter(detailed_stats.values()))

        if (total == nlines):
            return bar_codes

Esempio n. 30

0

Mostra file

File: example.py Progetto: jsanch/okra

import tre

fz = tre.Fuzzyness(maxerr = 3)
print fz

pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.

"""

m = pt.search(data, fz)

print dir(pt)
if m:
    print m.groups()
    print m[0]
    print m[1]

Esempio n. 31

0

Mostra file

File: tests_compat.py Progetto: Leonidas-from-XIV/tre

def test_valid_compile():
    """Tests for compilation of patterns which should be ok"""
    pattern = tre.compile("a")

Esempio n. 32

0

Mostra file

File: example.py Progetto: digitalbiology/tre

import tre

fz = tre.Fuzzyness(maxerr=3)
print(fz)

pt = tre.compile("Don(ald( Ervin)?)? Knuth", tre.EXTENDED)
data = """
In addition to fundamental contributions in several branches of
theoretical computer science, Donnald Erwin Kuth is the creator of the
TeX computer typesetting system, the related METAFONT font definition
language and rendering system, and the Computer Modern family of
typefaces.

"""

m = pt.search(data, fz)

if m:
    print(m.groups())
    print(m[0])

Esempio n. 33

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_search_nomatch():
    """Test whether a string with no match returns None"""
    pattern = re.compile("Doesn't exist")
    assert pattern.search('In this text') is None

Esempio n. 34

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_match_nomatch():
    """Test matching with strings that don't match"""
    pattern = re.compile('a')
    m = pattern.match('zzzzaaaa')
    assert m is None

Esempio n. 35

0

Mostra file

        elif len(words) < 3:
            print "Found invalid line in primer file", line.strip()
            continue
        primername = words[0]
        primer1seq = replace_ambiguity_codes(words[1].upper())
        primer2seq = replace_ambiguity_codes(words[2].upper())
        if options.rcreverse:
            primer2seq = revcomp(primer2seq)

        if patterns.has_key(primername):
            print "Error: Two of your regions have the same name:", primername
            print "Skipping..."
            continue

        patterns[primername] = [[], []]
        patterns[primername][0].append(tre.compile(primer1seq, tre.EXTENDED))
        patterns[primername][0].append(
            tre.compile(revcomp(primer1seq), tre.EXTENDED))
        patterns[primername][1].append(tre.compile(primer2seq, tre.EXTENDED))
        patterns[primername][1].append(
            tre.compile(revcomp(primer2seq), tre.EXTENDED))

        if len(words) >= 5:
            try:
                min_product_len = int(words[4])
            except ValueError:
                print "Invalid minimum product length for region", primername
                min_product_len = 0
        if len(words) >= 4:
            try:
                max_product_len = int(words[3])

Esempio n. 36

0

Mostra file

File: okraparser.py Progetto: jsanch/okra

def basic_scan(image_name):
	full_image_path = images_location + image_name
	# New file
	# dest = open(os.path.join(parser_location), 'w')
	# shutil.copy(image.buffer(), dest)

	tre_fuzzyness = tre.Fuzzyness(delcost = 3, inscost = 1, subcost = 2, maxcost = 2)
	tre_matcher = tre.compile(ere_end_of_line_price, tre.EXTENDED)
	
	# print os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path
	os.system(os.path.join(parser_location, 'ocr.sh') + ' ' + full_image_path)
	# os.path.join(parser_location, '/ocr.sh')

	raw_tab_data = []
	tab_items = []
	tab_meta = []

	with open(full_image_path + '1.txt','r') as file:
		for line in file.read().splitlines():
			# print line
			# line = line.encode('punycode')
			debug(line)
			tre_match = tre_matcher.search(line, tre_fuzzyness)
			if tre_match:
				tmp_description = re.sub(r'[\s:]*' + re.escape(tre_match.group(0)), '', line).lower()
				tmp_value = tre_match.group(0).strip()
				if len(tmp_description) > 2:
					raw_tab_data.append({'description' : tmp_description, 'value' : tmp_value})


	tre_fuzzyness = tre.Fuzzyness(maxerr = 3)
	tab_meta = {}

	cut_off_meta = 0

	for raw_item in raw_tab_data:
		raw_item_description = raw_item['description']
		raw_item_value = raw_item['value']
		matches_to_compare = []
		for parser_key in config['mid_parsers']:
			tre_matcher = tre.compile(config['mid_parsers'][parser_key]['ere'], tre.EXTENDED)
			tre_match = tre_matcher.search(name_fix(raw_item_description), tre_fuzzyness)
			debug('xxxxxxxxxxxxxxxxxxxxxxxxxx')
			debug(name_fix(raw_item_description) + ' XXX ' + config['mid_parsers'][parser_key]['ere'] + ' XXX ' + raw_item_value)
			if tre_match:
				debug('match')
				matches_to_compare.append((tre_match, config['mid_parsers'][parser_key]['string']))

		if matches_to_compare:
			# If there were matches
			cut_off_meta += 1
			min = matches_to_compare[0]
			for match in matches_to_compare:
				if match[0].cost < min[0].cost:
					min = match
			tab_meta[min[1]] = raw_item_value
		else:
			debug('SHOULD HAVE BEEN CUT OFF')
			if cut_off_meta < 1:
				tab_items.append(raw_item)

	tab = {'tab_items' : tab_items, 'tab_meta' : tab_meta}
	print tab

	# print analyze_tab(tab)
	x = analyze_tab(tab)
	print x
	return x


	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################
	######################

	######################
	######################
	######################
	######################

	######################
	######################
	##################

Esempio n. 37

0

Mostra file

File: organized_all.py Progetto: shirarotshild/sines-in-aging

def filter_potential_sines_and_locations(in_file_unify, in_file_sine, out_file_with_sine, out_file_location, sine_header=67, maxerr=14):
    sine = gene_lib.get_sine_forward(in_file_sine)  #"B1.fasta"
    re = tre.compile(sine[:sine_header], tre.EXTENDED)
    fuzziness = tre.Fuzzyness(maxerr=maxerr)

    # Create slave processes
    procs = []
    for _ in range(multiprocessing.cpu_count() - 3):
        # Create a communication queue between this process and slave process
        q = GeneDQueue()
        
        # Create and start slave process
        p = Process(target=filter_potential_sines_and_locations_proc, args=(q, re, fuzziness))
        p.start()

        procs.append({
            'p': p,
            'q': q,
            'batch': [],
            'write_i': 0
        })

    with open_any(in_file_unify, "rt") as handle_read, \
         open_any(out_file_with_sine, "wt") as handle_write_sine,\
         open_any(out_file_location, "wt") as handle_write_loc:


        records = gene_records_parse(handle_read)
        rec_i = 0

        for rec in tqdm(records, miniters=100):
            # Simple round-robin between the slave processes
            proc = procs[rec_i % len(procs)]

            # Add a new record into a local batch array of slave process
            proc['batch'].append(rec)

            if len(proc['batch']) >= 20:
                # Get found potential sine from slave process queue
                #
                # Optimization: 
                # Don't check the slave queue every iteration, as the check slows down.
                # Moreover we won't get a potential sine for every record.
                if proc['write_i'] > 3:
                    filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)
                    proc['write_i'] = 0
                else:
                    proc['write_i'] += 1

                # Put batch of new records into slave process queue
                proc['q'].put(proc['batch'])

                # Reset local batch of slave process
                proc['batch'] = []

            # Uncomment for testing a small amount of records
            # if rec_i == 100000:
            #     break

            rec_i += 1
        
        # Cleanup slave processes
        for proc in procs:
            # Get found potential sine from slave process queue, before last batch
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)

            # Put last batch, if avaliable
            if len(proc['batch']):
                proc['q'].put(proc['batch'])
                proc['batch'] = []
            
            # Make slave proccess terminate
            proc['q'].put(None)

            # Wait for termination
            proc['p'].join()
            
            # Get found potential sine from slave process queue, very last time
            filter_potential_sines_and_locations_write(proc['q'], handle_write_sine, handle_write_loc)

Esempio n. 38

0

Mostra file

File: organized_all.py Progetto: shirarotshild/sines-in-aging

            match = []
            
            for rec_part in barc_parts_list:
                is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match)
                    
                    

            q.put((rec, match))
    
    log("Slave process exited")	
>>>>>>> 43f3ca09184d4218470cf7f545d6841e1e2ec1ad

		count = 0
		for rec in recs:
			str_barc = str(rec.seq)
			re = tre.compile(str_barc, tre.EXTENDED)
			barc_parts_list = barcode_parts(rec, key_size)
			match = []
			
			for rec_part in barc_parts_list:
				is_match_barcodes_hist(main_dict[str(rec_part.seq)], rec.id, re, fuzziness, match, lenght)
					
			count = count + 1		
			if count % 100000 == 0 :
				print_step(count)
			
			if len(match) == 1:
				gene_record_write(rec, handle_noDuplicate)
				
			if(len(match)>= lenght):
				distribution_of_neighbors[lenght-1] = distribution_of_neighbors[lenght-1] + 1

Esempio n. 39

0

Mostra file

def merged_paired_ends(records1, records2):
    tot_good = 0
    tot_great = 0
    tot = 0
    #    log('in merged_paired_ends',records1,records2)
    for (rec1, rec2) in zip(records1, records2):
        tot += 1
        str1 = str(rec1.seq)
        str2 = str(rec2.seq.reverse_complement())
        #        log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================')
        end1 = str1[-common_req:]
        re = tre.compile(end1, tre.EXTENDED)
        # we expect small errors here
        res_seq = None
        match = re.search(str2, tre.Fuzzyness(maxerr=init_err))
        if match:
            tot_good += 1
            match_loc = match.groups()[0][0]
            to_search_len = match_loc + common_req
            fuzzyness = max(tot_err, ceil(0.1 * to_search_len))
            re = tre.compile(str1[-to_search_len:], tre.EXTENDED)
            match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness))
            #           log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost)
            if match_tot:
                #    if (tot_good % 100 == 0):
                #        log('fuzzyness = ', fuzzyness)
                #              log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost)
                tot_great += 1
                # An arbitrary decision: take the common string from r2
                res_str = str1[:-to_search_len] + str2
                # TODO: preserve qualities
                res_seq = SeqRecord(Seq(res_str),
                                    id=rec1.id,
                                    name=rec1.name,
                                    description=rec1.description,
                                    letter_annotations={
                                        "phred_quality":
                                        [30 for i in range(len(res_str))]
                                    })
                if (tot_great % step == 0):
                    log('nicely matched ', str1, '\n', str2, to_search_len,
                        match_tot.group(0), match.group(0), match_tot.cost,
                        match.cost)
#             log('result = ',str(res_seq.seq))
                yield res_seq
                continue

        res_str = str1 + ('N' * padding) + str2
        res_seq = SeqRecord(Seq(res_str),
                            id=rec1.id,
                            name=rec1.name,
                            description=rec1.description,
                            letter_annotations={
                                "phred_quality":
                                [30 for i in range(len(res_str))]
                            })
        if (tot % step == 0):
            log(tot, tot_good, tot_great)
        # log('matched ',str1,'\n',str2, len(str1), len(str2))
#      log('result = ',str(res_seq.seq))
        yield res_seq

Esempio n. 40

0

Mostra file

File: tests_compat.py Progetto: Leonidas-from-XIV/tre

def test_compile_twice():
    """Tests whether a pattern can be compiled twice"""
    old = tre.compile("a")
    new = tre.compile(old)
    assert old is new

Esempio n. 41

0

Mostra file

    (
        r'^1{0,2}22+1+22+1{0,2}$',  # eight
        r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$',
        r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+' +
        r'(.../)*(X_X/)+',
        r'(XXX/)(XXX/)(XXX/)'),
    (
        r'^1{0,2}22+3?2?12?11+$',  # nine
        r'^1{0,2}(2+3+2*|222+)1+$',
        r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$',
        r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$')
]
re_compiled = []
for row in regexps:
    re_compiled.append(
        (tre.compile(row[0], tre.EXTENDED), tre.compile(row[1], tre.EXTENDED),
         tre.compile(row[2], tre.EXTENDED), tre.compile(row[3], tre.EXTENDED)))

# limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings,
#                         max_num_hcrossings, max_num_vcrossings,
#                         min_num_hcrossings, min_num_vcrossings,
#                         min_max_num_hcrossings, min_max_num_vcrossings)
limits = [
    (4, 4, 3, 3, 1, 1, 2, 2),  # zero
    (4, 1, 2, 2, 1, 1, 1, 1),  # one
    (4, 4, 3, 3, 1, 1, 1, 2),  # two
    (4, 4, 2, 4, 1, 1, 1, 3),  # three
    (4, 4, 3, 2, 1, 1, 2, 1),  # four
    (4, 4, 2, 3, 1, 1, 1, 3),  # five
    (4, 4, 2, 3, 1, 1, 2, 2),  # six
    (4, 3, 2, 3, 1, 1, 1, 2),  # seven

Esempio n. 42

0

Mostra file

File: test_basic.py Progetto: Leonidas-from-XIV/tre

def test_findall():
    """Test whether findall() returns the proper list of matches"""
    pattern = re.compile('[0-9]')
    results = pattern.findall('d3t4 ru7e5!')
    assert results == ['3', '4', '7', '5']

Esempio n. 43

0

Mostra file

File: ocr.py Progetto: Alvibanez/eyegrade

            r'1+2(2|1)+11?',
            r'^/(X../|..X/){0,3}(_X./|_.X/)+(.XX/)(.XX/)?(_X./|_.X/)'
            + r'(_X./|_.X/)+(.X_/|X._/)*$',
            r'^/(_X./)*(X._/)+(.XX/)+.*$'),
           (r'^1{0,2}22+1+22+1{0,2}$', # eight
            r'^1{0,2}2{0,2}333+2{0,2}1{0,2}$',
            r'^/(.../)(.../){0,3}(X_X/)+(.../)*(.X_/|_X./|X__/|__X/)+'
            + r'(.../)*(X_X/)+',
            r'(XXX/)(XXX/)(XXX/)'),
           (r'^1{0,2}22+3?2?12?11+$', # nine
            r'^1{0,2}(2+3+2*|222+)1+$',
            r'^/(.../)+(X_X/)+(_XX/|XXX/)(_XX/|XXX/)?(_.X/)+(_X./)*$',
            r'^/(X._/|.X_/)(X._/|.X_/)+(XX./|.XX/)+$')]
re_compiled = []
for row in regexps:
    re_compiled.append((tre.compile(row[0], tre.EXTENDED),
                        tre.compile(row[1], tre.EXTENDED),
                        tre.compile(row[2], tre.EXTENDED),
                        tre.compile(row[3], tre.EXTENDED)))

# limits: for each digit (min_len_num_hcrossings, min_len_num_vcrossings,
#                         max_num_hcrossings, max_num_vcrossings,
#                         min_num_hcrossings, min_num_vcrossings,
#                         min_max_num_hcrossings, min_max_num_vcrossings)
limits = [(4, 4, 3, 3, 1, 1, 2, 2), # zero
          (4, 1, 2, 2, 1, 1, 1, 1), # one
          (4, 4, 3, 3, 1, 1, 1, 2), # two
          (4, 4, 2, 4, 1, 1, 1, 3), # three
          (4, 4, 3, 2, 1, 1, 2, 1), # four
          (4, 4, 2, 3, 1, 1, 1, 3), # five
          (4, 4, 2, 3, 1, 1, 2, 2), # six

Esempio n. 44

0

Mostra file

File: sines.py Progetto: ayeletYe/sines-in-aging

def search_sines2(sine,
                  r1_f,
                  frac_bound,
                  pref_bound,
                  start_line=0,
                  step_print=1000000,
                  nlines=200000000,
                  thresh=9,
                  pref=60):

    global stats
    stats = {}

    print('step ', step_print, nlines)
    sine = sine[:pref]
    matcher = difflib.SequenceMatcher(isjunk=None,
                                      a=sine,
                                      b='',
                                      autojunk=False)

    total = 0
    cnt = 0
    start_time = time()
    print('''condidates for sine = ''')

    if start_line > 0:
        for (i, cur_seq) in enumerate(r1_f):
            if i == start_line - 1:
                break

    for cur_seq in r1_f:

        if (total % step_print == 0 or total == nlines):
            print('''distances for first''', total, '''segments \n''')
            print('''========================''')
            print('''time elapsed''', (time() - start_time) / 60.0,
                  '''minutes''')
            for k in sorted(stats):
                n = sum([i for i in stats[k][1].values()])
                print('longest common =', k, 'num matches =', n, stats[k][0],
                      '''/''', cnt)
                if (total >= nlines) and (k >= thresh):
                    for (i, frac) in enumerate(sorted(stats[k][1])):
                        print(k, 'Fraction = ', frac)
                        if i == 20:
                            break

        if (total == nlines):
            break

        total += 1
        matcher.set_seq2(cur_seq)
        res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq))
        com = res[2]

        complete_regexp = sine[:res[0]] + '$'
        p = tre.compile(complete_regexp, tre.EXTENDED)
        max_fuzz = res[
            0]  # int(frac_bound*res[0]) is better perhaps, but want to trivialize it for now
        m = p.search(
            cur_seq[:res[1]],
            tre.Fuzzyness(maxcost=max_fuzz,
                          delcost=int(1 / 4.0 * max_fuzz) + 1,
                          inscost=int(1 / 4.0 * max_fuzz) + 1))
        if m == None:
            continue

        start_p = m.groups()[0][0]
        d = m.cost

        # This is the fraction of edit distance out of all.
        # In most cases, this is the right edit distance for the overall prefix

        if (res[0] + com) == 0:
            print('How peculier!', 'com =', com, 'res[0] = ', res[0], m.cost)
            continue

        frac = Fraction(d, res[0] + com)

        stats.setdefault(com, [0, collections.Counter()])
        stats[com][0] += 1

        try:
            if (start_p >= pref_bound) and Fraction(d, res[0]) <= frac_bound:
                stats[com][1][frac] += 1
                cnt += 1
        except (ZeroDivisionError):
            pass

Esempio n. 45

0

Mostra file

import tre

fz = tre.Fuzzyness(maxcost = 3)

print fz

pt = tre.compile("(foo)(bar)", tre.EXTENDED)

m = pt.match("zoobag", fz)

if m:
    print m.groups()
    print m[2]

Esempio n. 46

0

Mostra file

#!/usr/bin/env python

import difflib
import sys
import tre
import gzip

# define barcode format; build regex objects for approximate string matching
linker1 = "CCTAGTCGCGTAGAC"
l1reg = tre.compile(linker1)
linker1Length = len(linker1)

# define Fuzzyness for tre matching
fz = tre.Fuzzyness(maxins=0, maxdel=0, maxsub=1)


# pull in read for parsing
def readread(s):
    return [
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n'),
        s.readline().rstrip('\n')
    ]


def diff_letters(a, b):
    return sum(a[i] != b[i] for i in range(len(a)))


def parseRead(s, o):