def _match_by_edit_distance(full_text, text_to_match): text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")") text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}") text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match) try: end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match)) potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)] except: import sys print(full_text) print() print(text_to_match) sys.exit(1) if len(potential_matches) == 0: potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] if len(potential_matches) == 0: text_to_match = text_to_match.replace("(", "[") potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] if text_to_match[-1] in p and len(p) > len(text_to_match) else p) for p in potential_matches] if len(potential_matches) == 0: # No idea why this would ever happen, but it does return text_to_match match_with_lowest_edit_distance = "" lowest_edit_distance = -1 for match in potential_matches: e_d = edit_distance(match, text_to_match) if lowest_edit_distance == -1 or e_d <= lowest_edit_distance: lowest_edit_distance = e_d match_with_lowest_edit_distance = match result = match_with_lowest_edit_distance.strip() if text_to_match[-1] in result: while result[-1] != text_to_match[-1]: result = result[0:-1] elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result): while result[-1] not in ['"', '”', "\u201d"]: result = result[0:-1] elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..": while result[-1] != text_to_match[-1]: result += full_text[full_text.index(result) + len(result)][-1] return result
def _sustituirReg(self,aux,regOrig,regDest): #Resulta necesario el uso de regex que es mas potente y en ambos sentidos regAux="^"+regOrig+"$" for m in regex.finditer(regAux,aux,overlapped=True): if self.parentesisCoherentes1(m.groupdict()): aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n" return aux2 regAux="(?r)^"+regOrig+"$" for m in regex.finditer(regAux,aux,overlapped=True): if self.parentesisCoherentes1(m.groupdict()): aux2=regDest.format(**agregaCTX(m.groupdict())) + "\n" return aux2 return aux
def parse_ind_vars(self): """Define values of independent variables by parsing first example of form: [var name] [value 0] [value 1] ... [value n] [blank] *or* [text] :return: """ selem = np.array([[1, 0], [1, 1]]) # some weird bug with np.pad and string dtype s_type = 'S%d' % (max([len(x) for y in self.sheet for x in y]) + 10) xs_values = np.zeros(np.array(self.sheet.shape) + 1, dtype=s_type) xs_values[:-1, :-1] = self.sheet mask = (xs_values[:, :2] != '').astype(int) mask[:, 1] *= 2 mask_string = ''.join(['ABCD'[i] for i in mask.sum(axis=1)]) ind_vars = {} for x in re.finditer('(DC+)[ABD]', mask_string): name = xs_values[x.span()[0], 0] values = xs_values[x.span()[0]:x.span()[1] - 1, 1] ind_vars[name] = list(values) self.ind_vars.update(ind_vars)
def _match(self, text): matches = [] if self.mapping: seq = self.map.keys() else: seq = self.regex_sequence for r in seq: for matchobj in re.finditer(r, text, overlapped=True): groups = (matchobj.groupdict()) result = { 'start': matchobj.start(), 'end': matchobj.end(), 'regex': r, 'groups':groups } if self.mapping: for k, v in self.map[r].items(): if k not in result.keys(): result[k] = v matches.append( result ) return matches
def prune_by_precision(self, min_precision, text_data_pairs): """ Removes patterns from the model that don't reach a minimum precision :param float min_precision: the minimum precision required of a pattern when applied to the given data :param collections.Iterable text_data_pairs: an iterable of `(text, data)` pairs where `text` is a string and `data` is an anafora.AnaforaData object """ pattern_scores = collections.defaultdict(lambda: anafora.evaluate.Scores()) for text, data in text_data_pairs: # collect the spans of each type of reference annotation reference_type_spans_map = collections.defaultdict(lambda: set()) for annotation in data.annotations: reference_type_spans_map[annotation.type].add(annotation.spans) # make predictions with each pattern in the model for pattern in self.regex_type_attributes_map: predicted_spans = {((m.start(), m.end()),) for m in regex.finditer(pattern, text)} if predicted_spans: predicted_type, _ = self.regex_type_attributes_map[pattern] # update the scores for this pattern pattern_scores[pattern].add(reference_type_spans_map[predicted_type], predicted_spans) # delete any pattern with a precision lower than the minimum requested for pattern, scores in pattern_scores.items(): if scores.precision() < min_precision: del self.regex_type_attributes_map[pattern]
def parse(self, data, regex = None, encoding = "utf-8"): regex = regex or self.master is_unicode = appier.legacy.is_unicode(data) if not is_unicode: data = data.decode(encoding) nodes = [] matches = regex.finditer(data) current = 0 for match in matches: name = match.lastgroup parts = match.groupdict() start, end = match.span() if start > current: value = data[current:start] value = value.replace("\r", "") value = value.replace("\n", " ") if value: nodes.append(value) method = getattr(self, "parse_" + name) node = method(parts) nodes.append(node) current = end remaining = data[current:] remaining = remaining.replace("\r", "") remaining = remaining.replace("\n", " ") if remaining: nodes.append(remaining) return nodes
def tokenize(text): """ Yield tokens. Args: text (str): The original text. Yields: dict: The next token. """ stem = SnowballStemmer('norwegian').stem tokens = regex.finditer('\p{L}+', text.lower()) for offset, match in enumerate(tokens): # Get the raw token. unstemmed = match.group(0) yield { # Emit the token. 'stemmed': stem(unstemmed), 'unstemmed': unstemmed, 'offset': offset }
def determine_match(commentary_name, commentary_regex): issues = 0 full_pattern = u'{} (?>[@!/*][^ ]* )*(?P<dh>[^ ]+)'.format(commentary_regex) full_mechaber = Root('../../Even_HaEzer.xml').get_base_text() error_counter = Counter() for siman_num, siman in enumerate(full_mechaber.get_simanim()): for seif_num, seif in enumerate(siman.get_child()): matches = regex.finditer(full_pattern, unicode(seif)) for regex_match in matches: c_ref = Ref(u'{} {}:{}'.format(commentary_name, siman_num+1, getGematria(regex_match.group('ref')))) try: c_text = c_ref.text('he').text.split()[0] except IndexError: continue c_text = re.sub(u'[^\u05d0-\u05ea]', u'', c_text) dh_text = re.sub(u'[^\u05d0-\u05ea]', u'',regex_match.group('dh')) ratio = fuzz.ratio(dh_text, c_text) if ratio < 75.0: issues += 1 print u"Potential mismatch:" print u"Shulchan Arukh, Even HaEzer {}:{} {}".format(siman_num+1, seif_num+1, dh_text) print u"{} {}".format(c_ref.normal(), c_text) print u"Score: {}".format(ratio) error_counter[(dh_text, c_text)] += 1 print u"Total issues: {}".format(issues) return error_counter
def __init__(self, names, features={}, ftstr='', weights=None): """Construct a `Segment` object Args: names (list): ordered list of feature names features (dict): name-value pairs for specified features ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is interpreted as a feature specification weights (float): order list of feature weights/saliences """ self.n2s = {-1: '-', 0: '0', 1: '+'} self.s2n = {k: v for (v, k) in self.n2s.items()} self.names = names """Set a feature specification""" self.data = {} for name in names: if name in features: self.data[name] = features[name] else: self.data[name] = 0 for m in re.finditer(r'(\+|0|-)(\w+)', ftstr): v, k = m.groups() self.data[k] = self.s2n[v] if weights: self.weights = weights else: self.weights = [1 for _ in names]
def search_strand(pattern, sequence_to_scan, strand=1): ''' take a sequence pattern (element) and find occurrences of that on the provided, larger 5'-->3' sequence. Assumes strand is first unless provided. Tracks the start and end points of each occurrence, returning a list of that information where each element is a tuple of the start and end points along with the strand. Works with overlapped sequences because now "regex.findall and regex.finditer support an ‘overlapped’ flag which permits overlapped matches." , see https://pypi.python.org/pypi/regex/2018.02.21 based on https://www.biostars.org/p/209383/ (specifically steve's answer) ''' occurrences = [] for match in regex.finditer( pattern.upper(), str(sequence_to_scan.upper()),overlapped=True): if strand == 1: start_pos = match.start() + 1 end_pos = match.end() + 1 else: start_pos = (len(sequence_to_scan) - match.start() ) + 1 end_pos = (len(sequence_to_scan) - match.end() ) + 1 # print (start_pos, '\t', end_pos, '\t',strand) # for debugging occurrences.append((start_pos, end_pos,strand)) return occurrences
def tokenize(self, value): """ Perform the tokenizing. Required Argument value -- The unicode string to tokenize. """ t = Token() # The token instance we will reuse if not self._gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self._regexp.finditer(value)): yield t.update(match.group(0), index=(match.start(), match.end(),), position=pos) else: # When gaps=True, iterate through the matches and # yield the text between them. left = 0 last_pos = 0 for pos, match in enumerate(regex.finditer(self._regexp, value)): right, next = match.span() if right != 0: yield t.update(value[left:right], position=pos, index=(left, right,)) left = next last_pos = pos if left != len(value): yield t.update(value[left:], position=last_pos+1, index=(left, len(value),))
def findall_p_in_s(p,s): """"returns a series of matches for a pattern (p) in a str (s)""""" match_strs = regex.findall(p,s) #get pairs of left and right indexes match_indexes = [(i.start(0),i.end(0)) for i in regex.finditer(p,s)] all_p_in_s = [Match(match_strs[i],match_indexes[i][0],match_indexes[i][1]) for i in range(0,len(match_strs))] return all_p_in_s
def _find(self, *args): global index try: for match in regex.finditer(u'^.+?:\d+?:.*%s.*$' % args[0], index.data, regex.MULTILINE | regex.IGNORECASE | regex.V1, concurrent=True): self._print(match.group(0)) except sre_constants.error, e: print
def plx_wrapper(text): before = text text = delimToPanlex(text) idx_list = [ex_match.start() for ex_match in re.finditer('⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸', text)] if len(idx_list) == 0: return process_synonyms(proc)(text) idx_list.append( len(text)) if len(text[ 0:idx_list[0] ].strip()) > 0: idx_list.insert(0,0) final_exp = [] for idx in range(len(idx_list) - 1): ex = text[ idx_list[idx] : idx_list[idx+1]] tag,ex_text,attributes = get_plx_fields(ex) result = proc(ex_text) result_match = re.search('(⫷(?:ex|df)(?::\w{1,4}-\d{1,3})?⫸)(.+)', result) if result_match: if len(result_match[1].strip()) > 0: final_exp.append('%s%s' % (result,attributes)) else: if len(result.strip()) > 0: final_exp.append('%s%s%s' % (tag,result,attributes)) final_exp = filter_unique_meanings(final_exp) text = ''.join(final_exp) return text
def process(self, file): with open(file, 'r', encoding='utf8') as f: string = f.read() dropping = True last_is_comment = False out = [self.head()] for m in regex.finditer(r'#[.:]\s*(?<comment>.*)|msgstr (?:"(?<msgstr>.*)"\n?)+', string): print(m[0], dropping) if dropping: if m['comment']: if regex.match(r'[\s\p{punct}]*VAR', m['comment']): continue body_m = regex.search(r'(?is)<body.*', m['comment']) if body_m: dropping = False out.append(body_m[0]) else: if m['comment']: out.append(m['comment']) last_is_comment = True else: passage = self.unescape(' '.join(m.captures('msgstr'))) passage = passage.strip() if passage: if not last_is_comment: out.append(' ') last_is_comment = False out.append(passage) html_string = ''.join(out) return self.pretty_print(html_string)
def plot_location(needle, haystack, cluster_id=None, nbins=20, size=(17, 2), fname=None): """plot_location.""" locs = [] for h, s in haystack: for match in re.finditer(needle, s): s = match.start() e = match.end() m = s + (e - s) / 2 locs.append(m) plt.figure(figsize=size) n, bins, patches = plt.hist( locs, nbins, normed=0, facecolor='blue', alpha=0.3) plt.grid() plt.title(needle) plt.xlabel('Position') plt.ylabel('Num occurrences') if fname: plt.draw() figname = '%s_loc_%d.png' % (fname, cluster_id) plt.savefig( figname, bbox_inches='tight', transparent=True, pad_inches=0) else: figname = None plt.show() plt.close() return figname
def process_verses(chap_string, expression): """ Take an entire chapters as a string and break up into verses. The new chapter index (number followed by a space) must be stripped out. :param chap_string: All verses in a chapter combined as one string. :param expression: A compiled regular expression with which to find new verses. :return: A list of strings (jagged array), with each verse as a separate string. """ # find all new verses with the regular expression matches = expression.finditer(chap_string) # save start position of first verse and initiate list of verses try: start = next(matches) except StopIteration: return [chap_string] verses = [] # loop through matches until StopIteration is raised at the last verse while True: try: end = next(matches) verses.append(chap_string[start.end()-1:end.start()]) start = end except StopIteration: verses.append(chap_string[start.end()-1:]) break # error correction - look for numbers in each verse and compare to verse number # This will differentiate between incorrectly formatted verses numbers and other numbers in the text. corrected_verses = [] for index, verse in enumerate(verses): nums = re.finditer(u'\d{1,3} ', verse) good = True for num in nums: if int(num.group()) - index == 2: # add first verse corrected_verses.append(verse[:num.start()]) # edit second verse second = verse[num.start():] second = second.replace(num.group(), num.group()[:len(num.group())]) corrected_verses.append(second) good = False break if good: corrected_verses.append(verse) # strip out the * marker used to help differentiate numbers and verses for index, verse in enumerate(corrected_verses): corrected_verses[index] = verse.replace(u'*', u'') return corrected_verses
def find_mofit(s, t): """ :return: All locations of t as a substring of s. :param: s, t Two DNA strings """ matches = finditer(t, s, overlapped=True) ans = [str(element.start() + 1) for element in matches] return " ".join(ans)
def glycocheck(protein): ind = [] import regex as re matches = re.finditer("N[^P][ST][^P]", protein, overlapped=True) for i in matches: ind.append(i.start() + 1) return ind
def MultipleApproxPatternMatch(inputset): patterns, d = inputset all_matched_index = [] for eachp in patterns: reg = GetRE(eachp, d) for m in regex.finditer(reg, text, overlapped = True): all_matched_index.append(m.start()) return all_matched_index
def finditer( pattern, string, flags=0, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, **kwargs ): return regex.finditer( _apply_search_backrefs(pattern, flags), string, flags, pos, endpos, overlapped, partial, concurrent, **kwargs )
def get_word_groups_from_line(self, song_line): word_groups = [] # for match in re.finditer(r"(?:^|\s)([A-Za-z,'! &?.]+)(?:\s|$)", song_line): for match in re.finditer(r"(?:^|\s)([^-0-9;]+)(?:\s|$)", song_line): # print(song_line + " => '" + m.group(0).strip() + "'") word_group = Group(match.start(), match.group(0).strip()) word_groups.append(word_group) return word_groups
def key(word): out = [] for m in regex.finditer('(' + '|'.join(a) + ')|.', word): if m.group(1): if alpha[m[0]] is not None: out.append(alpha[m[0]]) else: out.append(-1) return out
def get_glycan_sites(seq, regex_pattern, strip_gap): if strip_gap == True: seq = seq.replace("-", "") sites = [] iterator = re.finditer(regex_pattern, seq, overlapped=True) for match in iterator: start, end = match.span() sites.append(start+1) return sites
def parse_scad_callables(scad_code_str): callables = [] # Note that this isn't comprehensive; tuples or nested data structures in # a module definition will defeat it. # Current implementation would throw an error if you tried to call a(x, y) # since Python would expect a(x); OpenSCAD itself ignores extra arguments, # but that's not really preferable behavior # TODO: write a pyparsing grammar for OpenSCAD, or, even better, use the yacc parse grammar # used by the language itself. -ETJ 06 Feb 2011 no_comments_re = r'(?mxs)(//.*?\n|/\*.*?\*/)' # Also note: this accepts: 'module x(arg) =' and 'function y(arg) {', both # of which are incorrect syntax mod_re = r'(?mxs)^\s*(?:module|function)\s+(?P<callable_name>\w+)\s*\((?P<all_args>.*?)\)\s*(?:{|=)' # See https://github.com/SolidCode/SolidPython/issues/95; Thanks to https://github.com/Torlos args_re = r'(?mxs)(?P<arg_name>\w+)(?:\s*=\s*(?P<default_val>([\w.\"\s\?:\-+\\\/*]+|\((?>[^()]|(?2))*\)|\[(?>[^\[\]]|(?2))*\])+))?(?:,|$)' # remove all comments from SCAD code scad_code_str = re.sub(no_comments_re, '', scad_code_str) # get all SCAD callables mod_matches = re.finditer(mod_re, scad_code_str) for m in mod_matches: callable_name = m.group('callable_name') args = [] kwargs = [] all_args = m.group('all_args') if all_args: arg_matches = re.finditer(args_re, all_args) for am in arg_matches: arg_name = am.group('arg_name') if am.group('default_val'): kwargs.append(arg_name) else: args.append(arg_name) callables.append({'name': callable_name, 'args': args, 'kwargs': kwargs}) return callables
def pep_end(pep,seq): if len(findall(pep,seq)) > 1: runs = finditer(pep,seq) coord = [] for match in runs: coord.append(match.end()) return coord elif len(findall(pep,seq)) == 1: return search(pep,seq).end() else: return 'Not found'
def find_out_regex(pattern, lines): end = len(lines) for match in regex.finditer(pattern, lines, regex.REVERSE): out = lines[match.end():end].rstrip('\n') if '\n' in out: return out else: end = match.start() # Skipping command raise Exception('No output to copy')
def write_index(seqname, seq, strings, index_file): """ Finds and then writes the indexes of the substrings """ if len(seq) > 1: tups = [] for string in strings: positions = [m.start() for m in re.finditer(string, seq.upper(), overlapped=True)] positions_rev = [m.end()-1 for m in re.finditer(reverse_complement(string), seq.upper(), overlapped=True)] tups += zip(positions, ["+"] * len(positions), [string] * len(positions)) tups += zip(positions_rev, ["-"] * len(positions_rev), [string] * len(positions_rev)) print("indexed " + str(len(positions) + len(positions_rev)) + " " + string + " sites") # Sort by position tups = sorted(tups, key=lambda x: x[0]) # print(tups) for t in tups: index_file.write(seqname + "\t" + str(t[0]) + "\t" + t[1] + "\t" + t[2] + "\n")
def parse_sequence(json_str): re_item = r'(?P<value>' + '|'.join( (re_bool, re_float, re_int, re_none, re_object, re_sequence, re_str)) + ')' items = regex.finditer(re_item, json_str[1:-1]) return [from_json(i.group('value')) for i in items]
def get_start_end_indexes(seq, seq_file): ''' Find a given string in a given file and get the indexes of said string in the file ''' find = [(m.start(0), m.end(0)) for m in regex.finditer(r'(?:'+seq+'){s<=2}' , seq_file)][0] start = find[0] end = find[1] matched_seq = seq_file[start:end] return (start, end, matched_seq)
def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, **kwargs): """Wrapper for finditer.""" return regex.finditer(_apply_search_backrefs(pattern, flags), string, flags, pos, endpos, overlapped, partial, concurrent, **kwargs)
def search_motif(protein, motif): """Search for a motif in a protein. Args: protein: sequence of amino acids motif: Return: a list of indexes of the motif's locations within protein """ motif_regex = motif.replace("{", "[^").replace("}", "]") indexes = re.finditer(motif_regex, protein, overlapped=True) return [i.start() + 1 for i in indexes]
def find_gap_positions_and_matching_characters( gap_string: str, character_string: str) -> List[Operation]: assert len(gap_string) == len(character_string) positions = [m.start() for m in re.finditer('<', gap_string)] grouped_positions = group_consecutive_indexes(positions) operations = [] for group in grouped_positions: if len(group) == 0: continue characters = "" for index in group: characters += character_string[index] operations.append(Operation(group[0], characters)) return operations
def __init__(self, match, parent, parent_scan_position): NObject.__init__(self, match.start() + parent_scan_position, parent) # process condition. A non-boolean value will be true if different from null. self.if_condition = self.scan_expression( match.group('IF_CONDITION'), parent_scan_position + match.start('IF_CONDITION')) # process if_body self.if_body = [] for m in re.finditer(full_regex, match.group('IF_BODY')): if self.is_match_valid(m, parent_scan_position): self.if_body.append( self.scan_fun_body( m, parent_scan_position + match.start('IF_BODY'))) # process else_body self.else_body = [] if match.group('ELSE_BODY') != None: for m in re.finditer(full_regex, match.group('ELSE_BODY')): if self.is_match_valid( m, parent_scan_position + match.start('ELSE_BODY')): self.else_body.append( self.scan_fun_body( m, parent_scan_position + match.start('ELSE_BODY')))
def parse_footnotes(s): footnote_pattern = r'(?<!\d\s*)\\(?P<i>\d+)\\' # Parse footnotes footnote_paragraph_match = re.match(footnote_pattern, s) if footnote_paragraph_match: footnote = footnote_paragraph_match.group('i') footnotes = [] else: footnote = np.nan footnotes = [m.group('i') for m in re.finditer(footnote_pattern, s)] s = re.sub(footnote_pattern, '', s) return footnote, footnotes, s
def explicit_and(line, listAnd): """ Parse string of type "course code and course code and course code ..." where course code is used interchangbly with placeholder #ids Attributes --------- line: str pre-requisite string listAnd: str used in regex expression to check if course codes are explicitly joined by "and" Returns ---------- matchObj: dict dictionary with id as key and parsed json array as value as {'id': [{ 'type': 'AND', 'groups': [(str) code, (str) code, ...] }] } line: str modified pre-requisite string with group replaced by #id """ matchObj = {} matches = regex.finditer(listAnd, line) if matches != None: for x in matches: spl = regex.split(r' and |, ', x.group()) codes = [] for y in spl: codex = y if codex in matchObj: codex = matchObj[codex] else: codex = [codex] codes += codex obj = {'type': 'AND', 'groups': codes} id = genId() matchObj[id] = [obj] line = line.replace( x.group(), id) # replace group substring with placeholder id return matchObj, line
def perform_camel_case_splitting(self): ''' Convert all camelcase terms into individual terms ret1: processed content without any camelcase terms ''' content = self.current_content # self.camel_case_split_content = regex.sub(r'([a-z]*)([A-Z])', r'\1 \2', content) matches = regex.finditer( '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', content, regex.DOTALL) self.camel_case_split_content = " ".join([m.group(0) for m in matches]) self.current_content = self.camel_case_split_content return self.camel_case_split_content
def try_merge_modifier_token(self, extract_result: ExtractResult, pattern: Pattern, source: str, potential_ambiguity: bool = False) -> bool: before_str = source[0:extract_result.start] after_str = source[extract_result.start:extract_result.length] # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \ regex.search(self.config.ambiguous_range_modifier_prefix, before_str): matches = list( regex.finditer(self.config.potential_ambiguous_range_regex, source)) if matches and len(matches): return any(match.start() < extract_result.start + extract_result.length and match.end() > extract_result.start for match in matches) # return self._filter_item(extract_result, matches) token = self.has_token_index(before_str.strip(), pattern) if token.matched: mod_len = len(before_str) - token.index extract_result.length += mod_len extract_result.start -= mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True elif self.config.check_both_before_after: # check also after_str after_str = source[extract_result.start:extract_result.length] token = self.has_token_index(after_str.strip(), pattern) if token.matched: mod_len = token.index + len(after_str) - len(after_str.strip()) extract_result.length += mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.data = Constants.HAS_MOD extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True return False
def get_range_conditions(range_string): for matches in regex.finditer(r"(?:([0-9]*)[:\-.]([0-9]*)|([0-9]+))", range_string): start, end, singular = matches.groups() if start and end and int(start) > int(end): start, end = end, start yield (lambda x, s=singular: int(s) == x) if singular else ( lambda x: True ) if not (start or end) else ( lambda x, s=start: x >= int(s) ) if start and not end else ( lambda x, e=end: x <= int(e) ) if not start and end else ( lambda x, s=start, e=end: int(s) <= x <= int(e) )
def sort_by_barcode(list_sParameters): print('Processing %s' % list_sParameters[1]) sSplitTag = list_sParameters[0] sInFastq = list_sParameters[1] sTempOut = list_sParameters[2] sBarcodeFile = list_sParameters[3] sRE = '[T]{7}' nBarcode3Cut = 3 nBarcode5Ext = 18 dict_sBarcodes = load_PE_input(sBarcodeFile) dict_sOutput = {} InFile = open(sInFastq, 'r') for i, sReadLine in enumerate(InFile): if i % 4 == 0: sReadID = sReadLine.replace('\n', '') if i % 4 != 1: continue sNGSSeq = sReadLine.replace('\n', '').upper() for sReIndex in regex.finditer(sRE, sNGSSeq, overlapped=True): nIndexStart = sReIndex.start() nIndexEnd = sReIndex.end() sBarcode = sNGSSeq[nIndexStart + nBarcode3Cut:nIndexEnd + nBarcode5Ext] if nIndexStart > (len(sNGSSeq) / 2): continue # SKIP barcode in back of read ### Skip Non-barcodes ### try: cPE = dict_sBarcodes[sBarcode] except KeyError: continue ######################### if sBarcode not in dict_sOutput: dict_sOutput[sBarcode] = [] dict_sOutput[sBarcode].append([sReadID, sNGSSeq]) #loop END: i, sReadLine #loop END: cPE InFile.close() ## Pickle Out ## sOutFile = '%s/%s.data' % (sTempOut, sSplitTag) OutFile = open(sOutFile, 'wb') pickle.dump(dict_sOutput, OutFile) OutFile.close()
def _get_locations(symbol_name, text): locations = [] for sentence in finditer(r'(^|[\.\!\?])[\s]*([^\.\!\?]+([\.\!\?]|$))', text, overlapped=True): existences = _locator(symbol_name, sentence.group(2)) if len(existences) == 0: continue start = sentence.start(2) end_shift = sentence.end(2) - start sent_locations = [(start, ex[0], ex[1] - ex[0], end_shift) for ex in existences] locations += sent_locations return locations
def find_motif(dna: str, motif: str, re: bool) -> list: locs = [] # Случай, когда мотив задан просто сиквенсом, а не регулярным выражением if re == False: ml = len(motif) for i in range(len(dna)): if dna[i] == motif[0]: if i + ml < len(dna) and dna[i: i + ml] == motif: locs.append(i) else: motif = regex.compile(motif) for match in regex.finditer(motif, dna, overlapped=True): locs.append(match.start()) return (locs)
def annotate(self, doc): doc = doc.upper().replace('\r\n', '\n') #negation detection works better with newlines removed doc = doc.replace('\n', ' ') doc = doc.replace('\r', ' ') doc = regex.sub(r' +', ' ', doc) doc = doc.replace(" .", ".") doc = regex.sub(r'\.+', ".", doc) doc_data = {'mentions': []} for dr in self.all_drugs: #{'mentioned':False, 'negated':False, 'status': False} reg = self.positive_regex[dr] m = regex.finditer(reg.pattern, doc) for match in m: # each time the drug name is found, test for negation mention = { 'drug': dr, 'mentioned': True, 'negated': False, 'allergic': False } mention.update({x: False for x in self.user_regex}) # get context and search within it for negation ctx_from = max(match.span()[0] - self.review_window, 0) ctx_to = min(match.span()[1] + self.review_window, len(doc)) ctx = doc[ctx_from:ctx_to] mention['ctx'] = ctx mention['start'] = match.span()[0] mention['end'] = match.span()[1] ctx_from = max(match.span()[0] - self.negation_window, 0) ctx_to = min(match.span()[1] + self.negation_window, len(doc)) ctx = doc[ctx_from:ctx_to] for negator in self.negative_regex[dr]: if len(negator.findall(ctx)) > 0: mention['negated'] = True for name in self.user_regex: for pattern in self.user_regex[name]: if len(pattern.findall(ctx)) > 0: mention[name] = True al = self.detect_allergy(ctx, dr) mention['allergic'] = al doc_data['mentions'].append(mention) return doc_data
def basic_regex_match(self, source: str) -> []: from .utilities import Token result: List[Token] = list() for pattern in self.config.time_regex_list: matches = list(regex.finditer(pattern, source)) # @TODO Workaround to avoid incorrect partial-only matches. Remove after time regex reviews across languages. matches = list(filter(lambda match: self.lth_check(match), matches)) result.extend(map(lambda x: Token(x.start(), x.end()), matches)) return result
def __analyze_patterns(self, text: str, flags: int = None): """ Evaluate all patterns in the provided text. Logic includes detecting words in the provided deny list. In a sentence we could get a false positive at the end of our regex, were we want to find the IBAN but not the false positive at the end of the match. i.e. "I want my deposit in DE89370400440532013000 2 days from today." :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: matches = re.finditer(pattern.regex, text, flags=self.flags) for match in matches: for grp_num in reversed(range(1, len(match.groups()) + 1)): start = match.span(0)[0] end = (match.span(grp_num)[1] if match.span(grp_num)[1] > 0 else match.span(0)[1]) current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) break return results
def hightlight_keywords(self, text, keywords, light_color='#ffea593d', deep_color='#ffc107'): hightlighted_html = '' all_hightlights = [] tokens = self.full_tokenize(text) for t in tokens: t['background_color'] = [] for w in keywords: matches = regex.finditer( r'\b{}\b'.format(regex.escape(w, special_only=True)), text, flags=regex.IGNORECASE ) all_hightlights.extend([ { 'start': m.start(), 'end': m.end(), 'text': m.group(), } for m in matches ]) all_hightlights = sorted(all_hightlights, key=lambda x: x['start']) for h in all_hightlights: for t in tokens: if (t['start'] >= h['start'] and t['end'] <= h['end']): t['background_color'].append(light_color) for t in tokens: color_len = len(t['background_color']) if color_len == 0: hightlighted_html += t['text'] elif color_len == 1: hightlighted_html += \ '<span style="background-color:{background_color};">{text}</span>'.format( background_color=t['background_color'][0], text=t['text'] ) else: hightlighted_html += \ '<span style="background-color:{background_color};">{text}</span>'.format( background_color=deep_color, text=t['text'] ) return hightlighted_html
def match_utterance(user_input): """ match_utterance accepts the user's utterance and tries to find a template matching it """ for key in dialogue_pairs: matches = list(re.finditer(key, user_input)) if matches: choice = np.random.randint(len(dialogue_pairs[key])) answer = re.sub(key, dialogue_pairs[key][choice], user_input, count=1) return answer
def split_into_words(token: str) -> List[ParsedToken]: """ >>> split_into_words(" var = 9.4\\t\\n") [<Tab>, SplitContainer[Word(('var', none))], NonCodeChar(=), <Number>(9), \ NonCodeChar(.), <Number>(4), <Tab>, <NewLine>] """ res = [] four_char_whitespace = " " * 4 for m in regex.finditer(f"(\\w+|[^ ]|{four_char_whitespace})", token): if m[0] == four_char_whitespace: res.append(Tab()) else: res.append(to_parsed_token(m[0])) return res
def _calculate_representation_original_array(self): original_interval_start = max(0, self.list_repeats_starts[0] - 5) # flanks for regex original_interval_end = self.list_repeats_starts[-1] + len( self.list_repeats[-1]) + 5 # flanks for regex original_dna_interval = self.full_dna[ original_interval_start:original_interval_end] search_pattern = f"(?e)({self.repeat_seq_candidate})" + "{" + "i<=3,d<=3,s<=3,s+i+d<=6" + "}" original_matches = list( regex.finditer(search_pattern, original_dna_interval)) self.list_repeats_original = [ match.group() for match in original_matches ] self.list_repeats_starts_original = [ original_interval_start + match.start() for match in original_matches ] if len(original_matches) > 1: internal_spacer_starts = [ match.end() for match in original_matches ][:-1] internal_spacer_end = [ match.start() for match in original_matches ][1:] internal_spacer_coordinates = [(start, end) for start, end in zip( internal_spacer_starts, internal_spacer_end)] self.list_spacers_original = [ original_dna_interval[start_end[0]:start_end[1]] for start_end in internal_spacer_coordinates ] else: self.list_spacers_original = [] relative_error_indexes = [] for match in original_matches: tuple_match_errors = match.fuzzy_changes list_relative_errors = [[e - match.start() for e in err_type] for err_type in tuple_match_errors] relative_error_indexes.append(list_relative_errors) self.list_repeats_error_indexes_original = relative_error_indexes self.list_fuzzy_counts_original = [ match.fuzzy_counts for match in original_matches ]
def format_internal_header_links(document, notes): """Formats Obsidian style header links""" matches = re.finditer("\\[{2}([^|#\\]]*?)#(.*?)\\]{2}", document) for match in matches: text = match.group(2) link = slug_case(match.group(1)) + ".html#" + slug_case(match.group(2)) files = [note['filename'] for note in notes] if match.group(1) in files: document = document.replace(match.group(), md_link(text, link)) else: document = document.replace(match.group(), text) return document
def xp_single_sub_check(self, sub, regex, error_type, color, info_text, reflags): res = re.finditer(regex, sub.text, flags=re.U | reflags if reflags != None else re.UNICODE) res_list = [] for item in res: res_list.append((item.start(), item.end())) if len(res_list) > 0: #sub.info = (error_type, (u'<span foreground="'+color+'">' + info_text + u'</span>', res_list)) sub.info = (error_type, (info_text, res_list)) else: sub.info = (error_type, '')
def get_label_masks(self, label, source_pattern, translation, source, target: Optional[str] = None) -> Tuple[str, str, Dict]: """ Search for `source_pattern` in `source`. If `target` is None, replace the matched text with `label`. If `target` is not None, replace the matched text with `label` only if `translation` can be found in `target`. If `translation` is None, the matched text is used to match against `target` instead. """ masks = [] source_mods = [] target_mods = [] for source_match in re.finditer(source_pattern, source): matched_text = source_match.group() source_match_position = source_match.start() replacement_text = translation if translation is not None else matched_text loc_in_target = -1 if target is None else target.find(replacement_text) if target is None or loc_in_target != -1: self.counts[label] += 1 labelstr = self.get_mask_string(label, self.counts[label]) # Run the regex again so we make sure to get the exact place # (str.replace here would be quicker but may find a match in the sentence # not exactly like the matched regex) source_mods.append((source_match_position, len(matched_text), labelstr)) if target is not None: target_mods.append((loc_in_target, len(replacement_text), labelstr)) mask = { "maskstr" : labelstr.strip(), "matched" : matched_text, "replacement" : replacement_text } masks.append(mask) else: self.counts_missed[label] += 1 def apply_mods(text, mod_list): if mod_list: offset = 0 for i, matched_len, mask in mod_list: i += offset text = text[0:i] + mask + text[i + matched_len:] offset += len(mask) - matched_len return text source = apply_mods(source, source_mods) target = apply_mods(target, target_mods) return source, target, masks
def plot_location(needle, haystack, nbins=20, size=(17, 2)): """plot_location.""" locs = [] for h, s in haystack: for match in re.finditer(needle, s): s = match.start() e = match.end() m = s + (e - s) / 2 locs.append(m) plt.figure(figsize=size) n, bins, patches = plt.hist( locs, nbins, normed=0, facecolor='blue', alpha=0.3) plt.grid() plt.show()
def match(original_text, word_or_token_list_to_match, clean_text=None): '''See README.md for a description of how to use this function.''' regex_flags = re.U | re.I if len(word_or_token_list_to_match) == 0: return [] if not (clean_text): clean_text = _cleanup_text(original_text) if type(word_or_token_list_to_match) is list: to_match = untokenize(" ".join(word_or_token_list_to_match).strip()) matches = [ (m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer(re.escape(to_match), clean_text, regex_flags) ] if len(matches) == 0: matches = [(m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer( r'\s*'.join( re.escape(w) for w in word_or_token_list_to_match), original_text, regex_flags)] if len(matches) == 0: edit_distance_match = _match_by_edit_distance( clean_text, re.sub(r'\\s[\*\+]', r' ', to_match)) matches = [(m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer(re.escape(edit_distance_match), clean_text, regex_flags)] if len(matches) == 0: edit_distance_match = _match_by_edit_distance( original_text, re.sub(r'\\s[\*\+]', r' ', to_match)) matches = [ (m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer(re.escape(edit_distance_match), original_text, regex_flags) ] if len(matches) == 0: edit_distance_match = _match_by_edit_distance( original_text, " ".join(word_or_token_list_to_match)) matches = [(m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer( re.escape(edit_distance_match), original_text, regex_flags)] if len(matches) == 0: return [] else: matches = [(m.start(), m.end(), original_text[m.start():m.end()]) for m in re.finditer( r'\b' + re.escape(word_or_token_list_to_match) + r'\b', clean_text, regex_flags)] return sorted(matches)
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): """Run regexes against message's marked lines to strip quotations. Return only last message lines. >>> mark_message_lines(['Hello', 'From: [email protected]', '', '> Hi', 'tsem']) ['Hello'] Also returns return_flags. return_flags = [were_lines_deleted, first_deleted_line, last_deleted_line] """ markers = ''.join(markers) # if there are no splitter there should be no markers if 's' not in markers and not re.search('(me*){3}', markers): markers = markers.replace('m', 't') if re.match('[te]*f', markers): return_flags[:] = [False, -1, -1] return lines # inlined reply # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' # both 't' entries should be found for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): # long links could break sequence of quotation lines but they shouldn't # be considered an inline reply links = (RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or RE_PARENTHESIS_LINK.match( lines[inline_reply.start()].strip())) if not links: return_flags[:] = [False, -1, -1] return lines # cut out text lines coming after splitter if there are no markers there quotation = re.search('(se*)+((t|f)+e*)+', markers) if quotation: return_flags[:] = [True, quotation.start(), len(lines)] return lines[:quotation.start()] # handle the case with markers quotation = (RE_QUOTATION.search(markers) or RE_EMPTY_QUOTATION.search(markers)) if quotation: return_flags[:] = True, quotation.start(1), quotation.end(1) return lines[:quotation.start(1)] + lines[quotation.end(1):] return_flags[:] = [False, -1, -1] return lines
def match_simple_cases(self, regexp: Pattern, source: str) -> List[Token]: result: List[Token] = list() for regexp in self.config.simple_cases_regex: matches: [Match] = regex.finditer(regexp, source) if matches: for match in matches: if RegExpUtility.get_group(match, Constants.MINUTE_GROUP_NAME) or\ RegExpUtility.get_group(match, Constants.SECOND_GROUP_NAME): end_with_valid_token = True if (source.index(match.group()) + (match.end() - match.start())) == len(source): end_with_valid_token = True else: after_str = source[source.index(match.group()) + (match.end() - match.start())] end_with_general_endings = self.config.general_ending_regex.match(after_str) end_with_am_pm = RegExpUtility.get_group(match, Constants.RIGHT_AM_PM_GROUP_NAME) if end_with_general_endings or end_with_am_pm or\ after_str.lstrip().startswith(self.config.token_before_date): end_with_valid_token = True elif (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0: # When TimeZone be migrated enable it end_with_valid_token = False if end_with_valid_token: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) else: match_pm_str = RegExpUtility.get_group(match, Constants.PM_GROUP_NAME) match_am_str = RegExpUtility.get_group(match, Constants.AM_GROUP_NAME) desc_str = RegExpUtility.get_group(match, Constants.DESC_GROUP_NAME) if match_pm_str or match_am_str or desc_str: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) else: after_str = source[source.index(match.group()) + (match.end() - match.start()):] # When TimeZone be migrated enable it if (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) return result
def detect_reference_start(self): self.references_start_index = 0 pdftext = self.get_fulltext() #str(self.pdf) reference_regexes = [ r"\breferences\b", # Title: References r"\bliterature cited\b" ] # Title: Literature cited potential_reference_starts = [] for title_wording in reference_regexes: potential_reference_starts += [ r.end() for r in re.finditer(title_wording, pdftext, re.IGNORECASE) ] print("Found these starts of potential reference section") for ref in potential_reference_starts: print(f"MATCH: {pdftext[ref-1:ref+20]}... at location {ref}") print( f"There are {len(potential_reference_starts)} potential reference start points" ) if len(potential_reference_starts) == 0: print("Could not detect references start point.") elif len(potential_reference_starts) == 1: self.references_start_index = potential_reference_starts[0] else: print( "There were more than one probable reference section. Detecting the most probable one." ) most_probable_start_point = 0 most_year_mentions = 0 # Test of correct reference point is based on how many year numbers are followed by the keyword # As it is more likely for references to be in the end of the self (although not always the case) # The number of year numbers is multiplied by index of the finding... for index, test_start in enumerate(potential_reference_starts): test_text = pdftext[test_start:test_start + 1000] nr_year_mentions = (index + 1) * len( re.findall(r"[\s\(\.,;]((?:19|20)\d\d)[\s\)\.,;]", test_text)) print( f"Position {test_start} is followed by {nr_year_mentions} year mentions." ) if nr_year_mentions > most_year_mentions: most_probable_start_point = test_start most_year_mentions = nr_year_mentions self.references_start_index = most_probable_start_point
def extract(self, source: str, reference: datetime = None) -> List[ExtractResult]: if reference is None: reference = datetime.now() result: List[ExtractResult] = list() if not source: return result match_source: Dict[Match, any] = dict() matched: List[bool] = [False] * len(source) collections = list( map(lambda x: (list(regex.finditer(x[0], source)), x[1]), self._regex_dict.items())) collections = list(filter(lambda x: len(x[0]) > 0, collections)) for collection in collections: for match in collection[0]: for j in range(len(match.group())): matched[match.start() + j] = True match_source[match] = collection[1] last = -1 for i in range(len(source)): if matched[i]: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last text = source[start:start + length].strip() src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) if src_match: value = ExtractResult() value.start = start value.length = length value.text = text value.type = self.extractor_type_name value.data = self.__get_data(match_source, src_match) result.append(value) else: last = i return result
def valueRe(value): '''CHECKES WHETHER A VALUE HAS ANY TRAILING SPACES (TEST FURTHER)''' # I don't know what I'm doing # 1. Define regex expression r = '^[-\w/#]*|[-\w/#]*$' # 2. Find the match in the value string for x in regex.finditer(r, value): if x.group(0) is not '': return x.group(0) else: # Returns the value itself if for any reason the regex search does not work # Reasons being, for example, that the idiot programmer didn't test against corner cases return value
def init_mapper(reads, strand): """ Basic mapping function, based on regex, finds a match in a genome and retrieves the location of insertion within the genome. reads - a list of queries strand - the genome sequence as a string (posstive of negative) """ found = list() for read in reads: for match in re.finditer(read, strand): info = '%02d-%02d: %s' % (match.start(), match.end(), match.group(0)) found.append((read, info)) return found # the hits found by mapping