def processText(text): ''' strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter. ''' text = re.sub(r'\x07|\r', '', text) #text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text) # strip ^G, \r, and inline citations #sections = pubGeneric.sectionRanges(text) #if sections is not None: # try: # dropRange = sections['ack'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # try: # dropRange = sections['refs'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # split by period followed by capital letter within 3 proceeding characters previousThreshold = -2 threshold = 0 for threshold in re.finditer('\..?.?([A-Z])', text): threshold = threshold.start() yield text[previousThreshold + 2:threshold + 1] previousThreshold = threshold yield text[threshold:]
def processText(text): ''' strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter. ''' text = re.sub(r'\x07|\r', '', text) #text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text) # strip ^G, \r, and inline citations #sections = pubGeneric.sectionRanges(text) #if sections is not None: # try: # dropRange = sections['ack'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # try: # dropRange = sections['refs'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # split by period followed by capital letter within 3 proceeding characters previousThreshold = -2 threshold = 0 for threshold in re.finditer('\..?.?([A-Z])', text): threshold = threshold.start() yield text[previousThreshold+2:threshold+1] previousThreshold = threshold yield text[threshold:]
def search(self, regex, flags=0, all=False): if all: result = dict() result["detail"] = [] matches = [] for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.finditer(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) thismatch = [] try: while True: m = next(match) thismatch.append(m) matches.append(m.group(0)) except StopIteration: pass if thismatch: result["detail"].append({"match": thismatch, "chunk": chunk}) result["matches"] = matches return result else: for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.search(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: result = dict() result["match"] = match result["chunk"] = chunk return result
def search(self, regex, flags=0, all=False): if all: result = dict() result["detail"] = [] matches = [] for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.finditer(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) thismatch = [] try: while True: m = match.next() thismatch.append(m) matches.append(m.group(0)) except StopIteration: pass if thismatch: result["detail"].append({"match": thismatch, "chunk": chunk}) result["matches"] = matches return result else: for map in self.address_space: for chunk in map["chunks"]: self.dumpfile.seek(chunk["offset"]) match = re.search(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags) if match: result = dict() result["match"] = match result["chunk"] = chunk return result
def predict_segment(self, corpus_segment): from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices norm_text = TextNormalizer.normalize_text(corpus_segment.language, corpus_segment.text) mentions = [] for match in re.finditer(self.named_entity_regex, norm_text): mentions += [ Mention(match.start(1), match.end(1), match.group(1), ref=corpus_segment.ref, versionTitle=corpus_segment.versionTitle, language=corpus_segment.language) ] mention_indices = [(mention.start, mention.end) for mention in mentions] norm_map = get_mapping_after_normalization( corpus_segment.text, TextNormalizer.find_text_to_remove) mention_indices = convert_normalized_indices_to_unnormalized_indices( mention_indices, norm_map) for mention, (unnorm_start, unnorm_end) in zip(mentions, mention_indices): mention.add_metadata(start=unnorm_start, end=unnorm_end) mentions = self.filter_already_found_mentions(mentions, corpus_segment.text, corpus_segment.language) return mentions
def get_tlsmaster(self): """Obtain the client/server random to TLS master secrets mapping that we have obtained through dynamic analysis.""" tlsmaster = {} dump_tls_log = os.path.join(self.analysis_path, "tlsdump", "tlsdump.log") if not os.path.exists(dump_tls_log): return tlsmaster for entry in open(dump_tls_log, "r").readlines() or []: try: for m in re.finditer( r"client_random:\s*(?P<client_random>[a-f0-9]+)\s*,\s*server_random:\s*(?P<server_random>[a-f0-9]+)\s*,\s*(?P<master_secret>[a-f0-9]+)\s*", entry, re.I, ): try: client_random = binascii.a2b_hex( m.group("client_random").strip()) server_random = binascii.a2b_hex( m.group("server_random").strip()) master_secret = binascii.a2b_hex( m.group("master_secret").strip()) except Exception as e: log.warning( "Problem dealing with tlsdump error:{0} line:{1}". format(e, m.group(0))) tlsmaster[client_random, server_random] = master_secret except Exception as e: log.warning( "Problem dealing with tlsdump error:{0} line:{1}".format( e, entry)) return tlsmaster
def test_bug_581080(self): it = re.finditer(r"\s", "a b") self.assertEqual(next(it).span(), (1, 2)) self.assertRaises(StopIteration, next, it) scanner = re.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None)
def test_bug_581080(self): iter = re.finditer(r"\s", "a b") self.assertEqual(iter.next().span(), (1,2)) self.assertRaises(StopIteration, iter.next) scanner = re.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None)
def __check_pattern(self, doc, results, field): """Check for specific pattern in text Args: doc: spacy document to analyze results: array containing the created results field: current field type (pattern) """ max_matched_strength = -1.0 for pattern in field.patterns: if pattern.strength <= max_matched_strength: break result_found = False match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, doc.text, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) match_time = datetime.datetime.now() - match_start_time self.logger.debug('--- match_time[{}]: {}.{} seconds'.format( field.name, match_time.seconds, match_time.microseconds)) for match in matches: start, end = match.span() field.text = doc.text[start:end] # Skip empty results if field.text == '': continue # Don't add duplicate if len(field.patterns) > 1 and any( ((x.location.start == start) or (x.location.end == end)) and ((x.field.name == field.name)) for x in results): continue res = self.__create_result(doc, pattern.strength, field, start, end) if res is None or res.score == 0: continue # Don't add overlap # if any(x.location.end >= start and x.score == 1.0 # for x in results): # continue results.append(res) result_found = True if result_found: max_matched_strength = pattern.strength
def splitGenbankAcc(acc): """ split a string like AY1234 into letter-number tuple, e.g. (AY, 1234) >>> splitGenbankAcc("AY1234") ('AY', 1234, 4) """ matches = list(re.finditer(r"([A-Z]+)([0-9]+)", acc)) # re2 has trouble with the .match function if len(matches)>1 or len(matches)==0: return None match = matches[0] letters, numbers = match.groups() return (letters, int(numbers), len(numbers))
def splitGenbankAcc(acc): """ split a string like AY1234 into letter-number tuple, e.g. (AY, 1234) >>> splitGenbankAcc("AY1234") ('AY', '1234', 4) """ matches = list(re.finditer(r"([A-Z]+)([0-9]+)", acc)) # re2 has trouble with the .match function if len(matches) > 1 or len(matches) == 0: return None match = matches[0] letters, numbers = match.groups() return (letters, numbers, len(numbers))
def __analyze_patterns(self, text, flags=None): """ Evaluates all patterns in the provided text, including words in the provided blacklist :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ flags = flags if flags else re.DOTALL | re.MULTILINE results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=flags) match_time = datetime.datetime.now() - match_start_time self.logger.debug('--- match_time[%s]: %s.%s seconds', pattern.name, match_time.seconds, match_time.microseconds) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == '': continue score = pattern.score validation_result = self.validate_result(current_match) description = self.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result) pattern_result = RecognizerResult(self.supported_entities[0], start, end, score, description) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE invalidation_result = self.invalidate_result(current_match) if invalidation_result is not None and invalidation_result: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) return results
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist In a sentence we could get a false positive at the end of our regex, were we want to find the IBAN but not the false positive at the end of the match. i.e. "I want my deposit in DE89370400440532013000 2 days from today." :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: matches = re.finditer(pattern.regex, text, flags=self.flags) for match in matches: for grp_num in reversed(range(1, len(match.groups()) + 1)): start = match.span(0)[0] end = (match.span(grp_num)[1] if match.span(grp_num)[1] > 0 else match.span(0)[1]) current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) break return results
def cleanup_report(report): # remove (Over) ... (Cont) inserts spans = [] iterator = re.finditer(r'\(Over\)', report) for match_over in iterator: start = match_over.start() chunk = report[match_over.end():] match_cont = re.search(r'\(Cont\)', chunk) if match_cont: end = match_over.end() + match_cont.end() spans.append( (start, end)) report = erase_spans(report, spans) # insert a space between list numbers and subsequent text, makes # lists and start-of-sentence negations easier to identify prev_end = 0 new_report = '' iterator = regex_list_start_no_space.finditer(report) for match in iterator: # end of list num (digits followed by '.' or ')' end = match.end('listnum') # start of following (concatenated) word start = match.start('word') new_report += report[prev_end:end] new_report += ' ' prev_end = start new_report += report[prev_end:] report = new_report # remove numbering in lists spans = [] iterator = regex_list_item.finditer(report) for match in iterator: start = match.start('listnum') end = match.end('listnum') spans.append( (start, end)) report = erase_spans(report, spans) # Remove long runs of dashes, underscores, or stars report = sub1_regex.sub(' ', report) # convert unicode left and right quotation marks to ascii report = sub3_regex.sub("'", report) return report
def search_regex(pattern, filename): with open(filename) as file: line = file.readline() cnt = 1 while line: is_match = False for m in re.finditer(pattern, line): print("{}:{}:{}".format(filename, cnt, '%02d:%s' % (m.start(), m.group(0))), end=" ") is_match = True if is_match: is_match = False print() line = file.readline() cnt += 1
def run(self): metakeys = {} # Build server random <-> session id mapping from the PCAP. for row in self.results.get("network", {}).get("tls", []) or []: metakeys[row["server_random"]] = row["session_id"] results = {} dump_tls_log = os.path.join(self.analysis_path, "tlsdump", "tlsdump.log") if not os.path.exists(dump_tls_log): return results for entry in open(dump_tls_log, "r").readlines() or []: try: for m in re.finditer( r"client_random:\s*(?P<client_random>[a-f0-9]+)\s*,\s*server_random:\s*(?P<server_random>[a-f0-9]+)\s*,\s*master_secret:\s*(?P<master_secret>[a-f0-9]+)\s*", entry, re.I, ): try: server_random = binascii.a2b_hex( m.group("server_random").strip()) master_secret = binascii.a2b_hex( m.group("master_secret").strip()) if server_random not in metakeys: log.debug( "Was unable to extract TLS master secret for server random %s, skipping it", server_random) continue results[metakeys[server_random]] = master_secret except Exception as e: log.warning( "Problem dealing with tlsdump error: %s line: %s", e, m.group(0)) except Exception as e: log.warning("Problem dealing with tlsdump error: %s line: %s", e, entry) if results: # Write the TLS master secrets file. with open(self.tlsmaster_path, "w") as f: for session_id, master_secret in sorted(results.items()): f.write( "RSA Session-ID:{session_id} Master-Key:{master_secret}" )
def main(): filename = os.path.abspath(sys.argv[1]) pattern = sys.argv[2] with open(filename) as tmp: data = tmp.read() print("RE-2 :") print("search pattern: " + pattern) print("in file: " + filename) results = re2.finditer(pattern, data) for res in results: start_time = time.time() match = re2.search(getResMatch(res), data) end_time = time.time() pos_str = '[' + str(res.span()[0]) + ',' + str(res.span()[1] - 1) + ']' print( getResMatch(res) + ', ' + pos_str + ', ' + str((end_time - start_time) * 1000))
def search_regex(pattern, filename): name_len = len(filename) with open(filename) as file: line = file.readline() cnt = 1 while line: is_match = False indent = 26 + name_len + len(str(cnt)) new_line = [' '] * (len(line) + indent) # for m in re.finditer(pattern, line): for index in range(indent + m.start(), indent + m.end()): new_line[index] = '^' is_match = True if is_match: is_match = False print("The file {} line number {} is {}".format(filename, cnt, line[:-1], end=" ")) print("".join(new_line)) line = file.readline() cnt += 1
def findSWF(d): # d = buffer of the read file # Search for SWF Header Sigs in files return [tmp.start() for tmp in re.finditer(b'CWS|FWS', d.read())]
def test_bug_817234(self): iter = re.finditer(r".*", "asdf") self.assertEqual(iter.next().span(), (0, 4)) self.assertEqual(iter.next().span(), (4, 4)) self.assertRaises(StopIteration, iter.next)
def test_finditer(self): iter = re.finditer(r":+", "a:b::c:::d") self.assertEqual([item.group(0) for item in iter], [":", "::", ":::"])
def re2lib_experiment(regex, doc): n_outputs = 0 for _ in re2.finditer(regex, doc): n_outputs += 1 return n_outputs
def test_bug_817234(self): it = re.finditer(r".*", "asdf") self.assertEqual(next(it).span(), (0, 4)) self.assertEqual(next(it).span(), (4, 4)) self.assertRaises(StopIteration, next, it)
def find_text_to_remove(cls, s): return [(m, cls.normalizing_rep) for m in re.finditer(cls.normalizing_reg, s)]