Exemple #1
0
def processText(text):
    '''
	strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter.
	'''
    text = re.sub(r'\x07|\r', '', text)
    #text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text)
    # strip ^G, \r, and inline citations
    #sections = pubGeneric.sectionRanges(text)
    #if sections is not None:
    #	try:
    #		dropRange = sections['ack']
    #		text = text[:dropRange[0]] + text[dropRange[1]:]
    #	except KeyError:
    #		pass
    #	try:
    #		dropRange = sections['refs']
    #		text = text[:dropRange[0]] + text[dropRange[1]:]
    #	except KeyError:
    #		pass

    # split by period followed by capital letter within 3 proceeding characters
    previousThreshold = -2
    threshold = 0
    for threshold in re.finditer('\..?.?([A-Z])', text):
        threshold = threshold.start()
        yield text[previousThreshold + 2:threshold + 1]
        previousThreshold = threshold
    yield text[threshold:]
def processText(text):
	'''
	strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter.
	'''
	text = re.sub(r'\x07|\r', '', text)
	#text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text)
		# strip ^G, \r, and inline citations
	#sections = pubGeneric.sectionRanges(text)
	#if sections is not None:
	#	try:
	#		dropRange = sections['ack']
	#		text = text[:dropRange[0]] + text[dropRange[1]:]
	#	except KeyError:
	#		pass
	#	try:
	#		dropRange = sections['refs']
	#		text = text[:dropRange[0]] + text[dropRange[1]:]
	#	except KeyError:
	#		pass
	
	# split by period followed by capital letter within 3 proceeding characters
	previousThreshold = -2
	threshold = 0
	for threshold in re.finditer('\..?.?([A-Z])', text):
		threshold = threshold.start()
		yield text[previousThreshold+2:threshold+1]
		previousThreshold = threshold
	yield text[threshold:]
Exemple #3
0
 def search(self, regex, flags=0, all=False):
     if all:
         result = dict()
         result["detail"] = []
         matches = []
         for map in self.address_space:
             for chunk in map["chunks"]:
                 self.dumpfile.seek(chunk["offset"])
                 match = re.finditer(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags)
                 thismatch = []
                 try:
                     while True:
                         m = next(match)
                         thismatch.append(m)
                         matches.append(m.group(0))
                 except StopIteration:
                     pass
                 if thismatch:
                     result["detail"].append({"match": thismatch, "chunk": chunk})
         result["matches"] = matches
         return result
     else:
         for map in self.address_space:
             for chunk in map["chunks"]:
                 self.dumpfile.seek(chunk["offset"])
                 match = re.search(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags)
                 if match:
                     result = dict()
                     result["match"] = match
                     result["chunk"] = chunk
                     return result
Exemple #4
0
 def search(self, regex, flags=0, all=False):
     if all:
         result = dict()
         result["detail"] = []
         matches = []
         for map in self.address_space:
             for chunk in map["chunks"]:
                 self.dumpfile.seek(chunk["offset"])
                 match = re.finditer(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags)
                 thismatch = []
                 try:
                     while True:
                         m = match.next()
                         thismatch.append(m)
                         matches.append(m.group(0))
                 except StopIteration:
                     pass
                 if thismatch:
                     result["detail"].append({"match": thismatch, "chunk": chunk})
         result["matches"] = matches
         return result
     else:
         for map in self.address_space:
             for chunk in map["chunks"]:
                 self.dumpfile.seek(chunk["offset"])
                 match = re.search(regex, self.dumpfile.read(chunk["end"] - chunk["start"]), flags)
                 if match:
                     result = dict()
                     result["match"] = match
                     result["chunk"] = chunk
                     return result
Exemple #5
0
    def predict_segment(self, corpus_segment):
        from data_utilities.util import get_mapping_after_normalization, convert_normalized_indices_to_unnormalized_indices

        norm_text = TextNormalizer.normalize_text(corpus_segment.language,
                                                  corpus_segment.text)
        mentions = []
        for match in re.finditer(self.named_entity_regex, norm_text):
            mentions += [
                Mention(match.start(1),
                        match.end(1),
                        match.group(1),
                        ref=corpus_segment.ref,
                        versionTitle=corpus_segment.versionTitle,
                        language=corpus_segment.language)
            ]
        mention_indices = [(mention.start, mention.end)
                           for mention in mentions]
        norm_map = get_mapping_after_normalization(
            corpus_segment.text, TextNormalizer.find_text_to_remove)
        mention_indices = convert_normalized_indices_to_unnormalized_indices(
            mention_indices, norm_map)
        for mention, (unnorm_start, unnorm_end) in zip(mentions,
                                                       mention_indices):
            mention.add_metadata(start=unnorm_start, end=unnorm_end)
        mentions = self.filter_already_found_mentions(mentions,
                                                      corpus_segment.text,
                                                      corpus_segment.language)
        return mentions
Exemple #6
0
    def get_tlsmaster(self):
        """Obtain the client/server random to TLS master secrets mapping that we have obtained through dynamic analysis."""
        tlsmaster = {}
        dump_tls_log = os.path.join(self.analysis_path, "tlsdump",
                                    "tlsdump.log")
        if not os.path.exists(dump_tls_log):
            return tlsmaster

        for entry in open(dump_tls_log, "r").readlines() or []:
            try:
                for m in re.finditer(
                        r"client_random:\s*(?P<client_random>[a-f0-9]+)\s*,\s*server_random:\s*(?P<server_random>[a-f0-9]+)\s*,\s*(?P<master_secret>[a-f0-9]+)\s*",
                        entry,
                        re.I,
                ):
                    try:
                        client_random = binascii.a2b_hex(
                            m.group("client_random").strip())
                        server_random = binascii.a2b_hex(
                            m.group("server_random").strip())
                        master_secret = binascii.a2b_hex(
                            m.group("master_secret").strip())
                    except Exception as e:
                        log.warning(
                            "Problem dealing with tlsdump error:{0} line:{1}".
                            format(e, m.group(0)))
                        tlsmaster[client_random, server_random] = master_secret
            except Exception as e:
                log.warning(
                    "Problem dealing with tlsdump error:{0} line:{1}".format(
                        e, entry))

        return tlsmaster
Exemple #7
0
    def test_bug_581080(self):
        it = re.finditer(r"\s", "a b")
        self.assertEqual(next(it).span(), (1, 2))
        self.assertRaises(StopIteration, next, it)

        scanner = re.compile(r"\s").scanner("a b")
        self.assertEqual(scanner.search().span(), (1, 2))
        self.assertEqual(scanner.search(), None)
Exemple #8
0
    def test_bug_581080(self):
        iter = re.finditer(r"\s", "a b")
        self.assertEqual(iter.next().span(), (1,2))
        self.assertRaises(StopIteration, iter.next)

        scanner = re.compile(r"\s").scanner("a b")
        self.assertEqual(scanner.search().span(), (1, 2))
        self.assertEqual(scanner.search(), None)
Exemple #9
0
    def __check_pattern(self, doc, results, field):
        """Check for specific pattern in text

        Args:
          doc: spacy document to analyze
          results: array containing the created results
          field: current field type (pattern)
        """

        max_matched_strength = -1.0
        for pattern in field.patterns:
            if pattern.strength <= max_matched_strength:
                break
            result_found = False

            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex,
                                  doc.text,
                                  flags=re.IGNORECASE | re.DOTALL
                                  | re.MULTILINE)
            match_time = datetime.datetime.now() - match_start_time
            self.logger.debug('--- match_time[{}]: {}.{} seconds'.format(
                field.name, match_time.seconds, match_time.microseconds))

            for match in matches:
                start, end = match.span()
                field.text = doc.text[start:end]

                # Skip empty results
                if field.text == '':
                    continue

                # Don't add duplicate
                if len(field.patterns) > 1 and any(
                    ((x.location.start == start) or (x.location.end == end))
                        and ((x.field.name == field.name)) for x in results):
                    continue

                res = self.__create_result(doc, pattern.strength, field, start,
                                           end)

                if res is None or res.score == 0:
                    continue

                # Don't add overlap
                # if any(x.location.end >= start and x.score == 1.0
                #        for x in results):
                #     continue

                results.append(res)
                result_found = True

            if result_found:
                max_matched_strength = pattern.strength
Exemple #10
0
def splitGenbankAcc(acc):
    """ split a string like AY1234 into letter-number tuple, e.g. (AY, 1234)
    >>> splitGenbankAcc("AY1234")
    ('AY', 1234, 4)
    """
    matches = list(re.finditer(r"([A-Z]+)([0-9]+)", acc))
    # re2 has trouble with the .match function
    if len(matches)>1 or len(matches)==0:
        return None
    match = matches[0]
    letters, numbers = match.groups()
    return (letters, int(numbers), len(numbers))
Exemple #11
0
def splitGenbankAcc(acc):
    """ split a string like AY1234 into letter-number tuple, e.g. (AY, 1234)
    >>> splitGenbankAcc("AY1234")
    ('AY', '1234', 4)
    """
    matches = list(re.finditer(r"([A-Z]+)([0-9]+)", acc))
    # re2 has trouble with the .match function
    if len(matches) > 1 or len(matches) == 0:
        return None
    match = matches[0]
    letters, numbers = match.groups()
    return (letters, numbers, len(numbers))
    def __analyze_patterns(self, text, flags=None):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        flags = flags if flags else re.DOTALL | re.MULTILINE
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex, text, flags=flags)
            match_time = datetime.datetime.now() - match_start_time
            self.logger.debug('--- match_time[%s]: %s.%s seconds',
                              pattern.name, match_time.seconds,
                              match_time.microseconds)

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == '':
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = self.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score,
                    validation_result)
                pattern_result = RecognizerResult(self.supported_entities[0],
                                                  start, end, score,
                                                  description)

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                invalidation_result = self.invalidate_result(current_match)
                if invalidation_result is not None and invalidation_result:
                    pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

        return results
Exemple #13
0
    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        In a sentence we could get a false positive at the end of our regex, were we
        want to find the IBAN but not the false positive at the end of the match.

        i.e. "I want my deposit in DE89370400440532013000 2 days from today."

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            matches = re.finditer(pattern.regex, text, flags=self.flags)

            for match in matches:
                for grp_num in reversed(range(1, len(match.groups()) + 1)):
                    start = match.span(0)[0]
                    end = (match.span(grp_num)[1]
                           if match.span(grp_num)[1] > 0 else match.span(0)[1])
                    current_match = text[start:end]

                    # Skip empty results
                    if current_match == "":
                        continue

                    score = pattern.score

                    validation_result = self.validate_result(current_match)
                    description = PatternRecognizer.build_regex_explanation(
                        self.name, pattern.name, pattern.regex, score,
                        validation_result)
                    pattern_result = RecognizerResult(
                        self.supported_entities[0], start, end, score,
                        description)

                    if validation_result is not None:
                        if validation_result:
                            pattern_result.score = EntityRecognizer.MAX_SCORE
                        else:
                            pattern_result.score = EntityRecognizer.MIN_SCORE

                    if pattern_result.score > EntityRecognizer.MIN_SCORE:
                        results.append(pattern_result)
                        break

        return results
Exemple #14
0
def cleanup_report(report):

	# remove (Over) ... (Cont) inserts
	spans = []
	iterator = re.finditer(r'\(Over\)', report)
	for match_over in iterator:
		start = match_over.start()
		chunk = report[match_over.end():]
		match_cont = re.search(r'\(Cont\)', chunk)
		if match_cont:
			end = match_over.end() + match_cont.end()
			spans.append( (start, end))
			
	report = erase_spans(report, spans)

	# insert a space between list numbers and subsequent text, makes
	# lists and start-of-sentence negations easier to identify
	prev_end = 0
	new_report = ''
	iterator = regex_list_start_no_space.finditer(report)
	for match in iterator:
		# end of list num (digits followed by '.' or ')'
		end = match.end('listnum')
		# start of following (concatenated) word
		start = match.start('word')
		new_report += report[prev_end:end]
		new_report += ' '
		prev_end = start
	new_report += report[prev_end:]
	report = new_report

	# remove numbering in lists
	spans = []
	iterator = regex_list_item.finditer(report)
	for match in iterator:
		start = match.start('listnum')
		end   = match.end('listnum')
		spans.append( (start, end))

	report = erase_spans(report, spans)
		
	# Remove long runs of dashes, underscores, or stars
	report = sub1_regex.sub(' ', report)

	# convert unicode left and right quotation marks to ascii
	report = sub3_regex.sub("'", report)
	
	return report
Exemple #15
0
def search_regex(pattern, filename):
    with open(filename) as file:
        line = file.readline()
        cnt = 1
        while line:
            is_match = False
            for m in re.finditer(pattern, line):
                print("{}:{}:{}".format(filename, cnt,
                                        '%02d:%s' % (m.start(), m.group(0))),
                      end=" ")
                is_match = True
            if is_match:
                is_match = False
                print()
            line = file.readline()
            cnt += 1
Exemple #16
0
    def run(self):
        metakeys = {}

        # Build server random <-> session id mapping from the PCAP.
        for row in self.results.get("network", {}).get("tls", []) or []:
            metakeys[row["server_random"]] = row["session_id"]

        results = {}
        dump_tls_log = os.path.join(self.analysis_path, "tlsdump",
                                    "tlsdump.log")
        if not os.path.exists(dump_tls_log):
            return results

        for entry in open(dump_tls_log, "r").readlines() or []:
            try:
                for m in re.finditer(
                        r"client_random:\s*(?P<client_random>[a-f0-9]+)\s*,\s*server_random:\s*(?P<server_random>[a-f0-9]+)\s*,\s*master_secret:\s*(?P<master_secret>[a-f0-9]+)\s*",
                        entry,
                        re.I,
                ):
                    try:
                        server_random = binascii.a2b_hex(
                            m.group("server_random").strip())
                        master_secret = binascii.a2b_hex(
                            m.group("master_secret").strip())
                        if server_random not in metakeys:
                            log.debug(
                                "Was unable to extract TLS master secret for server random %s, skipping it",
                                server_random)
                            continue
                        results[metakeys[server_random]] = master_secret
                    except Exception as e:
                        log.warning(
                            "Problem dealing with tlsdump error: %s line: %s",
                            e, m.group(0))
            except Exception as e:
                log.warning("Problem dealing with tlsdump error: %s line: %s",
                            e, entry)

        if results:
            # Write the TLS master secrets file.
            with open(self.tlsmaster_path, "w") as f:
                for session_id, master_secret in sorted(results.items()):
                    f.write(
                        "RSA Session-ID:{session_id} Master-Key:{master_secret}"
                    )
def main():
    filename = os.path.abspath(sys.argv[1])
    pattern = sys.argv[2]
    with open(filename) as tmp:
        data = tmp.read()
    print("RE-2 :")
    print("search pattern: " + pattern)
    print("in file: " + filename)

    results = re2.finditer(pattern, data)

    for res in results:
        start_time = time.time()
        match = re2.search(getResMatch(res), data)
        end_time = time.time()
        pos_str = '[' + str(res.span()[0]) + ',' + str(res.span()[1] - 1) + ']'
        print(
            getResMatch(res) + ', ' + pos_str + ', ' +
            str((end_time - start_time) * 1000))
Exemple #18
0
def search_regex(pattern, filename):
    name_len = len(filename)
    with open(filename) as file:
        line = file.readline()
        cnt = 1
        while line:
            is_match = False
            indent = 26 + name_len + len(str(cnt))
            new_line = [' '] * (len(line) + indent)  #
            for m in re.finditer(pattern, line):
                for index in range(indent + m.start(), indent + m.end()):
                    new_line[index] = '^'
                is_match = True
            if is_match:
                is_match = False
                print("The file {} line number {} is {}".format(filename,
                                                                cnt,
                                                                line[:-1],
                                                                end=" "))
                print("".join(new_line))
            line = file.readline()
            cnt += 1
Exemple #19
0
def findSWF(d):
    # d = buffer of the read file
    # Search for SWF Header Sigs in files
    return [tmp.start() for tmp in re.finditer(b'CWS|FWS', d.read())]
Exemple #20
0
 def test_bug_817234(self):
     iter = re.finditer(r".*", "asdf")
     self.assertEqual(iter.next().span(), (0, 4))
     self.assertEqual(iter.next().span(), (4, 4))
     self.assertRaises(StopIteration, iter.next)
Exemple #21
0
 def test_finditer(self):
     iter = re.finditer(r":+", "a:b::c:::d")
     self.assertEqual([item.group(0) for item in iter],
                      [":", "::", ":::"])
Exemple #22
0
def re2lib_experiment(regex, doc):
    n_outputs = 0
    for _ in re2.finditer(regex, doc):
        n_outputs += 1

    return n_outputs
Exemple #23
0
 def test_bug_817234(self):
     it = re.finditer(r".*", "asdf")
     self.assertEqual(next(it).span(), (0, 4))
     self.assertEqual(next(it).span(), (4, 4))
     self.assertRaises(StopIteration, next, it)
Exemple #24
0
 def test_finditer(self):
     iter = re.finditer(r":+", "a:b::c:::d")
     self.assertEqual([item.group(0) for item in iter], [":", "::", ":::"])
Exemple #25
0
 def find_text_to_remove(cls, s):
     return [(m, cls.normalizing_rep)
             for m in re.finditer(cls.normalizing_reg, s)]