Esempio n. 1
0
 def match(self, text):
     matches = set()
     for m in XML_PATTERN_RE.finditer(text):
         if ANCHOR_URL_PATTERN_RE.search(m.group(0)) or\
             ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)):
             continue
         xml_element = m.group(0)
         offset = m.start()
         children = get_xml_pair(xml_element, offset, self.priority)
         matches.add(
             create_match(
                 (m.start(), m.end(), 'xml element', self.priority),
                 children))
     for m in FUZZY_XML_PATTERN_RE.finditer(text):
         if ANCHOR_URL_PATTERN_RE.match(m.group(0)) or\
             ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)):
             continue
         xml_element = m.group(0)
         offset = m.start()
         children = get_xml_pair(xml_element, offset, self.priority)
         matches.add(
             create_match(
                 (m.start(), m.end(), 'xml element', self.priority),
                 children))
     return matches
Esempio n. 2
0
 def match(self, text):
     matches = set()
     for m in XML_PATTERN_RE.finditer(text):
         if ANCHOR_URL_PATTERN_RE.search(m.group(0)) or\
             ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)):
             continue
         xml_element = m.group(0)
         offset = m.start()
         children = get_xml_pair(xml_element,offset,self.priority)
         matches.add(
                 create_match(
                     (m.start(), m.end(), 'xml element', self.priority),
                     children))
     for m in FUZZY_XML_PATTERN_RE.finditer(text):
         if ANCHOR_URL_PATTERN_RE.match(m.group(0)) or\
             ANCHOR_EMAIL_PATTERN_RE.match(m.group(0)):
             continue
         xml_element = m.group(0)
         offset = m.start()
         children = get_xml_pair(xml_element,offset,self.priority)
         matches.add(
                 create_match(
                     (m.start(), m.end(), 'xml element', self.priority),
                     children))
     return matches
Esempio n. 3
0
def is_xml_lines(lines):
    xml_lines = 0
    empty_lines = 0
    confidence = 0.0
    lines_size = len(lines)

    for line in lines:
        if len(line.strip()) == 0:
            empty_lines += 1
        elif (XML_STRICT_PATTERN1_RE.match(line) or
                XML_STRICT_PATTERN2_RE.match(line) or
                XML_STRICT_COMMENT_RE.match(line)) and not \
                (ANCHOR_URL_PATTERN_RE.search(line) or \
                 ANCHOR_EMAIL_PATTERN_RE.search(line)):
            xml_lines += 1
        elif lines_size > 1:
            if XML_STRICT_OPENING_RE.match(line) or \
            XML_STRICT_CLOSING_RE.match(line) or \
            line.strip().startswith('<--') or \
            line.strip().endswith('-->') or \
            XML_ATTRIBUTE_VALUE_PAIR_STRICT_RE.search(line):
                xml_lines += 1

    non_empty_lines = lines_size - empty_lines

    if non_empty_lines > 0:
        confidence = float(xml_lines) / (len(lines) - empty_lines)

    if -0.10 <= (confidence - THRESHOLD_XML) <= 0:
        logger.info('Almost reached XML threshold: {0}'.format(lines))

    return (confidence >= THRESHOLD_XML, confidence)
Esempio n. 4
0
def is_xml_lines(lines):
    xml_lines = 0
    empty_lines = 0
    confidence = 0.0
    lines_size = len(lines)
    
    for line in lines:
        if len(line.strip()) == 0:
            empty_lines += 1
        elif (XML_STRICT_PATTERN1_RE.match(line) or
                XML_STRICT_PATTERN2_RE.match(line) or
                XML_STRICT_COMMENT_RE.match(line)) and not \
                (ANCHOR_URL_PATTERN_RE.search(line) or \
                 ANCHOR_EMAIL_PATTERN_RE.search(line)):
            xml_lines += 1
        elif lines_size > 1:
            if XML_STRICT_OPENING_RE.match(line) or \
            XML_STRICT_CLOSING_RE.match(line) or \
            line.strip().startswith('<--') or \
            line.strip().endswith('-->') or \
            XML_ATTRIBUTE_VALUE_PAIR_STRICT_RE.search(line):
                xml_lines += 1

    non_empty_lines = lines_size - empty_lines

    if non_empty_lines > 0:
        confidence = float(xml_lines) / (len(lines) - empty_lines)
    
    if -0.10 <= (confidence - THRESHOLD_XML) <= 0:
        logger.info('Almost reached XML threshold: {0}'.format(lines))
    
    return (confidence >= THRESHOLD_XML, confidence)