Ejemplo n.º 1
0
 def hit_check(self, node):
     if DataValidator.in_class_ids(node,
                                   ShortDescValidator.ids_and_classes):
         return True
     if tp.number_of_words(DataValidator.flatten_text(node.text)) >= 20:
         return True
     return False
Ejemplo n.º 2
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 2, 2, 2, 1]
        max_value = functools.reduce((lambda x, y: x + y), weights)

        # Condition 1: Contains more uppercase words
        if not DataValidator.contains_more_lower_than(node, 0.5):
            score += weights[0]

        # Condition 2: Contains header
        for tag in ['h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if node.has_tag(tag):
                score += weights[1]
                break

        # Condition 3: Contains less than 10 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10:
            score += weights[2]
            # Condition 4: Contains symbols - and :
            if re.search('[:\-]+', DataValidator.flatten_text(node.text)):
                score += weights[3]

        # Condition 5: Contains a link node
        if node.has_tag('a'):
            score += weights[4]

        # Condition 6: Contains no numbers or slashes
        if re.search('[0-9\/]+', DataValidator.flatten_text(
                node.text)) is None:
            score += weights[5]

        return float(score) / float(max_value)
Ejemplo n.º 3
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 1, 3]
        # 1 is subtracted because of the else condition
        max_value = functools.reduce((lambda x, y: x + y), weights) - 1
        words = ['am', 'vom', 'bis', 'zum', 'datum']

        # Condition 1: The text has fewer than 15 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15:
            score += weights[0]
            # Condition 2. Additionally contains suitable words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[1]
                    break
        else:
            # Condition 3: Contains suitable words but has more than 15 words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[2]
                    break

        # Condition 4: Contains suitable classes
        if DataValidator.in_class_ids(node, DateValidator.hit_labels):
            score += weights[3]

        return float(score) / float(max_value)
Ejemplo n.º 4
0
 def hit_check(self, node):
     if DataValidator.in_class_ids(node, LocationValidator.ids_classes):
         return True
     if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10:
         for word in LocationValidator.key_words:
             reg = '(\s+|^)%s(\s+|:)\s*\S+' % word
             if re.match(reg, DataValidator.flatten_text(node.text).lower()):
                 return True
     return False
Ejemplo n.º 5
0
 def base_check(self, node):
     text = DataValidator.flatten_text(node.text)
     if tp.number_of_words(text) > 15:
         return False
     if re.search('[\/?!]', text):
         return False
     # Checking the cases
     if DataValidator.contains_more_lower_than(node, (1.0/3.0)):
         return False
     return True
Ejemplo n.º 6
0
    def hit_check(self, node):
        keywords = ['©', 'copyright']

        if tp.number_of_words(DataValidator.flatten_text(node.text)) == 0:
            return True
        for kw in keywords:
            if kw in DataValidator.flatten_text(node.text):
                return True

        return False
Ejemplo n.º 7
0
 def base_check(self, node):
     text = DataValidator.flatten_text(node.text)
     if tp.number_of_words(text) > 15:
         return False
     if len(node.get_children()) > 1:
         return False
     return True
Ejemplo n.º 8
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 2, 2, 2]
        max_value = functools.reduce((lambda x, y: x + y), weights)
        text = DataValidator.flatten_text(node.text)

        # Condition 1: has header tags
        found = False
        for tag in ['header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if node.has_tag(tag):
                found = True
                break
        if not found:
            score += weights[0]

        # Condition 2: Contains address and house number
        # Regexp for searching for house numbers, allowed are f.e.:
        # Somestreet 12, Somestreet 12a
        # but not:
        # Somestreet 12sometextwithoutanywhitespace
        if re.search('([A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?\s+|[A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?$)', text) is not None:
            score += weights[1]

        # Condition 3: Contains postal code(Germany)
        # Regexp for postal code
        if re.search('(\s+[0-9]{5}\s+|\s+[0-9]{5}$)', text) is not None:
            score += weights[2]

        # Condition 4: Contains less than 11 words
        if tp.number_of_words(text) <= 10:
            score += weights[3]

        return float(score) / float(max_value)
Ejemplo n.º 9
0
    def base_check(self, node):
        found = False
        if 'class' in node.attributes:
            if DataValidator.in_class_ids(node, DateValidator.hit_labels):
                found = True

        # Regex searching for full dates like 21.07.2019 or 21.07
        if re.search(
                '([0-3]?[0-9]{1})[.:\-\s]+(0[1-9]{1}|1[012]{1})([.:\-\s]+(2[0-9]{3}|[0-1]{1}[0-9]{1})|\s+|$)',
                DataValidator.flatten_text(node.text)
        ):
            found = True

        # Regex searching for date containing month names like 21.August or 21 August
        match_data = re.match('\s*([012]{1}[1-9]{1}|[3]{1}[01]{1})[.:\-\s]+([a-zA-Z]+)', DataValidator.flatten_text(node.text))
        if match_data is not None:
            for group in match_data.groups():
                if group[1].lower() in [
                    'januar',
                    'jan',
                    'februar',
                    'feb',
                    'märz',
                    'april',
                    'apr'
                    'mai',
                    'juni',
                    'jun',
                    'juli',
                    'jul',
                    'august',
                    'aug',
                    'september',
                    'sep',
                    'oktober',
                    'okt',
                    'november',
                    'nov',
                    'dezember',
                    'dez'
                ]:
                    found = True
        if not found:
            return False

        return True
Ejemplo n.º 10
0
    def extract(self, node):
        text = DataValidator.flatten_text(node.text)
        date_info = {'day': None, 'month': None, 'year': None}
        # Finds date, which contains only numbers, like 21.07.19 or 21.07
        reg_number_date = r'(\s+|^)([0-3]?[0-9]{1})[.:\-\s]+(0[1-9]{1}|1[012])([.:\-\s]+(2[0-9]{3}|' \
                          r'[0-2]{1}[0-9]{1})|\s+|$)'
        reg_full_month = r'(\s+|^)([012]{1}[1-9]{1}|[3]{1}[01]{1})[.:\-\s]+([a-zA-Z]+)(.+)'

        match_year = None

        if re.search(reg_number_date, text):
            match = re.findall(reg_number_date, text)
            date_info['day'] = match[0][1]
            date_info['month'] = match[0][2]
            match_year = match[0][4]
        elif re.search(reg_full_month, text):
            match = re.findall(reg_full_month, text)
            date_info['day'] = match[0][1]
            month = self.month_lookup(match[0][2])
            if month is None:
                raise errors.errors.NotFoundError(
                    'Month was not found while date extraction')
            date_info['month'] = month
            if re.search('^[\s:.\-]+(2[0-9]{3}|[0-1]{1}[0-9]{1})(\s+|$)',
                         match[0][3]):
                match_year = re.findall(
                    '^[\s:.\-]+(2[0-9]{3}|[0-1]{1}[0-9]{1})(\s+|$)',
                    match[0][3])[0][0]
        else:
            for child in node.get_children():
                if 'class' in child.attributes:
                    if 'day' in child.attributes['class']:
                        date_info['day'] = child.text.strip()
                    if 'month' in child.attributes['class']:
                        date_info['month'] = self.month_lookup(
                            child.text.strip())
                    if 'year' in child.attributes['class']:
                        date_info['year'] = child.text.strip()

        if match_year is None:
            date_info['year'] = datetime.datetime.now().year
        elif len(match_year) == 2:
            date_info['year'] = '20' + match_year
        elif len(match_year) == 4:
            date_info['year'] = match_year
        else:
            date_info['year'] = None

        if date_info['year'] is None:
            return {'startDate': None}

        return {
            'startDate':
            ("%s-%s-%s" %
             (date_info['year'], date_info['month'], date_info['day']))
        }
Ejemplo n.º 11
0
    def extract(self, node):
        if DataLabel.TITLE not in node.data_container['label']['hits']:
            for label in [DataLabel.DATE, DataLabel.TIME]:
                if DataExtractor.is_hit(node, label):
                    return None

        text = DataValidator.flatten_text(node.text)
        return {
            'name': DataExtractor.remove_extra_whitespaces(text).strip()
        }
Ejemplo n.º 12
0
 def base_check(self, node):
     text = DataValidator.flatten_text(node.text)
     found = False
     if re.search(
             '(\s+|^)(([0-1]{1}[0-9]{1}|2[0-4]{1})[:]+([0-5]{1}[0-9]{1}))(\s+|$)',
             text) is not None:
         found = True
     if re.search('([0-9]{1}|[0-5]{1}[0-9]{1})\s+uhr',
                  text.lower()) is not None:
         found = True
     return found
Ejemplo n.º 13
0
    def extract(self, node):
        if DataLabel.SHORT_DESC not in node.data_container['label']['hits']:
            for label in [DataLabel.DATE, DataLabel.TIME, DataLabel.TITLE]:
                if DataExtractor.is_hit(node, label):
                    return None

        text = DataValidator.flatten_text(node.text)
        return {
            'disambiguatingDescription':
            self.remove_extra_whitespaces(text).strip()
        }
Ejemplo n.º 14
0
    def extract(self, node):
        # Only check the other labels, if node does not contains
        # a hit for location
        if DataLabel.LOCATION not in node.data_container['label']['hits']:
            for label in [DataLabel.DATE, DataLabel.TIME, DataLabel.TITLE]:
                if de.DataExtractor.is_hit(node, label):
                    return None

        text = DataValidator.flatten_text(node.text)
        text = self.remove_extra_whitespaces(text)
        text = re.sub('.+:', '', text)
        return {'location': text.strip()}
Ejemplo n.º 15
0
    def score_check(self, node):
        # preparations
        score = 0
        weights = [2, 1, 3, 2, 2]
        max_value = functools.reduce((lambda x, y: x * y), weights)
        text = DataValidator.flatten_text(node.text)

        # Condition 1: more than 15 words
        if tp.number_of_words(text) >= 15:
            score += weights[0]

        # Condition 2: Contains quotation marks
        if re.match('".+"', text) is not None:
            score += weights[1]

        # Condition 3: Does not contain any headers
        found = False
        for tag in [
                'stronger', 'header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        ]:
            if node.has_tag(tag):
                found = True
                break
        if not found:
            score += weights[2]

        # Condition 4: Contains more lowercase words than uppercase
        if DataValidator.contains_more_lower_than(node):
            score += weights[3]

        # Condition 5: Contains full stops, exclamation signs,
        #              questions signs, semicolons, colons
        if re.match('[.:;?!]\s+|[.:;?!]$', text):
            score += weights[4]

        # Returning score
        return float(score) / float(max_value)
Ejemplo n.º 16
0
    def score_check(self, node):
        # Preparations
        score = 0
        weights = [2, 3, 1]
        # 1 gets subtracted because of the else statement
        max_value = functools.reduce((lambda x, y: x + y), weights) - 1
        words = ['von', 'bis', 'ab', 'uhrzeit', 'um']

        # Condition 1: contains less than 16 words
        if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15:
            score += weights[1]
            # Condition 2: combination of 1 and 2
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[2]
                    break
        else:
            # Condition 3: contains suitable words
            for word in words:
                if word in DataValidator.flatten_text(node.text.lower()):
                    score += weights[0]
                    break

        return float(score) / float(max_value)
Ejemplo n.º 17
0
    def base_check(self, node):
        text = DataValidator.flatten_text(node.text)
        if tp.number_of_words(text) < 3:
            return False
        for child in node.get_children():
            if child.has_tag('div') or child.has_tag('span') or child.has_tag(
                    'td'):
                return False

        found = False
        for tag in ['span', 'div', 'td']:
            if node.type == tag:
                found = True
        if tp.number_of_words(text) >= 15:
            found = True
        return found
Ejemplo n.º 18
0
 def hit_check(self, node):
     if node.type in ['time']:
         return True
     return DataValidator.in_class_ids(node, TimeValidator.hit_labels)
Ejemplo n.º 19
0
    def extract(self, node):
        """
        Actual extraction function
        :param HTMLNode node: the node where to extract from
        :return: dict : containing time
        """
        text = DataValidator.flatten_text(node.text)
        ret_info = {'hour': None, 'minutes': None}
        # Those regex are very strict, no checks needed when extracting
        # Matches strings like 19:15, 12:00, 09:23
        reg_full_time = r'(\s+|^)(([0-1]?[0-9]{1}|2[0-4]{1})[:]+([0-5]{1}[0-9]{1}))(\s+|$)'
        # Matches strings like 18:12 uhr
        reg_part_time = r'(\s+|^)(0?[0-9]{1}|1[0-9]{1}|2[0-4]{1})([:](0[0-9]{1}|[1-5]{1}[0-9]{1})|\s*)' \
                        r'\s+uhr'
        # Matches strings like 12 - 19 uhr, 23 bis 2 uhr
        re_part_time_hours = r'(\s+|^)([01]{1}[0-9]{1}|2[0-4]{1})\s*([-:]{1}|bis)\s*' \
                             r'([01]{1}[0-9]{1}|2[0-4]{1})\s+uhr'
        # Matches 00 to 59
        reg_optional_minutes = r'^\s+([0-5]{1}[0-9]{1})(\s+|$)'

        # Those regex are much more loose and additional checks
        reg_loose = r'(\s+|^)((0?[0-9]{1}|1[0-9]{1}|2[0-4]{1})\.([0-5]{1}[0-9]{1}))(-|\s+|$)'

        if re.search(reg_full_time, text) is not None:
            match = re.findall(reg_full_time, text)
            ret_info['hour'] = match[0][2]
            ret_info['minutes'] = match[0][3]
        elif re.search(re_part_time_hours, text.lower()) is not None:
            match = re.findall(re_part_time_hours, text.lower())
            ret_info['hour'] = match[0][1]
            ret_info['minutes'] = '00'
        elif re.search(reg_part_time, text.lower()) is not None:
            match = re.findall(reg_part_time, text.lower())
            ret_info['hour'] = match[0][1]
            if re.match(reg_optional_minutes, match[0][3]):
                match = re.findall(reg_optional_minutes, match[0][3])
                ret_info['minutes'] = match[0][0]
            else:
                ret_info['minutes'] = '00'
        elif re.search(reg_loose, text):
            match = re.findall(reg_loose, text)
            for item in match:
                time = item[1]
                prefix = re.findall('(.+)%s' % time, text)[0]
                prefix_words = re.findall('([^\s]+)', prefix)
                # checking up to 3 words before the time
                for i in range(1, 4):
                    try:
                        allowed = ['(\s+|^)von', '(\s+|^)ab', '(\s+|^)um']
                        for reg in allowed:
                            if re.search(reg,
                                         prefix_words[-1 * i],
                                         flags=re.IGNORECASE) is not None:
                                ret_info['hour'] = item[2]
                                ret_info['minutes'] = item[3]
                                break
                    except IndexError:
                        break
        # Adding a leading zero if needed
        if ret_info['hour'] is not None and len(ret_info['hour']) == 1:
            ret_info['hour'] = '0%s' % ret_info['hour']
        else:
            return {'doorTime': None}
        return {
            'doorTime': ("%s:%s" % (ret_info['hour'], ret_info['minutes']))
        }
Ejemplo n.º 20
0
 def hit_check(self, node):
     return DataValidator.in_class_ids(node, TitleValidator.hit_labels)