Beispiel #1
0
    def detect(self, start_position, end_position):
        """Detects upper case formatted heading of text to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        # velka pismena
        capitals_pattern = '[A-ZĚŠČŘŽÝÁÍÉÚŮŇŤÓĎ ]{4,}'
        start_line = get_current_line(self._text, start_position)
        start_match = re.search(capitals_pattern, start_line)
        if start_match and (len(start_match.group(0)) / len(start_line) > 0.5):
            new_start_position = start_position + len(start_line)
            end_occurrences = find_all_occurrences_in_string(
                capitals_pattern, self._text[new_start_position:end_position])
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = new_start_position + occ
                    end_line = get_current_line(self._text, occ)
                    end_match = re.search(capitals_pattern, end_line)
                    if end_match and (len(end_match.group(0)) / len(end_line) >
                                      0.5):
                        self._end_position = occ
                        return self._end_position
        return None
Beispiel #2
0
    def detect(self, start_position, end_position):
        """Searches for a specific heading keyword of name parameter to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        # pouze nazev
        article_pattern = 'název'
        start_occurrences = find_all_occurrences_in_string(
            article_pattern,
            self._text[max(start_position - 50, 0):min(start_position +
                                                       50, len(self._text))])
        if len(start_occurrences) > 0:
            end_occurrences = find_all_occurrences_in_string(
                '\n', self._text[start_position:end_position])
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = start_position + occ
                    if len(get_current_line(self._text, occ)) > 30:
                        self._end_position = occ
                        return self._end_position

        return None
Beispiel #3
0
    def detect(self, start_position, end_position):
        """Searches for a specific heading keyword of sections to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        # clanek header
        subj_prefix = self._text[max(start_position - 50, 0):start_position]
        article_pattern = 'článek'
        start_occurrences = find_all_occurrences_in_string(
            article_pattern, subj_prefix)
        if len(start_occurrences) > 0:
            end_occurrences = find_all_occurrences_in_string(
                article_pattern, self._text[start_position:end_position])
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = start_position + occ
                    if len(get_current_line(self._text, occ)) < 50:
                        self._end_position = occ
                        return self._end_position
        return None
Beispiel #4
0
    def detect(self, start_position, end_position):
        """Detects the roman numbering of sections to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        # rimske cislovani
        subj_prefix = self._text[max(start_position - 50, 0):start_position]
        roman_numeral_pattern = '\s(?=[XVI])(X{0,3})(I[XV]|V?I{0,3})[\s\W]+'
        start_occurrences = find_all_occurrences_in_string(
            roman_numeral_pattern, subj_prefix, lower=False)
        if len(start_occurrences) > 0:
            end_occurrences = find_all_occurrences_in_string(
                roman_numeral_pattern,
                self._text[start_position:end_position],
                lower=False)
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = start_position + occ
                    if len(get_current_line(self._text, occ + 1)) < 50:
                        self._end_position = occ
                        return self._end_position
        return None
Beispiel #5
0
    def get_all_occurrences(self, text):
        """Finds all occurrences of keywords and computes their rating.

        Uses member keywords with their weights to initialize the rating of each of them.
        Uses local characteristics to accumulate the rating coefficient.
        Result keyword rating is the default rating multiplied by the coefficient.

        Args:
            text (str): text to find the keywords in

        Returns:
            list: list of all occurrences represented by a dictionary
            containing the keyword, rating and occurrence position
        """
        occurrences = []
        for keyword in self._keywords:
            occ = find_all_occurrences_in_string(keyword, text)
            for o in occ:
                rat = self._keywords[keyword]
                koef = 1
                matched = keyword.lower()
                current_line = get_current_line(text, o)
                # Whole line
                if current_line.lower() == keyword.lower():
                    koef += 2
                # Exact pattern match
                if text[o:min(o + len(keyword), len(text))] == keyword:
                    koef += 1.5
                    matched = keyword
                # Upper case pattern match
                if text[o:min(o + len(keyword), len(text))] == keyword.upper():
                    koef += 1.5
                    matched = keyword.upper()
                # Nearly linebreak after the pattern (chapter title)
                if '\n' in text[o:min(o + len(keyword) * 3, len(text))]:
                    koef += 2
                # Newline followed by a number preceding the pattern (chapter numbering)
                if re.search(r"\n[ ]*[0-9]", text[max(o - 20, 0):o]):
                    koef += 2
                # Nearly verb ' je ' after the pattern (subject sentence matching)
                if ' je ' in text[o:min(o + len(keyword) * 2, len(text))]:
                    koef += 2
                # Word 'článek' preceding the pattern (chapter header)
                if 'článek' in text[max(o - 20, 0):o].lower():
                    koef += 2
                # Chars 'I' preceding the pattern (chapter numbering)
                if text[max(o - 20, 0):o].count('I') > 1:
                    koef += 2
                # Simple sentences following
                koef += chars_occurrence_ratio(
                    text[min(o + 50, len(text)):min(o + 100, len(text))])
                # Nearly noun 'Zbozi' after the pattern ()
                if 'Zboží' in text[o:min(o + 150, len(text))]:
                    koef *= 0.5

                rat *= koef
                occurrences.append({'keyword': matched, 'rat': rat, 'occ': o})
        return occurrences
Beispiel #6
0
    def detect(self, start_position, end_position):
        """Detects the numbering of sections to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        # klasicke cislovani \n<num>. word
        subj_prefix = self._text[max(start_position - 50, 0):start_position]
        numeral_pattern = '\n[ \t]*[\d]+[^/]'
        start_occurrences = find_all_occurrences_in_string(numeral_pattern,
                                                           subj_prefix,
                                                           lower=False)
        if len(start_occurrences) > 0:
            # otocim text abych hledal prechazejici cislo, najdu cislo nasledujici newline a vratim ho
            article_num = int(
                re.search(
                    '[\d]+',
                    re.search('[^/][\d]+[ \t]*\n',
                              subj_prefix[::-1]).group(0)).group(0)[::-1])
            end_occurrences = find_all_occurrences_in_string(
                numeral_pattern,
                self._text[start_position:end_position],
                lower=False)
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = start_position + occ
                    m = re.search('[\d]+', self._text[occ:occ + 10])
                    if m is None:
                        continue
                    num = int(m.group(0))
                    if (num > article_num) and (num < article_num + 3):
                        current_line = get_current_line(self._text, occ + 1)
                        if len(current_line) < 50:
                            num_numbers_in_line = len(
                                find_all_occurrences_in_string(
                                    '\d', current_line))
                            if num_numbers_in_line <= 5:
                                self._end_position = occ
                                return self._end_position
        return None
Beispiel #7
0
    def detect(self, start_position, end_position):
        """Searches for special keywords to identify the end.

        Args:
            start_position (int): raw start position of the context
            end_position (int): raw end position of the context

        Returns:
            int: position of the identified subject context end
        """
        end_words = ['Cena', 'Doba', 'Místo']
        for word in end_words:
            end_occurrences = find_all_occurrences_in_string(
                word, self._text[start_position:end_position])
            if len(end_occurrences) > 0:
                for occ in end_occurrences:
                    occ = start_position + occ
                    if len(get_current_line(self._text, occ)) < 50:
                        self._end_position = occ
                        return self._end_position