Beispiel #1
0
    def process(self, stream: t.Iterator[lark.Token]) -> t.Iterator[Token]:
        number_tokens: t.Optional[t.List[lark.Token]] = None

        for token in stream:
            try:
                if not number_tokens:
                    number_tokens = []
                self._english_parser.parse(number_tokens + [token])
                number_tokens.append(token)
                continue
            except ValueError:
                if number_tokens:
                    v = self._english_parser.parse(number_tokens)
                    yield Token.new_borrow_pos(
                        "UNSIGNED_INTEGER"
                        if isinstance(v, int) else "UNSIGNED_REAL",
                        str(self._english_parser.parse(number_tokens)),
                        number_tokens[0])
                    number_tokens = None
            if not number_tokens:
                try:
                    # Roman numerals must be at least 2 characters long,
                    # otherwise we can't use 'i' or 'x'...
                    if not len(str(token).strip()) < 2:
                        yield Token.new_borrow_pos(
                            "INTEGER", str(numeral.roman2int(token)), token)
                        continue
                except ValueError:
                    pass
                except NotImplementedError:
                    pass
            yield token
Beispiel #2
0
 def parse_roman_num(s):
     s = str(s)
     if s == 'l':
         s = 'I'
     if re.search('(^[XIV]+$)', s):
         return str(roman2int(s))
     else:
         return s
Beispiel #3
0
def getTierInfo(summoner_id):
    url = "https://na1.api.riotgames.com/lol/league/v4/entries/by-summoner/{}?api_key={}"
    tier_info = requests.get(url.format(summoner_id, access_key)).json()
    rank = (tier_info[0]["tier"] + " " +
            str(roman2int(tier_info[0]["rank"]))).title()
    league_points = tier_info[0]["leaguePoints"]
    wins = tier_info[0]["wins"]
    losses = tier_info[0]["losses"]
    win_ratio = str(int(round((int(wins) * 100) / (int(wins) + int(losses)))))
    return rank, league_points, wins, losses, win_ratio
    def _get_symbol_ox_number(self, parsed_string: str) -> Tuple[str, int]:
        """Splits a parser hit into symbol and ox nuber and returns
        latter as a integer

        Args:
            parsed_string (str): regex match of the form metalname(romanoxidationstate)

        Returns:
            str: symbol
            int: oxidation number

        """
        name, roman = parsed_string.strip(")").split("(")
        if roman != "0":
            return self.name_symbol_dict[name.lower()], roman2int(roman)

        return self.name_symbol_dict[name.lower()], int(0)
Beispiel #5
0
    def identify_headers_footers_pagenos(self, placement):
        """
        Attempts to identify the presence of headers, footers, or page numbers

        1. Build a dict of first & last lines, indexed by page number.
        2. Try to identify headers and footers.

        Headers and footers can appear on every page, or on alternating pages
        (for example if one page has header of the title, the facing page
        might have the header of the chapter name).

        They may include a page number, or the page number might be a
        standalone header or footer.

        The presence of headers and footers in the document does not mean they
        appear on every page (for example, chapter openings or illustrated
        pages sometimes don't contain the header/footer, or contain a modified
        version, such as a standalone page number).

        Page numbers may be in Arabic or Roman numerals.

        This method does not attempt to look for all edge cases. For example,
        it will not find:
        - constantly varied headers, as in a dictionary
        - page numbers that don't steadily increase
        - page numbers that were misidentified in the OCR process, eg. IO2
        - page numbers that have characters around them, eg. '* 45 *'
        """

        # running this on first lines or last lines?
        if placement == 'first':
            mylines = self.firsts
        else:
            mylines = self.lasts
        self.logger.debug("Looking for headers/footers: {}".format(placement))

        # Look for standalone strings of digits
        digits = re.compile(r'^\d+$')
        romans = re.compile(r'^[xicmlvd]+$')
        candidate_digits = []
        candidate_romans = []
        for block in self.blocks:
            if placement in block:
                line = block['text']
                ourpageno = block['page_no']
                mylines[ourpageno] = {'text': block['text']}
                pageno = digits.search(line)
                rpageno = romans.search(line, re.IGNORECASE)
                if rpageno:
                    # Is this a roman numeral?
                    try:
                        # The numeral.roman2int method is very permissive
                        # for archaic numeral forms, which is good.
                        num = roman2int(line)
                    except ValueError:
                        # not a roman numeral
                        pass
                    mylines[ourpageno]['ocr_roman'] = placement
                    candidate_romans.append(num)
                elif pageno:
                    mylines[ourpageno]['ocr_digits'] = placement
                    candidate_digits.append(int(line))

        # The algorithms to find false positives in page number candidates
        # are resource intensive, so this excludes anything where the candidate
        # numbers aren't monotonically increasing.
        if candidate_digits and is_increasing(candidate_digits):
            self.pagenums_found = True
            self.logger.debug("Page #s found: {}".format(candidate_digits))
        if candidate_romans and is_increasing(candidate_romans):
            self.rpagenums_found = True
            self.logger.debug(
                "Roman page #s found: {}".format(candidate_romans))

        # identify match ratio
        fuzz_consecutive = 0
        fuzz_alternating = 0
        for k, v in mylines.items():
            # Check to see if there's still one page forward
            if k + 1 in mylines:
                ratio_consecutive = fuzz.ratio(v['text'],
                                               mylines[k + 1]['text'])
                mylines[k]['ratio_consecutive'] = ratio_consecutive
                fuzz_consecutive += ratio_consecutive
            # Check to see if there's still two pages forward
            if k + 2 in mylines:
                ratio_alternating = fuzz.ratio(v['text'],
                                               mylines[k + 2]['text'])
                mylines[k]['ratio_alternating'] = ratio_alternating
                fuzz_alternating += ratio_alternating

        # occasional similar first/last lines might happen in all texts,
        # so only identify headers & footers if there are many of them
        HEADERS_PRESENT_THRESHOLD = int(
            config.get('Main', 'HEADERS_PRESENT_THRESHOLD'))
        if len(mylines) > 2:
            average_consecutive = fuzz_consecutive / (len(mylines) - 1)
            average_alternating = fuzz_alternating / (len(mylines) - 2)
            self.logger.debug("{}: consecutive fuzz avg.: {}".format(
                placement, average_consecutive))
            self.logger.debug("{}: alternating fuzz avg.: {}".format(
                placement, average_alternating))
            if average_consecutive > HEADERS_PRESENT_THRESHOLD:
                if placement == 'first':
                    self.headers_present = 'consecutive'
                else:
                    self.footers_present = 'consecutive'
                self.logger.debug(
                    "{} repeated, consecutive pages".format(placement))
            elif average_alternating > HEADERS_PRESENT_THRESHOLD:
                if placement == 'first':
                    self.headers_present = 'alternating'
                else:
                    self.footers_present = 'alternating'
                self.logger.debug(
                    "{} repeated, alternating pages".format(placement))