def process(self, stream: t.Iterator[lark.Token]) -> t.Iterator[Token]: number_tokens: t.Optional[t.List[lark.Token]] = None for token in stream: try: if not number_tokens: number_tokens = [] self._english_parser.parse(number_tokens + [token]) number_tokens.append(token) continue except ValueError: if number_tokens: v = self._english_parser.parse(number_tokens) yield Token.new_borrow_pos( "UNSIGNED_INTEGER" if isinstance(v, int) else "UNSIGNED_REAL", str(self._english_parser.parse(number_tokens)), number_tokens[0]) number_tokens = None if not number_tokens: try: # Roman numerals must be at least 2 characters long, # otherwise we can't use 'i' or 'x'... if not len(str(token).strip()) < 2: yield Token.new_borrow_pos( "INTEGER", str(numeral.roman2int(token)), token) continue except ValueError: pass except NotImplementedError: pass yield token
def parse_roman_num(s): s = str(s) if s == 'l': s = 'I' if re.search('(^[XIV]+$)', s): return str(roman2int(s)) else: return s
def getTierInfo(summoner_id): url = "https://na1.api.riotgames.com/lol/league/v4/entries/by-summoner/{}?api_key={}" tier_info = requests.get(url.format(summoner_id, access_key)).json() rank = (tier_info[0]["tier"] + " " + str(roman2int(tier_info[0]["rank"]))).title() league_points = tier_info[0]["leaguePoints"] wins = tier_info[0]["wins"] losses = tier_info[0]["losses"] win_ratio = str(int(round((int(wins) * 100) / (int(wins) + int(losses))))) return rank, league_points, wins, losses, win_ratio
def _get_symbol_ox_number(self, parsed_string: str) -> Tuple[str, int]: """Splits a parser hit into symbol and ox nuber and returns latter as a integer Args: parsed_string (str): regex match of the form metalname(romanoxidationstate) Returns: str: symbol int: oxidation number """ name, roman = parsed_string.strip(")").split("(") if roman != "0": return self.name_symbol_dict[name.lower()], roman2int(roman) return self.name_symbol_dict[name.lower()], int(0)
def identify_headers_footers_pagenos(self, placement): """ Attempts to identify the presence of headers, footers, or page numbers 1. Build a dict of first & last lines, indexed by page number. 2. Try to identify headers and footers. Headers and footers can appear on every page, or on alternating pages (for example if one page has header of the title, the facing page might have the header of the chapter name). They may include a page number, or the page number might be a standalone header or footer. The presence of headers and footers in the document does not mean they appear on every page (for example, chapter openings or illustrated pages sometimes don't contain the header/footer, or contain a modified version, such as a standalone page number). Page numbers may be in Arabic or Roman numerals. This method does not attempt to look for all edge cases. For example, it will not find: - constantly varied headers, as in a dictionary - page numbers that don't steadily increase - page numbers that were misidentified in the OCR process, eg. IO2 - page numbers that have characters around them, eg. '* 45 *' """ # running this on first lines or last lines? if placement == 'first': mylines = self.firsts else: mylines = self.lasts self.logger.debug("Looking for headers/footers: {}".format(placement)) # Look for standalone strings of digits digits = re.compile(r'^\d+$') romans = re.compile(r'^[xicmlvd]+$') candidate_digits = [] candidate_romans = [] for block in self.blocks: if placement in block: line = block['text'] ourpageno = block['page_no'] mylines[ourpageno] = {'text': block['text']} pageno = digits.search(line) rpageno = romans.search(line, re.IGNORECASE) if rpageno: # Is this a roman numeral? try: # The numeral.roman2int method is very permissive # for archaic numeral forms, which is good. num = roman2int(line) except ValueError: # not a roman numeral pass mylines[ourpageno]['ocr_roman'] = placement candidate_romans.append(num) elif pageno: mylines[ourpageno]['ocr_digits'] = placement candidate_digits.append(int(line)) # The algorithms to find false positives in page number candidates # are resource intensive, so this excludes anything where the candidate # numbers aren't monotonically increasing. if candidate_digits and is_increasing(candidate_digits): self.pagenums_found = True self.logger.debug("Page #s found: {}".format(candidate_digits)) if candidate_romans and is_increasing(candidate_romans): self.rpagenums_found = True self.logger.debug( "Roman page #s found: {}".format(candidate_romans)) # identify match ratio fuzz_consecutive = 0 fuzz_alternating = 0 for k, v in mylines.items(): # Check to see if there's still one page forward if k + 1 in mylines: ratio_consecutive = fuzz.ratio(v['text'], mylines[k + 1]['text']) mylines[k]['ratio_consecutive'] = ratio_consecutive fuzz_consecutive += ratio_consecutive # Check to see if there's still two pages forward if k + 2 in mylines: ratio_alternating = fuzz.ratio(v['text'], mylines[k + 2]['text']) mylines[k]['ratio_alternating'] = ratio_alternating fuzz_alternating += ratio_alternating # occasional similar first/last lines might happen in all texts, # so only identify headers & footers if there are many of them HEADERS_PRESENT_THRESHOLD = int( config.get('Main', 'HEADERS_PRESENT_THRESHOLD')) if len(mylines) > 2: average_consecutive = fuzz_consecutive / (len(mylines) - 1) average_alternating = fuzz_alternating / (len(mylines) - 2) self.logger.debug("{}: consecutive fuzz avg.: {}".format( placement, average_consecutive)) self.logger.debug("{}: alternating fuzz avg.: {}".format( placement, average_alternating)) if average_consecutive > HEADERS_PRESENT_THRESHOLD: if placement == 'first': self.headers_present = 'consecutive' else: self.footers_present = 'consecutive' self.logger.debug( "{} repeated, consecutive pages".format(placement)) elif average_alternating > HEADERS_PRESENT_THRESHOLD: if placement == 'first': self.headers_present = 'alternating' else: self.footers_present = 'alternating' self.logger.debug( "{} repeated, alternating pages".format(placement))