def parse_page(self, page): title = regex.search(r'(?<=<title>).*(?=<\/title>)', page).group(0) text = regex.search(r'(?<=<text).*(?=<\/text>)', page, flags=regex.DOTALL).group(0) infobox = None infobox_regex = regex.search(r'(?=\{Infobox)(\{([^{}]|(?1))*\})', text) text_start_index = 0 if infobox_regex: text_start_index = infobox_regex.end() infobox = infobox_regex.groups(0)[0] page = Page(title, infobox, text[text_start_index:]) return page.get_parsed_date_tokens()
def get_lang(string): # TODO using nltk detection has_cyrillic = regex.search(r'\p{IsCyrillic}', string) if has_cyrillic: return "russian" else: return "english"
def parse_number_of_jobs_found(input_html: str) -> int: """ Im oberen Teil der Ergebnisseite wird angegeben, wie viele Postings für die Suchanfrage gefunden wurden. Diese Information wird durch diese Funktion geparst. :param input_html: HTML String der Ergebnisseite :return: Anzahl der gefundenen Job Postings """ return int( re.search(r"(?<=\()[][0-9]{2,4}(?= Jobs gefunden\))", input_html).group())
async def event_message(self, ctx): if ctx.author.name.lower() == self.nick.lower(): # Commands only the the bot user await self.handle_commands(ctx) if self.active and ctx.author.name.lower() == self.target_user.lower(): # reply logic only for the targeted user for rule in self.rules: if regex.search(rule.get('pattern'), ctx.content): time.sleep(0.5) await ctx.channel.send(rule.get('reply'))
def find_date(text): date = regex.search( r'(( [a-zA-Z]{3,8}|\d{1,2})[ ]\d{1,2}([ ]|(\,? ))\d{1,4})|(([a-zA-Z]{3,8}|in) \d{4})(?<=[^\}])', text) if date: start = date.start() end = date.end() date = date.group(0) return DateInText(date, start, end) return None
def get_data_from_row(row): try: text = row.text except AttributeError: return match = re.search(r"(.+) (\(.+\)) (\(.+\))", text) try: name = match.group(1).strip() year = match.group(2).strip("()") cat = match.group(3).strip("()") except AttributeError: return if "series" not in cat.lower(): return # print(name, year, cat) try: link = row.select_one(".result_text a").get("href").strip() link_match = re.search(r"title\/(tt\d+)", link) show_id = link_match.group(1) except AttributeError: return return dict(showname=name, category=cat, year=year, id=show_id)
def __merge_links(self, links): """ Fügt links nur dann der Objekt-liste hinzu, wenn ihre Job-Id noch nicht im job_id set vorhanden ist. :param links: Liste an URLs als Strings """ for link in links: try: job_id = re.search(r"[[:xdigit:]-]{9,}", link).group() if job_id not in self.job_ids: self.links.append(link) self.job_ids.add(job_id) except TypeError: pass
def parse_page(self, page): tree = ET.fromstring(page) title = tree.find('title').text text = tree.find('revision').find('text').text infobox = None infobox_regex = regex.search(r'(?=\{Infobox)(\{([^{}]|(?1))*\})', page) text_start_index = 0 if infobox_regex: text_start_index = infobox_regex.end() infobox = infobox_regex.groups(0)[0] page = Page(title, infobox, text[text_start_index:]) return page.get_parsed_date_tokens()
def __vectorize(self, sentences, training=False): sentenceKeys = list(sentences[0].keys()) for sentence in sentences: for name in sentenceKeys: if name == "NER_IOBX": continue if name.startswith("NER_"): if name not in self.__mappings and training: self.__mappings[name] = {"O": 1} if self.__special_labels: self.__mappings[name]["[CLS]"] = len( self.__mappings[name]) + 1 self.__mappings[name]["[SEP]"] = len( self.__mappings[name]) + 1 for (id_, item) in enumerate(sentence[name]): if item not in self.__mappings[name]: if training: self.__mappings[name][item] = len( self.__mappings[name]) + 1 else: print(f"Issue with the label {item} in {name}") exit(1) sentence[name][id_] = self.__mappings[name][item] if name == "tokens" and training and self.__tokenizer is not None: for token in sentence["tokens"]: bert_tokens = self.__tokenizer.tokenize(token) if self.__validateBertTokens( token, bert_tokens ) == 1 and token not in self.__add_tokens: if regex.search("\p{P}|\p{S}", token): new_tokens = list( filter(None, regex.split("(\p{P}|\p{S})", token))) for sub_token in new_tokens: bert_tokens = self.__tokenizer.tokenize( sub_token) if self.__validateBertTokens( sub_token, bert_tokens) == 1: self.__add_tokens.add(sub_token) else: self.__add_tokens.add(token)
def _get_additional_details(details): remove_whitespace = lambda x: x.strip() additional_data = list(map(remove_whitespace, details.text.split("|"))) maturity = "" if len(additional_data) == 4: maturity = additional_data[0] additional_data = additional_data[1:] try: ep_time, tags, date = additional_data except: return {} else: result = re.search(r".+\((.+)\)", date) date = result.group(1) tags = list(map(remove_whitespace, tags.split(","))) return { "tags": tags, "time_per_episode": ep_time, "running_date": date, "maturity": maturity, }
def compiles(self, tex_file_path, n=1, clean=False): path, filename, extension, filename_without_extension = get_path_filename_extension( tex_file_path) if clean: subprocess.run(['rm', '*.pdf.html'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(['rm', '*.pdf'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(['rm', '*.aux'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(['rm', '*.log'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) for i in range(n): print(f"trying to compile {path} + {filename}") process = subprocess.Popen( f'cd {path} && echo $(pwd) && pdflatex -interaction=nonstopmode -halt-on-error -file-line-error {filename}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) time.sleep(self.timeout_sec) process.send_signal(signal.SIGINT) output = process.stdout.read().decode('utf-8', errors="ignore") #print(output) errors = process.stderr.read().decode('utf-8', errors="ignore") #print(errors) if (any(error in output.lower() for error in ["latex error", "fatal error"])): where = output.lower().index('error') error_msg_at = output[where - 150:where + 150] self.path_spec.logger.error( f'{tex_file_path} -->> compilation failed on \n""" {error_msg_at}"""' ) line_number_match = regex.search(r":(\d+):", error_msg_at) if line_number_match: line_number = int(line_number_match.groups(1)[0]) try: with open(path + "/" + filename) as f: lines = f.readlines() except UnicodeDecodeError: self.path_spec.logger.error( "Could not read latex file because of encoding") break faulty_code = "\n".join( lines[max(0, line_number - 1):min(len(lines), line_number + 1)]) self.path_spec.logger.error( f' ---> see file {tex_file_path}: """\n{faulty_code}"""' ) return None if process.returncode: print(errors) return None self.path_spec.logger.info(f"{tex_file_path} compiled") pdf_path = path + "/" + filename_without_extension + ".pdf" return pdf_path
def hh_parse(burl, hdr): jobs = [] urls = [burl] soup = soup_content(burl, hdr) pagination = soup.find_all('a', attrs={'data-qa': 'pager-page'}) count = int(pagination[-1].text) for i in range(count): url = base_url.format(page_num=i) if url not in urls: urls.append(url) for url in urls: soup = soup_content(url, hdr) divs = soup.find_all('div', attrs={'class': 'vacancy-serp-item'}) for div in divs: location = "" salary = "" title = div.find('a', attrs={ 'data-qa': 'vacancy-serp__vacancy-title' }).text href = div.find('a', attrs={'data-qa': 'vacancy-serp__vacancy-title'})['href'] try: company = div.find('a', attrs={ 'data-qa': 'vacancy-serp__vacancy-employer' }).text except Exception as e: print(e) try: salary = div.find('span', attrs={ 'data-qa': 'vacancy-serp__vacancy-compensation' }).text except Exception as e: print(e) try: location = div.find('spam', attrs={'class': 'metro-station'}) except Exception as e: print(e) text1 = div.find('div', attrs={ 'data-qa': 'vacancy-serp__vacancy_snippet_responsibility' }).text text2 = div.find('div', attrs={ 'data-qa': 'vacancy-serp__vacancy_snippet_requirement' }).text content = text1 + ' ' + text2 if regex.search(r"\L<words>", title, words=template): jobs.append({ 'title': title, 'href': href, 'company': company, 'location': location, 'salary': salary, 'content': content }) print("Найдено вакансий:", len(jobs)) return jobs
def get_id_from_link(link): result = re.search(r"\/title\/([A-Za-z0-9]+)\/", link) if result: return result.group(1) return ""