def get_all_matches(cls, obj, message): from xoutil.future.itertools import map from re import compile as _re_compile config = obj.env['ir.config_parameter'] pattern = config.get_param( 'evaneos.mailrouter.pattern', # The default allows to tests pass. default=r'_(?P<thread>\d+)(?:_[^@]+)?@.*(?<=[@\.])evaneos\.com$') senders = cls.get_senders(message) regex = _re_compile(pattern) search = regex.search return (match for match in map(search, senders) if match)
def parse_with_next(text): num_pat = _re_compile(r'^[+-]?[0-9]+(\.[0-9]*)?') m = num_pat.search(text) if m is None: return None else: num = text[m.start(): m.end()] end = text[m.end():] if '.' in num: num = YJNumber(float(num)) else: num = YJNumber(int(num)) return num, end
def _initialize_self_mention(self, client): """ Initialize self mention detection :param client: Web client passed to through the message """ # Build regex to detect when bot is mentionned in the message and reach to # the command. # Retrieve bot ID first r = client.auth_test() if r['ok']: bot_id = r['user_id'] else: # Something went wrong raise RuntimeError('Could not retrive bot ID: {}'.format( r['error'])) # Bot should react at: @<BotID> <command> but not to <...> @<BotID> <...> # self._self_mention = _re_compile(r"^<@{}>".format(bot_id)) self._self_mention = _re_compile( r"^(<@\w*>) (\w*) ?(.*)".format(bot_id))
while i != 0: xynames.append(_odr.XYCurves.Name()) i = _odr.XYCurves.Next() return xynames # </editor-fold> # <editor-fold desc="Loads and declarations"> _this_module = _sys_modules[__name__] _classmap = _get_classmap() _command_manager = stdout_redirected _DISKFULL_RE = _re_compile('Disk Full') _ID = 'OpendssdirectEnhancer' setattr(_this_module, 'utils', _odr.utils) # loads chains of functions through which to pass the rough outputs of opendss. with open(TREATMENTS_PATH, 'r') as _tfile: _rtrt = _json_load(_tfile) _trt = dict() for _subdic_name, _subdic in _rtrt.items(): _nsd = { k: tuple([globals()[_t] for _t in _v]) for k, _v in _subdic.items() } _trt[_subdic_name] = _nsd
__author__ = 'Dug Song <*****@*****.**>' __copyright__ = 'Copyright (c) 2004 Dug Song' __license__ = 'BSD' __url__ = 'http://monkey.org/~dugsong/dpkt/' __version__ = '1.2' try: from itertools import izip as _it_izip except ImportError: _it_izip = zip from struct import calcsize as _st_calcsize, \ pack as _st_pack, unpack as _st_unpack, error as _st_error from re import compile as _re_compile intchr = _re_compile(r"(?P<int>[0-9]+)(?P<chr>.)") class MetaPacket(type): def __new__(cls, clsname, clsbases, clsdict): if '__hdr__' in clsdict: st = clsdict['__hdr__'] clsdict['__hdr_fields__'] = [x[0] for x in st] clsdict['__hdr_fmt__'] = clsdict.get('__byte_order__', '>') + \ ''.join([ x[1] for x in st ]) clsdict['__hdr_len__'] = _st_calcsize(clsdict['__hdr_fmt__']) clsdict['__hdr_defaults__'] = \ dict(zip(clsdict['__hdr_fields__'], [ x[2] for x in st ])) clsdict['__slots__'] = clsdict['__hdr_fields__'] return type.__new__(cls, clsname, clsbases, clsdict)
def scrape_talks(talk_links, outfile, skippedfile, start_at=0): ''' Scrapes the talk pages for a TED talk, getting information and the transcript tokenized by cue cards. Returns nothing. Takes list of talk links, filepath to output of scrapes, and filepath to output of skipped talks. ''' for index, talk_link in enumerate(talk_links): # Allow for starting part way through the list in case of exception if index < start_at: continue ### Get information about the talk ### talk_res = requests.get(base_url + talk_link) talk_page_html = talk_res.text talk_soup = BeautifulSoup(talk_page_html, "html.parser") # Scrape <script> that contains ' "__INITIAL_DATA__" object # Maybe I should scrape using javascript... this should already be loaded in as an object # ... # Just tested, web console showed _q as empty, couldn't locate INITIAL object. # _q and q() are being used by an external script with the use of eval. # Stick with Python here. script = talk_soup.find("script", string=_re_compile("__INITIAL_DATA__")) # Getting html after object declaration and removing newline and outer object close talk_data_string = script.text.split("\"__INITIAL_DATA__\":")[1][:-3] talk_data = json.loads(talk_data_string) talk = talk_data["talks"][0] # If multiple speakers, a performance, or grabbed from the web (no transcript) just skip to next talk if (len(talk["speakers"]) > 1) or ("performance" in talk["tags"]) or ( talk["video_type"]["id"] == '5'): # Log talk name and primary speaker so I can check to make sure I'm not # skipping something that I shouldn't! print("Skipping", talk["title"]) with open(skippedfile, 'a', encoding="utf-8") as file: file.write( str((talk["title"], talk["id"], talk["speaker_name"], talk_link)) + '\n') # Remember to sleep if (index + 1) % 5 == 0: print("Scraped", index + 1, "TALKS so far, taking a little break.") _time_sleep(60) continue # Add talk information data = {} data["title"] = talk["title"] data["talk_id"] = talk["id"] data["speaker"] = talk["speaker_name"] data["speaker_id"] = talk["speakers"][0]["id"] data["num_views"] = talk["viewed_count"] data["num_comments"] = talk_data["comments"]["count"] data["date"] = talk["recorded_at"] data["tags"] = talk["tags"] data["categories"] = talk["ratings"] data["language"] = talk_data["language"] data["duration"] = talk["duration"] data["event"] = talk_data["event"] ### Get the transcript ### # I have two options here, I could request the transcript from the page itself, # which handles parsing the json that stores the transcript, and I could # very easily scrape the <p> tags with attribute "dir" : "ltr". # # Or I can request just the json and parse it myself. # # Scraping the page would be much simpler, but I think parsing would be # fast for scraping purposes. # The splitting of the transcript into cues may be useful for tokenization. # I'll store the text as a list of the cue text. I can always concat # then tokenize again if I want. transc_res = requests.get(base_url + "/talks/" + data["talk_id"] + transcript_url_extension) transcript_json = transc_res.json() tokens = [] try: # Get list of paragraphs from transcript for paragraph in transcript_json["paragraphs"]: # Each paragraph contains a list of cue sets for cue in paragraph["cues"]: # Append text from cue to our set of token tokens.append(cue["text"]) data["tokens"] = tokens except KeyError: print("Skipping", talk["title"]) with open(skippedfile, 'a', encoding="utf-8") as file: file.write( str((talk["title"], talk["id"], talk["speaker_name"], talk_link)) + '\n') # Remember to sleep if (index + 1) % 5 == 0: print("Scraped", index + 1, "TALKS so far, taking a little break.") _time_sleep(60) continue # Debug print print("Just scraped \"", data["title"], "\" Now writing.") ### Append data of this talk to file ### # Opening and appending to file each time to preserve memory and allow # for thrown exceptions to not destroy progress! # Every line of this file will contains a dictionary object of the talk with open(outfile, 'a', encoding="utf-8") as talk_file: talk_file.write(str(data) + '\n') # Debug print("Written.") ### Add some rest after every fifth scrape ### # As mentioned above, too many requests leads to a rate limit # # Ran into issue with transcript this time, need to wait longer. # # Previously paused every ninth, more frequent now to hopefully avoid # # Still didn't work. Ran into a stricter limit this time (7). Response # does not have headers that reveal the limit! Looking into solutions # besides simply a longer sleep. # # Next day, decided that it doesn't matter if I just leave this scrape # running, I can wait between scrapes no worries. if (index + 1) % 5 == 0: print("Scraped", index + 1, "TALKS so far, taking a little break.") _time_sleep(60)
from re import compile as _re_compile from inspect import (currentframe as _curr_frame, getframeinfo as _get_frame_info) def is_pos_int(obj): return type(obj) is int and obj >= 1 _valid_pattern = _re_compile("^[a-zA-Z_]\w*$") def is_valid_name(obj): return (type(obj) is str) and (_valid_pattern.match(obj) != None) class Sym: _unq_count = 1 def __init__(self, nm): if not is_valid_name(nm): raise TypeError(f"expected an alphanumeric name string, " f"but got '{nm}'") self._nm = nm self._id = Sym._unq_count Sym._unq_count += 1 def __str__(self): return self._nm def __repr__(self):
and message. """ pass class StopGeneration(Exception): """StopGeneration: Raised when the top-level soundscape reaches its end -- no more agents or sounds to be run. """ pass from re import compile as _re_compile # Regular expression for valid event/property names: one or more elements, # separated by periods. Each element must contain only letters, digits, # and underscores. An element may not start with a digit. _prop_name_regexp = _re_compile('\\A[a-zA-Z_][a-zA-Z_0-9]*(\.([a-zA-Z_][a-zA-Z_0-9]*))*\\Z') # A cache of valid event/property names. We keep this so that we don't # have to regexp them every time. _valid_prop_names = {} def check_prop_name(val): """check_prop_name(val) -> str Ensure that the value is a valid event or property name. If it isn't, raise BoodlerError. If it is, return a str version of it (in case it was a unicode object). """ res = _valid_prop_names.get(val) if (res):
import unittest from ATL import * import numpy as np import random import time from functools import wraps from collections import namedtuple from re import compile as _re_compile _data_method_RE = _re_compile("^data_") # --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- # DataCase = namedtuple('DataCase',['name','input','output']) InputCase = namedtuple('InputCase',['name','input']) DInputCase = namedtuple('DInputCase',['name','input','dinput']) DInOutCase = namedtuple('DInOutCase',['name','input','dinput','doutput']) class FunctionTestCase: """ A testing mixin that should be used alongside unittest.TestCase Expects gen_func() to be defined returning an ATL function. Then every method with name data_*() is expected to return a sample input/output data pair for testing the function with. """ def _init_rand(self): if not hasattr(self,'rand'):
__author__ = 'Dug Song <*****@*****.**>' __copyright__ = 'Copyright (c) 2004 Dug Song' __license__ = 'BSD' __url__ = 'http://monkey.org/~dugsong/dpkt/' __version__ = '1.2' try: from itertools import izip as _it_izip except ImportError: _it_izip = zip from struct import calcsize as _st_calcsize, \ pack as _st_pack, unpack as _st_unpack, error as _st_error from re import compile as _re_compile intchr = _re_compile(r"(?P<int>[0-9]+)(?P<chr>.)") class MetaPacket(type): def __new__(cls, clsname, clsbases, clsdict): if '__hdr__' in clsdict: st = clsdict['__hdr__'] clsdict['__hdr_fields__'] = [ x[0] for x in st ] clsdict['__hdr_fmt__'] = clsdict.get('__byte_order__', '>') + \ ''.join([ x[1] for x in st ]) clsdict['__hdr_len__'] = _st_calcsize(clsdict['__hdr_fmt__']) clsdict['__hdr_defaults__'] = \ dict(zip(clsdict['__hdr_fields__'], [ x[2] for x in st ])) clsdict['__slots__'] = clsdict['__hdr_fields__'] return type.__new__(cls, clsname, clsbases, clsdict) class Packet(object):
def scrape_talks(filepath, outfile): ''' Scrapes the talk pages for a TED talk, getting information and the transcript tokenized by cue cards. Returns nothing. Takes list of talk links, filepath to output of scrapes, and filepath to output of skipped talks. ''' with open(filepath, 'r') as file: talk_links = file.read() talk_links = talk_links.split("\n")[:-1] collection = {} for index, talk_link in enumerate(talk_links): ### Get information about the talk ### talk_page_html = requests.get(base_url + talk_link, headers=headers_talklinks).text talk_soup = BeautifulSoup(talk_page_html, "html.parser") # Scrape <script> that contains ' "__INITIAL_DATA__" object script = talk_soup.find("script", string=_re_compile("__INITIAL_DATA__")) # Getting html after object declaration and removing newline and outer object close talk_data_string = script.text.split("\"__INITIAL_DATA__\":")[1][:-3] talk_data = json.loads(talk_data_string) talk = talk_data["talks"][0] data = {} # I couldn't find a way to extract the comments from www.ted.com # But anyway, all comments from www.ted.com seem to be positive. (only people who love tedtalks go to www.ted.com) # So I only use the comments from YouTube (and YouTube is much more popular across the world) data["title"] = talk["title"] data["speaker"] = talk["speaker_name"] data["description"] = talk["description"] data["date"] = talk["recorded_at"][:10] data["duration"] = talk["duration"] data["thumbnails"] = talk["player_talks"][0]["thumb"] data["tags"] = talk["tags"] data["num_views"] = talk["viewed_count"] data["num_comments"] = talk_data["comments"]["count"] if talk_data[ "comments"] is not None else 0 data["num_transcripts"] = len(talk["downloads"]["languages"]) if data["num_transcripts"] != 0: data["transcript_language"] = [ d['endonym'] for d in talk["downloads"]["languages"] ] else: data["transcript_language"] = [] data["categories"] = talk["ratings"] data["event"] = talk_data["event"] data["talk_link"] = base_url + talk_link ### Get the transcript ### if data["num_transcripts"] != 0: transc = requests.get(base_url + talk_link + transcript_url_extension, headers=headers_talklinks) transc = transc.json() if "paragraphs" in transc: for t in transc["paragraphs"]: for cue in t["cues"]: if "transcript" not in data: data["transcript"] = cue["text"] else: data["transcript"] = data[ "transcript"] + " " + cue["text"] else: data["transcript"] = "" else: data["transcript"] = "" collection[str(index + 1)] = data with open(outfile, "w") as f: json.dump(collection, f, indent=4)
import logging from email.utils import getaddresses, formataddr from re import compile as _re_compile from xoutil.future.itertools import map from xoutil.future.functools import curry from xoeuf.odoo.addons.xopgi_mail_threads import MailRouter, MailTransportRouter from xoeuf.odoo.addons.base.ir.ir_mail_server import encode_rfc2822_address_header # noqa from xoeuf import models, fields logger = logging.getLogger(__name__) EVANEOS_REGEXP = _re_compile( r'_(?P<thread>\d+)(?P<uuid>[_-][^@]+)?(?P<host>@.*(?<=[@\.])evaneos\.com)$' ) class MATCH_TYPE: SENDER = 0 RECIPIENT = 1 class Message(models.Model): _inherit = 'mail.message' # Make the email_from create an index, the 'search' in the router is slow # without it. email_from = fields.Char(index=True)
def match_path_by_pattern(pattern, path): # Match file path by pattern. i.e pattern : "/root/*.txt" # path : "/root/model.txt" regex = _fn_translate(pattern) reobj = _re_compile(regex) return reobj.match(path)
f = logging.Formatter('%(relativeCreated)d, %(message)s') sh = logging.StreamHandler() sh.setFormatter(f) fh = logging.FileHandler(filename='11112.log', mode='w', encoding='utf8') fh.setFormatter(f) __log__.addHandler(sh) __log__.addHandler(fh) del f del sh del fh ############################################################################### REGEX_SEARCH_IMAGE = _re_compile( r'<meta data-vue-meta="true" itemprop="image" content="([^"]+)"/>') REGEX_VIDEO_IDENT = _re_compile(r'av\d+') DICT_HEADERS = { 'User-Agent': 'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)' 'AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50' } STRING_URL_BASE = 'https://search.bilibili.com/all?' EVENT_ALL_DONE = threading.Event() INT_TIME_OUT = 12 class MyThread(threading.Thread):