Example #1
0
 def get_all_matches(cls, obj, message):
     from xoutil.future.itertools import map
     from re import compile as _re_compile
     config = obj.env['ir.config_parameter']
     pattern = config.get_param(
         'evaneos.mailrouter.pattern',
         # The default allows to tests pass.
         default=r'_(?P<thread>\d+)(?:_[^@]+)?@.*(?<=[@\.])evaneos\.com$')
     senders = cls.get_senders(message)
     regex = _re_compile(pattern)
     search = regex.search
     return (match for match in map(search, senders) if match)
Example #2
0
File: yson.py Project: yassu/Yson
 def parse_with_next(text):
     num_pat = _re_compile(r'^[+-]?[0-9]+(\.[0-9]*)?')
     m = num_pat.search(text)
     if m is None:
         return None
     else:
         num = text[m.start(): m.end()]
         end = text[m.end():]
         if '.' in num:
             num = YJNumber(float(num))
         else:
             num = YJNumber(int(num))
         return num, end
Example #3
0
    def _initialize_self_mention(self, client):
        """
    Initialize self mention detection
    :param client:  Web client passed to through the message
    """

        #  Build regex to detect when bot is mentionned in the message and reach to
        #  the command.
        #  Retrieve bot ID first
        r = client.auth_test()
        if r['ok']:
            bot_id = r['user_id']
        else:
            #  Something went wrong
            raise RuntimeError('Could not retrive bot ID: {}'.format(
                r['error']))

        # Bot should react at: @<BotID> <command> but not to <...> @<BotID> <...>
        # self._self_mention = _re_compile(r"^<@{}>".format(bot_id))
        self._self_mention = _re_compile(
            r"^(<@\w*>) (\w*) ?(.*)".format(bot_id))
    while i != 0:
        xynames.append(_odr.XYCurves.Name())
        i = _odr.XYCurves.Next()

    return xynames


# </editor-fold>

# <editor-fold desc="Loads and declarations">
_this_module = _sys_modules[__name__]
_classmap = _get_classmap()

_command_manager = stdout_redirected

_DISKFULL_RE = _re_compile('Disk Full')

_ID = 'OpendssdirectEnhancer'
setattr(_this_module, 'utils', _odr.utils)

# loads chains of functions through which to pass the rough outputs of opendss.
with open(TREATMENTS_PATH, 'r') as _tfile:
    _rtrt = _json_load(_tfile)
_trt = dict()
for _subdic_name, _subdic in _rtrt.items():
    _nsd = {
        k: tuple([globals()[_t] for _t in _v])
        for k, _v in _subdic.items()
    }
    _trt[_subdic_name] = _nsd
Example #5
0
__author__ = 'Dug Song <*****@*****.**>'
__copyright__ = 'Copyright (c) 2004 Dug Song'
__license__ = 'BSD'
__url__ = 'http://monkey.org/~dugsong/dpkt/'
__version__ = '1.2'

try:
    from itertools import izip as _it_izip
except ImportError:
    _it_izip = zip

from struct import calcsize as _st_calcsize, \
     pack as _st_pack, unpack as _st_unpack, error as _st_error
from re import compile as _re_compile

intchr = _re_compile(r"(?P<int>[0-9]+)(?P<chr>.)")


class MetaPacket(type):
    def __new__(cls, clsname, clsbases, clsdict):
        if '__hdr__' in clsdict:
            st = clsdict['__hdr__']
            clsdict['__hdr_fields__'] = [x[0] for x in st]
            clsdict['__hdr_fmt__'] = clsdict.get('__byte_order__', '>') + \
                ''.join([ x[1] for x in st ])
            clsdict['__hdr_len__'] = _st_calcsize(clsdict['__hdr_fmt__'])
            clsdict['__hdr_defaults__'] = \
                dict(zip(clsdict['__hdr_fields__'], [ x[2] for x in st ]))
            clsdict['__slots__'] = clsdict['__hdr_fields__']
        return type.__new__(cls, clsname, clsbases, clsdict)
def scrape_talks(talk_links, outfile, skippedfile, start_at=0):
    '''
    Scrapes the talk pages for a TED talk, getting information and the
    transcript tokenized by cue cards.
    Returns nothing.
    Takes list of talk links, filepath to output of scrapes, and filepath to
    output of skipped talks.
    '''

    for index, talk_link in enumerate(talk_links):

        # Allow for starting part way through the list in case of exception
        if index < start_at:
            continue

        ### Get information about the talk ###

        talk_res = requests.get(base_url + talk_link)
        talk_page_html = talk_res.text
        talk_soup = BeautifulSoup(talk_page_html, "html.parser")

        # Scrape <script> that contains ' "__INITIAL_DATA__" object
        # Maybe I should scrape using javascript... this should already be loaded in as an object
        # ...
        # Just tested, web console showed _q as empty, couldn't locate INITIAL object.
        # _q and q() are being used by an external script with the use of eval.
        # Stick with Python here.
        script = talk_soup.find("script",
                                string=_re_compile("__INITIAL_DATA__"))

        # Getting html after object declaration and removing newline and outer object close
        talk_data_string = script.text.split("\"__INITIAL_DATA__\":")[1][:-3]

        talk_data = json.loads(talk_data_string)
        talk = talk_data["talks"][0]

        # If multiple speakers, a performance, or grabbed from the web (no transcript) just skip to next talk
        if (len(talk["speakers"]) > 1) or ("performance" in talk["tags"]) or (
                talk["video_type"]["id"] == '5'):
            # Log talk name and primary speaker so I can check to make sure I'm not
            # skipping something that I shouldn't!
            print("Skipping", talk["title"])
            with open(skippedfile, 'a', encoding="utf-8") as file:
                file.write(
                    str((talk["title"], talk["id"], talk["speaker_name"],
                         talk_link)) + '\n')

            # Remember to sleep
            if (index + 1) % 5 == 0:
                print("Scraped", index + 1,
                      "TALKS so far, taking a little break.")
                _time_sleep(60)

            continue

        # Add talk information
        data = {}
        data["title"] = talk["title"]
        data["talk_id"] = talk["id"]
        data["speaker"] = talk["speaker_name"]
        data["speaker_id"] = talk["speakers"][0]["id"]
        data["num_views"] = talk["viewed_count"]
        data["num_comments"] = talk_data["comments"]["count"]
        data["date"] = talk["recorded_at"]
        data["tags"] = talk["tags"]
        data["categories"] = talk["ratings"]
        data["language"] = talk_data["language"]
        data["duration"] = talk["duration"]
        data["event"] = talk_data["event"]

        ### Get the transcript ###

        # I have two options here, I could request the transcript from the page itself,
        # which handles parsing the json that stores the transcript, and I could
        # very easily scrape the <p> tags with attribute "dir" : "ltr".
        #
        # Or I can request just the json and parse it myself.
        #
        # Scraping the page would be much simpler, but I think parsing would be
        # fast for scraping purposes.

        # The splitting of the transcript into cues may be useful for tokenization.
        # I'll store the text as a list of the cue text.  I can always concat
        # then tokenize again if I want.
        transc_res = requests.get(base_url + "/talks/" + data["talk_id"] +
                                  transcript_url_extension)
        transcript_json = transc_res.json()

        tokens = []

        try:
            # Get list of paragraphs from transcript
            for paragraph in transcript_json["paragraphs"]:
                # Each paragraph contains a list of cue sets
                for cue in paragraph["cues"]:
                    # Append text from cue to our set of token
                    tokens.append(cue["text"])

            data["tokens"] = tokens
        except KeyError:
            print("Skipping", talk["title"])
            with open(skippedfile, 'a', encoding="utf-8") as file:
                file.write(
                    str((talk["title"], talk["id"], talk["speaker_name"],
                         talk_link)) + '\n')

            # Remember to sleep
            if (index + 1) % 5 == 0:
                print("Scraped", index + 1,
                      "TALKS so far, taking a little break.")
                _time_sleep(60)

            continue

        # Debug print
        print("Just scraped \"", data["title"], "\" Now writing.")

        ### Append data of this talk to file ###

        # Opening and appending to file each time to preserve memory and allow
        # for thrown exceptions to not destroy progress!

        # Every line of this file will contains a dictionary object of the talk

        with open(outfile, 'a', encoding="utf-8") as talk_file:
            talk_file.write(str(data) + '\n')

        # Debug
        print("Written.")

        ### Add some rest after every fifth scrape ###
        # As mentioned above, too many requests leads to a rate limit
        #
        # Ran into issue with transcript this time, need to wait longer.
        #
        # Previously paused every ninth, more frequent now to hopefully avoid
        #
        # Still didn't work. Ran into a stricter limit this time (7). Response
        # does not have headers that reveal the limit!  Looking into solutions
        # besides simply a longer sleep.
        #
        # Next day, decided that it doesn't matter if I just leave this scrape
        # running, I can wait between scrapes no worries.

        if (index + 1) % 5 == 0:
            print("Scraped", index + 1, "TALKS so far, taking a little break.")
            _time_sleep(60)
Example #7
0
from re import compile as _re_compile
from inspect import (currentframe as _curr_frame, getframeinfo as
                     _get_frame_info)


def is_pos_int(obj):
    return type(obj) is int and obj >= 1


_valid_pattern = _re_compile("^[a-zA-Z_]\w*$")


def is_valid_name(obj):
    return (type(obj) is str) and (_valid_pattern.match(obj) != None)


class Sym:
    _unq_count = 1

    def __init__(self, nm):
        if not is_valid_name(nm):
            raise TypeError(f"expected an alphanumeric name string, "
                            f"but got '{nm}'")
        self._nm = nm
        self._id = Sym._unq_count
        Sym._unq_count += 1

    def __str__(self):
        return self._nm

    def __repr__(self):
Example #8
0
    and message.
    """
    pass

class StopGeneration(Exception):
    """StopGeneration: Raised when the top-level soundscape reaches its
    end -- no more agents or sounds to be run.
    """
    pass

from re import compile as _re_compile

# Regular expression for valid event/property names: one or more elements,
# separated by periods. Each element must contain only letters, digits,
# and underscores. An element may not start with a digit.
_prop_name_regexp = _re_compile('\\A[a-zA-Z_][a-zA-Z_0-9]*(\.([a-zA-Z_][a-zA-Z_0-9]*))*\\Z')

# A cache of valid event/property names. We keep this so that we don't
# have to regexp them every time.
_valid_prop_names = {}

def check_prop_name(val):
    """check_prop_name(val) -> str

    Ensure that the value is a valid event or property name. If it isn't, 
    raise BoodlerError. If it is, return a str version of it (in case it 
    was a unicode object).
    """
    
    res = _valid_prop_names.get(val)
    if (res):
Example #9
0
import unittest
from ATL import *

import numpy as np
import random
import time

from functools import wraps
from collections import namedtuple

from re import compile as _re_compile
_data_method_RE = _re_compile("^data_")

# --------------------------------------------------------------------------- #
# --------------------------------------------------------------------------- #

DataCase    = namedtuple('DataCase',['name','input','output'])
InputCase   = namedtuple('InputCase',['name','input'])
DInputCase  = namedtuple('DInputCase',['name','input','dinput'])
DInOutCase  = namedtuple('DInOutCase',['name','input','dinput','doutput'])

class FunctionTestCase:
  """ A testing mixin that should be used alongside unittest.TestCase
  Expects gen_func() to be defined returning an ATL function.  Then every
  method with name data_*() is expected to return a sample input/output
  data pair for testing the function with.
  """

  def _init_rand(self):
    if not hasattr(self,'rand'):
Example #10
0
__author__ = 'Dug Song <*****@*****.**>'
__copyright__ = 'Copyright (c) 2004 Dug Song'
__license__ = 'BSD'
__url__ = 'http://monkey.org/~dugsong/dpkt/'
__version__ = '1.2'

try:
    from itertools import izip as _it_izip
except ImportError:
    _it_izip = zip
    
from struct import calcsize as _st_calcsize, \
     pack as _st_pack, unpack as _st_unpack, error as _st_error
from re import compile as _re_compile

intchr = _re_compile(r"(?P<int>[0-9]+)(?P<chr>.)")

class MetaPacket(type):
    def __new__(cls, clsname, clsbases, clsdict):
        if '__hdr__' in clsdict:
            st = clsdict['__hdr__']
            clsdict['__hdr_fields__'] = [ x[0] for x in st ]
            clsdict['__hdr_fmt__'] = clsdict.get('__byte_order__', '>') + \
                ''.join([ x[1] for x in st ])
            clsdict['__hdr_len__'] = _st_calcsize(clsdict['__hdr_fmt__'])
            clsdict['__hdr_defaults__'] = \
                dict(zip(clsdict['__hdr_fields__'], [ x[2] for x in st ]))
            clsdict['__slots__'] = clsdict['__hdr_fields__']
        return type.__new__(cls, clsname, clsbases, clsdict)
                        
class Packet(object):
def scrape_talks(filepath, outfile):
    '''
    Scrapes the talk pages for a TED talk, getting information and the
    transcript tokenized by cue cards.
    Returns nothing.
    Takes list of talk links, filepath to output of scrapes, and filepath to
    output of skipped talks.
    '''

    with open(filepath, 'r') as file:
        talk_links = file.read()
        talk_links = talk_links.split("\n")[:-1]

    collection = {}

    for index, talk_link in enumerate(talk_links):

        ### Get information about the talk ###

        talk_page_html = requests.get(base_url + talk_link,
                                      headers=headers_talklinks).text
        talk_soup = BeautifulSoup(talk_page_html, "html.parser")

        # Scrape <script> that contains ' "__INITIAL_DATA__" object

        script = talk_soup.find("script",
                                string=_re_compile("__INITIAL_DATA__"))

        # Getting html after object declaration and removing newline and outer object close
        talk_data_string = script.text.split("\"__INITIAL_DATA__\":")[1][:-3]
        talk_data = json.loads(talk_data_string)
        talk = talk_data["talks"][0]

        data = {}

        # I couldn't find a way to extract the comments from www.ted.com
        # But anyway, all comments from www.ted.com seem to be positive. (only people who love tedtalks go to www.ted.com)
        # So I only use the comments from YouTube (and YouTube is much more popular across the world)

        data["title"] = talk["title"]
        data["speaker"] = talk["speaker_name"]
        data["description"] = talk["description"]
        data["date"] = talk["recorded_at"][:10]
        data["duration"] = talk["duration"]
        data["thumbnails"] = talk["player_talks"][0]["thumb"]
        data["tags"] = talk["tags"]
        data["num_views"] = talk["viewed_count"]
        data["num_comments"] = talk_data["comments"]["count"] if talk_data[
            "comments"] is not None else 0
        data["num_transcripts"] = len(talk["downloads"]["languages"])
        if data["num_transcripts"] != 0:
            data["transcript_language"] = [
                d['endonym'] for d in talk["downloads"]["languages"]
            ]
        else:
            data["transcript_language"] = []
        data["categories"] = talk["ratings"]
        data["event"] = talk_data["event"]
        data["talk_link"] = base_url + talk_link

        ### Get the transcript ###
        if data["num_transcripts"] != 0:
            transc = requests.get(base_url + talk_link +
                                  transcript_url_extension,
                                  headers=headers_talklinks)

            transc = transc.json()
            if "paragraphs" in transc:
                for t in transc["paragraphs"]:
                    for cue in t["cues"]:
                        if "transcript" not in data:
                            data["transcript"] = cue["text"]
                        else:
                            data["transcript"] = data[
                                "transcript"] + " " + cue["text"]
            else:
                data["transcript"] = ""

        else:
            data["transcript"] = ""

        collection[str(index + 1)] = data

    with open(outfile, "w") as f:
        json.dump(collection, f, indent=4)
Example #12
0
import logging
from email.utils import getaddresses, formataddr
from re import compile as _re_compile
from xoutil.future.itertools import map
from xoutil.future.functools import curry

from xoeuf.odoo.addons.xopgi_mail_threads import MailRouter, MailTransportRouter
from xoeuf.odoo.addons.base.ir.ir_mail_server import encode_rfc2822_address_header  # noqa

from xoeuf import models, fields

logger = logging.getLogger(__name__)

EVANEOS_REGEXP = _re_compile(
    r'_(?P<thread>\d+)(?P<uuid>[_-][^@]+)?(?P<host>@.*(?<=[@\.])evaneos\.com)$'
)


class MATCH_TYPE:
    SENDER = 0
    RECIPIENT = 1


class Message(models.Model):
    _inherit = 'mail.message'
    # Make the email_from create an index, the 'search' in the router is slow
    # without it.
    email_from = fields.Char(index=True)

Example #13
0
def match_path_by_pattern(pattern, path):
    # Match file path by pattern. i.e pattern : "/root/*.txt"
    # path : "/root/model.txt"
    regex = _fn_translate(pattern)
    reobj = _re_compile(regex)
    return reobj.match(path)
Example #14
0
f = logging.Formatter('%(relativeCreated)d, %(message)s')
sh = logging.StreamHandler()
sh.setFormatter(f)
fh = logging.FileHandler(filename='11112.log', mode='w', encoding='utf8')
fh.setFormatter(f)

__log__.addHandler(sh)
__log__.addHandler(fh)
del f
del sh
del fh

###############################################################################

REGEX_SEARCH_IMAGE = _re_compile(
    r'<meta data-vue-meta="true" itemprop="image" content="([^"]+)"/>')
REGEX_VIDEO_IDENT = _re_compile(r'av\d+')

DICT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)'
    'AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'
}

STRING_URL_BASE = 'https://search.bilibili.com/all?'
EVENT_ALL_DONE = threading.Event()

INT_TIME_OUT = 12


class MyThread(threading.Thread):