def _check_hostname(name): _check_endpoint_type(name) hostname_checker = _compile('^http(s)?://[a-zA-Z0-9-_\.]+(:[0-9]+)?$') if not hostname_checker.match(name): raise ValueError('endpoint name {} should be in http(s)://<hostname>[:<port>] and hostname may consist only of:' ' a-z, A-Z, 0-9, underscore and hyphens.'.format(name))
def convert(rome): """Accepts a Roman number in string format and returns an integer hoding the value it represents The regex search checks in portions if a part of the string is can be a Roman number The organization of the math groups also checks for the plausability of the number in that a Roman code for hundreds does not come after that of tens. This above is achieved by checking for thousands, hundreds, tens and then ones in order. The program idea was obtained from Mark Pilgrim's "Dive Into Python" book but has been tweaked just a bit for re-usability and effectiveness. The Exception arguments especially for those raised after assessment of the "offender" argument describe concisely what the probem with the provided Roman number is. """ match = _compile(r"^\s*(M{,})(CM|CD|D?C{,})(XC|XL|L?X{,})(IX|IV|V?I{,})\s*$").search(rome.upper()) if not match: raise InvalidRoman("{} is not a valid Roman number.".format(rome)) offender = filter(lambda y: len(y) > 3, match.groups()) if offender: if 'M' in offender[0]: raise InvalidRoman("{} is not a supported Roman number".format(rome)) #just to clarify that only numbers below 4000 are supported thus that is the error. else: raise InvalidRoman("{} is not a valid Roman number.".format(rome)) return __assess(match.groups())
def import_raw_svg(raw_svg=maps_dir + "raw_svg/core.svg"): """parse an svg from the patched version of bigg and save it in the maps directory""" non_empty = _compile(".") with open(raw_svg) as infile: svg = _SVGsoup(infile) rxn_layer = svg.findChild(name="g", id="Layer_rxn") met_layer = svg.findChild(name="g", id="Layer_met") for svg_rxn in rxn_layer.findChildren(name="g", recursive=False): del(svg_rxn["stroke"]) del(svg_rxn.a["xlink:href"]) for path in svg_rxn.findChildren(name="path", attrs={"marker-end": non_empty}): del(path["marker-end"]) path["class"] = "end" for path in svg_rxn.findChildren(name="path", attrs={"marker-start": non_empty}): del(path["marker-start"]) path["class"] = "start" for met_rxn in met_layer.findChildren(name="g", recursive=False): del(met_rxn.a["xlink:href"]) rxn_colors = _Tag(svg, name="style") rxn_colors["id"] = "object_styles" svg.defs.append(rxn_colors) # write the processed file out to the maps directory with open(maps_dir + _path.split(raw_svg)[1], "w") as outfile: outfile.write(str(svg))
def apply_solution(self, flux_dict, color_map=default_color_map): self.object_styles.clear() fluxes = dict((i, flux_dict[i]) for i in self.included_reactions.intersection(flux_dict)) abs_fluxes = [min(abs(i), 20) for i in fluxes.itervalues()] x_min = min(abs_fluxes) x_max = max(abs_fluxes) scale_func = lambda value: min(1, (abs(value) - x_min) / (x_max - x_min) * 3) for reaction, value in fluxes.iteritems(): #t = _Tag(name="title") #t.string = "%.2f" % (value) self._rxn_layer.findChild("g", id=reaction).title.string += "\n%.2f" % (value)#append(t) try: t = _Tag(name="title") t.string = "%.2f" % (value) self._rxn_label_layer.findChild(name="text", text=_compile(reaction)).append(t) except: None if str(reaction) in self.included_reactions: self.set_object_color(str(reaction), color_map(scale_func(value))) if value < 0: self.object_styles["%s .end" % str(reaction)] = {"marker-end": "none"} if value > 0: self.object_styles["%s .start" % str(reaction)] = {"marker-start": "none"} for reaction in self.included_reactions.difference(flux_dict.keys()): self.set_object_color(reaction, (0, 0, 0)) self._update_svg() return self
def _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find_all('a', herf=re._compile(r"/view/d+\.htm")) for link in links: new_url = link['herf'] new_full_url = urlparse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls
def get_urls(url_str): x = urlopen(url_str) list_str = x.read() y = _compile( '''<a\s+href=(?P<quote>["'])?(?P<obf>[^"']+)(?(quote)(?P=quote))[^>]*?>''', IGNORECASE | VERBOSE) urls_M = [] for i in y.finditer(list_str): urls_M.append(i.group('obf')) return urls_M
def _assertRegex(self, text, expected_regexp, msg=None): """Fail the test unless the text matches the regular expression. """ if isinstance(expected_regexp, _basestring): expected_regexp = _compile(expected_regexp) if not expected_regexp.search(text): msg = msg or "Regexp didn't match" msg = '%s: %r not found in %r' % (msg, expected_regexp.pattern, text) raise self.failureException(msg)
def enableBranches(self, branches): ''' Add one or more items to the list of branches that are needed for this set of cuts. ''' if isinstance(branches, str): branches = [branches] for b in branches: self.branchesNeeded.append(_compile(b))
def _assertNotRegex(self, text, unexpected_regexp, msg=None): """Fail the test if the text matches the regular expression. """ if isinstance(unexpected_regexp, _basestring): unexpected_regexp = _compile(unexpected_regexp) match = unexpected_regexp.search(text) if match: msg = msg or 'Regexp matched' msg = '%s: %r matches %r in %r' % (msg, text[match.start():match.end()], unexpected_regexp.pattern, text) raise self.failureException(msg)
def init(dllpath = None, root = "C:\\", bypass_check=False): """ Initialize the underlying tos-databridge DLL dllpath: string of the exact path of the DLL root: string of the directory to start walking/searching to find the DLL """ global _dll rel = set() if not bypass_check and dllpath is None and root == "C:\\": if abort_init_after_warn(): return try: if dllpath is None: matcher = _partial( _match, _REGEX_DLL_NAME) # regex match function for nfile in map( matcher, _listdir( _curdir )): if nfile: # try the current dir first rel.add( _curdir+ _sep + nfile.string ) if not rel: for root,dirs, files in _walk(root): # no luck, walk the dir tree for file in map( matcher, files): if file: rel.add( root + _sep + file.string ) if not rel: # if still nothing throw raise TOSDB_Error(" could not locate DLL") if len(rel) > 1: # only use the most recent version(s) ver = _compile('-[\d]{1,2}.[\d]{1,2}-') vers = tuple( zip( map( lambda x: _search(ver,x).group().strip('-'), rel), rel) ) vers_max = max(vers)[0].split('.')[0] mtup = tuple( (x[0].split('.')[1],x[1]) for x in vers if x[0].split('.')[0] == vers_max) mtup_max = max(mtup)[0] rel = set( x[1] for x in mtup if x[0] == mtup_max ) # find the most recently updated d = dict( zip(map( lambda x : _stat(x).st_mtime, rel), rel ) ) rec = max(d) dllpath = d[ rec ] _dll = _WinDLL( dllpath ) print( "+ Using Module ", dllpath ) print( "+ Last Update ", _asctime(_localtime(_stat(dllpath).st_mtime))) if connect(): print("+ Succesfully Connected to Service \ Engine") else: print("- Failed to Connect to Service \ Engine") return True # indicate the lib was loaded except Exception as e: raise TOSDB_CLibError( "unable to initialize library", e )
from django.contrib.sessions.middleware import SessionMiddleware from django.http import HttpResponseForbidden, HttpResponseRedirect from amsp import settings from re import compile as _compile __author__ = "mohammad" EXEMPT_URLS = [_compile(settings.LOGIN_URL.lstrip("/"))] if hasattr(settings, "LOGIN_EXEMPT_URLS"): EXEMPT_URLS += [_compile(expr) for expr in settings.LOGIN_EXEMPT_URLS] class LoginRequiredMiddleware(SessionMiddleware): """ Middleware that requires a user to be authenticated to view any page other than LOGIN_URL. Exemptions to this requirement can optionally be specified in settings via a list of regular expressions in LOGIN_EXEMPT_URLS (which you can copy from your urls.py). Requires authentication middleware and template context processors to be loaded. You'll get an error if they aren't. """ def process_response(self, request, response): assert hasattr( request, "user" ), "The Login Required middleware\ requires authentication middleware to be installed. Edit your\ MIDDLEWARE_CLASSES setting to insert\ 'django.contrib.auth.middlware.AuthenticationMiddleware'. If that doesn't\
<a href="?99929___" class="">猴头菇</a> <a href="?99933___" class="">草菇</a> <a href="?99936___" class="">竹笋</a> <a href="?99937___" class="">冬笋</a> <a href="?99938___" class="">黄豆芽</a> <a href="?99939___" class="">绿豆芽</a> <a href="?99940___" class="">玉米棒</a> """ res_vage_number = r'<a href="(.*)" class=' res_vage_number_com = re._compile(res_vage_number,0) res_vage_name = '<a href=".*" class="">(.*)</a>' res_vage_name_com = re._compile(res_vage_name,0) res_pro_number = r'<a href="(.*)" class=' res_pro_number_com = re._compile(res_pro_number,0) res_pro_name = '<a href=".*" class="">(.*)</a>' res_pro_name_com = re._compile(res_pro_name,0) get_data_vage_number = re.findall(res_vage_number_com,str(get_data_vage)) get_data_vage_name = re.findall(res_vage_name_com,str(get_data_vage)) get_data_pro_number = re.findall(res_pro_number_com,str(get_data_pro)) get_data_pro_name = re.findall(res_pro_name_com,str(get_data_pro)) with open(r'C:\Users\Attack\Desktop\vage.txt','w+') as f:# C:\Users\Attack\Desktop\name_price.txt d = dict(zip(get_data_vage_name,get_data_vage_number)) d = pickle.dumps(d) f.write(d) with open(r'C:\Users\Attack\Desktop\pro.txt','w+') as f:# C:\Users\Attack\Desktop\name_price.txt
def update_event(self, inp=-1): self.set_output_val(0, re._compile(self.input(0), self.input(1)))
def _add_or_update_endpoint(self, action, name, version, request_data): """ Add or update an endpoint """ self.logger.log(logging.DEBUG, f"Adding/updating model {name}...") _name_checker = _compile(r"^[a-zA-Z0-9-_\s]+$") if not isinstance(name, str): msg = "Endpoint name must be a string" self.logger.log(logging.CRITICAL, msg) raise TypeError(msg) if not _name_checker.match(name): raise gen.Return("endpoint name can only contain: a-z, A-Z, 0-9," " underscore, hyphens and spaces.") if self.settings.get("add_or_updating_endpoint"): msg = ("Another endpoint update is already in progress" ", please wait a while and try again") self.logger.log(logging.CRITICAL, msg) raise RuntimeError(msg) request_uuid = random_uuid() self.settings["add_or_updating_endpoint"] = request_uuid try: description = (request_data["description"] if "description" in request_data else None) if "docstring" in request_data: docstring = str( bytes(request_data["docstring"], "utf-8").decode("unicode_escape")) else: docstring = None endpoint_type = request_data[ "type"] if "type" in request_data else None methods = request_data[ "methods"] if "methods" in request_data else [] dependencies = (request_data["dependencies"] if "dependencies" in request_data else None) target = request_data[ "target"] if "target" in request_data else None schema = request_data[ "schema"] if "schema" in request_data else None src_path = request_data[ "src_path"] if "src_path" in request_data else None target_path = get_query_object_path( self.settings[SettingsParameters.StateFilePath], name, version) self.logger.log(logging.DEBUG, f"Checking source path {src_path}...") _path_checker = _compile(r"^[\\\:a-zA-Z0-9-_~\s/\.\(\)]+$") # copy from staging if src_path: if not isinstance(request_data["src_path"], str): raise gen.Return("src_path must be a string.") if not _path_checker.match(src_path): raise gen.Return( "Endpoint source path name can only contain: " "a-z, A-Z, 0-9, underscore, hyphens and spaces.") yield self._copy_po_future(src_path, target_path) elif endpoint_type != "alias": raise gen.Return("src_path is required to add/update an " "endpoint.") # alias special logic: if endpoint_type == "alias": if not target: raise gen.Return("Target is required for alias endpoint.") dependencies = [target] # update local config try: if action == "add": self.tabpy_state.add_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, ) else: self.tabpy_state.update_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, version=version, ) except Exception as e: raise gen.Return(f"Error when changing TabPy state: {e}") on_state_change(self.settings, self.tabpy_state, self.python_service, self.logger) finally: self.settings["add_or_updating_endpoint"] = None
# *-*coding:utf-8*-* ''' @version:python2.7.11 @author:Attack @time: 2016/2/17 001719:54 ''' import re import urllib2 import sys url = 'http://jiage.shucaiyuan.com/' html = urllib2.urlopen(url) data = html.read() res_number = r'<a href="(.*)" class=' res_number_com = re._compile(res_number,0) res_name = '<a href=".*" class="">(.*)</a>' res_name_com = re._compile(res_name,0) get_data_number = re.findall(res_number_com,data) get_data_name = re.findall(res_name_com,data) new =[] for i in get_data_name: i=i.decode('utf-8')#转换name中的unicode为中文 new.append(i) d=dict(zip(new,get_data_number)) reload(sys) #貌似是重加载系统文字设置?不管能用就行 sys.setdefaultencoding("utf-8") with open(r'C:\Users\liuyunhai\Desktop\name_price.txt','r+') as f: # C:\Users\Attack\Desktop\name_price.txt for k,v in d.items(): key = str(k)+':'+str(v)+'\n' f.write(key)
from urllib.parse import (urlparse, urljoin) from html import unescape from re import compile as _compile from requests import RequestException from .request import request from .status import Status from .exceptions import ( DeadlinksIgnoredURL, DeadlinksRedirectionURL, ) # -- Constants ----------------------------------------------------------------- __RE_LINKS__ = _compile(r'<a\s{1}([^>]+)>') # pylint: disable=W1401 # filters CLEANER = lambda x: x.strip("\"'\n ") # removes quotes, spaces and new lines ANCHORS = lambda x: x.split("#")[0] # removed part after anchor UNESCPE = lambda x: unescape(x) # pylint: disable=W0108 class URL: """ URL abstraction representation. """ def __init__(self, location: str) -> None: # print(urlparse(location)) self._url = urlparse(location) self._status = Status.UNDEFINED # type: Status
# encoding:utf-8 #! python3 # renameDates.py - Rename filenames with American MM-DD-YYYY date format to Chinese YYYY-MM-DD import re,shutil,os workdir='C:\\Temp\\' #Create a regex that matches files with American date format. datePattern=re._compile(r'''^(.*?) # all text before the date ((0|1)?\d)- # One or two digits for the month ((0|1|2|3)?\d)- # One or two digits for the day ((19|20)\d\d) # Four digits for the year (.*?)$ # all text after the date ''',re.VERBOSE)# 传入VERBOSE参数则允许正则表达式允许空白字符和注释,增加可读性 # TODO:Loop over the files in the working directory for amerFilename in os.listdir(workdir): mo=datePattern.search(amerFilename) # TODO:Skip files without a date. if mo==None: continue # TODO: Get different parts of the filename. beforePart=mo.group(1) monthPart=mo.group(2) dayPart=mo.group(4) yearPart=mo.group(6) afterPart=mo.group(8) # TODO: Form the Chinese-style filename. chnFilename=beforePart+yearPart+'-'+monthPart+'-'+dayPart+afterPart # TODO: Get the full,absolute file paths. amerFilename=os.path.join(workdir,amerFilename) chnFilename=os.path.join(workdir,chnFilename) # TODO: Rename the files. print('Renamingg %s to %s...'%(amerFilename,chnFilename))
listdir as _listdir, sep as _sep from re import compile as _compile, search as _search, match as _match, \ split as _split from ctypes import WinDLL as _WinDLL, cast as _cast, pointer as _pointer, \ create_string_buffer as _BUF_, POINTER as _PTR_, c_double as _double_, \ c_float as _float_, c_ulong as _ulong_, c_long as _long_, \ c_longlong as _longlong_, c_char_p as _str_, c_char as _char_, \ c_ubyte as _uchar_, c_int as _int_, c_void_p as _pvoid_, c_uint as _uint_ _pchar_ = _PTR_(_char_) _ppchar_ = _PTR_(_pchar_) DLL_BASE_NAME = "tos-databridge" SYS_ARCH_TYPE = "x64" if (_log(_maxsize * 2, 2) > 33) else "x86" MIN_MARGIN_OF_SAFETY = 10 _REGEX_NON_ALNUM = _compile("[\W+]") _REGEX_DLL_NAME = _compile('^(' + DLL_BASE_NAME + '-)[\d]{1,2}.[\d]{1,2}-' + SYS_ARCH_TYPE + '(.dll)$') _dll = None ### we added a lock to the _call from VTOSDB_DataBlock ### how do we want to handle concurrent calls at this level ??? def init(dllpath=None, root="C:\\", bypass_check=False): """ Initialize the underlying tos-databridge DLL dllpath: string of the exact path of the DLL root: string of the directory to start walking/searching to find the DLL """
GithubRateLimitException, GithubRateLimitWarning, ) from airfs.storage.http import ( HTTPRawIO as _HTTPRawIO, HTTPBufferedIO as _HTTPBufferedIO, ) __all__ = [ "GithubRateLimitException", "GithubRateLimitWarning", "GithubRawIO", "GithubBufferedIO", ] _RAW_GITHUB = _compile(r"^https?://raw\.githubusercontent\.com") class _GithubSystem(_SystemBase): """ GitHub system. Args: storage_parameters (dict): "github.MainClass.Github" keyword arguments. """ SUPPORTS_SYMLINKS = True _SIZE_KEYS = ( "size", "Content-Length",
def GetUrl2(html): RegUrl = r'<a href="(*?)" title' Urlre = re._compile(RegUrl) Url2List = 'http://tieba.baidu.com' + re.findall(Urlre,html) return Url2List
n = self.name if _path.exists(n): n = _path.basename(n) return DataHolder([(n, d)], warn=warn) class NumPySaver(PythonSaver): def save(self, data): import numpy as np #@UnresolvedImport if len(list(data.items())) > 1: print('Only saving first dataset') np.save(self.name, data[0]) from re import compile as _compile _begin_number = _compile(r'^[-+]?[\d]?\.?\d') class SRSLoader(PythonLoader): ''' Loads an SRS dat file and returns a dataholder object ''' def load(self, warn=True): ''' warn -- if True (default), print warnings about key names Returns a DataHolder object ''' f = open(self.name)
def _add_or_update_endpoint(self, action, name, version, request_data): ''' Add or update an endpoint ''' logging.debug("Adding/updating model {}...".format(name)) _name_checker = _compile('^[a-zA-Z0-9-_\\s]+$') if not isinstance(name, (str, unicode)): log_and_raise("Endpoint name must be a string or unicode", TypeError) if not _name_checker.match(name): raise gen.Return('endpoint name can only contain: a-z, A-Z, 0-9,' ' underscore, hyphens and spaces.') if self.settings.get('add_or_updating_endpoint'): log_and_raise( "Another endpoint update is already in progress" ", please wait a while and try again", RuntimeError) request_uuid = random_uuid() self.settings['add_or_updating_endpoint'] = request_uuid try: description = (request_data['description'] if 'description' in request_data else None) if 'docstring' in request_data: if sys.version_info > (3, 0): docstring = str( bytes(request_data['docstring'], "utf-8").decode('unicode_escape')) else: docstring = request_data['docstring'].decode( 'string_escape') else: docstring = None endpoint_type = (request_data['type'] if 'type' in request_data else None) methods = (request_data['methods'] if 'methods' in request_data else []) dependencies = (request_data['dependencies'] if 'dependencies' in request_data else None) target = (request_data['target'] if 'target' in request_data else None) schema = (request_data['schema'] if 'schema' in request_data else None) src_path = (request_data['src_path'] if 'src_path' in request_data else None) target_path = get_query_object_path( self.settings['state_file_path'], name, version) _path_checker = _compile('^[\\a-zA-Z0-9-_\\s/]+$') # copy from staging if src_path: if not isinstance(request_data['src_path'], (str, unicode)): raise gen.Return("src_path must be a string.") if not _path_checker.match(src_path): raise gen.Return('Endpoint name can only contain: a-z, A-' 'Z, 0-9,underscore, hyphens and spaces.') yield self._copy_po_future(src_path, target_path) elif endpoint_type != 'alias': raise gen.Return("src_path is required to add/update an " "endpoint.") # alias special logic: if endpoint_type == 'alias': if not target: raise gen.Return('Target is required for alias endpoint.') dependencies = [target] # update local config try: if action == 'add': self.tabpy_state.add_endpoint(name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema) else: self.tabpy_state.update_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, version=version) except Exception as e: raise gen.Return("Error when changing TabPy state: %s" % e) on_state_change(self.settings, self.tabpy_state, self.python_service) finally: self.settings['add_or_updating_endpoint'] = None
def workload_parser(workload_line, attrs=None, avoid_data_tokens=[';']): """ Attributes of each workload line in a SWF format (separated by space): 1. job_number -- a counter field, starting from 1. 2. submit_time -- in seconds. The earliest time the log refers to is zero, and is usually the submittal time of the first job. The lines in the log are sorted by ascending submittal times. It makes sense for jobs to also be numbered in this order. 3. wait_time -- in seconds. The difference between the job's submit time and the time at which it actually began to run. Naturally, this is only relevant to real logs, not to models. 4. duration -- in seconds. The wall clock time the job was running (end time minus start time). 5. allocated_processors -- an integer. In most cases this is also the number of processors the job uses; if the job does not use all of them, we typically don't know about it. 6. avg_cpu_time -- Time Used for both user and system, in seconds. This is the average over all processors of the CPU time used, and may therefore be smaller than the wall clock runtime. If a log contains the total CPU time used by all the processors, it is divided by the number of allocated processors to derive the average. 7. used_memory -- in kilobytes. This is again the average per processor. 8. requested_number_processors --- Requested Number of Processors. 9. requested_time -- This can be either runtime (measured in wallclock seconds), or average CPU time per processor (also in seconds) -- the exact meaning is determined by a header comment. In many logs this field is used for the user runtime estimate (or upper bound) used in backfilling. If a log contains a request for total CPU time, it is divided by the number of requested processors. 10. requested_memory -- Requested memory in kilobytes per processor. 11. status -- 1 if the job was completed, 0 if it failed, and 5 if cancelled. If information about chekcpointing or swapping is included, other values are also possible. See usage note below. This field is meaningless for models, so would be -1. 12. user_id -- a natural number, between one and the number of different users. 13. group_id -- a natural number, between one and the number of different groups. Some systems control resource usage by groups rather than by individual users. 14. executable_number -- a natural number, between one and the number of different applications appearing in the workload. in some logs, this might represent a script file used to run jobs rather than the executable directly; this should be noted in a header comment. 15. queue_number -- a natural number, between one and the number of different queues in the system. The nature of the system's queues should be explained in a header comment. This field is where batch and interactive jobs should be differentiated: we suggest the convention of denoting interactive jobs by 0. 16. partition_number -- a natural number, between one and the number of different partitions in the systems. The nature of the system's partitions should be explained in a header comment. For example, it is possible to use partition numbers to identify which machine in a cluster was used. 17. preceding_job_number -- this is the number of a previous job in the workload, such that the current job can only start after the termination of this preceding job. Together with the next field, this allows the workload to include feedback as described below. 18. think_time_prejob -- this is the number of seconds that should elapse between the termination of the preceding job and the submittal of this one. :param workload_line: A Line of the workload file :param attrs: List of attributes to be considered. Default None, all attributes will be considered. :param avoid_data_tokens: List of tokens to avoid the line :return: A dictionary with all the attributes requested. If the line is returned it means that the line has the token to avoid. """ if workload_line[0] in avoid_data_tokens: return workload_line _common_int_pattern = ('\s*(?P<{}>[-+]?\d+)', int) _common_float_pattern = ('\s*(?P<{}>[-+]?\d+\.\d+|[-+]?\d+)', float) _dict = { 'job_number': _common_int_pattern, 'submit_time': _common_int_pattern, 'wait_time': _common_int_pattern, 'duration': _common_int_pattern, 'allocated_processors': _common_int_pattern, 'avg_cpu_time': _common_float_pattern, 'used_memory': _common_int_pattern, 'requested_number_processors': _common_int_pattern, 'requested_time': _common_int_pattern, 'requested_memory': _common_int_pattern, 'status': _common_int_pattern, 'user_id': _common_int_pattern, 'group_id': _common_int_pattern, 'executable_number': _common_int_pattern, 'queue_number': _common_int_pattern, 'partition_number': _common_int_pattern, 'preceding_job_number': _common_int_pattern, 'think_time_prejob': _common_int_pattern } _sequence = _dict.keys() if not attrs else (( attrs, ) if isinstance(attrs, str) else attrs) reg_exp = r'' for _key in _sequence: reg_exp += _dict[_key][0].format(_key) p = _compile(reg_exp) _matches = p.match(workload_line) _dict_line = _matches.groupdict() return {key: _dict[key][1](_dict_line[key]) for key in _sequence}
cat *.malt | ./malt2connlX.py > output.conll NOTE: Beware of nasty Windows newlines: dos2unix *.malt Author: Pontus Stenetorp <pontus stenetorp se> Version: 2011-12-05 """ from re import compile as _compile from sys import stdin, stdout # Constants MALT_REGEX = _compile(r'^(?P<token>.*?)\t(?P<pos>[^\t]+)\t' r'(?P<head>[^\t]+)\t(?P<rel>[^\t]+)$') # NOTE: My interpretation from reversing the format by example OUTPUT_LINE = '{token_num}\t{token}\t_\t{pos}\t{pos}\t_\t{head}\t{rel}\t_\t_' ### def main(args): token_cnt = 0 for line in (l.decode('utf-8').rstrip('\n') for l in stdin): if not line: # Done with the sentence token_cnt = 0 stdout.write('\n') continue else: token_cnt += 1
listdir as _listdir, sep as _sep from re import compile as _compile, search as _search, match as _match, \ split as _split from ctypes import WinDLL as _WinDLL, cast as _cast, pointer as _pointer, \ create_string_buffer as _BUF_, POINTER as _PTR_, c_double as _double_, \ c_float as _float_, c_ulong as _ulong_, c_long as _long_, \ c_longlong as _longlong_, c_char_p as _str_, c_char as _char_, \ c_ubyte as _uchar_, c_int as _int_, c_void_p as _pvoid_, c_uint as _uint_ _pchar_ = _PTR_( _char_ ) _ppchar_ = _PTR_( _pchar_ ) DLL_BASE_NAME = "tos-databridge" SYS_ARCH_TYPE = "x64" if ( _log( _maxsize * 2, 2) > 33 ) else "x86" MIN_MARGIN_OF_SAFETY = 10 _REGEX_NON_ALNUM = _compile("[\W+]") _REGEX_DLL_NAME = _compile('^('+DLL_BASE_NAME + '-)[\d]{1,2}.[\d]{1,2}-' + SYS_ARCH_TYPE +'(.dll)$') _dll = None ### we added a lock to the _call from VTOSDB_DataBlock ### how do we want to handle concurrent calls at this level ??? def init(dllpath = None, root = "C:\\", bypass_check=False): """ Initialize the underlying tos-databridge DLL dllpath: string of the exact path of the DLL root: string of the directory to start walking/searching to find the DLL """
with open(r'C:\Users\Attack\Desktop\pro.txt','r+') as f: pro_name =pickle.loads(f.read()) with open(r'C:\Users\Attack\Desktop\vage.txt','r+') as f: vage_name =pickle.loads(f.read()) root_url = 'http://jiage.shucaiyuan.com/' url_base,data = [],{} pro_and_vage_name_base = [] for k,v in pro_name.items(): for i,x in vage_name.items(): pro_and_vage_name = k+i new_url = root_url+x.replace('___','')+v.strip('?').replace('__','_0_0') #拼接URL url_base.append(new_url) pro_and_vage_name_base.append(pro_and_vage_name) res_mon = '<th>(.*)</th>' res_pri = '<td>(.*)</td>' res_mon_com = re._compile(res_mon) res_pri_com = re._compile(res_pri) month = [] price = [] for i in xrange(100): url,name = url_base[i],pro_and_vage_name_base[i] soup = BeautifulSoup(urllib2.urlopen(url), "html.parser") # 通过html.parser解析url对应的页面,转换为BeautifulSoup对象 month = list(re.findall(soup,res_mon_com)) #获取 月份 并转换为列表 price = list(re.findall(soup,res_pri_com))#获取 价格 并转换为列表 data[name] = with open(r'C:\Users\Attack\Desktop\data.txt','w+') as f: f.write(pickle.dumps(data)) end = time.clock() print 'time',end-satrt
def str2date(value): return _str2date(value) str2percent = _str2percent def isnan(value): try: return _isnan(value) except TypeError: return False # following masks are used to recognize string patterns FLOAT_MASK = _compile(r'^[-+]?[0-9]\d*\.\d*$|[-+]?\.?[0-9]\d*$') PERCENT_MASK = _compile(r'^[-+]?[0-9]\d*\.\d*%$|[-+]?\.?[0-9]\d*%$') INT_MASK = _compile(r'^[-+]?[-0-9]\d*$') DATE_MASK = _compile( '^(?:(?!0000)[0-9]{4}([-/.]?)(?:(?:0?[1-9]|1[0-2])([-/.]?)(?:0?[1-9]|1[0-9]|2[0-8])|(?:0?[13-9]|1[0-2])([-/.]?)(?:29|30)|(?:0?[13578]|1[02])([-/.]?)31)|(?:[0-9]{2}(?:0[48]|[2468][048]|[13579][26])|(?:0[48]|[2468][048]|[13579][26])00)([-/.]?)0?2([-/.]?)29)$' ) BOOL_MASK = _compile( '^(true)|(false)|(yes)|(no)|(\u662f)|(\u5426)|(on)|(off)$') def auto_str2value(value, dtype=None): '''using preview masks to auto transfer a string to matchest date type Parameters ---------- value : str
def parse_detail(self, response): item = {} offer = CrawlendItem() firm = FirmItem() soup = bs4.BeautifulSoup(response.body, 'lxml') offer['url'] = response.url offer['resource'] = '前程无忧' # 职位名称,公司信息 # ‘class’是python关键字所以用‘class_’ # strip=True 过滤字符后面的空格 soup_cn = soup.find('div', class_='cn') offer['name'] = soup_cn.find('h1').get_text(strip=True) offer['work_place'] = soup_cn.find('span', class_='lname').get_text(strip=True) firm['work_place'] = offer['work_place'] # 薪水 # 正则式:[万千]匹配'万'或者'千';(\d+\.?\d*)匹配小数格式 p_salary_1 = re._compile(r'(\d+\.?\d*)-(\d+\.?\d*)[万千]') str_salary = soup_cn.find('strong').get_text(strip=True) r = re.match(p_salary_1, str_salary) # 数据单位统一处理 if r: if '万' in str_salary: lst_r = [float(i) * 10000 for i in r.groups()] else: lst_r = [float(i) * 1000 for i in r.groups()] offer['salary_from'], offer['salary_to'] = lst_r else: offer['is_negotiable'] = True # 公司名 firm['firm_name'] = soup_cn.find('p', class_='cname').get_text(strip=True) # 行业规模 soup_firm_msg = soup_cn.find('p', class_='msg ltype') str_msg = soup_firm_msg.string.replace(' ', '') nature, scale, indus = [i.strip() for i in str_msg.split('|')] # 企业性质 # 编码规范化入库 if '外企' in nature: firm['firm_nature'] = '2' elif '合资' in nature: firm['firm_nature'] = '3' elif '国企' in nature: firm['firm_nature'] = '1' elif '民营' in nature: firm['firm_nature'] = '4' elif '上市' in nature: firm['firm_nature'] = '7' else: firm['firm_nature'] = '5' # 企业规模 p_scale = re.compile(r'\d+') lst_scale = re.findall(p_scale, scale) lst_scale = [int(i) for i in lst_scale] lst_scale.sort() if len(lst_scale) == 2: firm['firm_scale_from'], firm['firm_scale_to'] = lst_scale elif len(lst_scale) == 1: firm['firm_scale_from'] = lst_scale[0] # 行业 firm['firm_industry'] = indus # 职位要求 soup_job_qua = soup.find('div', class_= 'jtag inbox') soup_qua_div = soup_job_qua.find('div', class_='t1') soup_qua_span = soup_qua_div.find_all('span') # 正则式匹配经验年限 p_exp = re.compile(r'\d+') # 正则式匹配发布时间 p_date = re.compile(r'\d+-\d+') today = datetime.date.today() year_ = today.year for span in soup_qua_span: text = span.get_text(strip=True) # 处理经验 if '经验' in text: r_exp = re.findall(p_exp, text) r_exp.sort() # 排序 if len(r_exp) == 2: offer['years_of_work_from'], offer['years_of_work_to'] = r_exp elif len(r_exp) == 1: offer['years_of_work_from'] = r_exp[0] # 学历 if '高中' in text: offer['degree'] = '1' elif '大专' in text: offer['degree'] = '2' elif '本科' in text: offer['degree'] = '3' elif '研究生' in text: offer['degree'] = '4' elif '博士' in text: offer['degree'] = '5' # 招聘人数 if '招聘': r_mem = re.findall(p_exp, text) if len(r_mem) == 1: offer['member'] = r_mem[0] # 发布时间 if '发布': r_date = re.findall(p_date, text) if r_date: date_ = str(year_) + '-' + r_date[0] offer['release'] = datetime.datetime.strptime(date_, "%Y-%m-%d").date() offer.setdefault('degree', '6') offer.setdefault('release', today) # 职位诱惑 soup_r = soup_job_qua.find('p', class_='t2') if soup_r: offer['temptation'] = soup_r.get_text(';', strip=True) soup_qua = soup.find('div', class_='bmsg job_msg inbox') # 职位描述 offer['description'] = soup_qua.get_text(strip=True).replace('举报', '').replace('分享', '') # 公司地址 soup_loc_div = soup.find('div', class_='bmsg inbox') soup_loc_p = soup_loc_div.find('p', class_='fp') firm['firm_location'] = [i for i in soup_loc_p.stripped_strings][-1] # 公司简介 soup_intro = soup.find('div', class_='tmsg inbox') firm['firm_introduction'] = soup_intro.get_text(strip=True) item['offer'] = offer item['firm'] = firm yield item
RESTServiceClient as _RESTServiceClient, Endpoint as _Endpoint, AliasEndpoint as _AliasEndpoint, ) from .custom_query_object import CustomQueryObject as \ _CustomQueryObject import os as _os import logging as _logging _logger = _logging.getLogger(__name__) _name_checker = _compile('^[a-zA-Z0-9-_\ ]+$') if sys.version_info.major == 3: unicode = str def _check_endpoint_name(name): """Checks that the endpoint name is valid by comparing it with an RE and checking that it is not reserved.""" if not isinstance(name, (str,unicode)): raise TypeError("Endpoint name must be a string or unicode") if name == '': raise ValueError("Endpoint name cannot be empty") if not _name_checker.match(name):
d = np.load(self.name) import os.path as _path n = self.name if _path.exists(n): n = _path.basename(n) return DataHolder([(n, d)], warn=warn) class NumPySaver(PythonSaver): def save(self, data): import numpy as np #@UnresolvedImport if len(data.items()) > 1: print 'Only saving first dataset' np.save(self.name, data[0]) from re import compile as _compile _begin_number = _compile(r'^[-+]?[\d]?\.?\d') class SRSLoader(PythonLoader): ''' Loads an SRS dat file and returns a dataholder object ''' def load(self, warn=True): ''' warn -- if True (default), print warnings about key names Returns a DataHolder object ''' f = open(self.name) try:
''' This module provides functionality required for managing endpoint objects in TabPy. It provides a way to download endpoint files from remote and then properly cleanup local the endpoint files on update/remove of endpoint objects. The local temporary files for TabPy will by default located at /tmp/query_objects ''' import logging import os import shutil from re import compile as _compile _name_checker = _compile(r'^[a-zA-Z0-9-_\s]+$') def _check_endpoint_name(name, logger=logging.getLogger(__name__)): """Checks that the endpoint name is valid by comparing it with an RE and checking that it is not reserved.""" if not isinstance(name, str): msg = 'Endpoint name must be a string' logger.log(logging.CRITICAL, msg) raise TypeError(msg) if name == '': msg = 'Endpoint name cannot be empty' logger.log(logging.CRITICAL, msg) raise ValueError(msg)
def _add_or_update_endpoint(self, action, name, version, request_data): ''' Add or update an endpoint ''' _name_checker = _compile('^[a-zA-Z0-9-_\ ]+$') if not isinstance(name, (str,unicode)): raise TypeError("Endpoint name must be a string or unicode") if not _name_checker.match(name): raise gen.Return('endpoint name can only contain: a-z, A-Z, 0-9,' ' underscore, hyphens and spaces.') if self.settings.get('add_or_updating_endpoint'): raise RuntimeError("Another endpoint update is already in progress, " "please wait a while and try again") request_uuid = random_uuid() self.settings['add_or_updating_endpoint'] = request_uuid try: description = request_data['description'] if 'description' in request_data else None if 'docstring' in request_data: if sys.version_info > (3, 0): docstring = str(bytes(request_data['docstring'], "utf-8").decode('unicode_escape')) else: docstring = request_data['docstring'].decode('string_escape') else: docstring=None endpoint_type = request_data['type'] if 'type' in request_data else None methods = request_data['methods'] if 'methods' in request_data else [] dependencies = request_data['dependencies'] if 'dependencies' in request_data else None target = request_data['target'] if 'target' in request_data else None schema = request_data['schema'] if 'schema' in request_data else None src_path = request_data['src_path'] if 'src_path' in request_data else None target_path = get_query_object_path(self.settings['state_file_path'], name, version) _path_checker = _compile('^[\\a-zA-Z0-9-_\ /]+$') # copy from staging if src_path: if not isinstance(request_data['src_path'], (str,unicode)): raise gen.Return("src_path must be a string.") if not _path_checker.match(src_path): raise gen.Return('Endpoint name can only contain: a-z, A-Z, 0-9,underscore, hyphens and spaces.') yield self._copy_po_future(src_path, target_path) elif endpoint_type != 'alias': raise gen.Return("src_path is required to add/update an endpoint.") # alias special logic: if endpoint_type == 'alias': if not target: raise gen.Return('Target is required for alias endpoint.') dependencies = [target] # update local config try: if action == 'add': self.tabpy.add_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema) else: self.tabpy.update_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, version=version) except Exception as e: raise gen.Return("Error when changing TabPy state: %s" % e) on_state_change(self.settings) finally: self.settings['add_or_updating_endpoint'] = None
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This module implements base64 encoding-related functionality. """ from __future__ import ( absolute_import, ) from re import ( compile as _compile, ) from binascii import ( Error, ) from base64 import ( b64decode as _b64decode, ) _b64decode_validator = _compile(b'^[A-Za-z0-9-_]*={0,2}$') def urlsafe_b64decode(s): """ Like ``base64.b64decode`` but with validation. """ if not _b64decode_validator.match(s): raise Error('Non-base64 digit found') return _b64decode(s, altchars=b"-_")
with open(r'C:\Users\Attack\Desktop\pro.txt','r+') as f: pro_name =pickle.loads(f.read()) with open(r'C:\Users\Attack\Desktop\vage.txt','r+') as f: vage_name =pickle.loads(f.read()) root_url = 'http://jiage.shucaiyuan.com/' url_base,data = [],{} pro_and_vage_name_base = [] for k,v in pro_name.items(): for i,x in vage_name.items(): pro_and_vage_name = k+i new_url = root_url+x.replace('___','')+v.strip('?').replace('__','_0_0') #拼接URL url_base.append(new_url) pro_and_vage_name_base.append(pro_and_vage_name) res_pro = '<th>(.*)</th>' res_pri = '<td>(.*)</td>' res_pro_com = re._compile(res_pro) res_pri_com = re._compile(res_pri) month = [] price = [] for i in xrange(100): url,name = url_base[i],pro_and_vage_name_base[i] soup = BeautifulSoup(urllib2.urlopen(url), "html.parser") # 通过html.parser解析url对应的页面,转换为BeautifulSoup对象 month = list(re.findall(soup,res_pro_com)) #获取 月份 并转换为列表 price = list(re.findall(soup,res_pri_com))#获取 价格 并转换为列表 l_price = l_price[14:26] #剔除 价格 del l_month[0] #剔除表头 d = dict(zip(l_month,l_price)).copy() data[name] = d with open(r'C:\Users\Attack\Desktop\data.txt','w+') as f: f.write(pickle.dumps(data)) end = time.clock()
from constants import BAD_REQUEST, DENIED, NOT_FOUND from database import query_all, save_to_db from response_caching import cache from util import js_time, map_to_list, safe_int, sanitize from .common import ( get_ques_by_id, get_user_by_id, post_level_up_webhook, post_incorrect_webhook, save_log_to_file_system, run_in_thread, ) pid = "halocrypt" # getpid() replace = _compile(r"\s").sub no_question = lambda idx: {"game_over": True} def clean_node(a): x = a.as_json x.pop("secure_data") return x # LEADERBOARD_LIMIT = 100 @cache(lambda: f"{pid}_leaderboard_temp_cache") def generate_leaderboard():
_cast_cstr = lambda x: _cast(x, _str_).value.decode() _gen_str_buffers = lambda sz, n: [_BUF_(sz) for _ in range(n)] _gen_str_buffers_ptrs = lambda bufs: (_pchar_ * len(bufs))( *[_cast(b, _pchar_) for b in bufs]) _map_cstr = _partial(map, _cast_cstr) _map_dt = _partial(map, TOSDB_DateTime) _zip_cstr_dt = lambda cstr, dt: zip(_map_cstr(cstr), _map_dt(dt)) DLL_BASE_NAME = "tos-databridge" DLL_DEPENDS1_NAME = "_tos-databridge" SYS_ARCH_TYPE = "x64" if (_log(_maxsize * 2, 2) > 33) else "x86" MIN_MARGIN_OF_SAFETY = 10 _REGEX_NON_ALNUM = _compile("[\W+]") _REGEX_LETTER = _compile("[a-zA-Z]") _VER_SFFX = '[\d]{1,2}.[\d]{1,2}' _REGEX_VER_SFFX = _compile('-' + _VER_SFFX + '-') _REGEX_DLL_NAME = _compile('^(' + DLL_BASE_NAME + '-)' \ + _VER_SFFX + '-' \ + SYS_ARCH_TYPE +'(.dll)$') _REGEX_DBG_DLL_PATH = _compile('^.+(' + DLL_BASE_NAME + '-)' \ + _VER_SFFX + '-' \ + SYS_ARCH_TYPE + '_d(.dll)$') _dll = None _dll_depend1 = None
# Utility Functions # ================================================== from email.utils import parseaddr as _parseaddr from functools import wraps as _wraps from http import HTTPStatus from json import dumps as _dumps from re import compile as _compile from time import time from traceback import print_exc as _print_exc from flask import Response as _Response from flask import request as _request from werkzeug.datastructures import Headers # maybe only strip whitespace? _sub = _compile(r"([^\w])").sub sanitize = lambda x: _sub("", f"{x}").strip().lower() def validate_email_address(email_id: str) -> str: if not email_id: raise AppException("Invalid email") email_id = email_id.lower() if "@" in _parseaddr(email_id)[1]: return email_id raise AppException("Invalid Email", HTTPStatus.BAD_REQUEST) def get_origin(headers: Headers, or_="*") -> str: """ for CORS requests
class Version: """ Version. Args: version (str): Version.. pre (bool): If True, and no prerelease specified, is always lower than any other prerelease when comparing. """ # Semantic version regex _RE = _compile( # Handle proper "major.minor.patch", # but also 'major' or 'major.minor' cases r'^(?P<major>0|[1-9]\d*)?' r'(?P<minor>\.(0|[1-9]\d*))?' r'(?P<patch>\.(0|[1-9]\d*))?' # Handle properly formatted prereleases and builds r'(?P<prerelease>-(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)' r'(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*)?' r'(?P<build>\+[0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*)?' # Keep extra trailling non semantic versionning characters. r'(?P<trail>.*)?$') # Prerelease and build characters filter _FILTER = _compile(r'[^a-zA-Z0-9-.]') # Prerelease comparison behavior _PRE_COMPARE = { # Use an empty value to to ensure stable < prerelease True: (), # Use an ASCII table late character to ensure stable > prerelease False: ('~', ) } def __init__(self, version, pre=False): self._version = parts = { key: value for key, value in self._RE.match(version).groupdict().items() if value } # Set if this version should be before or after prereleases self._pre = pre # Get core version number as integers for key in ('major', 'minor', 'patch'): parts[key] = int(parts.get(key, '0').lstrip('.')) # Remove delimiters for key in ('prerelease', 'build'): try: parts[key] = tuple(parts[key][1:].split('.')) except KeyError: continue # Try to handle trailing characters that does not match semantic version # as prerelease or build information to allow comparison try: prerelease = parts.pop('trail') except KeyError: pass else: # Get build information if any if 'build' not in parts: try: prerelease, build = prerelease.split('+', 1) except ValueError: pass else: self.build = build self.prerelease = '.'.join(parts.get('prerelease', ())) + prerelease def __lt__(self, other): return self._compare() < other._compare() def __le__(self, other): return self._compare() <= other._compare() def __eq__(self, other): return self._compare() == other._compare() def __ge__(self, other): return self._compare() >= other._compare() def __gt__(self, other): return self._compare() > other._compare() def __ne__(self, other): return self._compare() != other._compare() def _compare(self): """ Comparable version. Returns: tuple: Comparable version. """ ver = self._version return (ver['major'], ver['minor'], ver['patch'], ver.get('prerelease', self._PRE_COMPARE[self._pre])) @property def major(self): """ Major version Returns: int: Major version. """ return self._version['major'] @major.setter def major(self, value): """ Major version Args: value (int): New value. """ self._version['major'] = int(value) @property def minor(self): """ Minor version Returns: int: Minor version. """ return self._version['minor'] @minor.setter def minor(self, value): """ Minor version Args: value (int): New value. """ self._version['minor'] = int(value) @property def patch(self): """ Patch version Returns: int: Patch version. """ return self._version['patch'] @patch.setter def patch(self, value): """ Patch version Args: value (int): New value. """ self._version['patch'] = int(value) @property def prerelease(self): """ Prerelease version Returns: str: Prerelease version. """ return '.'.join(self._version.get('prerelease', ())) @prerelease.setter def prerelease(self, value): """ Prerelease version Args: value (str): New value. """ self._version['prerelease'] = tuple( element.lstrip('0') for element in self._FILTER.sub('', value).strip('-.').split('.')) @property def build(self): """ Build information. Returns: str: Build version. """ return '.'.join(self._version.get('build', ())) @build.setter def build(self, value): """ Build information. Args: value (str): New value. """ self._version['build'] = tuple( self._FILTER.sub('', value).strip('.').split('.'))
def featurise(self, document, sentence, annotation): before_ann = sentence.text[:annotation.start].split() before_ann.reverse() after_ann = sentence.text[annotation.end:].split() for i, tok in izip(xrange(1, 4), before_ann): for f_tup in self.feature._featurise(tok): yield ('-BEFORE-{}-{}'.format(i, f_tup[0]), f_tup[1]) for i, tok in izip(xrange(1, 4), after_ann): for f_tup in self.feature._featurise(tok): yield ('-AFTER-{}-{}'.format(i, f_tup[0]), f_tup[1]) from re import compile as _compile DATE_REGEX = _compile( r'^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$') #TODO: Window BoW! ### Features to capture NP internal performance class SpanBoWFeature(object): def get_id(self): return 'SPAN-BOW' def featurise(self, document, sentence, annotation): span_text = sentence.annotation_text(annotation) for tok in span_text.split(): yield (tok, 1)
cat *.malt | ./malt2connlX.py > output.conll NOTE: Beware of nasty Windows newlines: dos2unix *.malt Author: Pontus Stenetorp <pontus stenetorp se> Version: 2011-12-05 ''' from sys import stdin, stdout from re import compile as _compile from codecs import open as _open ### Constants MALT_REGEX = _compile(ur'^(?P<token>.*?)\t(?P<pos>[^\t]+)\t' ur'(?P<head>[^\t]+)\t(?P<rel>[^\t]+)$') # NOTE: My interpretation from reversing the format by example OUTPUT_LINE = u'{token_num}\t{token}\t_\t{pos}\t{pos}\t_\t{head}\t{rel}\t_\t_' ### def main(args): token_cnt = 0 for line in (l.decode('utf-8').rstrip('\n') for l in stdin): if not line: # Done with the sentence token_cnt = 0 stdout.write('\n') continue else: token_cnt += 1
def _add_or_update_endpoint(self, action, name, version, request_data): ''' Add or update an endpoint ''' self.logger.log(logging.DEBUG, f'Adding/updating model {name}...') _name_checker = _compile(r'^[a-zA-Z0-9-_\s]+$') if not isinstance(name, str): msg = 'Endpoint name must be a string' self.logger.log(logging.CRITICAL, msg) raise TypeError(msg) if not _name_checker.match(name): raise gen.Return('endpoint name can only contain: a-z, A-Z, 0-9,' ' underscore, hyphens and spaces.') if self.settings.get('add_or_updating_endpoint'): msg = ('Another endpoint update is already in progress' ', please wait a while and try again') self.logger.log(logging.CRITICAL, msg) raise RuntimeError(msg) request_uuid = random_uuid() self.settings['add_or_updating_endpoint'] = request_uuid try: description = (request_data['description'] if 'description' in request_data else None) if 'docstring' in request_data: docstring = str( bytes(request_data['docstring'], "utf-8").decode('unicode_escape')) else: docstring = None endpoint_type = (request_data['type'] if 'type' in request_data else None) methods = (request_data['methods'] if 'methods' in request_data else []) dependencies = (request_data['dependencies'] if 'dependencies' in request_data else None) target = (request_data['target'] if 'target' in request_data else None) schema = (request_data['schema'] if 'schema' in request_data else None) src_path = (request_data['src_path'] if 'src_path' in request_data else None) target_path = get_query_object_path( self.settings[SettingsParameters.StateFilePath], name, version) self.logger.log(logging.DEBUG, f'Checking source path {src_path}...') _path_checker = _compile(r'^[\\\:a-zA-Z0-9-_~\s/\.]+$') # copy from staging if src_path: if not isinstance(request_data['src_path'], str): raise gen.Return("src_path must be a string.") if not _path_checker.match(src_path): raise gen.Return( 'Endpoint source path name can only contain: ' 'a-z, A-Z, 0-9, underscore, hyphens and spaces.') yield self._copy_po_future(src_path, target_path) elif endpoint_type != 'alias': raise gen.Return("src_path is required to add/update an " "endpoint.") # alias special logic: if endpoint_type == 'alias': if not target: raise gen.Return('Target is required for alias endpoint.') dependencies = [target] # update local config try: if action == 'add': self.tabpy_state.add_endpoint(name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema) else: self.tabpy_state.update_endpoint( name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, version=version) except Exception as e: raise gen.Return(f'Error when changing TabPy state: {e}') on_state_change(self.settings, self.tabpy_state, self.python_service, self.logger) finally: self.settings['add_or_updating_endpoint'] = None
from re import search as _search from typing import AnyStr as _AnyStr from typing import List as _List from typing import Union as _Union from typing import Tuple as _Tuple from . import miscellaneous as _utils # ---------- Constants ---------- DEFAULT_EMBED_INLINE: bool = True MAXIMUM_CHARACTERS: int = 1900 MAXIMUM_CHARACTERS_EMBED_DESCRIPTION: int = 2048 RX_DISCORD_INVITE: _Pattern = _compile( r'(?:https?://)?discord(?:(?:app)?\.com/invite|\.gg)/?[a-zA-Z0-9]+/?') ZERO_WIDTH_SPACE: str = '\u200b' # ---------- Functions ---------- def convert_color_string_to_embed_color(color_string: str) -> _Colour: if color_string: split_color_string = color_string.split(',') r, g, b = [int(c) for c in split_color_string] result = _Colour.from_rgb(r, g, b) else: result = _Embed.Empty return result
ord_dict = OrderedDict( sorted(freq.items(), key=lambda k: k[1], reverse=True)[:num] ) try: size = list(range(len(ord_dict))) plt.clf() plt.bar(size, list(ord_dict.values()), align="center") plt.xticks(size, list(ord_dict.keys())) plt.autoscale() plt.savefig(path, bbox_inches="tight") except NameError: print( "Common n-grams where n = {}:".format(len(list(ord_dict.keys())[0])) ) for i in ord_dict: step = max(ord_dict[i] for i in ord_dict) / len(ord_dict) bars = "▇" * int(ord_dict[i] / step) print("{}: {} ({})".format(i, bars, ord_dict[i])) if __name__ == "__main__": if len(argv) != 2: raise SystemExit("Invalid number of parameters!") with open(argv[1]) as raw: TEXT = "".join(i.lower().replace("\n", " ") for i in raw.readlines()) SIMPLE = _compile("[^a-z ]+").sub("", TEXT) for n, m in zip([1, 2, 3], [26, 20, 15]): show_output(ngram_frequency(n, SIMPLE), m, "n{}.png".format(n))
from re import compile as _compile from .decorators import parser as _parser, spliter as _spliter from .parse import parse as _parse ALIAS_SPLIT = _compile(r"[\[./\\\]>]") CAMEL_CASE_SPACE1 = _compile(r"([A-Z])([A-Z][a-z]+)") CAMEL_CASE_SPACE2 = _compile(r"([a-z\d])([A-Z])") CAPITALS = _compile(r"([A-Z])([A-Z])") CAPITAL_LOWER = _compile(r"([A-Z])([a-z])") INVALID_CHARS = _compile(r"\W") INVALID_LEAD = _compile(r"^[^a-zA-Z]+") REPLACEABLE_WITH_UNDERSCORE = _compile(r"[\s/,.+-]+") SPACE = _compile(r" ") UNDERSCORE = _compile(r"_") UNDERSCORES = _compile(r"(_)\1+") @_spliter def alias2keys(alias): return [ _parse(key, errors=False) for key in filter(None, ALIAS_SPLIT.split(alias)) ] def _base_case(string): string = REPLACEABLE_WITH_UNDERSCORE.sub(r"_", string) string = INVALID_CHARS.sub(r"", string) string = CAMEL_CASE_SPACE1.sub(r"\1_\2", string) return CAMEL_CASE_SPACE2.sub(r"\1_\2", string)
def _add_or_update_endpoint(self, action, name, version, request_data): ''' Add or update an endpoint ''' _name_checker = _compile('^[a-zA-Z0-9-_\ ]+$') if not isinstance(name, basestring): raise TypeError("Endpoint name must be a string or unicode") if not _name_checker.match(name): raise gen.Return('endpoint name can only contain: a-z, A-Z, 0-9,' ' underscore, hyphens and spaces.') if self.settings.get('add_or_updating_endpoint'): raise RuntimeError( "Another endpoint update is already in progress, " "please wait a while and try again") request_uuid = random_uuid() self.settings['add_or_updating_endpoint'] = request_uuid try: description = request_data[ 'description'] if 'description' in request_data else None docstring = request_data[ 'docstring'] if 'docstring' in request_data else None endpoint_type = request_data[ 'type'] if 'type' in request_data else None methods = request_data[ 'methods'] if 'methods' in request_data else [] dependencies = request_data[ 'dependencies'] if 'dependencies' in request_data else None target = request_data[ 'target'] if 'target' in request_data else None schema = request_data[ 'schema'] if 'schema' in request_data else None src_path = request_data[ 'src_path'] if 'src_path' in request_data else None target_path = get_query_object_path( self.settings['state_file_path'], name, version) _path_checker = _compile('^[\\a-zA-Z0-9-_\ /]+$') # copy from staging if src_path: if not isinstance(request_data['src_path'], basestring): raise gen.Return("src_path must be a string.") if not _path_checker.match(src_path): raise gen.Return( 'Endpoint name can only contain: a-z, A-Z, 0-9,underscore, hyphens and spaces.' ) yield self._copy_po_future(src_path, target_path) elif endpoint_type != 'alias': raise gen.Return( "src_path is required to add/update an endpoint.") # alias special logic: if endpoint_type == 'alias': if not target: raise gen.Return('Target is required for alias endpoint.') dependencies = [target] # update local config try: if action == 'add': self.tabpy.add_endpoint(name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema) else: self.tabpy.update_endpoint(name=name, description=description, docstring=docstring, endpoint_type=endpoint_type, methods=methods, dependencies=dependencies, target=target, schema=schema, version=version) except Exception as e: raise gen.Return("Error when changing TabPy state: %s" % e) on_state_change(self.settings) finally: self.settings['add_or_updating_endpoint'] = None
""" This module provides functionality required for managing endpoint objects in TabPy. It provides a way to download endpoint files from remote and then properly cleanup local the endpoint files on update/remove of endpoint objects. The local temporary files for TabPy will by default located at /tmp/query_objects """ import logging import os import shutil from re import compile as _compile _name_checker = _compile(r"^[a-zA-Z0-9-_\s]+$") def _check_endpoint_name(name, logger=logging.getLogger(__name__)): """Checks that the endpoint name is valid by comparing it with an RE and checking that it is not reserved.""" if not isinstance(name, str): msg = "Endpoint name must be a string" logger.log(logging.CRITICAL, msg) raise TypeError(msg) if name == "": msg = "Endpoint name cannot be empty" logger.log(logging.CRITICAL, msg) raise ValueError(msg)
from resources import Annotation, Document, Sentence ### Constants TXT_EXT = '.txt' SS_EXT = '.ss' A1_EXT = '.a1' A2_EXT = '.a2' RES_EXTS = set((TXT_EXT, SS_EXT, A1_EXT, A2_EXT)) # We use this mapping for sorting PRIO_BY_RES_EXT = { TXT_EXT: 0, SS_EXT: 10, A1_EXT: 20, A2_EXT: 30, } TB_SO_REGEX = _compile((r'^T[0-9]+\t(?P<type>[^ ]+) (?P<start>\d+) ' r'(?P<end>\d+)(?:\t(?P<text>.*?))?$')) # TODO: data dir could be referred elsewhere DATA_DIR = join_path(dirname(__file__), '../data/corpora') BIONLP_2011_DIR = join_path(DATA_DIR, 'bionlp_2011_st') BIONLP_2009_DIR = join_path(DATA_DIR, 'bionlp_2009_st') GREC_DIR = join_path(DATA_DIR, 'grec') CALBC_CII_DIR = join_path(DATA_DIR, 'calbc_ii_st_format_500_sample') NLPBA_DIR = join_path(DATA_DIR, 'nlpba_slightly_wrong') NLPBA_DOWN_DIR = join_path(DATA_DIR, 'nlpba_slightly_wrong_downsized') # 'CALBC.CII.75k.cos98.3.EBI.CL.20101008_st_format') # Epi and PTM data BIONLP_2011_EPI_TRAIN_DIR = join_path(BIONLP_2011_DIR, 'BioNLP-ST_2011_Epi_and_PTM_training_data')
from server.util import ParsedRequest as _Parsed from server.util import get_bearer_token, json_response from .common import ( add_to_db, clean_secure, get_user_by_id, save_to_db, send_acount_creation_webhook, send_admin_action_webhook, ) from .cred_manager import CredManager # regex to find the offending column # there must be a better way - RH find_error = _compile(r"Key\s*\(\"?(?P<key>.*?)\"?\)=\((?P<val>.*?)\)", IGNORECASE).search def get_integrity_error_cause(error_message: str): try: match = find_error(error_message) print(error_message) if not match: return None k = match.group("key") v = match.group("val") return k, v except Exception as e: print(e) return None
c_void_p as _pvoid_, \ c_uint as _uint_, \ c_uint32 as _uint32_, \ c_uint8 as _uint8_ _pchar_ = _PTR_(_char_) _ppchar_ = _PTR_(_pchar_) _cast_cstr = lambda x: _cast(x,_str_).value.decode() DLL_BASE_NAME = "tos-databridge" DLL_DEPENDS1_NAME = "_tos-databridge" SYS_ARCH_TYPE = "x64" if (_log(_maxsize * 2, 2) > 33) else "x86" MIN_MARGIN_OF_SAFETY = 10 _REGEX_NON_ALNUM = _compile("[\W+]") _REGEX_LETTER = _compile("[a-zA-Z]") _VER_SFFX = '[\d]{1,2}.[\d]{1,2}' _REGEX_VER_SFFX = _compile('-' + _VER_SFFX + '-') _REGEX_DLL_NAME = _compile('^(' + DLL_BASE_NAME + '-)' \ + _VER_SFFX + '-' \ + SYS_ARCH_TYPE +'(.dll)$') _dll = None _dll_depend1 = None def init(dllpath=None, root="C:\\", bypass_check=False): """ Initialize the underlying tos-databridge DLL
def compile(rule): try: return _compile(rule) except Exception, e: raise Exception(str(e) + ': ' + rule)
return 'WINDOW-FORMATTING-STRING' def featurise(self, document, sentence, annotation): before_ann = sentence.text[:annotation.start].split() before_ann.reverse() after_ann = sentence.text[annotation.end:].split() for i, tok in izip(xrange(1, 4), before_ann): for f_tup in self.feature._featurise(tok): yield ('-BEFORE-{}-{}'.format(i, f_tup[0]), f_tup[1]) for i, tok in izip(xrange(1, 4), after_ann): for f_tup in self.feature._featurise(tok): yield ('-AFTER-{}-{}'.format(i, f_tup[0]), f_tup[1]) from re import compile as _compile DATE_REGEX = _compile(r'^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$') #TODO: Window BoW! ### Features to capture NP internal performance class SpanBoWFeature(object): def get_id(self): return 'SPAN-BOW' def featurise(self, document, sentence, annotation): span_text = sentence.annotation_text(annotation) for tok in span_text.split(): yield (tok, 1)
class Simplifier(object): pre_operations = [ lambda x: x.replace("parent's partner", "parent"), lambda x: x.replace("partner's child", "child"), # lambda x: x.replace("parent's child", ""), lambda x: x.replace("child's parent", ""), lambda x: x.replace(" 's", ""), lambda x: x.replace(" ", " "), lambda x: x if not x.startswith("'s") else x[2:], lambda x: x.strip(), ] operations = [ lambda x: x.replace("parent's sibling", "aunt/uncle"), lambda x: x.replace("aunt/uncle's child", "cousin"), lambda x: x.replace("parent's child", "sibling"), lambda x: x.replace("sibling's child", "niece/nephew"), lambda x: x.replace("sibling's partner's child", "niece/nephew"), lambda x: x.replace("parent's niece/nephew", "cousin"), lambda x: x.replace("aunt/uncle's child", "cousin"), lambda x: x.replace("niece/nephew's sibling", "niece/nephew"), lambda x: x.replace("niece/nephew's child", "grandniece/nephew"). replace("grandgrandniece/nephew", "great grandniece/nephew"), ] short_operations = [ lambda x: Simplifier.relation_simplify_simple(x, "child"), lambda x: Simplifier.relation_simplify_simple(x, "parent"), # lambda x: Simplifier.relation_simplify_simple(x, "niece/nephew"), # lambda x: Simplifier.get_cousin_string(x), lambda x: x.replace("grandsibling", "great aunt/uncle"), lambda x: Simplifier.sibling_cousin_remover(x), ] post_operations = [ lambda x: x.replace(" 's", ""), lambda x: x.replace(" ", " "), lambda x: x if not x.startswith("'s") else x[2:], lambda x: x.strip(), ] # cousin_matcher = _compile(r"(((great )*?)(grand)?(parent)'s )?(cousin)('s)? ?((((great )*? ?(grand)?)child('s)?)*)") # magic cousin_matcher = _compile( r"(parent('s)?) (((parent('s)?)|(child('s)?)) ?)+($|(partner))") sibling_cousin_matcher = _compile( r"sibling's \d+((st)|(nd)|(rd)|(th)) cousin") nephew_child_matcher = _compile(r"(niece\/nephew's )((child('s )?)+)") @staticmethod def relation_simplify_simple(string: str, search_string: str) -> str: ''' Simplifies down a range of "child's child's child's..." to one set of "[great...] grandchild Params: string: str The string to be searched and modified search_string: str The name to be searched for and expanded upon ''' # Split it to be able to iterate through split = string.strip().split(' ') new_string = '' counter = 0 for i in split: if i in [f"{search_string}'s", search_string]: counter += 1 elif counter == 1: new_string += f"{search_string}'s {i} " counter = 0 elif counter == 2: new_string += f"grand{search_string}'s {i} " counter = 0 elif counter > 2: new_string += f"{'great ' * (counter - 2)}grand{search_string}'s {i} " counter = 0 else: new_string += i + ' ' # And repeat again for outside of the loop if counter == 1: new_string += f"{search_string}'s " counter = 0 elif counter == 2: new_string += f"grand{search_string}'s " counter = 0 elif counter > 2: new_string += f"{'great ' * (counter - 2)}grand{search_string}'s" counter = 0 # Return new string new_string = new_string.strip() if new_string.endswith("'s"): return new_string[:-2] return new_string @classmethod def relation_simplify_nephew(cls, string: str) -> str: ''' Simplifies down a range of "niece/nephew's child's childs..." to one set of "[great...] grandniece/nephew Params: string: str The string to be searched and modified ''' # k = cls.nephew_child_matcher.search(string) # great_count = k.group(2).count(' ') # if 'child' in k.group(2): # span = k.span() # return string[:span[0]] + f"{'great ' * great_count}grandniece/nephew" + string[:span[1]] return string @staticmethod def get_cousin_parent_count(k): '''Gets the amount of generations UP the cousin count goes''' p = 0 if k.group(3): # greats p += k.group(3).strip().count(' ') if k.group(4): # grand p += 1 if k.group(5): # parent p += 1 return p @staticmethod def get_cousin_child_count(k): '''Gets the amount of generations DOWN the cousin count goes''' # group 5 is cousin, so we get an extra space # group 7 is [child's child's...] # group 12 is GRAND-child return (k.group(6) + k.group(8)).strip().count(' ') + { True: 1, False: 0 }[bool(k.group(12))] @classmethod def get_cousin_string(cls, string: str): '''Gets the full cousin string''' k = cls.cousin_matcher.search(string) if not k: return string # if k.group(0).startswith("parent's child"): # span = k.span() # return string[:span[0]] + " sibling's " + string[span[1]:] p = k.group(0).count( 'parent') # p = cls.get_cousin_parent_count(k) # parent c = k.group(0).count( 'child') # c = cls.get_cousin_child_count(k) # child if p < 2: # Make sure we're not just working on nieces/children/siblings return string if c == 1: # This is a variation on aunt/uncle if p <= 2: return string[:k.span()[0]] + "aunt/uncle" + string[k.span( )[1]:] return string[:k.span( )[0]] + f"{'great ' * (p - 3)} grand aunt/uncle" + string[k.span( )[1]:] p -= 2 c -= 2 x = c + 1 if (c + 1) < p + 1 else p + 1 # nth cousin y = abs(p - c) # y times removed if x < 1: return string if x == 1 and y == 0: return string[:k.span()[0]] + "cousin" + string[k.span()[1]:] cousin_string = "" if str(x).endswith('1') and x != 11: cousin_string += f"{x}st cousin " elif str(x).endswith('2') and x != 12: cousin_string += f"{x}nd cousin " elif str(x).endswith('3') and x != 13: cousin_string += f"{x}rd cousin " else: cousin_string += f"{x}th cousin " if y == 0: return string[:k.span()[0]] + cousin_string.strip( ) + string[k.span()[1]:] return string[:k.span()[0]] + (cousin_string + { True: "1 time removed", False: f"{y} times removed" }[y == 1]).strip() + string[k.span()[1]:] @classmethod def sibling_cousin_remover(cls, string: str): '''Removes "sibling's nth cousin" to "nth cousin"''' k = cls.sibling_cousin_matcher.search(string) if not k: return string span = k.span() return string[:span[0]] + k.group(0).replace("sibling's ", "") + string[span[1]:] @classmethod def simplify(cls, string: str): '''Runs the given input through the shortening operations a number of times so as to shorten the input to a nice family relationship string''' before = string for i in range(10): for o in cls.pre_operations: string = o(string) if string == before: continue else: before = string for i in range(5): string = cls.get_cousin_string(string) if string == before: continue else: before = string for i in range(10): for o in cls.operations: string = o(string) if string == before: continue else: before = string for i in range(10): for o in cls.short_operations: string = o(string) if string == before: continue else: before = string for i in range(10): for o in cls.post_operations: string = o(string) if string == before: continue else: before = string for i in range(10): for o in cls.short_operations: string = o(string) if string == before: continue else: before = string return string
from pathlib import Path from textwrap import dedent from re import compile as _compile import json import requests from jinja2 import Template try: from packaging.version import parse except ImportError: from pip._vendor.packaging.version import parse # -- Code ---------------------------------------------------------------------- DUNDER_REGEXP = _compile(r'(__(.*?)__ = "(.*?)")\n') def read_data() -> Dict[str, str]: """ Read data from __versions__ py """ init = Path(".").parent / "deadlinks" / "__version__.py" if not Path(init).is_file(): raise RuntimeError("Can not find source for deadlinks/__version__.py") values = dict() # type: Dict[str, str] with open(str(init)) as fh: content = "".join(fh.readlines()) for match in DUNDER_REGEXP.findall(content): values[match[1]] = match[2]
#TODO: Lehvenstein tolerance, warn if within #TODO: No-warn #TODO: Print filename if mismatch flag ### Constants ARGPARSER = ArgumentParser(description=('reads paths to PTB files on stdin ' 'adds a suffix to the path and compares the resulting files ' 'for consistency')) ARGPARSER.add_argument('dep_format', choices=['conll', 'sd'], help='format of the output to sanity check') # It is a bit dirty to assume suffixing, but it makes thing a little easier ARGPARSER.add_argument('--dep_suffix', default=None, help=('suffix to add to ' 'the input file path to get the path to the resulting dependency file ' '(default: dep_format value)')) PTB_TOK_REGEX = _compile(r'\([^(]+?\ ([^(]+?)\)') PTB_ESCAPE_MAP = { '-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', '-LCB-': '{', '-RCB-': '}', #Apparently at least the CoNLL converter don't want this done #'``': '"', } ### #XXX: We preserve espaces inside tokens, for, well, others do it def _ptb_unescape(tok): if tok in PTB_ESCAPE_MAP:
from graphlab.connect.aws._ec2 import get_credentials as _get_credentials import graphlab as _gl import graphlab.connect as _mt # since _predictive_service_environment imports these, need to have them defined first _MAX_CREATE_TIMEOUT_SECS = 600 # 10m from _predictive_service._predictive_service_environment import Ec2PredictiveServiceEnvironment as _Ec2PredictiveServiceEnvironment from _predictive_service._predictive_service_environment import LocalPredictiveServiceEnvironment as _LocalPredictiveServiceEnvironment from _predictive_service._file_util import parse_s3_path as _parse_s3_path, s3_recursive_delete as _s3_recursive_delete, s3_delete_key as _s3_delete_key from _predictive_service._predictive_service import PredictiveService as _PredictiveService _logger = _getLogger(__name__) _name_checker = _compile('^[a-zA-Z-]+$') def create(name, environment, state_path, description = None, api_key = None, admin_key = None, ssl_credentials = None): ''' Launch a Predictive Services cluster. This cluster can currently be launched on EC2 by specifying an EC2 environment. Parameters ---------- name : str The name of the Predictive Service that will be launched. This string can only contain: a-z, A-Z and hyphens. environment : :class:`~graphlab.deploy.environment.EC2` or str
def compile(rule): try: return _compile(rule) except Exception as e: raise Exception(str(e) + ': ' + rule)