def get_info(Term, Subject):
	url = "https://ssbp.mycampus.ca/prod/bwckschd.p_get_crse_unsec?TRM=U&term_in=" + Term + "&sel_subj=dummy&sel_day=dummy&sel_schd=dummy&sel_insm=dummy&sel_camp=dummy&sel_levl=dummy&sel_sess=dummy&sel_instr=dummy&sel_ptrm=dummy&sel_attr=dummy&sel_subj=" + Subject + "&sel_crse=&sel_title=&sel_from_cred=&sel_to_cred=&sel_camp=UON&begin_hh=0&begin_mi=0&begin_ap=a&end_hh=0&end_mi=0&end_ap=a"
	htmltext = urllib.urlopen(url).read();
	regex = '<TH CLASS="ddheader" scope="col" >(.+?)<BR><BR></TH>'
	pattern = re.compile(regex)
	courses = re.split(pattern, htmltext)
	re.purge()
	for course in courses:
		regex = '<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) \(<ABBR title= "Primary">P</ABBR>\)</TD>'
		regex2 = '<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault"><ABBR title = "To Be Announced">(.+?)</ABBR></TD>'
		regex3 = '<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault"><ABBR title = "To Be Announced">(.+?)</ABBR></TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault"><ABBR title = "To Be Announced">(.+?)</ABBR></TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault"><ABBR title = "To Be Announced">(.+?)</ABBR></TD>'
		regex4 = '<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) - (.+?)</TD>\n<TD CLASS="dbdefault">(.+?)</TD>\n<TD CLASS="dbdefault">(.+?) \(<ABBR title= "Primary">P</ABBR>\)(.+)?</TD>'
		pattern = re.compile(regex)
		pattern2 = re.compile(regex2)
		pattern3 = re.compile(regex3)
		pattern4 = re.compile(regex4)
		entries = re.findall(pattern3, course)				#this pattern is for courses that do not have a start time or class assigned
		if entries:											 
			print entries
		else:
			entries = re.findall(pattern2, course)			#this pattern is for instructor TBA
			if entries:
				print entries
			else:
				entries = re.findall(pattern, course)		#this pattern is for default structure of courses
				if entries:
					print entries
				else:
					entries = re.findall(pattern4, course)	#this pattern returns two values for instructor
					print entries
def main():
    times = {}
    html = urllib2.urlopen('http://example.webscraping.com/places/default/view/United-Kingdom-239').read()
    NUM_ITERATIONS = 1000 # number of times to test each scraper
    for name, scraper in ('Regular expressions', regex_scraper), ('Beautiful Soup', beautiful_soup_scraper), ('Lxml', lxml_scraper):
        times[name] = []
        # record start time of scrape
        start = time.time()
        for i in range(NUM_ITERATIONS):
            if scraper == regex_scraper:
                # the regular expression module will cache results
                # so need to purge this cache for meaningful timings
                re.purge() 
            result = scraper(html)

            # check scraped result is as expected
            assert(result['area'] == '244,820 square kilometres')
            times[name].append(time.time() - start)
        # record end time of scrape and output the total
        end = time.time()
        print('{}: {:.2f} seconds'.format(name, end - start))

    writer = csv.writer(open('times.csv', 'w'))
    header = sorted(times.keys())
    writer.writerow(header)
    for row in zip(*[times[scraper] for scraper in header]):
        writer.writerow(row)
Exemple #3
0
def finditer(content, encodings, charset, min_size):
    '''Generator function that iterates over all string matches inside the given content which are at least
    min_size characters long.

    @param    content    Binary content to search in
    @param    encodings  Dictionary of encoding functions
    @param    charset    An interable object containing the characters to consider as part of a string
    @param    min_size   Minimal string size to consider as a string match

    @return A tuple containing the match offset in content, encoding name, encoding key and the deobfuscated
            string reconstructed from the blob found
    '''

    # iterate over available encoding fucntions
    for encoding_name, (encoding_function, encoding_range) in encodings.items():

        # iterate over all keys in range for that encoding function
        for key in encoding_range:
            encoded_charset = encoding_function(charset, key)

            pattern = '[%s]{%d,}' % (re.escape(encoded_charset), min_size)

            for match in re.finditer(pattern, content):
                # deobfuscation: reconstruct the original string
                deobf = ''.join(charset[encoded_charset.index(c)] for c in match.group(0))

                yield (match.start(0), encoding_name, key, deobf)

        # cleanup regex cache once in a while
        re.purge()
Exemple #4
0
 def clear_cache(self):
     try:
         re.purge()
         dircache.reset()
         tiedobj.reset()
     except Exception, err:
         sys.stderr.write('Crond.clear_cache(): %s\n' % err)
Exemple #5
0
def dash_R_cleanup(fs, ps, pic):
    import gc, copy_reg
    import _strptime, linecache, dircache
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    struct._cache.clear()
    filecmp._cache.clear()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
Exemple #6
0
def color ( adjoining_words_i, data, balises ):
	"""Colorie les groupes de mots contigus dans une page web"""
	n = len(adjoining_words_i) + 1
	
	# on commence par les groupes les plus longs
	for i in xrange( n, 1, -1 ):
		
		# pour chaque groupe de mots
		for j in adjoining_words_i[i]:
			
			text = u'(\A|\W)(%s)(\W|\Z)'%( string.join([j[0][k] for k in range(0,i)] ,'(?:(?:</span>\W?)|\W)') )
			pattern1 = re.compile(text, re.I|re.U|re.S)		
			replace = u'\g<1><span class="%s" style="color:blue; background-color:grey;">\g<2></span>\g<3>'%(string.join(j[0],""))
			data = pattern1.sub(replace, data)
	
	re.purge()
	# recherche les emplacements sauvegardés des balises
	data_color = u''	
	flag3 = re.compile( u'#([0-9]+?)#', re.I|re.U|re.S )
	m = flag3.finditer( data )
	k = 0
	
	# remet les balises dans la chaine
	for j in m:
		data_color += data[k:j.start()] + balises[j.group(1)]
		k = j.end()
		
	data_color += data[k:]
	
	return data_color
def retrieve_devpaths():
    pipe = Popen('si projectinfo --devpaths --noacl --noattributes --noshowCheckpointDescription --noassociatedIssues --project="%s"' % sys.argv[1], shell=True, bufsize=1024, stdout=PIPE)
    devpaths = pipe.stdout.read()
    devpaths = devpaths [1:]
    devpaths_re = re.compile('    (.+) \(([0-9][\.0-9]+)\)\n')
    devpath_col = devpaths_re.findall(devpaths)
    re.purge()
    devpath_col.sort(key=lambda x: map(int, x[1].split('.'))) #order development paths by version
    return devpath_col
Exemple #8
0
    def get_skips(self, line):
        skip_points = []
        for r in self.skip_rules:
            pattern = '('+r[0]+')('+r[1]+')'
            matchobjs = re.finditer(pattern, line)
            for i in matchobjs:
                skip_points.append(i.end() )

        re.purge()
        return skip_points
Exemple #9
0
    def get_breaks(self, line):
        break_points = []
        for r in self.break_rules:
            pattern = '('+r[0]+')('+r[1]+')'
            matchobjs = re.finditer(pattern, line)
            for i in matchobjs:
                break_points.append(i.end() )

        re.purge()
        return break_points
Exemple #10
0
Fichier : re.py Projet : yoeo/pyhow
def purge():
    """re.purge: Purge internal regular expressions cache."""

    def _cache_empty():
        return not getattr(re, '_cache')

    re.match('', '')
    cache_created = not _cache_empty()
    re.purge()
    return cache_created and _cache_empty() and "empty cache"
Exemple #11
0
    def test_regex_equality_nocache(self):
        pattern = r'^(?:[a-z0-9\.\-]*)://'
        left = RegexValidator(pattern)
        re.purge()
        right = RegexValidator(pattern)

        self.assertEqual(
            left,
            right,
        )
Exemple #12
0
 def check(self, pattern):
     self.model.clear()
     if not pattern: return False
     try:
         re.compile(pattern, self.insertFlags())
         re.purge()
         return True
     except re.error as rerr:
         self.model.showError(str(rerr))
         return False
Exemple #13
0
def getRegexpFeatures(dct, number_of_words_per_type, number_of_words, select = None):
    it = list()
    for (mt, sen) in dct.iteritems():
        it.append((len(sen), mt, sen))
    it.sort(reverse=False)
    itt = list()
    for (l, mt, sen) in it:
        random.shuffle(sen)
        itt.append((l, mt, sen[0:1000]))
    regexps = dict()
    ret = list()
    types = list()
    for (_, meme, _sentences) in itt:
        types.extend([meme for _ in _sentences])
    types = [types]
    #glob = regExpChooser()
    #glob.add_types(types)
    for (_, meme_type, sentences) in it:
        if select != None and meme_type != select:
            continue
        regexps[meme_type] = cluster(sentences, meme_type)
        N = len(regexps[meme_type])
        n = 0
        start = time.time()
        loc = regExpChooser()
        loc.add_types(replaceNotEqual(types, meme_type))
        for regexp in regexps[meme_type]:
            re.purge()
            n += 1
            sys.stdout.write(
                "\r[{0}] {1}/{2} RE in {3} s. ({4})".format(
                    meme_type,
                    n,
                    N,
                    round(time.time() - start),
                    regexp
                ))
            sys.stdout.flush()
            compiled = re.compile(regexp)
            search_result = list()
            for (_, meme, _sentences) in itt: 
                for sent in _sentences:
                    search_result.append(
                        1 if compiled.search(sent.lower()) != None else 0)
            loc.add_regexp(regexp, search_result)
            #glob.add_regexp(regexp, search_result)
        selection = loc.getBest(number_of_words_per_type)
        ret.extend(selection)
        print("\r[{0}] Regular expressions selected in {1} seconds. (best: {2})".format(
            meme_type,
            time.time() - start,
            selection[0])
        )
    #ret.extend(glob.getBest(number_of_words))
    return ret
 def markdownify_content(self):
     self.content = re.sub(r'({{% question) "(.*)"(\s*%}})',r'### \2', self.content)
     re.purge()
     self.content = re.sub(r'{{< relref "(\w*)\.md[#\w\-éèà]*"\s*>}}\s*',r'\1', self.content)
     re.purge()
     self.content = re.sub(r'{{% (\w*) "(.*)" *%}}([\s\S]*?){{% \/\1 %}}',r'*\2*\3', self.content)
     re.purge()
     self.content = re.sub(r'\* Exemple : <.*\)',r'', self.content)
     re.purge()
     self.content = re.sub(r'(#+)\s',r'\1# ', self.content)
     re.purge()
def bench_regex_compile(loops, regexes):
    range_it = xrange(loops)
    t0 = perf.perf_counter()

    for _ in range_it:
        for regex, flags in regexes:
            re.purge()
            # ignore result (compiled regex)
            re.compile(regex, flags)

    return perf.perf_counter() - t0
def remove_links(s, replace_by):
	#quita url www.algo.com/djj
    re.purge()
    temp = re.compile(r"\s*www\.\. \w+\.(com|net|me|org)?(\s|/*[-\w+&@#/%!?=~_:.\[\]()0-9]*)")
    s = temp.sub(replace_by, s)
    #quita http://
    temp = re.compile(r"(((http|ftp|https)://\. |(http|ftp|https)://\.)[-/\w.]*)")
    s = temp.sub(replace_by, s)
    temp = re.compile(r"\w+/\w")
    s = temp.sub(replace_by, s)
    return s
def test_regex_compile(count, timer):
    regexes = capture_regexes()
    times = []

    for _ in xrange(count):
        t0 = timer()
        for regex, flags in regexes:
            re.purge()
            re.compile(regex, flags)
        t1 = timer()
        times.append(t1 - t0)
    return times
Exemple #18
0
def process(self, context, collection):
    '''
        Process collection, send names to rename and shared sort.
    '''

    # compare
    compare = []

    # clean
    clean = []

    # clean duplicates
    for name in collection:

        # remove duplicates
        if name[3][0] not in compare:

            # append
            compare.append(name[3][0])
            clean.append(name)

    # done with collection
    collection.clear()

    # name
    for i, name in enumerate(clean):
        rename(self, context, name, i)

    # randomize names (prevents conflicts)
    for name in clean:

        # randomize name
        name[3][0].name = str(random())

    # is shared sort or shared count
    if context.window_manager.BatchShared.sort or context.window_manager.BatchShared.count:

        # sort
        shared.main(self, context, clean, context.window_manager.BatchShared)

    # isnt shared sort or shared count
    else:

        # apply names
        for name in clean:
            name[3][0].name = name[1]

            # count
            if name[1] != name[2]:
                self.count += 1

    # purge re
    re.purge()
def dash_R_cleanup(fs, ps, pic, zdc, abcs):
    import gc, copy_reg
    import _strptime, linecache
    dircache = test_support.import_module('dircache', deprecated=True)
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)
    try:
        import zipimport
    except ImportError:
        pass # Run unmodified on platforms without zipimport support
    else:
        zipimport._zip_directory_cache.clear()
        zipimport._zip_directory_cache.update(zdc)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc, registry in abcs.items():
        abc._abc_registry = registry.copy()
        abc._abc_cache.clear()
        abc._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
Exemple #20
0
def dash_R_cleanup(fs, ps, pic, abcs):
    import gc, copy_reg
    import _strptime, linecache
    dircache = test_support.import_module('dircache', deprecated=True)
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc, registry in abcs.items():
        abc._abc_registry = registry.copy()
        abc._abc_cache.clear()
        abc._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    if _llvm:
        code_types = (types.CodeType, types.FunctionType, types.MethodType)
        for obj in gc.get_objects():
            if isinstance(obj, code_types):
                _llvm.clear_feedback(obj)

    # Collect cyclic trash.
    gc.collect()
def wrap_pieces_in_text(text, ordered_cont_pieces):
	text_length = len(text)
	text = text
	#if text:
		#print('wp text in ok')
	try: 
	
		try:
			re.purge()
			opener_segment = get_segment(text, ordered_cont_pieces[0][0], ordered_cont_pieces[0][1])
		except Exception as e:
			#print('wp_openseg error')
			raise e
		try:
			re.purge()
			closer_segment = get_segment(text, ordered_cont_pieces[-1][0], ordered_cont_pieces[-1][1])
		except Exception as e:
			#print('wp_closeg error')
			raise e
		# Maybe some more checking in case there's some shit at the top/bottom? -- i.e. check
		# by length or content?	
		
		try:
			if ordered_cont_pieces[-1][1] > text_length * 0.7 and '<salute>' in closer_segment:
				text = re.sub(closer_segment, fix_closer_wraps(closer_segment), text)
		except Exception as e:
			#print('wp closersub failed')
			raise e

		try:
			text = re.sub(opener_segment, fix_opener_wraps(opener_segment), text)	
		except Exception as e:
			#print('wp openersub failed')
			print(e)
	
		#print('wp_ index error not triggered')
	except IndexError: # presumably from fail if there is only one segment identified
		#print('wp_index error')
		opener_segment = get_segment(text, cont_pieces[0][0], cont_pieces[0][1])
		text = re.sub(opener_segment, fix_opener_wraps(opener_segment), text)
	except Exception as e:

		#print('wp_general exception', e)
		raise e
	# Remove all remaining temps
	text = re.sub(r'<TEMP>','',text)
	text = re.sub(r'</TEMP>','',text)
	#print(text)
	return text
def findPrice (product, logger, host):
    product_url = "/dp/" + product.id
    
    conn = http.client.HTTPConnection(host)
    conn.request("GET", product_url)
    r1 = conn.getresponse()
    dataRep = r1.read().decode("UTF-8")
    conn.close()
    shortRep = re.findall("<span.*priceblock_.*/span>",dataRep)[-1]
    shortRep = parsePrice(shortRep)
    re.purge()
    price = float(shortRep)
    logger.info(str(product) + " :: " + shortRep)
    if(product.setPrice(price)):
        logger.info("New price on product " + str(product) + " at " + str(product.price) + " Link : http://" + host + "/dp/" + str(product.id))
 def _replaceBrackets(self, string):
     """
     Resolves property variable within a string into a string
     :param string:
     :return:
     """
     m = re.findall(self.regEx, str(string))
     if m:
         for key in m:
             value = self.getItem(key, self.rawParameters)
             if re.findall(self.regEx, str(value)):
                 value = self._replaceBrackets(value)
             string = string.replace('[' + key + ']', value)
         re.purge()
     gc.collect()
     return string
    def format(self, data, format, filter):
        """
        Função para tratar o campo o texto coletado da aranha

        :param format: O formato esperado.
        :param filter: O formato de saida.
        """

        if type(filter) == int:
            filter = "\\" + str(filter)

        _result = re.subn(format, filter, data)
        _data = _result[0] if _result[1] >= 1 else ""

        re.purge()
        return _data
Exemple #25
0
def dash_R_cleanup(fs, ps, pic, abcs):
    import gc, copyreg
    import _strptime, linecache
    import urllib.parse, urllib.request, mimetypes, doctest
    import struct, filecmp, _abcoll
    from distutils.dir_util import _path_created
    from weakref import WeakSet

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copyreg.dispatch_table.clear()
    copyreg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc in [getattr(_abcoll, a) for a in _abcoll.__all__]:
        if not isabstract(abc):
            continue
        for obj in abc.__subclasses__() + [abc]:
            obj._abc_registry = abcs.get(obj, WeakSet()).copy()
            obj._abc_cache.clear()
            obj._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urllib.parse.clear_cache()
    urllib.request.urlcleanup()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
Exemple #26
0
    def trace_memory_clean_caches(self):
        """ Avoid polluting results with some builtin python caches """

        urlparse.clear_cache()
        re.purge()
        linecache.clearcache()
        copy_reg.clear_extension_cache()

        if hasattr(fnmatch, "purge"):
            fnmatch.purge()  # pylint: disable=no-member
        elif hasattr(fnmatch, "_purge"):
            fnmatch._purge()

        if hasattr(encodings, "_cache") and len(encodings._cache) > 0:
            encodings._cache = {}

        context.log.handler.flush()
def get_segment(text, s,e):
	try:
		regex = r'[\s\S]*'
		re.purge()
		pattern = re.compile(regex)
		#print(pattern)
		
		try:
			segment = pattern.search(text,s,e).group()
			
			return segment
		except Exception as e:
			#print('getseg regex err -', e)
			raise e
		
	except Exception as e:
		#print('getseg error', e)
		raise e
 def set(cls, ssquo=None, esquo=None, sdquo=None, edquo=None, dir=None):
     """
     Set the HTML entities (and indirectly, the Unicode glyphs) used to
     represent starting and ending single and double quotes, respectively,
     and language direction.
     """
     if ssquo is not None:
         cls.SSQUO = ssquo
     if esquo is not None:
         cls.ESQUO = esquo
     if sdquo is not None:
         cls.SDQUO = sdquo
     if edquo is not None:
         cls.EDQUO = edquo
     if dir is not None:
         if cls.direction != dir:
             re.purge()
         cls.direction = dir
         cls.direction_explicit = True
Exemple #29
0
    def run():
        responses.add(responses.GET, "http://example.com/zero")
        responses.add(responses.GET, "http://example.com/one")
        responses.add(responses.GET, "http://example.com/two")
        responses.add(responses.GET, re.compile(r"http://example\.com/three"))
        responses.add(responses.GET, re.compile(r"http://example\.com/four"))
        re.purge()
        responses.remove(responses.GET, "http://example.com/two")
        responses.remove(Response(method=responses.GET, url="http://example.com/zero"))
        responses.remove(responses.GET, re.compile(r"http://example\.com/four"))

        with pytest.raises(ConnectionError):
            requests.get("http://example.com/zero")
        requests.get("http://example.com/one")
        with pytest.raises(ConnectionError):
            requests.get("http://example.com/two")
        requests.get("http://example.com/three")
        with pytest.raises(ConnectionError):
            requests.get("http://example.com/four")
Exemple #30
0
 def addItem(self):
     """Add Items from Locate command."""
     start_time = datetime.now().second
     self.stringlist.clear()
     lineText = self.lineEdit.text()
     if len(lineText) and str(lineText).strip() not in self.history:
         self.history.append(lineText + "\n")
         self.historyCurrentItem = 1
         self.saveHistory()
     self.historyCurrentItem = self.historyCurrentItem - 1
     command = "ionice --ignore --class 3 chrt --idle 0 "  # Nice CPU / IO
     command += "locate --ignore-case --existing --quiet --limit 9999 {}"
     condition = str(self.applet.configurations.readEntry("Home")) == "true"
     if len(str(lineText).strip()) and condition:
         command_to_run = command.format(  # Only Search inside Home folders
             path.join(path.expanduser("~"), "*{}*".format(lineText)))
     else:
         command_to_run = command.format(lineText)
     locate_output = Popen(command_to_run, shell=True, stdout=PIPE).stdout
     results = tuple(locate_output.readlines())
     banned = self.applet.configurations.readEntry("Banned")
     banned_regex_pattern = str(banned).strip().lower().replace(" ", "|")
     for item in results:
         if not search(banned_regex_pattern, str(item)):  # banned words
             self.stringlist.append(item[:-1])
     purge()  # Purge RegEX Cache
     self.model.setStringList(self.stringlist)
     self.treeview.nativeWidget().resizeColumnToContents(0)
     number_of_results = len(results)
     if number_of_results:  # if tems found Focus on item list
         self.lineEdit.nativeWidget().clear()
         self.label.setText("Found {} results on {} seconds !".format(
             number_of_results, abs(datetime.now().second - start_time)))
         self.resize(500, 12 * number_of_results)
         self.treeview.nativeWidget().show()
         self.treeview.nativeWidget().setFocus()
     else:  # if no items found Focus on LineEdit
         self.label.setText("Search")
         self.resize(self.minimumSize())
         self.treeview.nativeWidget().hide()
         self.lineEdit.nativeWidget().selectAll()
         self.lineEdit.nativeWidget().setFocus()
Exemple #31
0
    for field in FIELDS:
        results[field] = soup.find('help')
    return results


def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('help')
    return results


times = {}
html = '''<body>help</body>'''
for name, scraper in ('Regular expressions',
                      regex_scraper), ('Beautiful Soup',
                                       beautiful_soup_scraper), ('Lxml',
                                                                 lxml_scraper):
    times[name] = []
    start = time.time()
    for i in range(1000):
        if scraper == regex_scraper:
            re.purge()  #RE will use cache so we purge(clean) it.
        result = scraper(html)  # check scraped result is as expected
        times[name].append(
            time.time() -
            start)  # record end time of scrape and output the total
    end = time.time()
    print '{}: {:.2f} seconds'.format(name, end - start)
Exemple #32
0
def purge():
    """Purge caches."""

    _purge_cache()
    _re.purge()
Exemple #33
0
def regular():
    data = "She is more than pretty. 520"

    # --- 正则 ---
    reg = r"mo"  # 指定字符 => span=(7, 9), match='mo'
    reg = r"."  # (.)单个字符 => span=(0, 1), match='S'
    reg = r"\."  # (\)转义符 => span=(23, 24), match='.'
    reg = r"[.]"  # ([])字符集合(注意:部分特殊字符失去特殊意义) => span=(23, 24), match='.'
    reg = r"[love]"  # []内任意字符 => span=(2, 3), match='e'
    reg = r"[i-u]"  # (-)范围 => span=(4, 5), match='i'
    reg = r"t{2}"  # {}内为长度(3个6) => span=(20, 22), match='tt'
    reg = r"t{1,3}"  # {M,} / {.N} / {N} => span=(12, 13), match='t'
    reg = r"(i|o|u){1}"  # (())组 => span=(4, 5), match='i'
    reg = r"^S"  # (^)开头 => span=(0, 1), match='S'
    reg = r"[^S]"  # ([^])取反(不含H) => span=(1, 2), match='h'
    reg = r"520$"  # ($)结尾 => span=(25, 28), match='520'
    reg = r"et*"  # (*)匹配{0,}个表达式 => ['e', 'e', 'ett']
    reg = r"et+"  # (+)匹配{1,}个表达式 => ['ett']
    reg = r"et?"  # (?)匹配{0,1}个表达式 => ['e', 'e', 'et']
    reg = r".+?e"  # (?)非贪婪模式(span=(0, 20), match='She is more than pre' => span=(0, 3), match='She')

    reg = r"\145"  # ascii标的8进制数(145=101=e) => span=(2, 3), match='e'
    reg = r"\d"  # (\d)单个数字 => span=(25, 26), match='5' (推荐:[0-9])
    reg = r"\D"  # (\D)非数字 => span=(0, 1), match='S' (推荐:[^0-9])
    reg = r"\s"  # (\s)空白字符 => span=(3, 4), match=' ' (推荐:[\t\n\r\f\v])
    reg = r"\S"  # (\S)非空白字符 => span=(0, 1), match='S' (推荐:[^\t\n\r\f\v])
    reg = r"\w"  # (\w)单词 => span=(0, 1), match='S' (推荐:[a-zA-Z0-9_])
    reg = r"\W"  # (\W)非单词 => span=(3, 4), match=' ' (推荐:[^a-zA-Z0-9_])
    reg = r"\AS"  # (\A)开头 => span=(0, 1), match='S'
    reg = r"520\Z"  # (\Z)结尾 => span=(25, 28), match='520'
    reg = r"y\b"  # (\b)单词边界(Hello) => span=(22, 23), match='y'
    reg = r"o\B"  # (\B)非单词边界(world) => span=(8, 9), match='o'
    reg = r"[01]\d\d|2[0-4]\d|25[0-5]"  # 或(|) 多位数(匹配0 - 255 直接的数字)

    index = re.search(reg, data)  # 查找单个匹配项
    index = re.match(r"She", data)  # 匹配开头 => span=(0, 3), match='She'
    index = re.fullmatch(
        r".+",
        data)  # 匹配全部 => span=(0, 28), match='She is more than pretty. 520'

    lists = re.findall(reg, data)  # 查找所有匹配项(列表)
    lists = re.split(
        r"o", data, maxsplit=1
    )  # 根据正则分割字符串(maxsplit分割次数) => ['She is m', 're than pretty. 520']

    strs = re.sub(
        r"\.", r"!", data,
        count=1)  # 替换(count:替换次数)(匹配替换,未匹配原样) => She is more than pretty! 520

    re.purge()  # 清除正则表达式缓存

    # --- 正则表达式对象 ---
    pat = re.compile(r"e")  # 编译成正则对象

    index = pat.search(data)  # 查找单个匹配项 => span=(2, 3), match='e'
    index = pat.search(data, 5)  # => span=(10, 11), match='e'
    index = pat.search(data, 1, 10)
    index = pat.match(data)  # 匹配开头 => None
    index = pat.match(data, 2)  # => span=(2, 3), match='e'
    index = pat.match(data, 1, 10)
    index = pat.fullmatch(data)  # 匹配全部 => None
    index = pat.fullmatch(data, 2)  # => None
    index = pat.fullmatch(data, 2, 3)  # span=(2, 3), match='e'

    lists = pat.split(
        data, maxsplit=0)  # 分割 => ['Sh', ' is mor', ' than pr', 'tty. 520']
    lists = pat.findall(data)  # 查找全部 => ['e', 'e', 'e']
    lists = pat.findall(data, 5)  # => ['e', 'e']
    lists = pat.findall(data, 1, 10)  # => ['e']

    strs = pat.sub(r"o", data, count=0)  # 替换 => Sho is moro than protty. 520

    # --- Match ---
    match = index
    # span=(2, 3), match='e'
    strs = match.string  # 被匹配的数据 => She is more than pretty. 520
    strs = match.group()  # 获取 match 数据 => e
    pos = match.pos  # => 2
    pos = match.endpos  # => 3
Exemple #34
0
def all():
    """Translation of the entire DocFX project"""
    global processed
    global greenFlag
    global reqs
    global chars
    RepoCheck()
    while not processed:
        if greenFlag:
            for item in Path().iterdir():
                if item.name != sourceDir and item.name in list(map(lambda x: '_'.join(x.split('-')).lower(), targetLangs)) and item.is_dir():
                    shutil.rmtree(item.name)
            processed = True
        if greenFlag:
            for path in targetPaths:
                os.mkdir(path)
        for entry in Path(sourceDir).iterdir():
            if entry.is_dir():
                dirLevel2 = sourceDir + '/' + entry.name
                if greenFlag:
                    for path in targetPaths:
                        os.mkdir(path + '/' + entry.name)
                for entry2 in Path(dirLevel2).iterdir():
                    if entry2.is_dir():
                        tgSeg = '/' + entry.name + '/' + entry2.name
                        dirLevel3 = dirLevel2 + '/' + entry2.name
                        if greenFlag:
                            for path in targetPaths:
                                os.mkdir(path + tgSeg)
                        for entry3 in Path(dirLevel3).iterdir():
                            if entry3.is_dir():
                                tgSeg = '/' + entry.name + '/' + entry2.name + '/' + entry3.name
                                dirLevel4 = dirLevel3 + '/' + entry3.name
                                if greenFlag:
                                    for path in targetPaths:
                                        os.mkdir(path + tgSeg)
                                for entry4 in Path(dirLevel4).iterdir():
                                    if entry4.is_dir():
                                        tgSeg = '/' + entry.name + '/' + entry2.name + '/' + entry3.name + '/' + entry4.name
                                        dirLevel5 = dirLevel4 + '/' + entry4.name
                                        if greenFlag:
                                            for path in targetPaths:
                                                os.mkdir(path + tgSeg)
                                        for entry5 in Path(dirLevel5).iterdir():
                                            if not entry5.is_dir():
                                                if greenFlag:
                                                    ProcessFiles(dirLevel5 + '/' + entry5.name)
                                                else:
                                                    stats.append(FileStats(dirLevel5 + '/' + entry5.name))
                                    else:
                                        if greenFlag:
                                            ProcessFiles(dirLevel4 + '/' + entry4.name)
                                        else:
                                            stats.append(FileStats(dirLevel4 + '/' + entry4.name))
                            else:
                                if greenFlag:
                                    ProcessFiles(dirLevel3 + '/' + entry3.name)
                                else:
                                    stats.append(FileStats(dirLevel3 + '/' + entry3.name))
                    else:
                        if greenFlag:
                            ProcessFiles(dirLevel2 + '/' + entry2.name)
                        else:
                            stats.append(FileStats(dirLevel2 + '/' + entry2.name))
            else:
                if greenFlag:
                    ProcessFiles(sourceDir + '/' + entry.name)
                else:
                    stats.append(FileStats(sourceDir + '/' + entry.name))
        if not greenFlag:
            fls = list(filter(lambda x: x is not None, stats))
            nFls = len(fls)*len(targetLangs)
            print('\n Target languages:\t\t\t' + ', '.join(targetLangs))
            print(' Total of source language files:\t' + str(len(fls)))
            for i in range(len(fls)):
                reqs += fls[i][0]
                chars += fls[i][1]
            estimatedT = int(reqs * 1.3)
            print(' Total of source language characters:\t' + str(chars))
            print(' Total of files to be generated:\t' + str(nFls))
            print(' Total of calls to translation service:\t' + str(reqs))
            print(' Total of characters for translation:\t' + str(chars * len(targetLangs)))
            print(' Estimated process duration:\t\t' + str(datetime.timedelta(seconds = estimatedT)))
            cont = input('\n Continue [c] or abort [Enter]? ')
            if cont == 'c':
                greenFlag = True
            else:
                break
    if greenFlag and not len(haltedTranslation):
        PrGreen('\n Completed successfully!')
    else:
        if greenFlag:
            PrYellow("The following files could neither be processed nor copied to target language directories:")
            for notTranslated in haltedTranslation:
                print(' ' + notTranslated)
        PrRed('\n Exiting...')
    time.sleep(1)
    re.purge()
    exit()
def basic_operation():
	r"""
	# Special sequence.
	\number \A \b \B \d \D \s \S \w \W \Z

	# Standard escape.
	\a \b \f \n \N \r \t \u \U \v \x \\

	# Flag.
	re.A, re.ASCII
	re.I, re.IGNORECASE
	re.L, re.LOCALE
	re.M, re.MULTILINE
	re.S, re.DOTALL
	re.U, re.UNICODE
	re.X, re.VERBOSE

	re.DEBUG

	re.search(pattern, string, flags=0)
		Scan through string looking for the first location where the regular expression pattern produces a match.
	re.match(pattern, string, flags=0)
		If zero or more characters at the beginning of string match the regular expression pattern.
	re.fullmatch(pattern, string, flags=0)
		If the whole string matches the regular expression pattern.
	re.split(pattern, string, maxsplit=0, flags=0)
	re.findall(pattern, string, flags=0)
	re.finditer(pattern, string, flags=0)
	re.sub(pattern, repl, string, count=0, flags=0)
	re.subn(pattern, repl, string, count=0, flags=0)

	re.escape(pattern)

	re.purge()
	"""

	#--------------------
	# Search.

	# *, +, ?.
	#	The '*', '+', and '?' qualifiers are all greedy; they match as much text as possible.
	#	If the RE <.*> is matched against '<a> b <c>', it will match the entire string, and not just '<a>'.
	# *?, +?, ??.
	#	Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
	#	Using the RE <.*?> will match only '<a>' against '<a> b <c>'.

	re.search(r'''['"].*['"]''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: '\'cd\'ef\'gh\'ij"kl"mn\'op"qr"st\'uv"wx\'yz\'AB"'.
	re.search(r'''['"].*?['"]''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: "'cd'".

	# (...): Group.
	# (?P<name>...): Named group.
	# (?P=name): Backreference to a named group.

	re.search(r'''(?P<quote>['"]).*(?P=quote)''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: '\'cd\'ef\'gh\'ij"kl"mn\'op"qr"st\'uv"wx\'yz\''.
	re.search(r'''(?P<quote>['"]).*?(?P=quote)''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: "'cd'".
	re.search(r'''(?P<asterisk>\*).*?(?P=asterisk)|(?P<quote>['"]).*?(?P=quote)''', '''ab'cd'ef'gh'ij"kl"mn*op*qr'st"uv"wx'yz*AB*CD"EF'GH'IJ"KL*MN*OPQRSTUVWXYZ''')  # Result: "'cd'".

	# (?=...): Lookahead assertion.
	# (?!...): Negative lookahead assertion.
	# (?<=...): Positive lookbehind assertion.
	# (?<!...): Negative lookbehind assertion.

	# (?!...): Negative lookahead assertion.
	re.search(r'(?!ABC)\w*', 'Aabcde')  # Matched.
	re.search(r'(?!ABC)\w*', 'Babcde')  # Matched.
	re.search(r'(?!ABC)\w*', 'Cabcde')  # Matched.
	re.search(r'(?!ABC)\w*', 'ABabcde')  # Matched.
	re.search(r'(?!ABC)\w*', 'BCabcde')  # Matched.
	re.search(r'(?!ABC)\w*', 'ABCabcde')  # Unmatched.

	#--------------------
	# Match.

	# [^...]: Complementation of a set of characters.
	# The first character.
	re.match(r'[^A]\w*', 'abcde')  # Matched.
	re.match(r'[^A]\w*', 'Babcde')  # Matched.
	re.match(r'[^A]\w*', 'Aabcde')  # Unmatched.
	re.match(r'[^ABC]\w*', 'abcde')  # Matched.
	re.match(r'[^ABC]\w*', 'aAabcde')  # Matched.
	re.match(r'[^ABC]\w*', 'Aabcde')  # Unmatched.
	re.match(r'[^ABC]\w*', 'Babcde')  # Unmatched.
	re.match(r'[^ABC]\w*', 'Cabcde')  # Unmatched.
	# The second character.
	re.match(r'\w[^A]\w*', 'abcde')  # Matched.
	re.match(r'\w[^A]\w*', 'aBabcde')  # Matched.
	re.match(r'\w[^A]\w*', 'aAabcde')  # Unmatched.
	# The first and second characters.
	re.match(r'[^A][^B]\w*', 'abcde')  # Matched.
	re.match(r'[^A][^B]\w*', 'Babcde')  # Matched.
	re.match(r'[^A][^B]\w*', 'Aabcde')  # Unmatched.
	re.match(r'[^A][^B]\w*', 'aBabcde')  # Unmatched.

	#--------------------
	# Split.

	re.split(r'\W+', 'Words, words, words.')
	re.split(r'(\W+)', 'Words, words, words.')
	re.split(r'\W+', 'Words, words, words.', 1)
	re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)
	re.split(r'(\W+)', '...words, words...')

	#--------------------
	# Find.

	re.findall(r'''['"].*?['"]''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: ["'cd'", "'gh'", '"kl"', '\'op"', '"st\'', '"wx\'', '\'AB"'].
	re.findall(r'''(?P<quote>['"]).*?(?P=quote)''', '''ab'cd'ef'gh'ij"kl"mn'op"qr"st'uv"wx'yz'AB"CD''')  # Result: ["'", "'", '"', "'", '"'].

	re.findall(r'''['"].*?['"]|\*.*?\*''', '''ab'cd'ef'gh'ij"kl"mn*op*qr'st"uv"wx'yz*AB*CD"EF'GH'IJ"KL*MN*OPQRSTUVWXYZ''')  # Result: ["'cd'", "'gh'", '"kl"', '*op*', '\'st"', '"wx\'', '*AB*', '"EF\'', '\'IJ"', '*MN*'].
	re.findall(r'''(?P<quote>['"]).*?(?P=quote)|(?P<asterisk>\*).*?(?P=asterisk)''', '''ab'cd'ef'gh'ij"kl"mn*op*qr'st"uv"wx'yz*AB*CD"EF'GH'IJ"KL*MN*OPQRSTUVWXYZ''')  # Result: [("'", ''), ("'", ''), ('"', ''), ('', '*'), ("'", ''), ('', '*'), ('"', ''), ('', '*')].
	re.findall(r'''(?P<asterisk>\*).*?(?P=asterisk)|(?P<quote>['"]).*?(?P=quote)''', '''ab'cd'ef'gh'ij"kl"mn*op*qr'st"uv"wx'yz*AB*CD"EF'GH'IJ"KL*MN*OPQRSTUVWXYZ''')  # Result: [('', "'"), ('', "'"), ('', '"'), ('*', ''), ('', "'"), ('*', ''), ('', '"'), ('*', '')].

	#--------------------
	# Substitute.

	def dash_repl(match):
		if match.group(0) == '-': return ' '  # The entire match.
		else: return '-'
	re.sub('-{1,2}', '-', 'pro----gram-files')  # Result: "pro--gram-files".
	re.sub('-{1,2}', dash_repl, 'pro----gram-files')  # Result: "pro--gram files".
	re.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)  # Result: "Baked Beans & Spam".

	re.subn('-{1,2}', dash_repl, 'pro----gram-files')  # Result: "('pro--gram files', 3)".
	re.subn(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)  # Result: "('Baked Beans & Spam', 1)".

	#--------------------
	re.escape('http://www.python.org')  # Result: "http://www\\.python\\.org".

	re.purge()  # Clear the regular expression cache.

	#--------------------
	try:
		re.compile('[a-z+')
	except re.error as ex:
		print('re.error: {}.'.format(ex))
Exemple #36
0
def check_file(filename):
    """Check input file and verify its content.
	
	Checks that the file begins with HEX, BIN or ASC keyword,
	verifies the claimed content, and splits it into appropriate
	chunks.
	
	Returns integer (0=invalid, 1=HEX, 2=BIN, 3=ASCII) and the
	chunked file content or error message.
	
	Valid example hex file content:
	HEX 35 00 FF A2 81 9B E3
	"""
    file = open(filename, 'r')
    file_content = file.read()
    if len(file_content) < 3 or file_content.isspace():
        file.close()
        return (0, 'File content must begin with a keyword (HEX, BIN or ASC)!')
    # First 3 characters should represent the base of the content.
    base = file_content[0:3]
    file_content = file_content[3:]
    forbidden_chars = {'BIN': [None], 'HEX': [None]}

    # Content is claimed to be hexadecimal:
    if base == 'HEX':
        file_content = ''.join(file_content.split())
        file_content = file_content.upper()
        if len(file_content) < 2:
            file.close()
            return (
                0,
                'File must contain at least 1 byte of data after the keyword!')
        mod = len(file_content) % 2
        if mod != 0:
            return (
                0,
                'File must contain full bytes of data (2 hex digits = 1 byte)!'
            )
        # Use regular expression for verifying the content.
        if re.match('[0-9A-F]+$', file_content):
            content = ''
            for start in range(0, len(file_content), 2):
                if start + 2 <= len(file_content):
                    content += file_content[start:start + 2] + ' '
                else:
                    content += file_content[start:]  # add the remainings

            content = content.rstrip()  # remove possible whitespace at the end
            # Check that the file doesn't contain any forbidden control characters
            for val in content.split():
                if val in forbidden_chars['HEX']:
                    file.close()
                    return (
                        0,
                        'File must not contain other control characters than TAB, LF or CR!'
                    )
            # Return type indicator and the chopped content.
            file.close()
            return (1, content)
        else:
            file.close()
            return (0, 'File content was invalid hexadecimal data!')

    # Content is claimed to be binary:
    elif base == 'BIN':
        file_content = ''.join(file_content.split())
        if len(file_content) < 8:
            file.close()
            return (
                0,
                'File must contain at least 1 byte of data after the keyword!')
        mod = len(file_content) % 8
        if mod != 0:
            return (0,
                    'File must contain full bytes of data (8 bits = 1 byte)!')

        # Use regular expression for verifying the content.
        re.purge()  # clear regex cache
        if re.match('[0-1]+$', file_content):
            content = ''
            for start in range(0, len(file_content), 8):
                if start + 8 <= len(file_content):
                    content += file_content[start:start + 8] + ' '
                else:
                    content += file_content[start:]  # add the remainings

            content = content.rstrip()  # remove possible whitespace at the end
            # Check that the file doesn't contain any forbidden control characters
            for val in content.split():
                if val in forbidden_chars['BIN']:
                    file.close()
                    return (
                        0,
                        'File must not contain other control characters than TAB, LF or CR!'
                    )
            # Return type indicator and the chopped content.
            file.close()
            return (2, content)
        else:
            file.close()
            return (0, 'File content was invalid binary data!')

    # Content is claimed to be ASCII:
    elif base == 'ASC':
        escape_chars = ['\a', '\b', '\f', '\n', '\r', '\t', '\v']
        escape_letters = ['a', 'b', 'f', 'n', 'r', 't', 'v']
        # Use regular expression for verifying the content.
        re.purge()  # clear regex cache
        if re.match('[\x00-\x7F]+$', file_content):  # [\x20-\x7E]
            # Check that the file doesn't contain any forbidden control characters
            for c in file_content:
                if binascii.hexlify(c).upper() in forbidden_chars['HEX']:
                    file.close()
                    return (0, 'File contains illegal control characters!')
            for c in escape_chars:
                if file_content.count(c) != 0:
                    file_content = file_content.replace(c, '')
            # Replace all "\\n", "\\r" etc. with "\n", "\r" etc. (i.e. remove
            # the extra backslash) so that the control characters are interpreted
            # correctly into hex values.
            for c in range(0, len(file_content)):
                if file_content[c:c + 1] == '\\':
                    if file_content[c + 1:c + 2] in escape_letters:
                        for e in escape_letters:
                            if file_content[c + 1:c + 2] == e:
                                file_content = file_content[:c] + escape_chars[
                                    escape_letters.index(e)] + file_content[c +
                                                                            2:]
                                break
                    else:
                        return (
                            0,
                            'File contains illegal control characters!\n\n' +
                            'Legal characters after a backslash are: a, b, f, n, r, t, and v.'
                        )

            # Return type indicator and the file content.
            file.close()
            return (3, file_content)
        else:
            file.close()
            return (0, 'File content was invalid ASCII data!')

    # Content is invalid:
    else:
        file.close()
        return (0, 'File content must begin with a keyword (HEX, BIN or ASC)!')
Exemple #37
0
import gc

print "Is Enabled? %s"%gc.isenabled()
print gc.set_debug(gc.DEBUG_STATS)
#print gc.set_debug(gc.DEBUG_UNCOLLECTABLE)
#print gc.set_debug(gc.DEBUG_COLLECTABLE)
print gc.get_debug()
print "Objects: %s"%gc.garbage
print "Collect: %s"%gc.collect()

import re
print "re: %s"%re.purge()
Exemple #38
0
def replace_numbers(s, replace_by):
    re.purge()
    #temp = re.compile(ur'([0-9]+(st|th|rd|nd|,[0-9]+|.[0-9]+)?)', re.UNICODE)
    s = temp.sub(replace_by, s)
    return s
Exemple #39
0
def findAllMatches(string, pattern, flag=re.MULTILINE | re.DOTALL):
    regex = re.compile(pattern, flag)
    list = regex.findall(string)
    re.purge()
    return list
Exemple #40
0
    def test_sanity_re(self):
        '''
        Basic sanity tests for the re module.  Each module member is
        used at least once.
        '''
        #compile
        self.assertTrue(hasattr(re.compile("(abc){1}"), "pattern"))
        self.assertTrue(hasattr(re.compile("(abc){1}", re.L), "pattern"))
        self.assertTrue(hasattr(re.compile("(abc){1}", flags=re.L), "pattern"))

        #I IGNORECASE L LOCAL MMULTILINE S DOTALL U UNICODE X VERBOSE
        flags = [
            "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "S", "DOTALL",
            "U", "UNICODE", "X", "VERBOSE"
        ]

        for f in flags:
            self.assertTrue(hasattr(re, f))

        #search
        self.assertEqual(re.search("(abc){1}", ""), None)
        self.assertEqual(re.search("(abc){1}", "abcxyz").span(), (0, 3))
        self.assertEqual(re.search("(abc){1}", "abcxyz", re.L).span(), (0, 3))
        self.assertEqual(
            re.search("(abc){1}", "abcxyz", flags=re.L).span(), (0, 3))
        self.assertEqual(re.search("(abc){1}", "xyzabc").span(), (3, 6))

        self.assertEqual(re.search("(abc){1}", buffer("")), None)
        self.assertEqual(
            re.search("(abc){1}", buffer("abcxyz")).span(), (0, 3))
        self.assertEqual(
            re.search("(abc){1}", buffer("abcxyz"), re.L).span(), (0, 3))
        self.assertEqual(
            re.search("(abc){1}", buffer("abcxyz"), flags=re.L).span(), (0, 3))
        self.assertEqual(
            re.search("(abc){1}", buffer("xyzabc")).span(), (3, 6))

        #match
        self.assertEqual(re.match("(abc){1}", ""), None)
        self.assertEqual(re.match("(abc){1}", "abcxyz").span(), (0, 3))
        self.assertEqual(re.match("(abc){1}", "abcxyz", re.L).span(), (0, 3))
        self.assertEqual(
            re.match("(abc){1}", "abcxyz", flags=re.L).span(), (0, 3))

        #split
        self.assertEqual(re.split("(abc){1}", ""), [''])
        self.assertEqual(re.split("(abc){1}", "abcxyz"), ['', 'abc', 'xyz'])
        #maxsplit
        self.assertEqual(re.split("(abc){1}", "abc", 0), ['', 'abc', ''])
        for i in xrange(3):
            self.assertEqual(re.split("(abc){1}", "abc", maxsplit=i),
                             ['', 'abc', ''])
            self.assertEqual(re.split("(abc){1}", "", maxsplit=i), [''])
            self.assertEqual(re.split("(abc){1}", "abcxyz", maxsplit=i),
                             ['', 'abc', 'xyz'])
        self.assertEqual(re.split("(abc){1}", "abcxyzabc", maxsplit=0),
                         ['', 'abc', 'xyz', 'abc', ''])
        self.assertEqual(re.split("(abc){1}", "abcxyzabc", maxsplit=1),
                         ['', 'abc', 'xyzabc'])
        self.assertEqual(re.split("(abc){1}", "abcxyzabc", maxsplit=2),
                         ['', 'abc', 'xyz', 'abc', ''])

        #findall
        self.assertEqual(re.findall("(abc){1}", ""), [])
        self.assertEqual(re.findall("(abc){1}", "abcxyz"), ['abc'])
        self.assertEqual(re.findall("(abc){1}", "abcxyz", re.L), ['abc'])
        self.assertEqual(re.findall("(abc){1}", "abcxyz", flags=re.L), ['abc'])
        self.assertEqual(re.findall("(abc){1}", "xyzabcabc"), ['abc', 'abc'])

        #finditer
        self.assertEqual([x.group() for x in re.finditer("(abc){1}", "")], [])
        self.assertEqual(
            [x.group() for x in re.finditer("(abc){1}", "abcxyz")], ['abc'])
        self.assertEqual(
            [x.group() for x in re.finditer("(abc){1}", "abcxyz", re.L)],
            ['abc'])
        self.assertEqual(
            [x.group() for x in re.finditer("(abc){1}", "abcxyz", flags=re.L)],
            ['abc'])
        self.assertEqual(
            [x.group() for x in re.finditer("(abc){1}", "xyzabcabc")],
            ['abc', 'abc'])
        rex = re.compile("foo")
        for m in rex.finditer("this is a foo and a foo bar"):
            self.assertEqual((m.pos, m.endpos), (0, 27))
        for m in rex.finditer(""):
            self.assertEqual((m.pos, m.endpos), (0, 1))
        for m in rex.finditer("abc"):
            self.assertEqual((m.pos, m.endpos), (0, 4))
        for m in rex.finditer("foo foo foo foo foo"):
            self.assertEqual((m.pos, m.endpos), (0, 19))

        #sub
        self.assertEqual(re.sub("(abc){1}", "9", "abcd"), "9d")
        self.assertEqual(re.sub("(abc){1}", "abcxyz", 'abcd'), "abcxyzd")
        self.assertEqual(re.sub("(abc){1}", "1", "abcd", 0), "1d")
        self.assertEqual(re.sub("(abc){1}", "1", "abcd", count=0), "1d")
        self.assertEqual(re.sub("(abc){1}", "1", "abcdabcd", 1), "1dabcd")
        self.assertEqual(re.sub("(abc){1}", "1", "abcdabcd", 2), "1d1d")
        self.assertEqual(re.sub("(abc){1}", "1", "ABCdabcd", 2, flags=re.I),
                         "1d1d")

        #subn
        self.assertEqual(re.subn("(abc){1}", "9", "abcd"), ("9d", 1))
        self.assertEqual(re.subn("(abc){1}", "abcxyz", 'abcd'), ("abcxyzd", 1))
        self.assertEqual(re.subn("(abc){1}", "1", "abcd", 0), ("1d", 1))
        self.assertEqual(re.subn("(abc){1}", "1", "abcd", count=0), ("1d", 1))
        self.assertEqual(re.subn("(abc){1}", "1", "abcdabcd", 1),
                         ("1dabcd", 1))
        self.assertEqual(re.subn("(abc){1}", "1", "abcdabcd", 2), ("1d1d", 2))
        self.assertEqual(re.subn("(abc){1}", "1", "ABCdabcd", 2, flags=re.I),
                         ("1d1d", 2))

        #escape
        self.assertEqual(re.escape("abc"), "abc")
        self.assertEqual(re.escape(""), "")
        self.assertEqual(re.escape("_"), "\\_")
        self.assertEqual(re.escape("a_c"), "a\\_c")

        #error
        exc = re.error()
        exc = re.error("some args")

        #purge
        re.purge()
Exemple #41
0
def describe_regex(regex):
    re.purge()
    re.compile(regex, re.DEBUG)
Exemple #42
0
def clear_caches():
    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Flush standard output, so that buffered data is sent to the OS and
    # associated Python objects are reclaimed.
    for stream in (sys.stdout, sys.stderr, sys.__stdout__, sys.__stderr__):
        if stream is not None:
            stream.flush()

    # Clear assorted module caches.
    # Don't worry about resetting the cache if the module is not loaded
    try:
        distutils_dir_util = sys.modules['distutils.dir_util']
    except KeyError:
        pass
    else:
        distutils_dir_util._path_created.clear()
    re.purge()

    try:
        _strptime = sys.modules['_strptime']
    except KeyError:
        pass
    else:
        _strptime._regex_cache.clear()

    try:
        urllib_parse = sys.modules['urllib.parse']
    except KeyError:
        pass
    else:
        urllib_parse.clear_cache()

    try:
        urllib_request = sys.modules['urllib.request']
    except KeyError:
        pass
    else:
        urllib_request.urlcleanup()

    try:
        linecache = sys.modules['linecache']
    except KeyError:
        pass
    else:
        linecache.clearcache()

    try:
        mimetypes = sys.modules['mimetypes']
    except KeyError:
        pass
    else:
        mimetypes._default_mime_types()

    try:
        filecmp = sys.modules['filecmp']
    except KeyError:
        pass
    else:
        filecmp._cache.clear()

    try:
        struct = sys.modules['struct']
    except KeyError:
        pass
    else:
        # TODO: fix
        # struct._clearcache()
        pass

    try:
        doctest = sys.modules['doctest']
    except KeyError:
        pass
    else:
        doctest.master = None

    try:
        ctypes = sys.modules['ctypes']
    except KeyError:
        pass
    else:
        ctypes._reset_cache()

    try:
        typing = sys.modules['typing']
    except KeyError:
        pass
    else:
        for f in typing._cleanups:
            f()

    support.gc_collect()
Exemple #43
0
	def execute(self, message):
		"""
		:type message: IrcMessage
		"""
		#Immediately check if there's any parameters, to prevent useless work
		if message.messagePartsLength == 0:
			message.reply("Please provide a term to search for. See '{}help {}' for an explanation how to use this command".format(message.bot.commandPrefix, message.trigger), "say")
			return

		searchType = message.messageParts[0].lower()

		addExtendedInfo = message.trigger == 'netrunner'

		#Check for update command before file existence, to prevent message that card file is missing after update, which doesn't make much sense
		if searchType == 'update' or searchType == 'forceupdate':
			if self.areCardfilesBeingUpdated:
				replytext = "I'm already updating!"
			elif not message.bot.isUserAdmin(message.user, message.userNickname, message.userAddress):
				replytext = "Sorry, only admins can use my update function"
			elif not searchType == 'forceupdate' and not self.shouldUpdate()[0]:
				replytext = "The last update check was done pretty recently, there's no need to check again so soon"
			else:
				replytext = self.updateCardFile()[1]
				#Since we're checking now, set the automatic check to start counting from now on
				self.resetScheduledFunctionGreenlet()
			message.reply(replytext, "say")
			return

		#Check if the data file even exists
		elif not os.path.exists(os.path.join(GlobalStore.scriptfolder, 'data', 'NetrunnerCards.json')):
			if self.areCardfilesBeingUpdated:
				message.reply("I don't have my card database, but I'm solving that problem as we speak! Try again in, oh,  10, 15 seconds")
			else:
				message.reply("Sorry, I don't appear to have my card database. I'll try to retrieve it though! Give me 20 seconds, tops")
				gevent.spawn(self.updateCardFile)
				self.resetScheduledFunctionGreenlet()
			return

		#If we reached here, we're gonna search through the card store
		searchDict = {}
		# If there is an actual search (with colon key-value separator OR a random card is requested with specific search requirements
		if (searchType == 'search' and ':' in message.message) or (searchType == 'random' and message.messagePartsLength > 1):
			#Advanced search!
			if message.messagePartsLength <= 1:
				message.reply("Please provide an advanced search query too, in JSON format, so 'key1: value1, key2: value2'")
				return

			#Turn the search string (not the argument) into a usable dictionary, case-insensitive,
			searchDict = SharedFunctions.stringToDict(" ".join(message.messageParts[1:]).lower(), True)
			if len(searchDict) == 0:
				message.reply("That is not a valid search query. It should be entered like JSON, so 'name: Wall of Thorns, type: ICE,...'. ")
				return
		#If the searchtype is just 'random', don't set a 'name' field so we don't go through all the cards first
		#  Otherwise, set the whole message as the 'name' search, since that's the default search
		elif not searchType.startswith('random'):
			searchDict['title'] = message.message.lower()

		#Correct some values, to make searching easier (so a search for 'set' or 'sets' both work)
		searchTermsToCorrect = {'setname': ['set', 'sets'], 'flavor': ['flavour'], 'title': ['name']}
		for correctTerm, listOfWrongterms in searchTermsToCorrect.iteritems():
			for wrongTerm in listOfWrongterms:
				if wrongTerm in searchDict:
					if correctTerm not in searchDict:
						searchDict[correctTerm] = searchDict[wrongTerm]
					searchDict.pop(wrongTerm)

		#Turn the search strings into actual regexes
		regexDict = {}
		errors = []
		for attrib, query in searchDict.iteritems():
			try:
				#Since the query is a string, and the card data is unicode, convert the query to unicode before turning it into a regex
				regex = re.compile(unicode(query, encoding='utf8'), re.IGNORECASE)
			except (re.error, SyntaxError) as e:
				self.logError("[Netrunner] Regex error when trying to parse '{}': {}".format(query, e))
				errors.append(attrib)
			except UnicodeDecodeError as e:
				self.logError("[Netrunner] Unicode error in key '{}': {}".format(attrib, e))
				errors.append(attrib)
			else:
				regexDict[attrib] = regex
		#If there were errors parsing the regular expressions, don't continue, to prevent errors further down
		if len(errors) > 0:
			#If there was only one search element to begin with, there's no need to specify
			if len(searchDict) == 1:
				message.reply("An error occurred when trying to parse your search query. Please check if it is a valid regular expression, and that there are no non-UTF8 characters")
			#If there were more elements but only one error, specify
			elif len(errors) == 1:
				message.reply("An error occurred while trying to parse the query for the '{}' field. Please check if it is a valid regular expression without non-UTF8 characters".format(errors[0]))
			#Multiple errors, list them all
			else:
				message.reply("Errors occurred while parsing attributes: {}. Please check your search query for errors".format(", ".join(errors)))
			return

		#All entered data is valid, look through the stored cards
		with open(os.path.join(GlobalStore.scriptfolder, 'data', 'NetrunnerCards.json'), 'r') as jsonfile:
			cardstore = json.load(jsonfile)

		for index in xrange(0, len(cardstore)):
			carddata = cardstore.pop(0)

			#Then check if the rest of the attributes match
			for attrib in regexDict:
				if attrib not in carddata or not regexDict[attrib].search(carddata[attrib]):
					#If the wanted attribute is either not in the card, or it doesn't match, throw it out
					break
			#The else-block of a for-loop is executed when a for-loop isn't broken out of. So if everything matches, we get here
			else:
				cardstore.append(carddata)

		numberOfCardsFound = len(cardstore)
		#Pick a random card if needed and possible
		if searchType.startswith('random') and numberOfCardsFound > 0:
			cardstore = [random.choice(cardstore)]
			numberOfCardsFound = 1

		if numberOfCardsFound == 0:
			replytext = "Sorry, no card matching your query was found"
		elif numberOfCardsFound == 1:
			replytext = self.getFormattedCardInfo(cardstore[0], addExtendedInfo)
		else:
			nameMatchedCardFound = False
			replytext = ""
			#If there was a name search, check if the literal name is in the resulting cards
			if 'title' in searchDict:
				titleMatchIndex = None
				for index, card in enumerate(cardstore):
					if card['title'].lower() == searchDict['title']:
						titleMatchIndex = index
						break

				if titleMatchIndex:
					replytext = self.getFormattedCardInfo(cardstore[titleMatchIndex], addExtendedInfo)
					cardstore.pop(titleMatchIndex)
					numberOfCardsFound -= 1
					nameMatchedCardFound = True

			#Pick some cards to show
			maxCardsToList = 15
			if numberOfCardsFound > maxCardsToList:
				cardstore = random.sample(cardstore, maxCardsToList)
			cardnameText = ""
			for card in cardstore:
				cardnameText += card['title'].encode('utf-8') + "; "
			cardnameText = cardnameText[:-2]

			if nameMatchedCardFound:
				replytext += " ({:,} more match{} found: ".format(numberOfCardsFound, 'es' if numberOfCardsFound > 1 else '')
			else:
				replytext += "Your search returned {:,} cards: ".format(numberOfCardsFound)
			replytext += cardnameText
			if numberOfCardsFound > maxCardsToList:
				replytext += " and {:,} more".format(numberOfCardsFound - maxCardsToList)
			#Since the extra results list is bracketed when a literal match was also found, it needs a closing bracket
			if nameMatchedCardFound:
				replytext += ")"


		re.purge()  #Clear the stored regexes, since we don't need them anymore
		message.reply(replytext)
Exemple #44
0
def export_to_git(revisions,
                  done_count,
                  devpath=False,
                  ancestor=False,
                  ancestorDate=None):
    if len(revisions) == 0: return done_count

    abs_sandbox_path = os.getcwd()
    abs_sandbox_path = abs_sandbox_path.replace("\\", "/")
    integrity_file = os.path.basename(project)
    git_folder_re = re.compile(
        "\.git(\\\|$)"
    )  #any path named .git, with or without child elements. But will not match .gitignore

    if "ancestorDate" in revisions[0]:
        ancestor = revisions[0]["ancestor"]
        ancestorDate = revisions[0]["ancestorDate"]

    for revision in revisions:
        print("%d of %d (%0.2f%%)" % (done_count + 1, total_revision_count,
                                      done_count / total_revision_count * 100),
              file=sys.stderr)
        done_count += 1

        mark = marks[revision["number"]]
        si('si retargetsandbox %s --quiet --project="%s" --projectRevision=%s "%s/%s"'
           % (additional_si_args, project, revision["number"],
              abs_sandbox_path, integrity_file))
        si('si resync --yes --recurse %s --quiet --sandbox="%s/%s"' %
           (additional_si_args, abs_sandbox_path, integrity_file))
        if devpath:
            print_out('commit refs/heads/devpath/%s' % devpath)
        else:
            print_out('commit refs/heads/main')
        print_out('mark %s' % mark)
        print_out('committer %s <> %d +0000' %
                  (revision["author"], revision["seconds"]))
        export_string(revision["description"])
        if ancestor:
            print_out(
                'from %s' % marks[ancestor]
            )  # we're starting a development path so we need to start from it was originally branched from
            ancestor = False  #set to zero so it doesn't loop back in to here
        print_out('deleteall')
        tree = os.walk('.')
        for dir in tree:
            for filename in dir[2]:
                if (dir[0] == '.'):
                    fullfile = filename
                else:
                    fullfile = os.path.join(dir[0], filename)[2:]
                if (fullfile.find('.pj') != -1):
                    continue
                #if (fullfile[0:4] == ".git"):
                if git_folder_re.search(fullfile):
                    continue
                if (fullfile.find('mks_checkpoints_to_git') != -1):
                    continue
                inline_data(fullfile)

        for tag in revision["tags"]:
            print_out('tag %s' % tag.replace(" ", "_"))
            print_out('from %s' % mark)
            print_out('tagger %s <> %d +0000' %
                      (revision["author"], revision["seconds"]))
            export_string("")  # Tag message

        re.purge()
    print_out('checkpoint')
    return done_count
Exemple #45
0
    tr = soup.find(attrs={'id': 'places_area__row'})
    td = tr.find(attrs={'class': 'w2p_fw'})
    print td.text

    broken_html = "<ul class = country><li>Area<li>Population</ul>"
    tree = lxml.html.fromstring(broken_html)
    fixed_html = lxml.html.tostring(tree, pretty_print=True)
    print "new html:\n", fixed_html

    tree2 = lxml.html.fromstring(html)
    td = tree2.cssselect("tr#places_area__row > td.w2p_fw")[0]
    print td.text_content()

    for name, scraper in [('Regular expressions', re_scraper),
                          ('BeautifulSoup', bs_scraper),
                          ('lxml', lxml_scraper)]:
        start_time = time.time()
        for i in range(NUM_ITERATIONS):
            if scraper == re_scraper:
                re.purge()
            result = scraper(html)
            assert (result['area'] == '1580 square kilometres')
        end = time.time()
        print '%s:%.2f seconds' % (name, end - start_time)
#2018.08.05 test
'''
1580 square kilometres
Regular expressions:15.61 seconds
BeautifulSoup:77.98 seconds
lxml:3.76 seconds
'''
Exemple #46
0
 def parseExpr(self, expr:str): 
     re.purge()
     return(re.findall(r"[^[]*\[([^]]*)\]", expr))
Exemple #47
0
    def validate(self:object, customValidator:str=None):
        """
        Validate a resultset against predefined metadata based on the LANG rules of data quality.
        """
        if (self.metadata is None):
            raise ValidationError("LANG Exception: meta-data has not been set", None)
        elif (self.dataset is None):
            raise ValidationError("LANG Exception: resultset has not been set", None)

        """
        Change request: find and output the primary key in the error report file if specified
        """
        primary_key = ""
        primary_key_values = None
        
        for key, item in self.metadata.items():                
            if (MetaUtils.isTrue(item, "PrimaryKey")):
                primary_key = key
                primary_key_values = self.dataset[primary_key]
                break
                
        """
        Execute a series of validations against the supplied column of data and the metadata for the column.
        Which validation is run is determined by entries in the metadata.
        """         
        for meta_attribute_key, meta_attribute_definition in self.metadata.items():                
            if (meta_attribute_key in self.dataset):
                print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r')
                                
                attribute = self.dataset[meta_attribute_key]
                                
                for row_count in range(len(attribute)):
                    value = attribute[row_count]
                    
                    """ 
                    If a primarykey tag has been found then output the value so that the user 
                     has a reference to search for the record in the source system. 
                     If there is no primary key attribute set then output the row count 
                    """
                    
                    if (not primary_key_values is None):
                        primary_key_value = primary_key_values[row_count]
                    else:
                        primary_key_value = "Row: " + str(row_count+1)
                    
                    self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value)                  
                    self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value)

                
                # format check (must provide a regex)
                if (MetaUtils.exists(meta_attribute_definition, "Format")):
                    re.purge()
                    regex=re.compile(meta_attribute_definition["Format"])
                    
                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]
                        
                        isMatch = (not regex.match(value) is None)
                        
                        if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ):
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'"))

                   
                # unique field check        
                if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ):
                    # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0
                    seen = set()          

                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]

                        if (not value in seen):
                            seen.add(value) #only process a value once 
                        else:    
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected."))
                            
                self.checkComposite(meta_attribute_definition, meta_attribute_key)
                
                # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset
                self.evaluateExpression(meta_attribute_definition, meta_attribute_key)

                print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.")
            else:
                self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset."))
        
        # only invoke the custom validator if one has been provoded
        if (not customValidator is None and len(customValidator) > 0):
            self.customValidator(customValidator)
Exemple #48
0
def remove_returns(s, replace_by):
    '''EOL: End of Line'''
    re.purge()
    temp = re.compile(r"\s+")
    s = temp.sub(replace_by, s)
    return s
Exemple #49
0
    def __setUpGrammars(self, defaultGrammars):
        self.grammars = {}
        # Arrange all the grammars by name.
        for k, v in defaultGrammars.items():
            v['name'] = k
            self.grammars[k] = v

        # Compile regexes for each grammar.
        for k, v in defaultGrammars.items():
            if 0:
                # keywords re.
                v['keywordsRe'] = re.compile(
                    app.regex.joinReWordList(
                        v.get('keywords', []) + v.get('types', [])))
                v['errorsRe'] = re.compile(
                    app.regex.joinReList(v.get('errors', [])))
                v['specialsRe'] = re.compile(
                    app.regex.joinReList(v.get('special', [])))
            # contains and end re.
            matchGrammars = []
            markers = []
            # Index [0]
            if v.get('escaped'):
                markers.append(v['escaped'])
                matchGrammars.append(v)
            else:
                # Add a non-matchable placeholder.
                markers.append(app.regex.kNonMatchingRegex)
                matchGrammars.append(None)
            # Index [1]
            if v.get('end'):
                markers.append(v['end'])
                matchGrammars.append(v)
            else:
                # Add a non-matchable placeholder.
                markers.append(app.regex.kNonMatchingRegex)
                matchGrammars.append(None)
            # |Contains| markers start at index 2.
            for grammarName in v.get('contains', []):
                g = self.grammars.get(grammarName, None)
                if g is None:
                    self._raiseGrammarNotFound()
                markers.append(g.get('begin', g.get('matches', u"")))
                matchGrammars.append(g)
            # |Next| markers start after |contains|.
            for grammarName in v.get('next', []):
                g = self.grammars.get(grammarName, None)
                if g is None:
                    self._raiseGrammarNotFound()
                markers.append(g['begin'])
                matchGrammars.append(g)
            # |Errors| markers start after |next| markers.
            markers += v.get('errors', [])
            # |Keywords| markers start after |errors| markers.
            for keyword in v.get('keywords', []):
                markers.append(r'\b' + keyword + r'\b')
            # |Types| markers start after |keywords| markers.
            for types in v.get('types', []):
                markers.append(r'\b' + types + r'\b')
            # |Special| markers start after |types| markers.
            markers += v.get('special', [])
            # Variable width characters are at index [-3] in markers.
            markers.append(r'\t+')
            # Double wide characters are at index [-2] in markers.
            markers.append(u'[\u3000-\uffff]+')
            # Carriage return characters are at index [-1] in markers.
            markers.append(r'\n')
            #app.log.startup('markers', v['name'], markers)
            v['matchRe'] = re.compile(app.regex.joinReList(markers))
            v['markers'] = markers
            v['matchGrammars'] = matchGrammars
            containsGrammarIndexLimit = 2 + len(v.get('contains', []))
            nextGrammarIndexLimit = containsGrammarIndexLimit + len(
                v.get('next', []))
            errorIndexLimit = nextGrammarIndexLimit + len(v.get('errors', []))
            keywordIndexLimit = errorIndexLimit + len(v.get('keywords', []))
            typeIndexLimit = keywordIndexLimit + len(v.get('types', []))
            specialIndexLimit = typeIndexLimit + len(v.get('special', []))
            v['indexLimits'] = (containsGrammarIndexLimit,
                                nextGrammarIndexLimit, errorIndexLimit,
                                keywordIndexLimit, typeIndexLimit,
                                specialIndexLimit)

        # Reset the re.cache for user regexes.
        re.purge()
The expression’s behaviour can be modified by specifying a flags value. 
Flag Values can be any of the re flag variables, combined using bitwise OR (the | operator).

Note: Using re.compile() and saving the resulting regular expression object for 
reuse is more efficient when the expression will be used several times in a single program.
'''
string1 = "18IT033"
string2 = "My id is 18CE033"
pattern1 = "^[0-9]{2}(IT)[0-9]{3}"
patt1 = re.compile(pattern1)
result1 = patt1.match(string1)
print(result1)
result1 = re.match(pattern1, string1)
print(result1)
result2 = patt1.match(string2)
print(result2)
'''
re.purge()
  Clear the regular expression cache.
'''

re.purge()
'''
re.escape(pattern)
Escape special characters in pattern. 
This is useful if you want to match an arbitrary literal string 
that may have regular expression metacharacters in it.
'''
print(re.escape("h.(h)"))
print(re.escape("n&n"))
print(re.escape("n*{n}"))
Exemple #51
0
def purge():
    re.purge()
Exemple #52
0
def clear_caches():
    import gc

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Clear assorted module caches.
    # Don't worry about resetting the cache if the module is not loaded
    try:
        distutils_dir_util = sys.modules['distutils.dir_util']
    except KeyError:
        pass
    else:
        distutils_dir_util._path_created.clear()

    re.purge()

    try:
        _strptime = sys.modules['_strptime']
    except KeyError:
        pass
    else:
        _strptime._regex_cache.clear()

    try:
        urlparse = sys.modules['urlparse']
    except KeyError:
        pass
    else:
        urlparse.clear_cache()

    try:
        urllib = sys.modules['urllib']
    except KeyError:
        pass
    else:
        urllib.urlcleanup()

    try:
        urllib2 = sys.modules['urllib2']
    except KeyError:
        pass
    else:
        urllib2.install_opener(None)

    try:
        dircache = sys.modules['dircache']
    except KeyError:
        pass
    else:
        dircache.reset()

    try:
        linecache = sys.modules['linecache']
    except KeyError:
        pass
    else:
        linecache.clearcache()

    try:
        mimetypes = sys.modules['mimetypes']
    except KeyError:
        pass
    else:
        mimetypes._default_mime_types()

    try:
        filecmp = sys.modules['filecmp']
    except KeyError:
        pass
    else:
        filecmp._cache.clear()

    try:
        struct = sys.modules['struct']
    except KeyError:
        pass
    else:
        struct._clearcache()

    try:
        doctest = sys.modules['doctest']
    except KeyError:
        pass
    else:
        doctest.master = None

    try:
        ctypes = sys.modules['ctypes']
    except KeyError:
        pass
    else:
        ctypes._reset_cache()

    # Collect cyclic trash.
    support.gc_collect()
Exemple #53
0
def WriteCert(ProgPath,
              InputName,
              OutputName,
              IsLabelInOutput=True,
              DoEncode=False):
    TempStr = ''
    RawLine = ''
    EncodeLine = ''
    CleanLine = ''
    I = 0
    VarLen = 0
    ListObj = None
    ReObj = None

    FileLineNo = 0
    ErrorNumber = 0
    CertBegin = False
    DataBegin = False
    DataEnd = False
    TrustBegin = False
    LabelPrinted = False

    CertReObj = re.compile('CKA_CLASS CK_OBJECT_CLASS CKO_CERTIFICATE',
                           re.IGNORECASE)
    if not CertReObj:
        print(ErrorMainList[0])
        return 254
    LabelStr = ''
    LabelReObj = re.compile(r'CKA_LABEL UTF8 \"([^\"]+)\"', re.IGNORECASE)
    if not LabelReObj:
        re.purge()
        print(ErrorMainList[0])
        return 254
    DataRawStr = ''
    DataEncSplit = None
    DataReObj = re.compile('CKA_VALUE MULTILINE_OCTAL', re.IGNORECASE)
    if not DataReObj:
        re.purge()
        print(ErrorMainList[0])
        return 254
    OctetsReObj = re.compile('[0-7][0-7][0-7]', re.IGNORECASE)
    if not OctetsReObj:
        re.purge()
        print(ErrorMainList[0])
        return 254
    EndReObj = re.compile('END', re.IGNORECASE)
    if not EndReObj:
        re.purge()
        print(ErrorMainList[0])
        return 254

    TrustReObj = re.compile('CKA_CLASS CK_OBJECT_CLASS CKO_NSS_TRUST',
                            re.IGNORECASE)
    if not TrustReObj:
        print(ErrorMainList[0])
        return 254
    TrustPurpose = ''
    TrustLevel = ''
    TrustPrimaryReObj = re.compile(
        r'CKA_TRUST_([a-z_]+) CK_TRUST CKT_NSS_([a-z_]+)', re.IGNORECASE)
    if not TrustPrimaryReObj:
        print(ErrorMainList[0])
        return 254

    try:
        os.remove(OutputName)
        print('Deleted file "%s".' % OutputName)
    except:
        pass
    FTxtInObj = open(InputName, 'rb')
    FTxtOutObj = open(OutputName, 'wb')

    for RawLine in FTxtInObj:
        FileLineNo += 1
        if DoEncode:
            try:
                EncodeLine = RawLine.encode('utf_8', 'strict')
            except:
                try:
                    EncodeLine = ''
                    if LabelPrinted:
                        TempStr = ErrorEncodeList[0] + ErrorEncodeList[
                            1] + ErrorEncodeList[0] + ErrorEncodeList[2]
                    else:
                        TempStr = '\n' + ErrorEncodeList[1] + ErrorEncodeList[
                            0] + ErrorEncodeList[2]
                    print(TempStr % FileLineNo)
                    TempStr = ''
                    EncodeLine = RawLine.encode('utf_8', 'ignore')
                    if LabelPrinted:
                        TempStr = ErrorEncodeList[0]
                    TempStr += ErrorEncodeList[0] + ErrorEncodeList[4]
                    print(TempStr)
                    TempStr = ''
                except:
                    try:
                        EncodeLine = RawLine
                        if LabelPrinted:
                            TempStr = ErrorEncodeList[0]
                        TempStr += ErrorEncodeList[0] + ErrorEncodeList[4]
                        if not LabelPrinted:
                            TempStr += '\n'
                        print(TempStr)
                        TempStr = ''
                    except:
                        ErrorNumber = 250
                        break
            ListObj = EncodeLine.splitlines(False)
            EncodeLine = ''
        else:
            ListObj = RawLine.splitlines(False)
        ListRemoveEmpty(ListObj)
        if not ListObj:
            if DataBegin:
                try:
                    print(ErrorReadList[1] % FileLineNo)
                finally:
                    ErrorNumber = 3
                    break
            continue
        CleanLine = ListObj[0].strip()
        ListClean(ListObj)
        if (not CleanLine) or (CleanLine == '#'):
            if DataBegin:
                try:
                    print(ErrorReadList[1] % FileLineNo)
                finally:
                    ErrorNumber = 3
                    break
            continue
        if CertBegin:
            if not DataBegin:
                if CertReObj.match(CleanLine):
                    ListClean(DataEncSplit)
                    DataRawStr = ''
                    LabelStr = ''
                    TrustBegin = False
                    DataEnd = False
                    DataBegin = False
                    if LabelPrinted:
                        LabelPrinted = False
                        print('    CANCELING. Found NON CA. Line %d.' %
                              FileLineNo)
                else:
                    if not TrustBegin:
                        if not DataEnd:
                            if not LabelStr:
                                try:
                                    ReObj = LabelReObj.match(CleanLine)
                                    if ReObj:
                                        LabelStr = ReObj.group(1)
                                        if not LabelStr:
                                            raise ValueError
                                        else:
                                            LabelPrinted = True
                                            LabelStr = CorrectCertLabel(
                                                LabelStr)
                                            print(
                                                '\nCertificate on Line %d\n    "%s"'
                                                % (FileLineNo, LabelStr))
                                            VarLen = len(LabelStr)
                                            LabelStr += FStyleNL
                                            for I in range(0, VarLen, 1):
                                                LabelStr += FStyleLabelUR
                                            LabelStr += FStyleNL
                                except:
                                    try:
                                        print(ErrorReadList[0] % FileLineNo)
                                    finally:
                                        ErrorNumber = 2
                                        break
                            elif DataReObj.match(CleanLine):
                                DataBegin = True
                        elif TrustReObj.match(CleanLine):
                            TrustBegin = True
                    else:
                        try:
                            ReObj = TrustPrimaryReObj.match(CleanLine)
                            if ReObj:
                                TrustPurpose = ReObj.group(1).upper()
                                TrustLevel = ReObj.group(2).upper()
                                if (TrustPurpose in MozillaTrustReqPrimary
                                    ) and (TrustLevel
                                           == MozillaTrustLevels[0]):
                                    FTxtOutObj.write(FStyleNL)
                                    if IsLabelInOutput:
                                        FTxtOutObj.write(LabelStr)
                                    for I in range(0, len(DataEncSplit), 1):
                                        FTxtOutObj.write(DataEncSplit[I])
                                    FTxtOutObj.flush()
                                    ListClean(DataEncSplit)
                                    DataRawStr = ''
                                    LabelStr = ''
                                    LabelPrinted = False
                                    TrustBegin = False
                                    DataEnd = False
                                    DataBegin = False
                                    CertBegin = False
                                    print('    SAVE.')
                                TrustLevel = ''
                                TrustPurpose = ''
                        except:
                            try:
                                print(ErrorReadList[2] % FileLineNo)
                            finally:
                                ErrorNumber = 4
                                break
            else:
                if EndReObj.match(CleanLine):
                    try:
                        DataEncSplit = CertToBase64(DataRawStr, True, True)
                        DataRawStr = ''
                        if not DataEncSplit:
                            raise ValueError
                    except:
                        try:
                            print(ErrorReadList[1] % FileLineNo)
                        finally:
                            ErrorNumber = 250
                            break
                    DataBegin = False
                    DataEnd = True
                else:
                    try:
                        ListObj = CleanLine.split('\\')
                        if not ListObj:
                            raise ValueError
                    except:
                        try:
                            print(ErrorReadList[1] % FileLineNo)
                        finally:
                            ErrorNumber = 250
                            break
                    ListRemoveEmpty(ListObj)
                    if not ListObj:
                        try:
                            print(ErrorReadList[1] % FileLineNo)
                        finally:
                            ErrorNumber = 3
                            break
                    for I in range(0, len(ListObj), 1):
                        if not OctetsReObj.match(ListObj[I]):
                            ErrorNumber = 3
                            break
                        try:
                            DataRawStr += chr(int(ListObj[I], 8))
                        except:
                            ErrorNumber = 3
                            break
                    ListClean(ListObj)
                    if ErrorNumber:
                        try:
                            print(ErrorReadList[1] % FileLineNo)
                        finally:
                            break
        elif CertReObj.match(CleanLine):
            CertBegin = True

    FTxtOutObj.close()
    FTxtInObj.close()
    re.purge()
    return ErrorNumber