def read_prpixel_file(filename): f = open(filename) s = ''.join(f.readlines()) f.close() # Track: # {'ntrack':, 'nhits':, 'hits:' [{'hitid':, 'module':, 'x':, 'y':, 'z':}, ...]} prpixel_tracks = [] # Find all debug lines with created tracks for i in re.finditer('Store track Nb (?P<ntrack>\d+)[^\d]nhits (?P<nhits>\d+).*?PrPixelTracking[ \t]*?(INFO ===|DEBUG)', s, re.DOTALL): hits = [] # Find all hits in the track for j in re.finditer('PrPixelTracking.*?(?P<hitid>\d+) *module *(?P<module>\d+) x *(?P<x>[\d\.\-]+) y *(?P<y>[\d\.\-]+) z *(?P<z>[\d\-\.]+) used \d', i.group(0), re.DOTALL): hits.append({'hitid': j.group('hitid'), 'module': j.group('module'), 'x': j.group('x'), 'y': j.group('y'), 'z': j.group('z')}) prpixel_tracks.append({'ntrack': i.group('ntrack'), 'nhits': i.group('nhits'), 'hits': hits}) return prpixel_tracks
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return content = utils.gzip_decompress(content) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int(email_utils.mktime_tz( email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def obfuscate_codeblocks(source): """Method for obfuscating codeblocks contents. It can be often useful to temporarly obfuscate codeblocks contents for performing safely some tasks and then re-introducing them. Parameters ---------- source : str string (as single stream) containing the source Returns ------- protected_contents : list list of str containing the contents of codeblocks str source with codeblocks contents obfuscated and replaced by a safe placeholder >>> source = '``` my code block ``` other contents' >>> prot, ob_source = obfuscate_codeblocks(source) >>> prot[0][2] '``` my code block ```' >>> ob_source '$PROTECTED-1 other contents' """ obfuscate_source = source protected_contents = [] for match in re.finditer(__regex_codeblock__,obfuscate_source): protected_contents.append([match.start(),match.end(),match.group()]) obfuscate_source = re.sub(__regex_codeblock__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1) for match in re.finditer(__regex_codeblock_html__,obfuscate_source): protected_contents.append([match.start(),match.end(),match.group()]) obfuscate_source = re.sub(__regex_codeblock_html__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1) return protected_contents,obfuscate_source
def get_pronoun_label_zh(line): f_pronouns = ['我', '我们', '我 的'] s_pronouns = ['你', '你们', '你 的'] f_count = 0 s_count = 0 f_positions = [] s_positions = [] for pro in f_pronouns: f_zh = re.findall('^' + pro + ' ', line) + re.findall(' ' + pro + ' ', line) + re.findall(' ' + pro + '$', line) f_positions += [m.span()[0] for m in re.finditer('^' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + '$', line)] f_count += len(f_zh) for pro in s_pronouns: s_zh = re.findall('^' + pro + ' ', line) + re.findall(' ' + pro + ' ', line) + re.findall(' ' + pro + '$', line) s_positions += [m.span()[0] for m in re.finditer('^' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + '$', line)] s_count += len(s_zh) if f_count == 0 and s_count == 0: return ('none', 0, 0, [], []) if(f_count == s_count): f_min = min(f_positions) s_min = min(s_positions) starts_with = '1v' if f_min < s_min else '2v' return (starts_with, f_count, s_count, f_zh, s_zh) #return ('1v', f_count, s_count, f_zh, s_zh) elif(f_count > s_count): return ('1v', f_count, s_count, f_zh, s_zh) else: return ('2v', f_count, s_count, f_zh, s_zh)
def ParseMethodAnnotation(self, annotation): if annotation.find('reservable = true') >= 0: self._is_reservable = True delegate_re = re.compile('delegate\s*=\s*' '(?P<delegate>(true|false))') for match in re.finditer(delegate_re, annotation): delegate = match.group('delegate') if delegate == 'true': self._is_delegate = True elif delegate == 'false': self._is_delegate = False disable_reflect_method_re = re.compile('disableReflectMethod\s*=\s*' '(?P<disableReflectMethod>(true|false))') for match in re.finditer(disable_reflect_method_re, annotation): disable_reflect_method = match.group('disableReflectMethod') if disable_reflect_method == 'true': self._disable_reflect_method = True else: self._disable_reflect_method = False pre_wrapline_re = re.compile('preWrapperLines\s*=\s*\{\s*(' '?P<pre_wrapline>(".*")(,\s*".*")*)\s*\}') for match in re.finditer(pre_wrapline_re, annotation): pre_wrapline = self.FormatWrapperLine(match.group('pre_wrapline')) self._method_annotations[self.ANNOTATION_PRE_WRAPLINE] = pre_wrapline post_wrapline_re = re.compile('postWrapperLines\s*=\s*\{\s*(' '?P<post_wrapline>(".*")(,\s*".*")*)\s*\}') for match in re.finditer(post_wrapline_re, annotation): post_wrapline = self.FormatWrapperLine(match.group('post_wrapline')) self._method_annotations[self.ANNOTATION_POST_WRAPLINE] = post_wrapline
def prepair_query(self, media, *args, **kwards): if media == 'tvshow': uri = '/search/advanced_search.php?' query = {"q": args[0], "from_year": args[3], "to_year": args[3], "section": 2} uri += urllib.urlencode(query) html = self.request(uri) r = re.search('Search Results For: "(.*?)</table>', html, re.DOTALL) if r: fragment = r.group(1) pattern = r'<a\s+href="([^"]+)"\s+title="([^"]+)' for match in re.finditer(pattern, fragment): url, title_year = match.groups('') url = url.replace('-tvshow-online-free-putlocker.html', '-tvshow-season-%s-episode-%s-online-free-putlocker.html' % (args[1], args[2])) uri = url.replace(self.base_url, '') return uri return False else: uri = '/search/advanced_search.php?' query = {"q": args[0], "from_year": args[1], "to_year": args[1], "section": 1} uri += urllib.urlencode(query) html = self.request(uri) r = re.search('Search Results For: "(.*?)</table>', html, re.DOTALL) if r: fragment = r.group(1) pattern = r'<a\s+href="([^"]+)"\s+title="([^"]+)' for match in re.finditer(pattern, fragment): url, title_year = match.groups('') uri = url.replace(self.base_url, '') return uri return False
def extractRequirements(args): f = open(args.requirements_path) text = f.read() f.close() data = {} # [['Requirement Number', 'Requirement Description']] # Extract all of the tables for table in re.finditer(r'\\begin{tabular}.*?\\end{tabular}', text, re.S): if re.search(r'Number.*?Requirement Description', table.group(0)) != None: header_row = True # Look for this pattern: # F1.10 & Description for req in re.finditer(r'([\w\.]+)\s+&\s+(.*?)\s*\\\\', table.group(0)): if header_row: header_row = False # Skip the header continue else: # Store the requirment number and description in the data structure #data.append([req.group(1), req.group(2)]) data[req.group(1)] = [req.group(1), req.group(2), set()] return data
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'}) if fragment: for match in re.finditer('href="([^"]+)', fragment[0]): match = re.search('movie-player/(.*)', match.group(1)) if match: player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) html = self._http_get(player_url, cache_limit=.5) match = re.search('<source\s+src="([^"]+)', html) if match: stream_url = match.group(1) hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'}) if fragment2: for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]): other_url, quality = match.groups() match = re.search('movie-player/(.*)', other_url) if match: other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) if other_url == player_url: continue hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) return hosters
def tableViewInHierarchy(): viewDescription = fb.evaluateExpressionValue( "(id)[(id)[[UIApplication sharedApplication] keyWindow] recursiveDescription]" ).GetObjectDescription() searchView = None # Try to find an instance of classPattern = re.compile(r"UITableView: (0x[0-9a-fA-F]+);") for match in re.finditer(classPattern, viewDescription): searchView = match.group(1) break # Try to find a direct subclass if not searchView: subclassPattern = re.compile(r"(0x[0-9a-fA-F]+); baseClass = UITableView;") for match in re.finditer(subclassPattern, viewDescription): searchView = match.group(1) break # SLOW: check every pointer in town if not searchView: pattern = re.compile(r"(0x[0-9a-fA-F]+)[;>]") for view in re.findall(pattern, viewDescription): if fb.evaluateBooleanExpression("[" + view + " isKindOfClass:(id)[UITableView class]]"): searchView = view break return searchView
def google_scrap(self): #iter = re.finditer('''<!--sMSL-->([\s\S]*)<!--sMSR-->''',self.the_page) #for it in iter: # self.res_html = it.group(1) iter = re.finditer('''<p>([\s\S]*?)</p>''',self.the_page) for it in iter: self.search_item.append(it.group(1)) for i in range(len(self.search_item)): href = "" title = "" iter = re.finditer('''<a.href="/url\?q=[^"]*">([\s\S]*?)</a>[\s\S]*?<a.href="/search\?q=related:([^"]*)&hl=">''',self.search_item[i]) #iter = re.finditer('''<a.href="[^"]*">([^"]*)</a>[\s\S]*?<a.href="/search?q=related:([^"]*)&hl=">''',self.search_item[i]) for it in iter: href = it.group(2) if href == '': break title = it.group(1) textProcess = textprocess.TextProcess(title) textProcess.getTitle() textProcess.clearPoint() textProcess.clearSpace() self.html_titles[href] = textProcess.text if href == '': continue textProcess = textprocess.TextProcess(self.search_item[i]) textProcess.clearHtml() textProcess.clearPoint() textProcess.clearSpace() text = textProcess.text self.res_items[href] = [self.html_titles[href],text] print self.html_titles
def scan_page(url, data=None): retval, usable = False, False url, data = re.sub(r"=(&|\Z)", "=1\g<1>", url) if url else url, re.sub(r"=(&|\Z)", "=1\g<1>", data) if data else data try: for phase in (GET, POST): current = url if phase is GET else (data or "") for match in re.finditer(r"((\A|[?&])(?P<parameter>[\w\[\]]+)=)(?P<value>[^&]+)", current): found, usable = False, True print "* scanning %s parameter '%s'" % (phase, match.group("parameter")) prefix, suffix = ("".join(random.sample(string.ascii_lowercase, PREFIX_SUFFIX_LENGTH)) for i in xrange(2)) for pool in (LARGER_CHAR_POOL, SMALLER_CHAR_POOL): if not found: tampered = current.replace(match.group(0), "%s%s" % (match.group(0), urllib.quote("%s%s%s%s" % ("'" if pool == LARGER_CHAR_POOL else "", prefix, "".join(random.sample(pool, len(pool))), suffix)))) content = (_retrieve_content(tampered, data) if phase is GET else _retrieve_content(url, tampered)).replace("%s%s" % ("'" if pool == LARGER_CHAR_POOL else "", prefix), prefix) for sample in re.finditer("%s([^ ]+?)%s" % (prefix, suffix), content, re.I): for regex, condition, info, content_removal_regex in XSS_PATTERNS: context = re.search(regex % {"chars": re.escape(sample.group(0))}, re.sub(content_removal_regex or "", "", content), re.I) if context and not found and sample.group(1).strip(): if _contains(sample.group(1), condition): print " (i) %s parameter '%s' appears to be XSS vulnerable (%s)" % (phase, match.group("parameter"), info % dict((("filtering", "no" if all(char in sample.group(1) for char in LARGER_CHAR_POOL) else "some"),))) found = retval = True break if not usable: print " (x) no usable GET/POST parameters found" except KeyboardInterrupt: print "\r (x) Ctrl-C pressed" return retval
def parseHtmlForm(attr_str, html, input_names=None): for form in re.finditer(r"(?P<tag><form[^>]*%s[^>]*>)(?P<content>.*?)</?(form|body|html)[^>]*>" % attr_str, html, re.S | re.I): inputs = {} action = parseHtmlTagAttrValue("action", form.group('tag')) for inputtag in re.finditer(r'(<(input|textarea)[^>]*>)([^<]*(?=</\2)|)', form.group('content'), re.S | re.I): name = parseHtmlTagAttrValue("name", inputtag.group(1)) if name: value = parseHtmlTagAttrValue("value", inputtag.group(1)) if value is None: inputs[name] = inputtag.group(3) or '' else: inputs[name] = value if isinstance(input_names, dict): # check input attributes for key, val in input_names.items(): if key in inputs: if isinstance(val, basestring) and inputs[key] == val: continue elif isinstance(val, tuple) and inputs[key] in val: continue elif hasattr(val, "search") and re.match(val, inputs[key]): continue break # attibute value does not match else: break # attibute name does not match else: return action, inputs # passed attribute check else: # no attribute check return action, inputs return {}, None # no matching form found
def consistency_check(text, word_pairs, err, msg, offset=0): """Build a consistency checker for the given word_pairs.""" errors = [] msg = " ".join(msg.split()) for w in word_pairs: match1 = [m for m in re.finditer(w[0], text)] match2 = [m for m in re.finditer(w[1], text)] if len(match1) > 0 and len(match2) > 0: if len(match1) > len(match2): for m in match2: errors.append(( m.start() + offset, m.end() + offset, err, msg.format(m.group(0), w[0]))) else: for m in match1: errors.append(( m.start() + offset, m.end() + offset, err, msg.format(m.group(0), w[1]))) return errors
def get_media_url(self, host, media_id): web_url = self.get_url(host, media_id) html = self.net.http_GET(web_url).content form_values = {} stream_url = '' for i in re.finditer('<input type="hidden" name="([^"]+)" value="([^"]+)', html): form_values[i.group(1)] = i.group(2) xbmc.sleep(2000) html = self.net.http_POST(web_url, form_data=form_values).content r = re.search("file\s*:\s*'([^']+)'", html) if r: stream_url = r.group(1) for match in re.finditer('(eval\(function.*?)</script>', html, re.DOTALL): js_data = jsunpack.unpack(match.group(1)) match2 = re.search('<param\s+name="src"\s*value="([^"]+)', js_data) if match2: stream_url = match2.group(1) else: match2 = re.search('<embed.*?type="video.*?src="([^"]+)', js_data) if match2: stream_url = match2.group(1) if stream_url: return stream_url + '|User-Agent=%s&Referer=%s' % (common.IE_USER_AGENT, web_url) raise UrlResolver.ResolverError('Unable to resolve cloudyvideos link. Filelink not found.')
def setupTranslations(type, locales, projectName, key): # Copy locales list, we don't want to change the parameter locales = set(locales) # Fill up with locales that we don't have but the browser supports if type == 'chrome': for locale in chromeLocales: locales.add(locale) else: firefoxLocales = urllib2.urlopen('http://www.mozilla.org/en-US/firefox/all.html').read() for match in re.finditer(r'&lang=([\w\-]+)"', firefoxLocales): locales.add(mapLocale(type, match.group(1))) langPacks = urllib2.urlopen('https://addons.mozilla.org/en-US/firefox/language-tools/').read() for match in re.finditer(r'<tr>.*?</tr>', langPacks, re.S): if match.group(0).find('Install Language Pack') >= 0: match2 = re.search(r'lang="([\w\-]+)"', match.group(0)) if match2: locales.add(mapLocale(type, match2.group(1))) # Convert locale codes to the ones that Crowdin will understand locales = set(map(lambda locale: mapLocale(type, locale), locales)) allowed = set() allowedLocales = urllib2.urlopen('http://crowdin.net/page/language-codes').read() for match in re.finditer(r'<tr>\s*<td\b[^<>]*>([\w\-]+)</td>', allowedLocales, re.S): allowed.add(match.group(1)) if not allowed.issuperset(locales): print 'Warning, following locales aren\'t allowed by server: ' + ', '.join(locales - allowed) locales = list(locales & allowed) locales.sort() params = urllib.urlencode([('languages[]', locale) for locale in locales]) result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/edit-project?key=%s' % (projectName, key), params).read() if result.find('<success') < 0: raise Exception('Server indicated that the operation was not successful\n' + result)
def processticker(ticker, file_name, date_int, listview): base_url = "http://finance.yahoo.com/q/op" num_of_tries = 0 payload = {"s": ticker, "date": date_int} r = requests.get(base_url, params=payload) data = r.text soup = BeautifulSoup(data, "lxml") option_list = [] expiration_dictionary = {} while num_of_tries < 20: try: for pair in soup.find_all("option"): expiration_dictionary[pair.get_text()] = yahoo_url + pair["data-selectbox-link"] for n in soup.find_all("script"): option_list.append(n) raw_options_chain = str(option_list.pop(16)) start_call_options = [a.start() for a in list(re.finditer("calls", raw_options_chain))] endoptions = [a.start() for a in list(re.finditer("_options", raw_options_chain))] raw_options_chain = raw_options_chain[start_call_options[0] - 2 : endoptions[0] - 2] options_json = json.loads(raw_options_chain) # Extract puts/calls as JSON objects. put_list = options_json["puts"] call_list = options_json["calls"] print(call_list) create_csv(call_list, put_list, file_name, listview) except IndexError: num_of_tries += 1 continue break
def consistency_check(text, word_pairs, err, msg, offset=0): """Build a consistency checker for the given word_pairs.""" errors = [] msg = " ".join(msg.split()) for w in word_pairs: matches = [ [m for m in re.finditer(w[0], text)], [m for m in re.finditer(w[1], text)] ] if len(matches[0]) > 0 and len(matches[1]) > 0: idx_minority = len(matches[0]) > len(matches[1]) for m in matches[idx_minority]: errors.append(( m.start() + offset, m.end() + offset, err, msg.format(w[~idx_minority], m.group(0)), w[~idx_minority])) return errors
def html2utf8(self,in_html): in_html = (re.subn(r'<(script).*?</\1>(?s)', '', in_html)[0]) in_html = (re.subn(r'<(style).*?</\1>(?s)', '', in_html)[0]) entitydict = {} entities = re.finditer('&([^#][A-Za-z]{1,5}?);', in_html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = htmlentitydefs.name2codepoint[x.group(1)] entities = re.finditer('&#x([0-9A-Fa-f]{2,2}?);', in_html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = "%d" % int(key[3:5], 16) entities = re.finditer('&#(\d{1,5}?);', in_html) for x in entities: key = x.group(0) if key not in entitydict: entitydict[key] = x.group(1) if re.search("charset=utf-8", in_html): for key, codepoint in iteritems(entitydict): in_html = in_html.replace(key, unichr(int(codepoint))) self.inhtml = in_html.encode('utf8') return for key, codepoint in iteritems(entitydict): in_html = in_html.replace(key, unichr(int(codepoint)).encode('latin-1', 'ignore')) self.inhtml = in_html.decode('latin-1').encode('utf8')
def _attack(self, basePair, payloads, taint, request_template, referer): proto = helpers.analyzeRequest(basePair).getUrl().getProtocol() + '://' if 'abshost' in payloads: payloads['abshost'] = proto + payloads['abshost'] payloads['referer'] = proto + taint + '/' + referer # Load the supplied payloads into the request if 'xfh' in payloads: payloads['xfh'] = "\r\nX-Forwarded-Host: " + payloads['xfh'] for key in ('xfh', 'abshost', 'host', 'referer'): if key not in payloads: payloads[key] = '' # Ensure that the response to our request isn't cached - that could be harmful payloads['cachebust'] = str(time.time()) request = request_template.substitute(payloads) attack = callbacks.makeHttpRequest(basePair.getHttpService(), request) response = safe_bytes_to_string(attack.getResponse()) requestHighlights = [jarray.array([m.start(), m.end()], 'i') for m in re.finditer('(' + '|'.join(payloads.values()) + ')', safe_bytes_to_string(attack.getRequest()))] responseHighlights = [jarray.array([m.start(), m.end()], 'i') for m in re.finditer(taint, response)] attack = callbacks.applyMarkers(attack, requestHighlights, responseHighlights) return attack, response
def get_media_url(self, host, media_id): web_url = self.get_url(host, media_id) html = self.net.http_GET(web_url).content data = {} for match in re.finditer('input type="hidden" name="([^"]+)" value="([^"]+)', html): key, value = match.groups() data[key] = value data['method_free'] = 'Proceed to Video' html = self.net.http_POST(web_url, form_data=data).content stream_url = '' for match in re.finditer('(eval\(function.*?)</script>', html, re.DOTALL): js_data = jsunpack.unpack(match.group(1)) match2 = re.search('<param\s+name="src"\s*value="([^"]+)', js_data) if match2: stream_url = match2.group(1) else: match2 = re.search('file\s*:\s*"([^"]+)', js_data) if match2: stream_url = match2.group(1) if stream_url: return stream_url + '|' + urllib.urlencode({'User-Agent': common.IE_USER_AGENT, 'Referer': web_url}) raise ResolverError('Unable to resolve grifthost link. Filelink not found.')
def replaceDeployFile(out_file,template_file,train_file,fix_layers=None): f=open(template_file,'rb'); text=f.read()[:]; f.close(); text=text.replace('$TRAIN_TXT','"'+train_file+'"'); if fix_layers is not None: start_excludes=[]; for fix_layer_curr in fix_layers: starts = [match.start() for match in re.finditer(re.escape('name: "'+fix_layer_curr), text)] assert len(starts)==1; # start_excludes=starts[:]; start_excludes.append(starts[0]); starts=[match.start() for match in re.finditer(re.escape('name: '), text)] starts=[idx for idx in starts if idx not in start_excludes]; starts.sort(); starts=starts[::-1]; # starts=starts[1:]; for start in starts: string_orig=text[start:]; string_orig=string_orig[:string_orig.index('\n')] # [:string_orig.rindex('"')+1] # print string_orig string_new=string_orig[:string_orig.rindex('"')]+'_fix"'; # print string_new,string_orig text=text.replace(string_orig,string_new); f=open(out_file,'wb') f.write(text); f.close();
def stem_helper(word, rem_umlaut = True): """rem_umlat: Remove umlaut from text""" #Define R1 and R2 regions #R1 is defined as the region after the first consonant followed by a vowel try: R1 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word))[0].start() + 2 except: R1 = len(word) #R2 is defined as the region within R1 after the first consonant followed by a vowel try: R2 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word[R1:]))[0].start() + 2 + R1 except: R2 = len(word) #Make sure the index of R1 is at least 3. if R1<3: try: R1 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word[1:]))[0].start() + 2 except: R1 = len(word) if rem_umlaut: word = remove_umlaut(word) word = word[:R1] + re.sub(r'(wes|wen|est|ern|em|en|er|es|eȥ(?=[klmrt])s|(?=[lr])n|e)$',"",word[R1:]) word = word[:R1] + re.sub(r'(est|er|en|re|in|iu|(?=.{3})st,word[R1:])$',"",word[R1:]) word = word[:R2] + re.sub(r'(lich?.?.|keit|inc|isch?.?.)$',"",word[R2:]) return word
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) match = re.search('This movie is of poor quality', html, re.I) if match: quality = QUALITIES.LOW else: quality = QUALITIES.HIGH for match in re.finditer('href="([^"]+/embed\d*/[^"]+)', html): url = match.group(1) embed_html = self._http_get(url, cache_limit=.5) r = re.search('{\s*write\("([^"]+)', embed_html) if r: plaintext = self._caesar(r.group(1), 13).decode('base-64') if 'http' not in plaintext: plaintext = self._caesar(r.group(1).decode('base-64'), 13).decode('base-64') else: plaintext = embed_html hosters += self._get_links(plaintext) pattern = 'href="([^"]+)".*play_video.gif' for match in re.finditer(pattern, html, re.I): url = match.group(1) host = urlparse.urlparse(url).hostname hoster = {'multi-part': False, 'url': url, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, quality), 'rating': None, 'views': None, 'direct': False} hosters.append(hoster) return hosters
def list_show_page(self, url, page, seasons=False, episodes=False): result = [] if "/p/epizody" in url or "/p/epiz%C3%B3dy" in url or "p/archiv" in url: if seasons: season_data = util.substr(page, SERIES_START2, SERIES_END2) for m in re.finditer(SERIES_ITER_RE2, season_data, re.DOTALL | re.IGNORECASE): item = self.dir_item() item['title'] = m.group('title') item['url'] = url + '#post=%s' % (m.group('id')) self._filter(result, item) if episodes: for m in re.finditer(EPISODE_ITER_RE2, page, re.DOTALL | re.IGNORECASE): item = self.video_item() item['title'] = "%s (%s)" % (m.group('title'), m.group('date')) item['url'] = m.group('url') self._filter(result, item) else: if seasons: season_data = util.substr(page, SERIES_START, SERIES_END) for m in re.finditer(SERIES_ITER_RE, season_data, re.DOTALL | re.IGNORECASE): item = self.dir_item() item['title'] = m.group('title') item['url'] = 'http://' + urlparse.urlparse(url).netloc + '/ajax.json?' + m.group('url') self._filter(result, item) if episodes: episodes_data = util.substr(page, EPISODE_START, EPISODE_END) for m in re.finditer(EPISODE_ITER_RE, page, re.DOTALL | re.IGNORECASE): item = self.video_item() item['title'] = "%s. %s (%s)" % (m.group('episode'), m.group('title'), m.group('date')) item['url'] = m.group('url') self._filter(result, item) return result
def dotransform(request, response): emailaddr = [] msgfile = request.value lookFor = ['To', 'From'] tmpfolder = request.fields['sniffMyPackets.outputfld'] with open(msgfile, mode='r') as msgfile: reader = msgfile.read() reader = str(reader) for x in lookFor: if x in reader: for s in re.finditer('RCPT TO: <([\w.-]+@[\w.-]+)>', reader): to_addr = s.group(1), 'mail_to' emailaddr.append(to_addr) for t in re.finditer('MAIL FROM: <([\w.-]+@[\w.-]+)>', reader): from_addr = t.group(1), 'mail_from' emailaddr.append(from_addr) for addr, addrfield in emailaddr: e = EmailAddress(addr) e.linklabel = addrfield e += Field('filelocation', request.value, displayname='File Location', matchingrule='loose') e += Field('emailaddr', addrfield, displayname='Header Info') response += e return response
def setupTranslations(localeConfig, projectName, key): # Make a new set from the locales list, mapping to Crowdin friendly format locales = {mapLocale(localeConfig['name_format'], locale) for locale in localeConfig['locales']} # Fill up with locales that we don't have but the browser supports if 'chrome' in localeConfig['target_platforms']: for locale in chromeLocales: locales.add(mapLocale('ISO-15897', locale)) if 'gecko' in localeConfig['target_platforms']: firefoxLocales = urllib2.urlopen('http://www.mozilla.org/en-US/firefox/all.html').read() for match in re.finditer(r'&lang=([\w\-]+)"', firefoxLocales): locales.add(mapLocale('BCP-47', match.group(1))) langPacks = urllib2.urlopen('https://addons.mozilla.org/en-US/firefox/language-tools/').read() for match in re.finditer(r'<tr>.*?</tr>', langPacks, re.S): if match.group(0).find('Install Language Pack') >= 0: match2 = re.search(r'lang="([\w\-]+)"', match.group(0)) if match2: locales.add(mapLocale('BCP-47', match2.group(1))) allowed = set() allowedLocales = urllib2.urlopen('http://crowdin.net/page/language-codes').read() for match in re.finditer(r'<tr>\s*<td\b[^<>]*>([\w\-]+)</td>', allowedLocales, re.S): allowed.add(match.group(1)) if not allowed.issuperset(locales): print 'Warning, following locales aren\'t allowed by server: ' + ', '.join(locales - allowed) locales = list(locales & allowed) locales.sort() params = urllib.urlencode([('languages[]', locale) for locale in locales]) result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/edit-project?key=%s' % (projectName, key), params).read() if result.find('<success') < 0: raise Exception('Server indicated that the operation was not successful\n' + result)
def list_archive_page(self, show_page, showon=False, showoff=False): showonlist = [] if showon: page = util.substr(show_page, VYSIELANE_START, NEVYSIELANE_START) for m in re.finditer(VYSIELANE_ITER_RE, page, re.DOTALL | re.IGNORECASE): item = self.dir_item() item['title'] = m.group('title') item['plot'] = m.group('desc') item['url'] = m.group('url') + "#season_episode" if m.group('itime') is not None: item['type'] = "showon7d" else: item['type'] = "showon" showonlist.append(item) showonlist.sort(key=lambda x: x['title'].lower()) showofflist = [] if showoff: page = util.substr(show_page, NEVYSIELANE_START, NEVYSIELANE_END) for m in re.finditer(NEVYSIELANE_ITER_RE, page, re.DOTALL | re.IGNORECASE): item = self.dir_item() item['title'] = m.group('title') item['url'] = m.group('url') + "#season_episode" item['type'] = "showoff" showofflist.append(item) showofflist.sort(key=lambda x: x['title'].lower()) result = showonlist + showofflist return result
def _generate_entry_probe(self): # Any $entry(name) expressions result in saving that argument # when entering the function. self.args_to_probe = set() regex = r"\$entry\((\w+)\)" for expr in self.exprs: for arg in re.finditer(regex, expr): self.args_to_probe.add(arg.group(1)) for arg in re.finditer(regex, self.filter): self.args_to_probe.add(arg.group(1)) if any(map(lambda expr: "$latency" in expr, self.exprs)) or \ "$latency" in self.filter: self.args_to_probe.add("__latency") self.param_types["__latency"] = "u64" # nanoseconds for pname in self.args_to_probe: if pname not in self.param_types: raise ValueError("$entry(%s): no such param" \ % arg) self.hashname_prefix = "%s_param_" % self.probe_hash_name text = "" for pname in self.args_to_probe: # Each argument is stored in a separate hash that is # keyed by pid. text += "BPF_HASH(%s, u32, %s);\n" % \ (self.hashname_prefix + pname, self.param_types[pname]) text += self._generate_entry() return text
def ExtendCurlys(self, list_of_terms, target_body): """ Run FindWordsInBracketsAndCurlies first. Adds brackets to the same words if they have not yet received brackets. """ self.target_body = ' ' + target_body + ' ' self.dbrackets = [m.span(0) for m in re.finditer(r"\[([\w \(\)\-,.]+)\]", self.target_body)] self.sbrackets = [m.span(0) for m in re.finditer(r"\{([\w \(\)\-,.]+)\}", self.target_body)] self.allbrackets = self.dbrackets + self.sbrackets def repl(matchobj): for span in self.allbrackets: if matchobj.start(0) in range(*span): return matchobj.group(0) self.curly_count += 1 return (matchobj.group(1) + self.curly_term + matchobj.group(2)) self.curly_count = 0 for i, term in enumerate(list_of_terms): self.curly_term = '{' + term + '}' regex = re.compile(r"([^\{\w])%s([^\}\w])" %term, re.IGNORECASE) if i ==0: self.ecoutput = re.sub(regex, repl, self.target_body) else: self.ecoutput = re.sub(regex, repl, self.ecoutput) self.ecoutput = self.ecoutput[1:-1]
def _boundary_of_alternatives_indices(pattern): """ Determines the location of a set of alternatives in a glob pattern. Alternatives are defined by a matching set of non-bracketed parentheses. :param pattern: Glob pattern with wildcards. :return: Indices of the innermost set of matching non-bracketed parentheses in a tuple. The Index of a missing parenthesis will be passed as None. """ # Taking the leftmost closing parenthesis and the rightmost opening # parenthesis left of it ensures that the parentheses belong together and # the pattern is parsed correctly from the most nested section outwards. end_pos = None for match in re.finditer('\\)', pattern): if not _position_is_bracketed(pattern, match.start()): end_pos = match.start() break # Break to get leftmost. start_pos = None for match in re.finditer('\\(', pattern[:end_pos]): if not _position_is_bracketed(pattern, match.start()): start_pos = match.end() # No break to get rightmost. return start_pos, end_pos
# print(i.start()) # print() #.................................................................................................................... # x='a{2,3}' # where minimum 2 or maximum 3 'a' are found # r='aa aabca aaaa cgaa' # match=re.finditer(x,r) # for i in match: # print(i.start()) # print(i.group()) # print() #.................................................................................................................... # # x='^a' #check whether the whole string is starting with 'a' # r='aa abc cga baac' # match=re.finditer(x,r) # for i in match: # print("yes starting with a") # print(i.start()) # print(i.group()) #.................................................................................................................. x = 'a$' #ending with a r = 'aa abc cga baaa' match = re.finditer(x, r) for i in match: print("yes ending with a") print(i.start()) print(i.group())
def spliterator(text): return (x.group(0) for x in re.finditer(r"[A-Za-z,-]+", text))
def getSimilarWordIndex(review, contextWord): return [m.start() for m in re.finditer(contextWord, review)] #find all index for matching word
def parse(cls, raw): return [cls(raw, m.span(), raw[slice(*m.span(1))]) for m in \ re.finditer(r'`(.+?)`', raw)]
import re print("Hello, World!") fileName = "#{prj_code}_#{app_code}_#{brc_code}_docs_V#{rls_ver}.tar.gz" reExpr = r"#\{\w*\}" reIt = re.finditer(reExpr, fileName) prjPara = {} prjPara["prj_code"] = "jtwlwV8" prjPara["sys_code"] = "billing" prjPara["app_code"] = "billing" prjPara["brc_code"] = "testage_jtwlwV8" listSeg = [] lastPos = 0 for it in reIt: listSeg.append(fileName[lastPos:it.start()]) print(" listSeg is:%s" %listSeg) varName = it.group()[2:-1] print( "varName is :%s" %varName) if varName in prjPara: listSeg.append(prjPara[varName]) else: listSeg.append("#{%s}" % varName) print("varName : %s not found!!" %varName) print("lastPot : %s" %it.end()) lastPos = it.end() if lastPos < len(fileName)-1: listSeg.append(fileName[lastPos:])
import re s='[email protected],[email protected]' pattern = r'\S+@\S+\.(com|cn)' print(re.findall(pattern,s)) # regex = re.compile(pattern) # l = regex.search(s) l = re.finditer(pattern,s) for i in l: print(i.group()) # l = re.match(pattern,s) # print(l)
def _extract_urls(webpage): return [ mobj.group('url') for mobj in re.finditer( r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', webpage) ]
def read_config(config_file): global config if not os.path.isfile(config_file): exit("[!] missing configuration file '%s'" % config_file) else: print "[i] using configuration file '%s'" % config_file config.clear() try: array = None content = open(config_file, "rb").read() for line in content.split("\n"): line = line.strip('\r') line = re.sub(r"\s*#.*", "", line) if not line.strip(): continue if line.count(' ') == 0: if re.search(r"[^\w]", line): if array == "USERS": exit("[!] invalid USERS entry '%s'\n[?] (hint: add whitespace at start of line)" % line) else: exit("[!] invalid configuration (line: '%s')" % line) array = line.upper() config[array] = [] continue if array and line.startswith(' '): config[array].append(line.strip()) continue else: array = None try: name, value = line.strip().split(' ', 1) except ValueError: name = line value = "" finally: name = name.strip().upper() value = value.strip("'\"").strip() _ = os.environ.get("%s_%s" % (NAME.upper(), name)) if _: value = _ if any(name.startswith(_) for _ in ("USE_", "SET_", "CHECK_", "ENABLE_", "SHOW_", "DISABLE_")): value = value.lower() in ("1", "true") elif value.isdigit(): value = int(value) else: for match in re.finditer(r"\$([A-Z0-9_]+)", value): if match.group(1) in globals(): value = value.replace(match.group(0), str(globals()[match.group(1)])) else: value = value.replace(match.group(0), os.environ.get(match.group(1), match.group(0))) if name.endswith("_DIR"): value = os.path.realpath(os.path.join(ROOT_DIR, os.path.expanduser(value))) config[name] = value except (IOError, OSError): pass for option in ("MONITOR_INTERFACE", "CAPTURE_BUFFER", "LOG_DIR"): if not option in config: exit("[!] missing mandatory option '%s' in configuration file '%s'" % (option, config_file)) for entry in (config.USERS or []): if len(entry.split(':')) != 4: exit("[!] invalid USERS entry '%s'" % entry) if re.search(r"\$\d+\$", entry): exit("[!] invalid USERS entry '%s'\n[?] (hint: please update PBKDF2 hashes to SHA256 in your configuration file)" % entry) if config.SSL_PEM: config.SSL_PEM = config.SSL_PEM.replace('/', os.sep) if config.USER_WHITELIST: if ',' in config.USER_WHITELIST: print("[x] configuration value 'USER_WHITELIST' has been changed. Please use it to set location of whitelist file") elif not os.path.isfile(config.USER_WHITELIST): exit("[!] missing 'USER_WHITELIST' file '%s'" % config.USER_WHITELIST) else: read_whitelist() if config.USER_IGNORELIST: if not os.path.isfile(config.USER_IGNORELIST): exit("[!] missing 'USER_IGNORELIST' file '%s'" % config.USER_IGNORELIST) else: read_ignorelist() config.PROCESS_COUNT = int(config.PROCESS_COUNT or CPU_CORES) if config.USE_MULTIPROCESSING: print("[x] configuration switch 'USE_MULTIPROCESSING' is deprecated. Please use 'PROCESS_COUNT' instead") if config.DISABLE_LOCAL_LOG_STORAGE and not any((config.LOG_SERVER, config.SYSLOG_SERVER)): print("[x] configuration switch 'DISABLE_LOCAL_LOG_STORAGE' turned on and neither option 'LOG_SERVER' nor 'SYSLOG_SERVER' are set. Falling back to console output of event data") if config.UDP_ADDRESS is not None and config.UDP_PORT is None: exit("[!] usage of configuration value 'UDP_ADDRESS' requires also usage of 'UDP_PORT'") if config.UDP_ADDRESS is None and config.UDP_PORT is not None: exit("[!] usage of configuration value 'UDP_PORT' requires also usage of 'UDP_ADDRESS'") if not str(config.HTTP_PORT or "").isdigit(): exit("[!] invalid configuration value for 'HTTP_PORT' ('%s')" % config.HTTP_PORT) if config.PROCESS_COUNT and subprocess.mswindows: print "[x] multiprocessing is currently not supported on Windows OS" config.PROCESS_COUNT = 1 if config.CAPTURE_BUFFER: if str(config.CAPTURE_BUFFER or "").isdigit(): config.CAPTURE_BUFFER = int(config.CAPTURE_BUFFER) elif re.search(r"\d+\s*[kKmMgG]B", config.CAPTURE_BUFFER): match = re.search(r"(\d+)\s*([kKmMgG])B", config.CAPTURE_BUFFER) config.CAPTURE_BUFFER = int(match.group(1)) * {"K": 1024, "M": 1024 ** 2, "G": 1024 ** 3}[match.group(2).upper()] elif re.search(r"\d+%", config.CAPTURE_BUFFER): physmem = _get_total_physmem() if physmem: config.CAPTURE_BUFFER = physmem * int(re.search(r"(\d+)%", config.CAPTURE_BUFFER).group(1)) / 100 else: exit("[!] unable to determine total physical memory. Please use absolute value for 'CAPTURE_BUFFER'") else: exit("[!] invalid configuration value for 'CAPTURE_BUFFER' ('%s')" % config.CAPTURE_BUFFER) config.CAPTURE_BUFFER = config.CAPTURE_BUFFER / BLOCK_LENGTH * BLOCK_LENGTH if config.PROXY_ADDRESS: PROXIES.update({"http": config.PROXY_ADDRESS, "https": config.PROXY_ADDRESS}) opener = urllib2.build_opener(urllib2.ProxyHandler(PROXIES)) urllib2.install_opener(opener) if not config.TRAILS_FILE: config.TRAILS_FILE = DEFAULT_TRAILS_FILE else: config.TRAILS_FILE = os.path.abspath(os.path.expanduser(config.TRAILS_FILE))
import json import re import ast import numpy as np import matplotlib.pyplot as plt import operator import collections json_contents = open('output/msd_fit_categories0.2.txt', 'r').read()[1:-1] json_contents_split = [int(a) for a in json_contents.split()] cluster_sizes = sorted(collections.Counter(json_contents_split).items()) cluster_years = collections.defaultdict(list) json_contents = open('output/song_groupings0.2.txt', 'r').read() for g in re.finditer('(\d{1,2}): \[.*?(\)\])', json_contents): cluster_num = g.group(1) for year in re.finditer(', (\d{4})\),', g.group(0)): cluster_years[cluster_num].append(int(year.group(1))) all_song_dists = {} all_song_nums = {} for key in cluster_years.keys(): song_dist = cluster_years[key] all_songs_dists_raw = sorted(collections.Counter(song_dist).items()) total_songs_num = sum([tup[1] for tup in all_songs_dists_raw]) all_song_nums[key] = total_songs_num all_song_dists[key] = [(tup[0], float(tup[1]) / total_songs_num) for tup in all_songs_dists_raw] plt.switch_backend('agg') for idx, key in enumerate(cluster_years):
del mo1, mo2 mo1 = re.search("Hello", oneline_string) mo2 = re.match("Hello", oneline_string) print(mo1) print(mo2) del mo1, mo2 mo1 = re.search("Vien", oneline_string) mo2 = re.search("Vien", multiline_string, re.M) mo3 = re.findall("Vien", multiline_string) print(mo1) print(mo2) print(mo3) for mo4 in re.finditer("Vien", multiline_string): print(mo4) print(mo4.group()) print(mo4.span()) del mo1, mo2, mo3, mo4 print("Hello Vien\nBye Vien") print(r"Hello Vien\nBye Vien") #raw string doesn't process backslash as escape character print(re.sub("Vien", "Van", multiline_string)) print(re.split("Vien", multiline_string)) mo = re.match("Hello (?P<name>\w+)\n.*\n.*(?P=name)",multiline_string) print(mo) print(mo.groups())
# find a pattern without compilation string1 = 'to Alice and Bob from' print(re.search('to .* from', string1)) # match ``` >>> re.findall("(\d+)", "07 23 32 32") ['07', '23', '32', '32'] ``` ``` >>> re.search("[123]","199") <re.Match object; span=(0, 1), match='1'> ``` Empty if no match ``` >>> re.search("[123]","999") ``` ``` >>> [m.start(0) for m in re.finditer("a", "abcabca")] [0, 3, 6] ``` ``` >>> re.sub("\s+", " ", "Good morning") "Good morning" ```
def prettyName(class_name): return ' '.join( [x.group() for x in re.finditer('([A-Z])([a-z0-9]+)', class_name)])
def check_trailing_whitespace(): stderr("checking trailing whitespace...") for mo in re.finditer(r'(?m)[ \t]+$', spec.text): posn = mo.start() msg_at_posn(posn, "trailing whitespace")
def do_get_colors(self): # Looks for fish_color_*. # Returns an array of lists [color_name, color_description, color_value] result = [] # Make sure we return at least these remaining = set( [ "normal", "error", "command", "end", "param", "comment", "match", "selection", "search_match", "operator", "escape", "quote", "redirection", "valid_path", "autosuggestion", "user", "host", "cancel", ] ) # Here are our color descriptions descriptions = { "normal": "Default text", "command": "Ordinary commands", "quote": "Text within quotes", "redirection": "Like | and >", "end": "Like ; and &", "error": "Potential errors", "param": "Command parameters", "comment": "Comments start with #", "match": "Matching parenthesis", "selection": "Selected text", "search_match": "History searching", "history_current": "Directory history", "operator": "Like * and ~", "escape": "Escapes like \\n", "cwd": "Current directory", "cwd_root": "cwd for root user", "valid_path": "Valid paths", "autosuggestion": "Suggested completion", "user": "******", "host": "Hostname in the prompt", "cancel": "The ^C cancel indicator", } out, err = run_fish_cmd("set -L") for line in out.split("\n"): for match in re.finditer(r"^fish_color_(\S+) ?(.*)", line): color_name, color_value = [x.strip() for x in match.group(1, 2)] color_desc = descriptions.get(color_name, "") data = {"name": color_name, "description": color_desc} data.update(parse_color(color_value)) result.append(data) remaining.discard(color_name) # Sort our result (by their keys) result.sort(key=operator.itemgetter("name")) # Ensure that we have all the color names we know about, so that if the # user deletes one he can still set it again via the web interface for color_name in remaining: color_desc = descriptions.get(color_name, "") result.append([color_name, color_desc, parse_color("")]) return result
def test_objecttypes(self): # check all types defined in Objects/ calcsize = struct.calcsize size = test.support.calcobjsize vsize = test.support.calcvobjsize check = self.check_sizeof # bool check(True, vsize('') + self.longdigit) # buffer # XXX # builtin_function_or_method check(len, size('4P')) # XXX check layout # bytearray samples = [b'', b'u'*100000] for sample in samples: x = bytearray(sample) check(x, vsize('n2Pi') + x.__alloc__()) # bytearray_iterator check(iter(bytearray()), size('nP')) # bytes check(b'', vsize('n') + 1) check(b'x' * 10, vsize('n') + 11) # cell def get_cell(): x = 42 def inner(): return x return inner check(get_cell().__closure__[0], size('P')) # code def check_code_size(a, expected_size): self.assertGreaterEqual(sys.getsizeof(a), expected_size) check_code_size(get_cell().__code__, size('6i13P')) check_code_size(get_cell.__code__, size('6i13P')) def get_cell2(x): def inner(): return x return inner check_code_size(get_cell2.__code__, size('6i13P') + calcsize('n')) # complex check(complex(0,1), size('2d')) # method_descriptor (descriptor object) check(str.lower, size('3PP')) # classmethod_descriptor (descriptor object) # XXX # member_descriptor (descriptor object) import datetime check(datetime.timedelta.days, size('3PP')) # getset_descriptor (descriptor object) import collections check(collections.defaultdict.default_factory, size('3PP')) # wrapper_descriptor (descriptor object) check(int.__add__, size('3P2P')) # method-wrapper (descriptor object) check({}.__iter__, size('2P')) # dict check({}, size('nQ2P') + calcsize('2nP2n') + 8 + (8*2//3)*calcsize('n2P')) longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8} check(longdict, size('nQ2P') + calcsize('2nP2n') + 16 + (16*2//3)*calcsize('n2P')) # dictionary-keyview check({}.keys(), size('P')) # dictionary-valueview check({}.values(), size('P')) # dictionary-itemview check({}.items(), size('P')) # dictionary iterator check(iter({}), size('P2nPn')) # dictionary-keyiterator check(iter({}.keys()), size('P2nPn')) # dictionary-valueiterator check(iter({}.values()), size('P2nPn')) # dictionary-itemiterator check(iter({}.items()), size('P2nPn')) # dictproxy class C(object): pass check(C.__dict__, size('P')) # BaseException check(BaseException(), size('5Pb')) # UnicodeEncodeError check(UnicodeEncodeError("", "", 0, 0, ""), size('5Pb 2P2nP')) # UnicodeDecodeError check(UnicodeDecodeError("", b"", 0, 0, ""), size('5Pb 2P2nP')) # UnicodeTranslateError check(UnicodeTranslateError("", 0, 1, ""), size('5Pb 2P2nP')) # ellipses check(Ellipsis, size('')) # EncodingMap import codecs, encodings.iso8859_3 x = codecs.charmap_build(encodings.iso8859_3.decoding_table) check(x, size('32B2iB')) # enumerate check(enumerate([]), size('n3P')) # reverse check(reversed(''), size('nP')) # float check(float(0), size('d')) # sys.floatinfo check(sys.float_info, vsize('') + self.P * len(sys.float_info)) # frame import inspect CO_MAXBLOCKS = 20 x = inspect.currentframe() ncells = len(x.f_code.co_cellvars) nfrees = len(x.f_code.co_freevars) extras = x.f_code.co_stacksize + x.f_code.co_nlocals +\ ncells + nfrees - 1 check(x, vsize('12P3ic' + CO_MAXBLOCKS*'3i' + 'P' + extras*'P')) # function def func(): pass check(func, size('12P')) class c(): @staticmethod def foo(): pass @classmethod def bar(cls): pass # staticmethod check(foo, size('PP')) # classmethod check(bar, size('PP')) # generator def get_gen(): yield 1 check(get_gen(), size('Pb2PPP')) # iterator check(iter('abc'), size('lP')) # callable-iterator import re check(re.finditer('',''), size('2P')) # list samples = [[], [1,2,3], ['1', '2', '3']] for sample in samples: check(sample, vsize('Pn') + len(sample)*self.P) # sortwrapper (list) # XXX # cmpwrapper (list) # XXX # listiterator (list) check(iter([]), size('lP')) # listreverseiterator (list) check(reversed([]), size('nP')) # int check(0, vsize('')) check(1, vsize('') + self.longdigit) check(-1, vsize('') + self.longdigit) PyLong_BASE = 2**sys.int_info.bits_per_digit check(int(PyLong_BASE), vsize('') + 2*self.longdigit) check(int(PyLong_BASE**2-1), vsize('') + 2*self.longdigit) check(int(PyLong_BASE**2), vsize('') + 3*self.longdigit) # module check(unittest, size('PnPPP')) # None check(None, size('')) # NotImplementedType check(NotImplemented, size('')) # object check(object(), size('')) # property (descriptor object) class C(object): def getx(self): return self.__x def setx(self, value): self.__x = value def delx(self): del self.__x x = property(getx, setx, delx, "") check(x, size('4Pi')) # PyCapsule # XXX # rangeiterator check(iter(range(1)), size('4l')) # reverse check(reversed(''), size('nP')) # range check(range(1), size('4P')) check(range(66000), size('4P')) # set # frozenset PySet_MINSIZE = 8 samples = [[], range(10), range(50)] s = size('3nP' + PySet_MINSIZE*'nP' + '2nP') for sample in samples: minused = len(sample) if minused == 0: tmp = 1 # the computation of minused is actually a bit more complicated # but this suffices for the sizeof test minused = minused*2 newsize = PySet_MINSIZE while newsize <= minused: newsize = newsize << 1 if newsize <= 8: check(set(sample), s) check(frozenset(sample), s) else: check(set(sample), s + newsize*calcsize('nP')) check(frozenset(sample), s + newsize*calcsize('nP')) # setiterator check(iter(set()), size('P3n')) # slice check(slice(0), size('3P')) # super check(super(int), size('3P')) # tuple check((), vsize('')) check((1,2,3), vsize('') + 3*self.P) # type # static type: PyTypeObject fmt = 'P2n15Pl4Pn9Pn11PIP' if hasattr(sys, 'getcounts'): fmt += '3n2P' s = vsize(fmt) check(int, s) s = vsize(fmt + # PyTypeObject '3P' # PyAsyncMethods '36P' # PyNumberMethods '3P' # PyMappingMethods '10P' # PySequenceMethods '2P' # PyBufferProcs '4P') # Separate block for PyDictKeysObject with 8 keys and 5 entries s += calcsize("2nP2n") + 8 + 5*calcsize("n2P") # class class newstyleclass(object): pass check(newstyleclass, s) # dict with shared keys check(newstyleclass().__dict__, size('nQ2P' + '2nP2n')) # unicode # each tuple contains a string and its expected character size # don't put any static strings here, as they may contain # wchar_t or UTF-8 representations samples = ['1'*100, '\xff'*50, '\u0100'*40, '\uffff'*100, '\U00010000'*30, '\U0010ffff'*100] asciifields = "nnbP" compactfields = asciifields + "nPn" unicodefields = compactfields + "P" for s in samples: maxchar = ord(max(s)) if maxchar < 128: L = size(asciifields) + len(s) + 1 elif maxchar < 256: L = size(compactfields) + len(s) + 1 elif maxchar < 65536: L = size(compactfields) + 2*(len(s) + 1) else: L = size(compactfields) + 4*(len(s) + 1) check(s, L) # verify that the UTF-8 size is accounted for s = chr(0x4000) # 4 bytes canonical representation check(s, size(compactfields) + 4) # compile() will trigger the generation of the UTF-8 # representation as a side effect compile(s, "<stdin>", "eval") check(s, size(compactfields) + 4 + 4) # TODO: add check that forces the presence of wchar_t representation # TODO: add check that forces layout of unicodefields # weakref import weakref check(weakref.ref(int), size('2Pn2P')) # weakproxy # XXX # weakcallableproxy check(weakref.proxy(int), size('2Pn2P'))
def gather_def_ids(node): if 'id' in node.attrs: defid = node.attrs['id'] # ---------- # no duplicate ids, of course if defid in node_with_id_: msg_at_node(node, f"duplicate id: '{defid}'") node_with_id_[defid] = node # ---------- # id should begin with "(sec|eqn|figure|table)-" # if and only if the node is of certain kinds. id_prefix_expectation = { 'emu-intro' : 'sec-', 'emu-clause': 'sec-', 'emu-annex' : 'sec-', 'emu-eqn' : 'eqn-', 'emu-figure': 'figure-', 'emu-table' : 'table-', }.get(node.element_name, None) if id_prefix_expectation: if not defid.startswith(id_prefix_expectation): msg_at_node(node, f'Expected the id to start with "{id_prefix_expectation}"') else: if (False or defid.startswith('sec-') or defid.startswith('eqn-') or defid.startswith('figure-') or defid.startswith('table-') ): msg_at_node(node, f'Did not expect the id to start that way') # ---------- # If an element defines an abstract operation, # its id should be ... if 'aoid' in node.attrs: # TODO: After the merge of #545, most abstract ops don't have an 'aoid' attribute; # instead it's generated at 'render' time. # (But SDOs, emu-eqns, and a few others do, so this code is still being executed, # just not as much as we want.) aoid = node.attrs['aoid'] assert node.element_name in ['emu-clause', 'emu-annex', 'emu-eqn', 'dfn'] if id_prefix_expectation is None: id_prefix_expectation = '' # for thisFooValue, was 'sec-' until PR 2103 possibles = [ id_prefix_expectation + aoid.lower().replace(' ', '-').replace('::', '-'), id_prefix_expectation + aoid, id_prefix_expectation + kebab(aoid), id_prefix_expectation + 'static-semantics-' + aoid.lower(), id_prefix_expectation + 'runtime-semantics-' + aoid.lower(), ] if defid not in possibles: msg_at_node(node, f'Expected id="{possibles[0]}"') if node.element_name == 'emu-alg': for mo in re.finditer(r' \[(\w+)="([^"]+)"\]', node.inner_source_text()): assert mo.group(1) == 'id' defid = mo.group(2) # ---------- # no duplicate ids if defid in node_with_id_: msg_at_node(node, f"duplicate id: '{defid}'") node_with_id_[defid] = node # XXX Should really be the node that will later be constructed # for the step in which this step_attribute occurs. # ---------- # id should begin with "step-" assert defid.startswith('step-') if 'oldids' in node.attrs: for oldid in node.attrs['oldids'].split(','): assert oldid not in all_oldids all_oldids.add(oldid) for child in node.children: gather_def_ids(child)
def extract( mention_id ="text", sentence_text ="text", tokens ="text[]", begin_exp ="int", end_exp ="int", begin_explain ="int", end_explain ="int", sentence_source ="text[]", position_source ="text[]" ): forbidden_word = ["nếu","phải","đó","không","được","đã","đồng_thời","cần", "chỉ",'cụ_thể'] for i in range(2): if end_exp +2 +i <= end_explain: if handle_string.toLowerCase(tokens[end_exp+2+i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if end_exp - i >= begin_exp: if handle_string.toLowerCase(tokens[end_exp-i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word: yield [ mention_id, -1, "forbidden_word_2" ] if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]): yield [ mention_id, -1, "forbidden_word_3" ] i = len(mention_id) - 1 first = False while(i>0) : if mention_id[i] == '_' and not first: first = True i -= 1 continue if mention_id[i] == '_' and first: break i -= 1 j = 0 while(j<len(mention_id)) : if mention_id[j] == '_': break j += 1 position_require = mention_id[j+1:i+1] index = 0 for index in range(0,len(position_source)): if position_require in position_source[index] : if divlaw.lenIterator(re.finditer(r"Giải_thích\stừ_ngữ",sentence_source[index],re.U|re.I)) > 0 : yield [ mention_id, 1, "in_explain_words_law" ]
site = pywikibot.Site("lv", "wikipedia") def notify_Edgars(): page = pywikibot.Page(site, "Dalībnieka diskusija:EdgarsBot") pagetext = page.get() pagetext += "\n\n{{ping|Edgars2007}} DYK sagatavē divi vienādi datumi --~~~~" page.text = pagetext page.save(summary="New error", botflag=True) page = pywikibot.Page(site, 'Veidne:Vai tu zināji/Sagatave') page_text = page.get() all_dates = re.finditer('<!--(\d+)\. datums\n-->\|\d+=', page_text) found_dublicate = False all_date_matches = [] for match in all_dates: date = match.group(1) if date in all_date_matches: found_dublicate = True break all_date_matches.append(date) if found_dublicate: notify_Edgars()
def print_HOR_read(r, show_cenpbbxx=False): coords = [0] * len(r.hors) hv_str = ["."] * len(r.hors) for n, (_idx, _size, elem) in enumerate(r.hors): idx, size = int(_idx), int(_size) if r.ori == '+': b, e = r.mons[idx].begin, r.mons[idx + size - 1].end gap = 0 if idx == 0 else r.mons[idx].begin - r.mons[ idx - 1].end # gap before me. else: b, e = r.mons[idx].end, r.mons[idx + size - 1].begin gap = 0 if idx == 0 else -(r.mons[idx].end - r.mons[idx - 1].begin) coords[n] = (b, e, gap) if elem[:5] == "M=HOR": hv_str[n] = "m" if gap < 100 else "M" if elem[:3] == "M=M": hv_str[n] = "m" if gap < 100 else "M" elif elem[:3] == "Rev": hv_str[n] = "r" if gap < 100 else "R" elif elem[:2] == "M=": hv_str[n] = "a" if gap < 100 else "A" else: hv_str[n] = "h" if gap < 100 else "H" # search variant HOR candidates elems = [e for i, s, e in r.hors] hv_new_idx = functools.reduce(lambda x, y: x | y, [ set(range(mt.start(1) + 1, mt.end(1) - 1)) for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str)) ], set()) hv_new_hash = { mt.start(1) + 1: f"{hash(tuple(elems[ mt.start(1) + 1: mt.end(1) - 1 ])):x}"[-8:] for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str)) } for n, (_idx, _size, elem) in enumerate(r.hors): b, e, gap = coords[n] idx, size = int(_idx), int(_size) nvars = sum([len(m.monomer.snvs) for m in r.mons[idx:idx + size]]) if n in hv_new_hash: print( f"{r.name}\t{b}\t{e}\t{idx}\t{size}\t{elem}\t" +\ f"{gap}\t{nvars}\t{100.0*nvars/abs(e-b):.2f}\t" +\ "new=" + hv_new_hash[n]) else: print( f"{r.name}\t{b}\t{e}\t{idx}\t{size}\t{elem}\t" +\ f"{gap}\t{nvars}\t{100.0*nvars/abs(e-b):.2f}\t" +\ ("new" if n in hv_new_idx else ".")) print("") # 5-mer kmer = [ tuple(elems[mt.start(1):mt.end(1)]) for mt in re.finditer(r"(?=([hH]hhhh))", "".join(hv_str)) ] cand = [ tuple(elems[mt.start(1) + 1:mt.end(1) - 1]) for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str)) ] return kmer, cand
def interpret_expression(self, expr, local_vars, allow_recursion): expr = expr.strip() if expr == '': # Empty expression return None if expr.startswith('('): parens_count = 0 for m in re.finditer(r'[()]', expr): if m.group(0) == '(': parens_count += 1 else: parens_count -= 1 if parens_count == 0: sub_expr = expr[1:m.start()] sub_result = self.interpret_expression( sub_expr, local_vars, allow_recursion, ) remaining_expr = expr[m.end():].strip() if not remaining_expr: return sub_result else: expr = json.dumps(sub_result) + remaining_expr break else: raise ExtractorError('Premature end of parens in %r' % expr) for op, opfunc in _ASSIGN_OPERATORS: m = re.match( r'''(?x) (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? \s*%s (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr, ) if not m: continue right_val = self.interpret_expression( m.group('expr'), local_vars, allow_recursion - 1, ) if m.groupdict().get('index'): lvar = local_vars[m.group('out')] idx = self.interpret_expression( m.group('index'), local_vars, allow_recursion, ) assert isinstance(idx, int) cur = lvar[idx] val = opfunc(cur, right_val) lvar[idx] = val return val else: cur = local_vars.get(m.group('out')) val = opfunc(cur, right_val) local_vars[m.group('out')] = val return val if expr.isdigit(): return int(expr) var_m = re.match( r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE, expr, ) if var_m: return local_vars[var_m.group('name')] try: return json.loads(expr) except ValueError: pass m = re.match( r'(?P<var>%s)\.(?P<member>[^(]+)' '(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, expr, ) if m: variable = m.group('var') member = m.group('member') arg_str = m.group('args') if variable in local_vars: obj = local_vars[variable] else: if variable not in self._objects: self._objects[variable] = self.extract_object(variable) obj = self._objects[variable] if arg_str is None: # Member access if member == 'length': return len(obj) return obj[member] assert expr.endswith(')') # Function call if arg_str == '': argvals = tuple() else: argvals = tuple([ self.interpret_expression(v, local_vars, allow_recursion) for v in arg_str.split(',') ]) if member == 'split': assert argvals == ('',) return list(obj) if member == 'join': assert len(argvals) == 1 return argvals[0].join(obj) if member == 'reverse': assert len(argvals) == 0 obj.reverse() return obj if member == 'slice': assert len(argvals) == 1 return obj[argvals[0]:] if member == 'splice': assert isinstance(obj, list) index, howMany = argvals res = [] for i in range(index, min(index + howMany, len(obj))): res.append(obj.pop(index)) return res return obj[member](argvals) m = re.match( r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr, ) if m: val = local_vars[m.group('in')] idx = self.interpret_expression( m.group('idx'), local_vars, allow_recursion - 1, ) return val[idx] for op, opfunc in _OPERATORS: m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) if not m: continue x, abort = self.interpret_statement( m.group('x'), local_vars, allow_recursion - 1, ) if abort: raise ExtractorError( 'Premature left-side return of %s in %r' % (op, expr), ) y, abort = self.interpret_statement( m.group('y'), local_vars, allow_recursion - 1, ) if abort: raise ExtractorError( 'Premature right-side return of %s in %r' % (op, expr), ) return opfunc(x, y) m = re.match( r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % _NAME_RE, expr, ) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] for v in m.group('args').split(',') ]) if fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) raise ExtractorError('Unsupported JS expression %r' % expr)
def find_suffix(besedilo, vzorec): return { x.group(0) for x in re.finditer(r'\b\w*?' + vzorec + r'\b', besedilo) }
def preminify_markdown(lines, rem_log): ''' We can save a few extra KBs by doing some targeted minification on markdown.js that our minification tools would otherwise overlook ''' # start with direct replacement for the State enum lines = minify_markdown_state_enum(lines) # stateToStr takes up a lot of space when it probably doesn't have to for the minified version # Save several hundred bytes by removing it start = lines.find('const stateToStr') if start != -1: end = lines.find('}', lines.find('}', start) + 1) + 1 if end != -1: lines = lines.replace(lines[start:end], 'let stateToStr = (state) => state;') if rem_log != 0: start = lines.find('const shouldLogTmi = ') if start == -1: print('WARN: could not find "shouldLogTmi" for removal') else: end = lines.find('}\n', start) + 2 lines = lines.replace(lines[start:end], '') # Check whether we should remove TMI logging. There is a # separate flag for markdown-specific removal, since it's especially noisy if rem_log == 4: lines = re.sub(r'(\/\*@__PURE__\*\/)?\blogTmi\(.*\); *\n', '', lines) # currentRun is used quite a bit lines = re.sub(r'\bthis\.currentRun\b', 'this.' + next_var(), lines) # State of a run - Disabled because other classes use .state # lines = re.sub(r'\.state\b', '.' + next_var(), lines) # Inner runs lines = re.sub(r'\binnerRuns\b', next_var(), lines) # Run methods lines = re.sub(r'\bstartContextLength\b', next_var(), lines) lines = re.sub(r'\bendContextLength\b', next_var(), lines) lines = re.sub(r'\btransform\b', next_var(), lines) # Now look for things that are very method-like. for match in re.finditer(r'\n (_\w+)\(', lines): cur_var = next_var() if cur_var == '': return lines lines = re.sub(r'\b' + match.group(1) + r'\b', cur_var, lines) # TODO: this.text is _very_ heavily used, but multiple classes # use this, so it breaks things. # Now getting real hacky. Modify String's prototype to save a hundred bytes or so # Some should already be done if 'ultra' is set, so don't do anything in that case lines = re.sub(r'\.indexOf\(', '.i(', lines) lines = 'String.prototype.i = String.prototype.indexOf;\n' + lines lines = 'Array.prototype.i = Array.prototype.indexOf;\n' + lines lines = re.sub(r'\.substring\(', '.s(', lines) lines = 'String.prototype.s = String.prototype.substring;\n' + lines return lines
def webScraping(CSVPath): # Connect to Browser DRIVER_PATH = "C:/Users/filip/Documents/PythonFiles/chromedriver" browser = webdriver.Chrome(DRIVER_PATH) # Access website URL = "https://www.imdb.com/" browser.get(URL) # Give the browser time to load all content. # time.sleep(2) # Click on Menu button menuButton = browser.find_element_by_css_selector( '#imdbHeader-navDrawerOpen--desktop') menuButton.click() time.sleep(2) # Select Most Popular Movies from Menu items mostPopMovies = browser.find_element_by_css_selector( '#nav-link-categories-mov+ ._299G6wcz6LCpY_QFQJtc76 .ipc-list__item--indent-one:nth-child(4)' ) mostPopMovies.click() time.sleep(2) # Sort by IMDB rating descending, so I don't collect movies without IMDB ratings. sortByButton = browser.find_element_by_css_selector( "#lister-sort-by-options [value='ir:descending']") sortByButton.click() # Extract selected contents mainInfoList = browser.find_elements_by_css_selector('.titleColumn') ratingList = browser.find_elements_by_css_selector('.imdbRating') # Create DataFrame to store result df = pd.DataFrame(columns=['Title', 'Year', 'Ranking', 'Rating']) # Loop through 87 elements (excluding movies without rating) for i in range(87): # Prepare overal text result start = mainInfoList[i].get_attribute('innerHTML') # Beautiful soup allows us to remove HTML tags from our content, if it exists. soup = BeautifulSoup(start, features="lxml") # Remove leading and trailing whitespaces rawString = soup.get_text().strip() # Remove hidden characters for tabs and new lines. rawString = re.sub( r"[\n\t]*", "", rawString) # re.sub(pattern, repl, string, count=0, flags=0) # Replace(remove) two or more consecutive empty spaces with '' rawString = re.sub('[ ]{2,}', '', rawString) # Extract TITLE titleCutOff = rawString.index('(') title = rawString[:titleCutOff] # title.strip() # Extract YEAR yearCutOff = rawString.index(')') # RANKING after this point year = rawString[titleCutOff + 1:yearCutOff] # Extract RANKING # Here I find the second (, which determines the end index for Ranking rankingCutOff = [m.start() for m in re.finditer("\(", rawString)][1] ranking = rawString[yearCutOff + 1:rankingCutOff] # Extract RATING start = ratingList[i].get_attribute('innerHTML') # Beautiful soup allows us to remove HTML tags from our content if it exists. soup = BeautifulSoup(start, features="lxml") rating = soup.get_text().strip( ) # Leading and trailing whitespaces are removed # Adding info on a Data Frame moviesInfo = { 'Title': title, 'Year': year, 'Ranking': ranking, 'Rating': rating } df = df.append(moviesInfo, ignore_index=True) # Show all columns. pd.set_option('display.max_columns', None) # Show all rows. pd.set_option('display.max_rows', None) # Increase number of columns that display on one line. pd.set_option('display.width', 1000) print("\nTop 87 Movies listed in descending order of IMDB Rating") print(df) # Saving results in a CSV file df.to_csv(CSVPath)
def double_letters(besedilo): return {x.group(0) for x in re.finditer(r'\w*(.)\1\w+', besedilo)}
def _generateTriggerFromPattern(pattern, template, template_numeric_dict): crit_value_list = sorted(template_numeric_dict[template]) prefix_crit = crit_value_list[:-1] suffix_crit = crit_value_list[1:] mid_crit = [(v1 + v2) / 2 for v1, v2 in zip(suffix_crit, prefix_crit)] enhanced_crit_value_list = sorted( mid_crit + [crit_value_list[0] - 1, crit_value_list[-1] + 1] + crit_value_list) enhanced_crit_value_str_list = [ str(v).replace('.', '_') for v in enhanced_crit_value_list ] hit_pattern_list = [m.group(0) for m in re.finditer(r'(\d)\1*', pattern)] cover_list = list() start_index = 0 for hit_pattern in hit_pattern_list: if '1' in hit_pattern: cover_list.append((start_index, start_index + len(hit_pattern))) start_index = start_index + len(hit_pattern) statement_list = list() for cover in cover_list: if cover[0] != 0 and cover[1] != len(pattern): # raise Exception('this pattern %s is not supported' % pattern) # ugly fix: if cover[1] % 2 != 0: # non-critical value, "<" upper_str = '<' + enhanced_crit_value_str_list[ cover[1]].replace('_', '.') else: # critical value, "<=" upper_str = '<=' + enhanced_crit_value_str_list[cover[1] - 1].replace( '_', '.') if cover[0] % 2 == 0: # non-critical value, ">" lower_str = enhanced_crit_value_str_list[cover[0] - 1].replace( '_', '.') + '<' else: # critical value, ">=" lower_str = enhanced_crit_value_str_list[cover[0]].replace( '_', '.') + '<=' statement_list.append('%s%s%s' % (lower_str, template, upper_str)) elif cover[0] == 0 and cover[1] != len(pattern): if cover[1] % 2 != 0: # non-critical value, "<" statement_list.append( '%s<%s' % (template, enhanced_crit_value_str_list[cover[1]].replace( '_', '.'))) else: # critical value, "<=" statement_list.append( '%s<=%s' % (template, enhanced_crit_value_str_list[cover[1] - 1].replace( '_', '.'))) # statement_list.append('%s=%s' % (template, enhanced_crit_value_str_list[cover[1]-1].replace('_', '.'))) elif cover[0] != 0 and cover[1] == len(pattern): if cover[0] % 2 == 0: # non-critical value, ">" statement_list.append( '%s>%s' % (template, enhanced_crit_value_str_list[cover[0] - 1].replace( '_', '.'))) else: # critical value, ">=" statement_list.append( '%s>=%s' % (template, enhanced_crit_value_str_list[cover[0]].replace( '_', '.'))) # statement_list.append('%s=%s' % (template, enhanced_crit_value_str_list[cover[0]].replace('_', '.'))) elif cover[0] == 0 and cover[1] == len(pattern): statement_list.append('%s changed' % template) return statement_list
def __init__(self, source_root, url_base, doc_root, list_outputs_only): self.source_root = source_root self.url_base = url_base self.doc_root = doc_root self.list_outputs_only = list_outputs_only self.source_url_base = self.url_base + 'SourceControl/latest#' self.wiki_url_base = self.url_base + 'wikipage?title=' self.issue_url_base = self.url_base + 'workitem/' try: with open('maps.cache', 'rb') as f: self.file_map, self.type_map = pickle.load(f) return except: pass if self.list_outputs_only: self.file_map = None self.type_map = None return print('Creating file maps') file_map = {} type_map = {} for dirname, dirnames, filenames in os.walk(source_root): for filename in filenames: fullpath = os.path.join(source_root, dirname, filename) urlpath = fullpath[len(source_root):].lstrip('\\').replace('\\', '/') nameonly = os.path.split(fullpath)[1] if nameonly in file_map: file_map[nameonly] = None else: file_map[nameonly] = urlpath if not filename.upper().endswith(('.PY', '.CS')): continue try: with open(fullpath, 'r', encoding='utf-8-sig') as f: content = f.read() except UnicodeDecodeError: #print('Cannot read {}'.format(filename)) continue nsname = None if filename.upper().endswith('.PY'): nsname = os.path.splitext(filename)[0] for match in re.finditer(r'(namespace|class|struct|enum|interface) ([\w\.]+)', content): kind, name = match.groups() if kind == 'namespace': nsname = name elif nsname: type_map[nsname + '.' + name] = urlpath try: with open('maps.cache', 'wb') as f: pickle.dump((file_map, type_map), f, pickle.HIGHEST_PROTOCOL) except: pass self.file_map = file_map self.type_map = type_map
import os import glob import re from shutil import copyfile, move # indir = os.path.join('datasets', 'nude') indir = os.path.join('datasets', 'full') for root, dirs, files in os.walk(indir): for dir in dirs: dir = os.path.join(root, dir) for fname in glob.glob(dir + '/*.jpg'): # source file src = fname # destination file slash_list = [m.start() for m in re.finditer(r"/", fname)] slash_3rd_pos = slash_list[2] dst = fname[:slash_3rd_pos] + fname[fname.rfind('/'):] # move move(src, dst) if len(os.listdir(dir)) == 0: os.rmdir(dir)
if re.search(r'^(x|y)', gene) and re.search(r'e$', gene): print(gene) print('\nContains three or more numbers in a row:') for gene in gene_names: if re.search(r'[0-9]{3,100}', gene): # no space in {3,100} print(gene) print('\nEnds with d followed by a, r or p:') for gene in gene_names: if re.search(r'd[a|r|p]$.*', gene): print(gene) # Double digest dna = open('dna.txt').read().rstrip('\n') cuts = [0] for match in re.finditer(r"A[ATGC]TAAT", dna): cuts.append(match.start() + 3) for match in re.finditer(r"GC[AG][AT]TG", dna): cuts.append(match.start() + 4) cuts.append(len(dna)) cuts.sort() print('\nFragment lengths:\n') for i in range(1, len(cuts)): print(cuts[i] - cuts[i - 1])
def _events(self, params): session = self.get_session() if session is None: self.send_response(_http_client.UNAUTHORIZED) self.send_header(HTTP_HEADER.CONNECTION, "close") return None start, end, size, total = None, None, -1, None content = None log_exists = False dates = params.get("date", "") if ".." in dates: pass elif '_' not in dates: try: date = datetime.datetime.strptime(dates, "%Y-%m-%d").strftime("%Y-%m-%d") event_log_path = os.path.join(config.LOG_DIR, "%s.log" % date) if os.path.exists(event_log_path): range_handle = open(event_log_path, "rb") log_exists = True except ValueError: print("[!] invalid date format in request") log_exists = False else: logs_data = "" date_interval = dates.split("_", 1) try: start_date = datetime.datetime.strptime(date_interval[0], "%Y-%m-%d").date() end_date = datetime.datetime.strptime(date_interval[1], "%Y-%m-%d").date() for i in xrange(int((end_date - start_date).days) + 1): date = start_date + datetime.timedelta(i) event_log_path = os.path.join(config.LOG_DIR, "%s.log" % date.strftime("%Y-%m-%d")) if os.path.exists(event_log_path): log_handle = open(event_log_path, "rb") logs_data += log_handle.read() log_handle.close() range_handle = io.BytesIO(logs_data) log_exists = True except ValueError: print("[!] invalid date format in request") log_exists = False if log_exists: range_handle.seek(0, 2) total = range_handle.tell() range_handle.seek(0) if self.headers.get(HTTP_HEADER.RANGE): match = re.search(r"bytes=(\d+)-(\d+)", self.headers[HTTP_HEADER.RANGE]) if match: start, end = int(match.group(1)), int(match.group(2)) max_size = end - start + 1 end = min(total - 1, end) size = end - start + 1 if start == 0 or not session.range_handle: session.range_handle = range_handle if session.netfilters is None and not session.mask_custom: session.range_handle.seek(start) self.send_response(_http_client.PARTIAL_CONTENT) self.send_header(HTTP_HEADER.CONNECTION, "close") self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain") self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes %d-%d/%d" % (start, end, total)) content = session.range_handle.read(size) else: self.send_response(_http_client.OK) self.send_header(HTTP_HEADER.CONNECTION, "close") self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain") buffer, addresses, netmasks, regex = io.StringIO(), set(), [], "" for netfilter in session.netfilters or []: if not netfilter: continue if '/' in netfilter: netmasks.append(netfilter) elif re.search(r"\A[\d.]+\Z", netfilter): addresses.add(netfilter) elif "\\." in netfilter: regex = r"\b(%s)\b" % netfilter else: print("[!] invalid network filter '%s'" % netfilter) return for line in session.range_handle: display = session.netfilters is None ip = None line = line.decode(UNICODE_ENCODING, "ignore") if regex: match = re.search(regex, line) if match: ip = match.group(1) display = True if not display and (addresses or netmasks): for match in re.finditer(r"\b(\d+\.\d+\.\d+\.\d+)\b", line): if not display: ip = match.group(1) else: break if ip in addresses: display = True break elif netmasks: for _ in netmasks: prefix, mask = _.split('/') if addr_to_int(ip) & make_mask(int(mask)) == addr_to_int(prefix): addresses.add(ip) display = True break if session.mask_custom and "(custom)" in line: line = re.sub(r'("[^"]+"|[^ ]+) \(custom\)', "- (custom)", line) if display: if ",%s" % ip in line or "%s," % ip in line: line = re.sub(r" ([\d.,]+,)?%s(,[\d.,]+)? " % re.escape(ip), " %s " % ip, line) buffer.write(line) if buffer.tell() >= max_size: break content = buffer.getvalue() end = start + len(content) - 1 self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes %d-%d/%d" % (start, end, end + 1 + max_size * (len(content) >= max_size))) if len(content) < max_size: session.range_handle.close() session.range_handle = None if size == -1: self.send_response(_http_client.OK) self.send_header(HTTP_HEADER.CONNECTION, "close") self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain") self.end_headers() with range_handle as f: while True: data = f.read(io.DEFAULT_BUFFER_SIZE) if not data: break else: self.wfile.write(data) else: self.send_response(_http_client.OK) # instead of _http_client.NO_CONTENT (compatibility reasons) self.send_header(HTTP_HEADER.CONNECTION, "close") if self.headers.get(HTTP_HEADER.RANGE): self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes 0-0/0") return content
def get_mol_from_graph(line) : x = line.split('\t') mol_smiles = x[1] mol_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(mol_smiles),True) #make sure this is cannonical atom_info = x[2] bond_info = x[3] regex = r"\{(.*?)\}" matches = re.finditer(regex, atom_info , re.MULTILINE | re.DOTALL) atom_info_list = [] for matchNum, match in enumerate(matches): for groupNum in range(0, len(match.groups())): atom_info_list.append(match.group(1)) #create mol from the graph mol = Chem.RWMol() natoms = int(atom_info_list[0]) node_to_idx = {} idx_to_info = {} for atom_index in range(natoms): atom_info = atom_info_list[atom_index + 1] z = atom_info.split(':') ID = z[0] typ = z[1] smiles = z[2] order = z[3] sym = z[4] at_no = z[5] f_c = z[6] hyb = z[7] i_hc = z[8] e_hc = z[9] is_aro = z[10] a=Chem.Atom(int(at_no)) #a.SetChiralTag(chiral_tags[node]) a.SetFormalCharge(int(f_c)) if is_aro=='true': a.SetIsAromatic(True) else: a.SetIsAromatic(False) if hyb in rdkit.Chem.rdchem.HybridizationType.names: a.SetHybridization(rdkit.Chem.rdchem.HybridizationType.names[hyb]) else: a.SetHybridization(rdkit.Chem.rdchem.HybridizationType.names['OTHER']) #a.SetHybridization(hyb_map[hyb]) a.SetNumExplicitHs(int(e_hc)) idx = mol.AddAtom(a) node_to_idx[ID] = idx idx_to_info[idx] = [ID,typ,smiles,order] print(idx,ID,typ,smiles,order) bond_info_arr = bond_info.split(',') nbonds = int(bond_info_arr[0]) for bond_index in range(nbonds): bond = bond_info_arr[bond_index + 1] y = bond.split(':') edge = y[0] btype = y[1] y = edge.split('-') atom1_id = y[0] atom2_id = y[1] atom1_idx = node_to_idx[atom1_id] atom2_idx = node_to_idx[atom2_id] if(atom1_idx < atom2_idx): #print('Adding',atom1_idx,atom2_idx,btype) mol.AddBond(atom1_idx,atom2_idx,rdkit.Chem.rdchem.BondType.names[btype]) Chem.SanitizeMol(mol) new_smiles = Chem.MolToSmiles(mol) print('smiles=',mol_smiles,'\n') print('new_smiles=',new_smiles,'\n') assert new_smiles == mol_smiles print('graph and molecule matched') return mol,mol_smiles,idx_to_info