Beispiel #1
0
def read_prpixel_file(filename):
    f = open(filename)
    s = ''.join(f.readlines())
    f.close()

    # Track:
    # {'ntrack':, 'nhits':, 'hits:' [{'hitid':, 'module':, 'x':, 'y':, 'z':}, ...]}
    prpixel_tracks = []

    # Find all debug lines with created tracks
    for i in re.finditer('Store track Nb (?P<ntrack>\d+)[^\d]nhits (?P<nhits>\d+).*?PrPixelTracking[ \t]*?(INFO ===|DEBUG)', s, re.DOTALL):
        hits = []
        # Find all hits in the track
        for j in re.finditer('PrPixelTracking.*?(?P<hitid>\d+) *module *(?P<module>\d+) x *(?P<x>[\d\.\-]+) y *(?P<y>[\d\.\-]+) z *(?P<z>[\d\-\.]+) used \d', i.group(0), re.DOTALL):
            hits.append({'hitid': j.group('hitid'),
                'module': j.group('module'),
                'x': j.group('x'),
                'y': j.group('y'),
                'z': j.group('z')})

        prpixel_tracks.append({'ntrack': i.group('ntrack'),
            'nhits': i.group('nhits'),
            'hits': hits})

    return prpixel_tracks
Beispiel #2
0
def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    content = utils.gzip_decompress(content)
    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(email_utils.mktime_tz(
            email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
Beispiel #3
0
def obfuscate_codeblocks(source):
  """Method for obfuscating codeblocks contents.

  It can be often useful to temporarly obfuscate codeblocks contents for performing safely some tasks
  and then re-introducing them.

  Parameters
  ----------
  source : str
    string (as single stream) containing the source

  Returns
  -------
  protected_contents : list
    list of str containing the contents of codeblocks
  str
    source with codeblocks contents obfuscated and replaced by a safe placeholder

  >>> source = '``` my code block ``` other contents'
  >>> prot, ob_source = obfuscate_codeblocks(source)
  >>> prot[0][2]
  '``` my code block ```'
  >>> ob_source
  '$PROTECTED-1 other contents'
  """
  obfuscate_source = source
  protected_contents = []
  for match in re.finditer(__regex_codeblock__,obfuscate_source):
    protected_contents.append([match.start(),match.end(),match.group()])
    obfuscate_source = re.sub(__regex_codeblock__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
  for match in re.finditer(__regex_codeblock_html__,obfuscate_source):
    protected_contents.append([match.start(),match.end(),match.group()])
    obfuscate_source = re.sub(__regex_codeblock_html__,'$PROTECTED-'+str(len(protected_contents)),obfuscate_source,1)
  return protected_contents,obfuscate_source
def get_pronoun_label_zh(line):
	f_pronouns = ['我', '我们', '我 的']
	s_pronouns = ['你', '你们', '你 的']
	f_count = 0
	s_count = 0
	f_positions = []
	s_positions = []
	for pro in f_pronouns:
		f_zh = re.findall('^' + pro + ' ', line) + re.findall(' ' + pro + ' ', line) + re.findall(' ' + pro + '$', line)
		f_positions += [m.span()[0] for m in re.finditer('^' + pro + ' ', line)] +  [m.span()[0] for m in re.finditer(' ' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + '$', line)]
		f_count += len(f_zh)
	for pro in s_pronouns:
		s_zh = re.findall('^' + pro + ' ', line) + re.findall(' ' + pro + ' ', line) + re.findall(' ' + pro + '$', line)
		s_positions += [m.span()[0] for m in re.finditer('^' + pro + ' ', line)] +  [m.span()[0] for m in re.finditer(' ' + pro + ' ', line)] + [m.span()[0] for m in re.finditer(' ' + pro + '$', line)]
		s_count += len(s_zh)

	if f_count == 0 and s_count == 0:
		return ('none', 0, 0, [], [])
	if(f_count == s_count):
		f_min = min(f_positions)
		s_min = min(s_positions)
		starts_with = '1v' if f_min < s_min else '2v'
		return (starts_with, f_count, s_count, f_zh, s_zh)
		#return ('1v', f_count, s_count, f_zh, s_zh)
	elif(f_count > s_count):
		return ('1v', f_count, s_count, f_zh, s_zh)
	else:
		return ('2v', f_count, s_count, f_zh, s_zh)
Beispiel #5
0
  def ParseMethodAnnotation(self, annotation):
    if annotation.find('reservable = true') >= 0:
      self._is_reservable = True

    delegate_re = re.compile('delegate\s*=\s*'
        '(?P<delegate>(true|false))')
    for match in re.finditer(delegate_re, annotation):
      delegate = match.group('delegate')
      if delegate == 'true':
        self._is_delegate = True
      elif delegate == 'false':
        self._is_delegate = False

    disable_reflect_method_re = re.compile('disableReflectMethod\s*=\s*'
        '(?P<disableReflectMethod>(true|false))')
    for match in re.finditer(disable_reflect_method_re, annotation):
      disable_reflect_method = match.group('disableReflectMethod')
      if disable_reflect_method == 'true':
        self._disable_reflect_method = True
      else:
        self._disable_reflect_method = False

    pre_wrapline_re = re.compile('preWrapperLines\s*=\s*\{\s*('
        '?P<pre_wrapline>(".*")(,\s*".*")*)\s*\}')
    for match in re.finditer(pre_wrapline_re, annotation):
      pre_wrapline = self.FormatWrapperLine(match.group('pre_wrapline'))
      self._method_annotations[self.ANNOTATION_PRE_WRAPLINE] = pre_wrapline

    post_wrapline_re = re.compile('postWrapperLines\s*=\s*\{\s*('
        '?P<post_wrapline>(".*")(,\s*".*")*)\s*\}')
    for match in re.finditer(post_wrapline_re, annotation):
      post_wrapline = self.FormatWrapperLine(match.group('post_wrapline'))
      self._method_annotations[self.ANNOTATION_POST_WRAPLINE] = post_wrapline
Beispiel #6
0
	def prepair_query(self, media, *args, **kwards):
		if media == 'tvshow':
			uri = '/search/advanced_search.php?'
			query = {"q": args[0], "from_year": args[3], "to_year": args[3], "section": 2}
			uri += urllib.urlencode(query)
			html = self.request(uri)
			r = re.search('Search Results For: "(.*?)</table>', html, re.DOTALL)
			if r:
				fragment = r.group(1)
				pattern = r'<a\s+href="([^"]+)"\s+title="([^"]+)'
				for match in re.finditer(pattern, fragment):
					url, title_year = match.groups('')
					url = url.replace('-tvshow-online-free-putlocker.html', '-tvshow-season-%s-episode-%s-online-free-putlocker.html' % (args[1], args[2]))
					uri = url.replace(self.base_url, '')
					return uri
			return False
		else:
			uri = '/search/advanced_search.php?'
			query = {"q": args[0], "from_year": args[1], "to_year": args[1], "section": 1}
			uri += urllib.urlencode(query)
			html = self.request(uri)
			r = re.search('Search Results For: "(.*?)</table>', html, re.DOTALL)
			if r:
				fragment = r.group(1)
				pattern = r'<a\s+href="([^"]+)"\s+title="([^"]+)'
				for match in re.finditer(pattern, fragment):
					url, title_year = match.groups('')
					uri = url.replace(self.base_url, '')
					return uri
			return False
Beispiel #7
0
def extractRequirements(args):
    f = open(args.requirements_path)
    text = f.read()
    f.close()

    data = {} # [['Requirement Number', 'Requirement Description']]

    # Extract all of the tables
    for table in re.finditer(r'\\begin{tabular}.*?\\end{tabular}', text, re.S):
        if re.search(r'Number.*?Requirement Description', table.group(0)) != None:

            header_row = True

            # Look for this pattern:
            # F1.10 & Description
            for req in re.finditer(r'([\w\.]+)\s+&\s+(.*?)\s*\\\\', table.group(0)):

                if header_row:
                    header_row = False

                    # Skip the header
                    continue
                else:
                    # Store the requirment number and description in the data structure
                    #data.append([req.group(1), req.group(2)])
                    data[req.group(1)] = [req.group(1), req.group(2), set()]

    return data
    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)
            
            fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'})
            if fragment:
                for match in re.finditer('href="([^"]+)', fragment[0]):
                    match = re.search('movie-player/(.*)', match.group(1))
                    if match:
                        player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                        html = self._http_get(player_url, cache_limit=.5)
                        match = re.search('<source\s+src="([^"]+)', html)
                        if match:
                            stream_url = match.group(1)
                            hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True}
                            hosters.append(hoster)
                        
                        fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'})
                        if fragment2:
                            for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]):
                                other_url, quality = match.groups()
                                match = re.search('movie-player/(.*)', other_url)
                                if match:
                                    other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1)))
                                    if other_url == player_url: continue
                                    hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True}
                                    hosters.append(hoster)

        return hosters
def tableViewInHierarchy():
    viewDescription = fb.evaluateExpressionValue(
        "(id)[(id)[[UIApplication sharedApplication] keyWindow] recursiveDescription]"
    ).GetObjectDescription()

    searchView = None

    # Try to find an instance of
    classPattern = re.compile(r"UITableView: (0x[0-9a-fA-F]+);")
    for match in re.finditer(classPattern, viewDescription):
        searchView = match.group(1)
        break

    # Try to find a direct subclass
    if not searchView:
        subclassPattern = re.compile(r"(0x[0-9a-fA-F]+); baseClass = UITableView;")
        for match in re.finditer(subclassPattern, viewDescription):
            searchView = match.group(1)
            break

    # SLOW: check every pointer in town
    if not searchView:
        pattern = re.compile(r"(0x[0-9a-fA-F]+)[;>]")
        for view in re.findall(pattern, viewDescription):
            if fb.evaluateBooleanExpression("[" + view + " isKindOfClass:(id)[UITableView class]]"):
                searchView = view
                break

    return searchView
Beispiel #10
0
 def google_scrap(self):
     #iter = re.finditer('''<!--sMSL-->([\s\S]*)<!--sMSR-->''',self.the_page)
     #for it in iter:
     #    self.res_html = it.group(1)
     iter = re.finditer('''<p>([\s\S]*?)</p>''',self.the_page)
     for it in iter:
         self.search_item.append(it.group(1))
     for i in range(len(self.search_item)):
         href = ""
         title = ""
         iter = re.finditer('''<a.href="/url\?q=[^"]*">([\s\S]*?)</a>[\s\S]*?<a.href="/search\?q=related:([^"]*)&amp;hl=">''',self.search_item[i])
         #iter = re.finditer('''<a.href="[^"]*">([^"]*)</a>[\s\S]*?<a.href="/search?q=related:([^"]*)&amp;hl=">''',self.search_item[i])
         for it in iter:
             href = it.group(2)
             if href == '':
                 break
             title = it.group(1)
             textProcess = textprocess.TextProcess(title)
             textProcess.getTitle()
             textProcess.clearPoint()
             textProcess.clearSpace()
             self.html_titles[href] = textProcess.text
         if href == '':
             continue
         textProcess = textprocess.TextProcess(self.search_item[i])
         textProcess.clearHtml()
         textProcess.clearPoint()
         textProcess.clearSpace()
         text = textProcess.text
         self.res_items[href] = [self.html_titles[href],text]
     print self.html_titles
Beispiel #11
0
def scan_page(url, data=None):
    retval, usable = False, False
    url, data = re.sub(r"=(&|\Z)", "=1\g<1>", url) if url else url, re.sub(r"=(&|\Z)", "=1\g<1>", data) if data else data
    try:
        for phase in (GET, POST):
            current = url if phase is GET else (data or "")
            for match in re.finditer(r"((\A|[?&])(?P<parameter>[\w\[\]]+)=)(?P<value>[^&]+)", current):
                found, usable = False, True
                print "* scanning %s parameter '%s'" % (phase, match.group("parameter"))
                prefix, suffix = ("".join(random.sample(string.ascii_lowercase, PREFIX_SUFFIX_LENGTH)) for i in xrange(2))
                for pool in (LARGER_CHAR_POOL, SMALLER_CHAR_POOL):
                    if not found:
                        tampered = current.replace(match.group(0), "%s%s" % (match.group(0), urllib.quote("%s%s%s%s" % ("'" if pool == LARGER_CHAR_POOL else "", prefix, "".join(random.sample(pool, len(pool))), suffix))))
                        content = (_retrieve_content(tampered, data) if phase is GET else _retrieve_content(url, tampered)).replace("%s%s" % ("'" if pool == LARGER_CHAR_POOL else "", prefix), prefix)
                        for sample in re.finditer("%s([^ ]+?)%s" % (prefix, suffix), content, re.I):
                            for regex, condition, info, content_removal_regex in XSS_PATTERNS:
                                context = re.search(regex % {"chars": re.escape(sample.group(0))}, re.sub(content_removal_regex or "", "", content), re.I)
                                if context and not found and sample.group(1).strip():
                                    if _contains(sample.group(1), condition):
                                        print " (i) %s parameter '%s' appears to be XSS vulnerable (%s)" % (phase, match.group("parameter"), info % dict((("filtering", "no" if all(char in sample.group(1) for char in LARGER_CHAR_POOL) else "some"),)))
                                        found = retval = True
                                    break
        if not usable:
            print " (x) no usable GET/POST parameters found"
    except KeyboardInterrupt:
        print "\r (x) Ctrl-C pressed"
    return retval
Beispiel #12
0
def parseHtmlForm(attr_str, html, input_names=None):
    for form in re.finditer(r"(?P<tag><form[^>]*%s[^>]*>)(?P<content>.*?)</?(form|body|html)[^>]*>" % attr_str, html, re.S | re.I):
        inputs = {}
        action = parseHtmlTagAttrValue("action", form.group('tag'))
        for inputtag in re.finditer(r'(<(input|textarea)[^>]*>)([^<]*(?=</\2)|)', form.group('content'), re.S | re.I):
            name = parseHtmlTagAttrValue("name", inputtag.group(1))
            if name:
                value = parseHtmlTagAttrValue("value", inputtag.group(1))
                if value is None:
                    inputs[name] = inputtag.group(3) or ''
                else:
                    inputs[name] = value

        if isinstance(input_names, dict):
            # check input attributes
            for key, val in input_names.items():
                if key in inputs:
                    if isinstance(val, basestring) and inputs[key] == val:
                        continue
                    elif isinstance(val, tuple) and inputs[key] in val:
                        continue
                    elif hasattr(val, "search") and re.match(val, inputs[key]):
                        continue
                    break # attibute value does not match
                else:
                    break # attibute name does not match
            else:
                return action, inputs # passed attribute check
        else:
            # no attribute check
            return action, inputs

    return {}, None # no matching form found
Beispiel #13
0
def consistency_check(text, word_pairs, err, msg, offset=0):
    """Build a consistency checker for the given word_pairs."""
    errors = []

    msg = " ".join(msg.split())

    for w in word_pairs:
        match1 = [m for m in re.finditer(w[0], text)]
        match2 = [m for m in re.finditer(w[1], text)]

        if len(match1) > 0 and len(match2) > 0:

            if len(match1) > len(match2):
                for m in match2:
                    errors.append((
                        m.start() + offset,
                        m.end() + offset,
                        err,
                        msg.format(m.group(0), w[0])))
            else:
                for m in match1:
                    errors.append((
                        m.start() + offset,
                        m.end() + offset,
                        err,
                        msg.format(m.group(0), w[1])))

    return errors
    def get_media_url(self, host, media_id):
        web_url = self.get_url(host, media_id)
        html = self.net.http_GET(web_url).content
        form_values = {}
        stream_url = ''
        for i in re.finditer('<input type="hidden" name="([^"]+)" value="([^"]+)', html):
            form_values[i.group(1)] = i.group(2)

        xbmc.sleep(2000)
        html = self.net.http_POST(web_url, form_data=form_values).content
        
        r = re.search("file\s*:\s*'([^']+)'", html)
        if r:
            stream_url = r.group(1)

        for match in re.finditer('(eval\(function.*?)</script>', html, re.DOTALL):
            js_data = jsunpack.unpack(match.group(1))
            match2 = re.search('<param\s+name="src"\s*value="([^"]+)', js_data)
            if match2:
                stream_url = match2.group(1)
            else:
                match2 = re.search('<embed.*?type="video.*?src="([^"]+)', js_data)
                if match2:
                    stream_url = match2.group(1)
            
        if stream_url:
            return stream_url + '|User-Agent=%s&Referer=%s' % (common.IE_USER_AGENT, web_url)

        raise UrlResolver.ResolverError('Unable to resolve cloudyvideos link. Filelink not found.')
Beispiel #15
0
def setupTranslations(type, locales, projectName, key):
  # Copy locales list, we don't want to change the parameter
  locales = set(locales)

  # Fill up with locales that we don't have but the browser supports
  if type == 'chrome':
    for locale in chromeLocales:
      locales.add(locale)
  else:
    firefoxLocales = urllib2.urlopen('http://www.mozilla.org/en-US/firefox/all.html').read()
    for match in re.finditer(r'&amp;lang=([\w\-]+)"', firefoxLocales):
      locales.add(mapLocale(type, match.group(1)))
    langPacks = urllib2.urlopen('https://addons.mozilla.org/en-US/firefox/language-tools/').read()
    for match in re.finditer(r'<tr>.*?</tr>', langPacks, re.S):
      if match.group(0).find('Install Language Pack') >= 0:
        match2 = re.search(r'lang="([\w\-]+)"', match.group(0))
        if match2:
          locales.add(mapLocale(type, match2.group(1)))

  # Convert locale codes to the ones that Crowdin will understand
  locales = set(map(lambda locale: mapLocale(type, locale), locales))

  allowed = set()
  allowedLocales = urllib2.urlopen('http://crowdin.net/page/language-codes').read()
  for match in re.finditer(r'<tr>\s*<td\b[^<>]*>([\w\-]+)</td>', allowedLocales, re.S):
    allowed.add(match.group(1))
  if not allowed.issuperset(locales):
    print 'Warning, following locales aren\'t allowed by server: ' + ', '.join(locales - allowed)

  locales = list(locales & allowed)
  locales.sort()
  params = urllib.urlencode([('languages[]', locale) for locale in locales])
  result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/edit-project?key=%s' % (projectName, key), params).read()
  if result.find('<success') < 0:
    raise Exception('Server indicated that the operation was not successful\n' + result)
def processticker(ticker, file_name, date_int, listview):
    base_url = "http://finance.yahoo.com/q/op"
    num_of_tries = 0
    payload = {"s": ticker, "date": date_int}
    r = requests.get(base_url, params=payload)
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    option_list = []
    expiration_dictionary = {}

    while num_of_tries < 20:
        try:

            for pair in soup.find_all("option"):
                expiration_dictionary[pair.get_text()] = yahoo_url + pair["data-selectbox-link"]
            for n in soup.find_all("script"):
                option_list.append(n)
            raw_options_chain = str(option_list.pop(16))
            start_call_options = [a.start() for a in list(re.finditer("calls", raw_options_chain))]
            endoptions = [a.start() for a in list(re.finditer("_options", raw_options_chain))]
            raw_options_chain = raw_options_chain[start_call_options[0] - 2 : endoptions[0] - 2]
            options_json = json.loads(raw_options_chain)
            # Extract puts/calls as JSON objects.
            put_list = options_json["puts"]
            call_list = options_json["calls"]
            print(call_list)
            create_csv(call_list, put_list, file_name, listview)

        except IndexError:
            num_of_tries += 1
            continue
        break
Beispiel #17
0
def consistency_check(text, word_pairs, err, msg, offset=0):
    """Build a consistency checker for the given word_pairs."""
    errors = []

    msg = " ".join(msg.split())

    for w in word_pairs:
        matches = [
            [m for m in re.finditer(w[0], text)],
            [m for m in re.finditer(w[1], text)]
        ]

        if len(matches[0]) > 0 and len(matches[1]) > 0:

            idx_minority = len(matches[0]) > len(matches[1])

            for m in matches[idx_minority]:
                errors.append((
                    m.start() + offset,
                    m.end() + offset,
                    err,
                    msg.format(w[~idx_minority], m.group(0)),
                    w[~idx_minority]))

    return errors
Beispiel #18
0
	def html2utf8(self,in_html):
		in_html = (re.subn(r'<(script).*?</\1>(?s)', '', in_html)[0])
		in_html = (re.subn(r'<(style).*?</\1>(?s)', '', in_html)[0])
		entitydict = {}

		entities = re.finditer('&([^#][A-Za-z]{1,5}?);', in_html)
		for x in entities:
			key = x.group(0)
			if key not in entitydict:
				entitydict[key] = htmlentitydefs.name2codepoint[x.group(1)]

		entities = re.finditer('&#x([0-9A-Fa-f]{2,2}?);', in_html)
		for x in entities:
			key = x.group(0)
			if key not in entitydict:
				entitydict[key] = "%d" % int(key[3:5], 16)

		entities = re.finditer('&#(\d{1,5}?);', in_html)
		for x in entities:
			key = x.group(0)
			if key not in entitydict:
				entitydict[key] = x.group(1)

		if re.search("charset=utf-8", in_html):
			for key, codepoint in iteritems(entitydict):
				in_html = in_html.replace(key, unichr(int(codepoint)))
			self.inhtml = in_html.encode('utf8')
			return

		for key, codepoint in iteritems(entitydict):
			in_html = in_html.replace(key, unichr(int(codepoint)).encode('latin-1', 'ignore'))
		self.inhtml = in_html.decode('latin-1').encode('utf8')
    def _attack(self, basePair, payloads, taint, request_template, referer):
        proto = helpers.analyzeRequest(basePair).getUrl().getProtocol() + '://'
        if 'abshost' in payloads:
            payloads['abshost'] = proto + payloads['abshost']
        payloads['referer'] = proto + taint + '/' + referer

        # Load the supplied payloads into the request
        if 'xfh' in payloads:
            payloads['xfh'] = "\r\nX-Forwarded-Host: " + payloads['xfh']

        for key in ('xfh', 'abshost', 'host', 'referer'):
            if key not in payloads:
                payloads[key] = ''

        # Ensure that the response to our request isn't cached - that could be harmful
        payloads['cachebust'] = str(time.time())

        request = request_template.substitute(payloads)

        attack = callbacks.makeHttpRequest(basePair.getHttpService(), request)

        response = safe_bytes_to_string(attack.getResponse())

        requestHighlights = [jarray.array([m.start(), m.end()], 'i') for m in
                             re.finditer('(' + '|'.join(payloads.values()) + ')',
                                         safe_bytes_to_string(attack.getRequest()))]
        responseHighlights = [jarray.array([m.start(), m.end()], 'i') for m in re.finditer(taint, response)]
        attack = callbacks.applyMarkers(attack, requestHighlights, responseHighlights)
        return attack, response
Beispiel #20
0
    def get_media_url(self, host, media_id):
        web_url = self.get_url(host, media_id)
        html = self.net.http_GET(web_url).content

        data = {}
        for match in re.finditer('input type="hidden" name="([^"]+)" value="([^"]+)', html):
            key, value = match.groups()
            data[key] = value
        data['method_free'] = 'Proceed to Video'

        html = self.net.http_POST(web_url, form_data=data).content

        stream_url = ''
        for match in re.finditer('(eval\(function.*?)</script>', html, re.DOTALL):
            js_data = jsunpack.unpack(match.group(1))
            match2 = re.search('<param\s+name="src"\s*value="([^"]+)', js_data)
            if match2:
                stream_url = match2.group(1)
            else:
                match2 = re.search('file\s*:\s*"([^"]+)', js_data)
                if match2:
                    stream_url = match2.group(1)

        if stream_url:
            return stream_url + '|' + urllib.urlencode({'User-Agent': common.IE_USER_AGENT, 'Referer': web_url})

        raise ResolverError('Unable to resolve grifthost link. Filelink not found.')
def replaceDeployFile(out_file,template_file,train_file,fix_layers=None):
    f=open(template_file,'rb');
    text=f.read()[:];
    f.close();
    

    text=text.replace('$TRAIN_TXT','"'+train_file+'"');
    if fix_layers is not None:
        start_excludes=[];
        for fix_layer_curr in fix_layers:
            starts = [match.start() for match in re.finditer(re.escape('name: "'+fix_layer_curr), text)]
            assert len(starts)==1;
            # start_excludes=starts[:];
            start_excludes.append(starts[0]);
        starts=[match.start() for match in re.finditer(re.escape('name: '), text)]
        starts=[idx for idx in starts if idx not in start_excludes];
        starts.sort();
        starts=starts[::-1];
        # starts=starts[1:];
        for start in starts:
            string_orig=text[start:];   
            string_orig=string_orig[:string_orig.index('\n')]
            # [:string_orig.rindex('"')+1]
            # print string_orig
            string_new=string_orig[:string_orig.rindex('"')]+'_fix"';
            # print string_new,string_orig
            text=text.replace(string_orig,string_new);


    f=open(out_file,'wb')
    f.write(text);
    f.close();    
Beispiel #22
0
def stem_helper(word, rem_umlaut = True):
	"""rem_umlat: Remove umlaut from text"""
	
	#Define R1 and R2 regions
	
	#R1 is defined as the region after the first consonant followed by a vowel
	
	try:
		R1 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word))[0].start() + 2
	except:
		R1 = len(word)
		
	#R2 is defined as the region within R1 after the first consonant followed by a vowel
	
	try:
		R2 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word[R1:]))[0].start() + 2 + R1
	except:
		R2 = len(word)
		
	#Make sure the index of R1 is at least 3. 
	
	if R1<3:
		try:
			R1 = list(re.finditer(r"[aëeiouäöüâêîôûæœ][bdghfcjklmnspqrtvwz]",word[1:]))[0].start() + 2
		except:
			R1 = len(word)
	
	if rem_umlaut:
		word = remove_umlaut(word)
	
	word = word[:R1] + re.sub(r'(wes|wen|est|ern|em|en|er|es|eȥ(?=[klmrt])s|(?=[lr])n|e)$',"",word[R1:])
	word = word[:R1] + re.sub(r'(est|er|en|re|in|iu|(?=.{3})st,word[R1:])$',"",word[R1:])
	word = word[:R2] + re.sub(r'(lich?.?.|keit|inc|isch?.?.)$',"",word[R2:])
	
	return word
Beispiel #23
0
    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(url, cache_limit=.5)

            match = re.search('This movie is of poor quality', html, re.I)
            if match:
                quality = QUALITIES.LOW
            else:
                quality = QUALITIES.HIGH

            for match in re.finditer('href="([^"]+/embed\d*/[^"]+)', html):
                url = match.group(1)
                embed_html = self._http_get(url, cache_limit=.5)
                r = re.search('{\s*write\("([^"]+)', embed_html)
                if r:
                    plaintext = self._caesar(r.group(1), 13).decode('base-64')
                    if 'http' not in plaintext:
                        plaintext = self._caesar(r.group(1).decode('base-64'), 13).decode('base-64')
                else:
                    plaintext = embed_html
                hosters += self._get_links(plaintext)
            
            pattern = 'href="([^"]+)".*play_video.gif'
            for match in re.finditer(pattern, html, re.I):
                url = match.group(1)
                host = urlparse.urlparse(url).hostname
                hoster = {'multi-part': False, 'url': url, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, quality), 'rating': None, 'views': None, 'direct': False}
                hosters.append(hoster)
        return hosters
Beispiel #24
0
 def list_show_page(self, url, page, seasons=False, episodes=False):
     result = []
     if "/p/epizody" in url or "/p/epiz%C3%B3dy" in url or "p/archiv" in url:
         if seasons:
             season_data = util.substr(page, SERIES_START2, SERIES_END2)
             for m in re.finditer(SERIES_ITER_RE2, season_data, re.DOTALL | re.IGNORECASE):
                 item = self.dir_item()
                 item['title'] = m.group('title')
                 item['url'] = url + '#post=%s' % (m.group('id'))
                 self._filter(result, item)
         if episodes:
             for m in re.finditer(EPISODE_ITER_RE2, page, re.DOTALL | re.IGNORECASE):
                 item = self.video_item()
                 item['title'] = "%s (%s)" % (m.group('title'), m.group('date'))
                 item['url'] = m.group('url')
                 self._filter(result, item)
     else:
         if seasons:
             season_data = util.substr(page, SERIES_START, SERIES_END)
             for m in re.finditer(SERIES_ITER_RE, season_data, re.DOTALL | re.IGNORECASE):
                 item = self.dir_item()
                 item['title'] = m.group('title')
                 item['url'] = 'http://' + urlparse.urlparse(url).netloc + '/ajax.json?' + m.group('url')
                 self._filter(result, item)
         if episodes:
             episodes_data = util.substr(page, EPISODE_START, EPISODE_END)
             for m in re.finditer(EPISODE_ITER_RE, page, re.DOTALL | re.IGNORECASE):
                 item = self.video_item()
                 item['title'] = "%s. %s (%s)" % (m.group('episode'), m.group('title'), m.group('date'))
                 item['url'] = m.group('url')
                 self._filter(result, item)
     return result
Beispiel #25
0
def dotransform(request, response):
  
  emailaddr = []
  msgfile = request.value
  lookFor = ['To', 'From']
  tmpfolder = request.fields['sniffMyPackets.outputfld']
  
  with open(msgfile, mode='r') as msgfile:
    reader = msgfile.read()
    reader = str(reader)
    for x in lookFor:
      if x in reader:
        for s in re.finditer('RCPT TO: <([\w.-]+@[\w.-]+)>', reader):
          to_addr = s.group(1), 'mail_to'
          emailaddr.append(to_addr)
        for t in re.finditer('MAIL FROM: <([\w.-]+@[\w.-]+)>', reader):
          from_addr = t.group(1), 'mail_from'
          emailaddr.append(from_addr)

  
	
  for addr, addrfield in emailaddr:
    e = EmailAddress(addr)
    e.linklabel = addrfield
    e += Field('filelocation', request.value, displayname='File Location', matchingrule='loose')
    e += Field('emailaddr', addrfield, displayname='Header Info')
    response += e
  return response
Beispiel #26
0
def setupTranslations(localeConfig, projectName, key):
  # Make a new set from the locales list, mapping to Crowdin friendly format
  locales = {mapLocale(localeConfig['name_format'], locale)
             for locale in localeConfig['locales']}

  # Fill up with locales that we don't have but the browser supports
  if 'chrome' in localeConfig['target_platforms']:
    for locale in chromeLocales:
      locales.add(mapLocale('ISO-15897', locale))

  if 'gecko' in localeConfig['target_platforms']:
    firefoxLocales = urllib2.urlopen('http://www.mozilla.org/en-US/firefox/all.html').read()
    for match in re.finditer(r'&amp;lang=([\w\-]+)"', firefoxLocales):
      locales.add(mapLocale('BCP-47', match.group(1)))
    langPacks = urllib2.urlopen('https://addons.mozilla.org/en-US/firefox/language-tools/').read()
    for match in re.finditer(r'<tr>.*?</tr>', langPacks, re.S):
      if match.group(0).find('Install Language Pack') >= 0:
        match2 = re.search(r'lang="([\w\-]+)"', match.group(0))
        if match2:
          locales.add(mapLocale('BCP-47', match2.group(1)))

  allowed = set()
  allowedLocales = urllib2.urlopen('http://crowdin.net/page/language-codes').read()
  for match in re.finditer(r'<tr>\s*<td\b[^<>]*>([\w\-]+)</td>', allowedLocales, re.S):
    allowed.add(match.group(1))
  if not allowed.issuperset(locales):
    print 'Warning, following locales aren\'t allowed by server: ' + ', '.join(locales - allowed)

  locales = list(locales & allowed)
  locales.sort()
  params = urllib.urlencode([('languages[]', locale) for locale in locales])
  result = urllib2.urlopen('http://api.crowdin.net/api/project/%s/edit-project?key=%s' % (projectName, key), params).read()
  if result.find('<success') < 0:
    raise Exception('Server indicated that the operation was not successful\n' + result)
Beispiel #27
0
 def list_archive_page(self, show_page, showon=False, showoff=False):
     showonlist = []
     if showon:
         page = util.substr(show_page, VYSIELANE_START, NEVYSIELANE_START)
         for m in re.finditer(VYSIELANE_ITER_RE, page, re.DOTALL | re.IGNORECASE):
             item = self.dir_item()
             item['title'] = m.group('title')
             item['plot'] = m.group('desc')
             item['url'] = m.group('url') + "#season_episode"
             if m.group('itime') is not None:
                 item['type'] = "showon7d"
             else:
                 item['type'] = "showon"
             showonlist.append(item)
     showonlist.sort(key=lambda x: x['title'].lower())
     showofflist = []
     if showoff:
         page = util.substr(show_page, NEVYSIELANE_START, NEVYSIELANE_END)
         for m in re.finditer(NEVYSIELANE_ITER_RE, page, re.DOTALL | re.IGNORECASE):
             item = self.dir_item()
             item['title'] = m.group('title')
             item['url'] = m.group('url') + "#season_episode"
             item['type'] = "showoff"
             showofflist.append(item)
     showofflist.sort(key=lambda x: x['title'].lower())
     result = showonlist + showofflist
     return result
Beispiel #28
0
        def _generate_entry_probe(self):
                # Any $entry(name) expressions result in saving that argument
                # when entering the function.
                self.args_to_probe = set()
                regex = r"\$entry\((\w+)\)"
                for expr in self.exprs:
                        for arg in re.finditer(regex, expr):
                                self.args_to_probe.add(arg.group(1))
                for arg in re.finditer(regex, self.filter):
                        self.args_to_probe.add(arg.group(1))
                if any(map(lambda expr: "$latency" in expr, self.exprs)) or \
                   "$latency" in self.filter:
                        self.args_to_probe.add("__latency")
                        self.param_types["__latency"] = "u64"    # nanoseconds
                for pname in self.args_to_probe:
                        if pname not in self.param_types:
                                raise ValueError("$entry(%s): no such param" \
                                                % arg)

                self.hashname_prefix = "%s_param_" % self.probe_hash_name
                text = ""
                for pname in self.args_to_probe:
                        # Each argument is stored in a separate hash that is
                        # keyed by pid.
                        text += "BPF_HASH(%s, u32, %s);\n" % \
                             (self.hashname_prefix + pname,
                              self.param_types[pname])
                text += self._generate_entry()
                return text
Beispiel #29
0
	def ExtendCurlys(self, list_of_terms, target_body):
		"""
		Run FindWordsInBracketsAndCurlies first.
		Adds brackets to the same words if they have not yet received brackets.
		"""
		self.target_body = ' ' + target_body + ' '

		self.dbrackets = [m.span(0) for m in re.finditer(r"\[([\w \(\)\-,.]+)\]", self.target_body)]
		self.sbrackets = [m.span(0) for m in re.finditer(r"\{([\w \(\)\-,.]+)\}", self.target_body)]
		self.allbrackets = self.dbrackets + self.sbrackets

		def repl(matchobj):
			for span in self.allbrackets:
				if matchobj.start(0) in range(*span):
					return matchobj.group(0)

			self.curly_count += 1
			return (matchobj.group(1) + self.curly_term + matchobj.group(2))


		self.curly_count = 0
		for i, term in enumerate(list_of_terms):
			self.curly_term = '{' + term + '}'

			regex = re.compile(r"([^\{\w])%s([^\}\w])" %term, re.IGNORECASE)
			if i ==0:
				self.ecoutput = re.sub(regex, repl, self.target_body)
			else:
				self.ecoutput = re.sub(regex, repl, self.ecoutput)

		self.ecoutput = self.ecoutput[1:-1]
Beispiel #30
0
def _boundary_of_alternatives_indices(pattern):
    """
    Determines the location of a set of alternatives in a glob pattern.
    Alternatives are defined by a matching set of non-bracketed parentheses.

    :param pattern: Glob pattern with wildcards.
    :return:        Indices of the innermost set of matching non-bracketed
                    parentheses in a tuple. The Index of a missing parenthesis
                    will be passed as None.
    """
    # Taking the leftmost closing parenthesis and the rightmost opening
    # parenthesis left of it ensures that the parentheses belong together and
    # the pattern is parsed correctly from the most nested section outwards.
    end_pos = None
    for match in re.finditer('\\)', pattern):
        if not _position_is_bracketed(pattern, match.start()):
            end_pos = match.start()
            break  # Break to get leftmost.

    start_pos = None
    for match in re.finditer('\\(', pattern[:end_pos]):
        if not _position_is_bracketed(pattern, match.start()):
            start_pos = match.end()
            # No break to get rightmost.

    return start_pos, end_pos
Beispiel #31
0
#     print(i.start())
#     print()
#....................................................................................................................

# x='a{2,3}' # where minimum 2 or maximum 3 'a' are found
# r='aa aabca aaaa cgaa'
# match=re.finditer(x,r)
# for i in match:
#     print(i.start())
#     print(i.group())
#     print()

#....................................................................................................................
#
# x='^a' #check whether the whole string is starting with 'a'
# r='aa abc cga baac'
# match=re.finditer(x,r)
# for i in match:
#     print("yes starting with a")
#     print(i.start())
#     print(i.group())
#..................................................................................................................

x = 'a$'  #ending with a
r = 'aa abc cga baaa'
match = re.finditer(x, r)
for i in match:
    print("yes ending with a")
    print(i.start())
    print(i.group())
Beispiel #32
0
def spliterator(text):
    return (x.group(0) for x in re.finditer(r"[A-Za-z,-]+", text))
Beispiel #33
0
def getSimilarWordIndex(review, contextWord):
    return [m.start() for m in re.finditer(contextWord, review)] #find all index for matching word
Beispiel #34
0
 def parse(cls, raw):
   return [cls(raw, m.span(), raw[slice(*m.span(1))]) for m in \
     re.finditer(r'`(.+?)`', raw)]
Beispiel #35
0
import re

print("Hello, World!")
fileName = "#{prj_code}_#{app_code}_#{brc_code}_docs_V#{rls_ver}.tar.gz"

reExpr = r"#\{\w*\}"
reIt = re.finditer(reExpr, fileName)

prjPara = {}
prjPara["prj_code"] = "jtwlwV8"
prjPara["sys_code"] = "billing"
prjPara["app_code"] = "billing"
prjPara["brc_code"] = "testage_jtwlwV8"

listSeg = []
lastPos = 0
for it in reIt:
	listSeg.append(fileName[lastPos:it.start()])
	print(" listSeg is:%s" %listSeg)
	varName = it.group()[2:-1]
	print( "varName is :%s" %varName)
	if varName in prjPara:
		listSeg.append(prjPara[varName])
	else:
		listSeg.append("#{%s}" % varName)
		print("varName : %s not found!!" %varName)
	print("lastPot : %s" %it.end())
	lastPos = it.end()

if lastPos < len(fileName)-1:
    listSeg.append(fileName[lastPos:])
Beispiel #36
0
import re
s='[email protected],[email protected]'
pattern = r'\S+@\S+\.(com|cn)'
print(re.findall(pattern,s))
# regex = re.compile(pattern)
# l = regex.search(s)
l = re.finditer(pattern,s)


for i in l:
    print(i.group())
# l = re.match(pattern,s)
# print(l)
Beispiel #37
0
 def _extract_urls(webpage):
     return [
         mobj.group('url') for mobj in re.finditer(
             r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
             webpage)
     ]
Beispiel #38
0
def read_config(config_file):
    global config

    if not os.path.isfile(config_file):
        exit("[!] missing configuration file '%s'" % config_file)
    else:
        print "[i] using configuration file '%s'" % config_file

    config.clear()

    try:
        array = None
        content = open(config_file, "rb").read()

        for line in content.split("\n"):
            line = line.strip('\r')
            line = re.sub(r"\s*#.*", "", line)
            if not line.strip():
                continue

            if line.count(' ') == 0:
                if re.search(r"[^\w]", line):
                    if array == "USERS":
                        exit("[!] invalid USERS entry '%s'\n[?] (hint: add whitespace at start of line)" % line)
                    else:
                        exit("[!] invalid configuration (line: '%s')" % line)
                array = line.upper()
                config[array] = []
                continue

            if array and line.startswith(' '):
                config[array].append(line.strip())
                continue
            else:
                array = None
                try:
                    name, value = line.strip().split(' ', 1)
                except ValueError:
                    name = line
                    value = ""
                finally:
                    name = name.strip().upper()
                    value = value.strip("'\"").strip()

            _ = os.environ.get("%s_%s" % (NAME.upper(), name))
            if _:
                value = _

            if any(name.startswith(_) for _ in ("USE_", "SET_", "CHECK_", "ENABLE_", "SHOW_", "DISABLE_")):
                value = value.lower() in ("1", "true")
            elif value.isdigit():
                value = int(value)
            else:
                for match in re.finditer(r"\$([A-Z0-9_]+)", value):
                    if match.group(1) in globals():
                        value = value.replace(match.group(0), str(globals()[match.group(1)]))
                    else:
                        value = value.replace(match.group(0), os.environ.get(match.group(1), match.group(0)))
                if name.endswith("_DIR"):
                    value = os.path.realpath(os.path.join(ROOT_DIR, os.path.expanduser(value)))

            config[name] = value

    except (IOError, OSError):
        pass

    for option in ("MONITOR_INTERFACE", "CAPTURE_BUFFER", "LOG_DIR"):
        if not option in config:
            exit("[!] missing mandatory option '%s' in configuration file '%s'" % (option, config_file))

    for entry in (config.USERS or []):
        if len(entry.split(':')) != 4:
            exit("[!] invalid USERS entry '%s'" % entry)
        if re.search(r"\$\d+\$", entry):
            exit("[!] invalid USERS entry '%s'\n[?] (hint: please update PBKDF2 hashes to SHA256 in your configuration file)" % entry)

    if config.SSL_PEM:
        config.SSL_PEM = config.SSL_PEM.replace('/', os.sep)

    if config.USER_WHITELIST:
        if ',' in config.USER_WHITELIST:
            print("[x] configuration value 'USER_WHITELIST' has been changed. Please use it to set location of whitelist file")
        elif not os.path.isfile(config.USER_WHITELIST):
            exit("[!] missing 'USER_WHITELIST' file '%s'" % config.USER_WHITELIST)
        else:
            read_whitelist()
            
    if config.USER_IGNORELIST:
        if not os.path.isfile(config.USER_IGNORELIST):
            exit("[!] missing 'USER_IGNORELIST' file '%s'" % config.USER_IGNORELIST)
        else:
            read_ignorelist()
            
    config.PROCESS_COUNT = int(config.PROCESS_COUNT or CPU_CORES)

    if config.USE_MULTIPROCESSING:
        print("[x] configuration switch 'USE_MULTIPROCESSING' is deprecated. Please use 'PROCESS_COUNT' instead")

    if config.DISABLE_LOCAL_LOG_STORAGE and not any((config.LOG_SERVER, config.SYSLOG_SERVER)):
        print("[x] configuration switch 'DISABLE_LOCAL_LOG_STORAGE' turned on and neither option 'LOG_SERVER' nor 'SYSLOG_SERVER' are set. Falling back to console output of event data")

    if config.UDP_ADDRESS is not None and config.UDP_PORT is None:
        exit("[!] usage of configuration value 'UDP_ADDRESS' requires also usage of 'UDP_PORT'")

    if config.UDP_ADDRESS is None and config.UDP_PORT is not None:
        exit("[!] usage of configuration value 'UDP_PORT' requires also usage of 'UDP_ADDRESS'")

    if not str(config.HTTP_PORT or "").isdigit():
        exit("[!] invalid configuration value for 'HTTP_PORT' ('%s')" % config.HTTP_PORT)

    if config.PROCESS_COUNT and subprocess.mswindows:
        print "[x] multiprocessing is currently not supported on Windows OS"
        config.PROCESS_COUNT = 1

    if config.CAPTURE_BUFFER:
        if str(config.CAPTURE_BUFFER or "").isdigit():
            config.CAPTURE_BUFFER = int(config.CAPTURE_BUFFER)
        elif re.search(r"\d+\s*[kKmMgG]B", config.CAPTURE_BUFFER):
            match = re.search(r"(\d+)\s*([kKmMgG])B", config.CAPTURE_BUFFER)
            config.CAPTURE_BUFFER = int(match.group(1)) * {"K": 1024, "M": 1024 ** 2, "G": 1024 ** 3}[match.group(2).upper()]
        elif re.search(r"\d+%", config.CAPTURE_BUFFER):
            physmem = _get_total_physmem()

            if physmem:
                config.CAPTURE_BUFFER = physmem * int(re.search(r"(\d+)%", config.CAPTURE_BUFFER).group(1)) / 100
            else:
                exit("[!] unable to determine total physical memory. Please use absolute value for 'CAPTURE_BUFFER'")
        else:
            exit("[!] invalid configuration value for 'CAPTURE_BUFFER' ('%s')" % config.CAPTURE_BUFFER)

        config.CAPTURE_BUFFER = config.CAPTURE_BUFFER / BLOCK_LENGTH * BLOCK_LENGTH

    if config.PROXY_ADDRESS:
        PROXIES.update({"http": config.PROXY_ADDRESS, "https": config.PROXY_ADDRESS})
        opener = urllib2.build_opener(urllib2.ProxyHandler(PROXIES))
        urllib2.install_opener(opener)

    if not config.TRAILS_FILE:
        config.TRAILS_FILE = DEFAULT_TRAILS_FILE
    else:
        config.TRAILS_FILE = os.path.abspath(os.path.expanduser(config.TRAILS_FILE))
import json
import re
import ast
import numpy as np
import matplotlib.pyplot as plt
import operator
import collections

json_contents = open('output/msd_fit_categories0.2.txt', 'r').read()[1:-1]
json_contents_split = [int(a) for a in json_contents.split()]
cluster_sizes = sorted(collections.Counter(json_contents_split).items())

cluster_years = collections.defaultdict(list)
json_contents = open('output/song_groupings0.2.txt', 'r').read()
for g in re.finditer('(\d{1,2}): \[.*?(\)\])', json_contents):
    cluster_num = g.group(1)
    for year in re.finditer(', (\d{4})\),', g.group(0)):
        cluster_years[cluster_num].append(int(year.group(1)))

all_song_dists = {}
all_song_nums = {}
for key in cluster_years.keys():
    song_dist = cluster_years[key]
    all_songs_dists_raw = sorted(collections.Counter(song_dist).items())
    total_songs_num = sum([tup[1] for tup in all_songs_dists_raw])
    all_song_nums[key] = total_songs_num
    all_song_dists[key] = [(tup[0], float(tup[1]) / total_songs_num)
                           for tup in all_songs_dists_raw]

plt.switch_backend('agg')
for idx, key in enumerate(cluster_years):
Beispiel #40
0
del mo1, mo2

mo1 = re.search("Hello", oneline_string)
mo2 = re.match("Hello", oneline_string)
print(mo1)
print(mo2)
del mo1, mo2

mo1 = re.search("Vien", oneline_string)
mo2 = re.search("Vien", multiline_string, re.M)
mo3 = re.findall("Vien", multiline_string)
print(mo1)
print(mo2)
print(mo3)

for mo4 in re.finditer("Vien", multiline_string):
    print(mo4)
    print(mo4.group())
    print(mo4.span())
del mo1, mo2, mo3, mo4

print("Hello Vien\nBye Vien")
print(r"Hello Vien\nBye Vien") #raw string doesn't process backslash as escape character

print(re.sub("Vien", "Van", multiline_string))

print(re.split("Vien", multiline_string))

mo = re.match("Hello (?P<name>\w+)\n.*\n.*(?P=name)",multiline_string)
print(mo)
print(mo.groups())
Beispiel #41
0
# find a pattern without compilation
string1 = 'to Alice and Bob from'
print(re.search('to .* from', string1))
# match


```
>>> re.findall("(\d+)", "07 23 32 32")
['07', '23', '32', '32']
```

```
>>> re.search("[123]","199")
<re.Match object; span=(0, 1), match='1'>
```

Empty if no match
```
>>> re.search("[123]","999")
```

```
>>> [m.start(0) for m in re.finditer("a", "abcabca")]
[0, 3, 6]
```

```
>>> re.sub("\s+", " ", "Good    morning")
"Good morning"
```
Beispiel #42
0
def prettyName(class_name):
    return ' '.join(
        [x.group() for x in re.finditer('([A-Z])([a-z0-9]+)', class_name)])
Beispiel #43
0
def check_trailing_whitespace():
    stderr("checking trailing whitespace...")
    for mo in re.finditer(r'(?m)[ \t]+$', spec.text):
        posn = mo.start()
        msg_at_posn(posn, "trailing whitespace")
Beispiel #44
0
    def do_get_colors(self):
        # Looks for fish_color_*.
        # Returns an array of lists [color_name, color_description, color_value]
        result = []

        # Make sure we return at least these
        remaining = set(
            [
                "normal",
                "error",
                "command",
                "end",
                "param",
                "comment",
                "match",
                "selection",
                "search_match",
                "operator",
                "escape",
                "quote",
                "redirection",
                "valid_path",
                "autosuggestion",
                "user",
                "host",
                "cancel",
            ]
        )

        # Here are our color descriptions
        descriptions = {
            "normal": "Default text",
            "command": "Ordinary commands",
            "quote": "Text within quotes",
            "redirection": "Like | and >",
            "end": "Like ; and &",
            "error": "Potential errors",
            "param": "Command parameters",
            "comment": "Comments start with #",
            "match": "Matching parenthesis",
            "selection": "Selected text",
            "search_match": "History searching",
            "history_current": "Directory history",
            "operator": "Like * and ~",
            "escape": "Escapes like \\n",
            "cwd": "Current directory",
            "cwd_root": "cwd for root user",
            "valid_path": "Valid paths",
            "autosuggestion": "Suggested completion",
            "user": "******",
            "host": "Hostname in the prompt",
            "cancel": "The ^C cancel indicator",
        }

        out, err = run_fish_cmd("set -L")
        for line in out.split("\n"):

            for match in re.finditer(r"^fish_color_(\S+) ?(.*)", line):
                color_name, color_value = [x.strip() for x in match.group(1, 2)]
                color_desc = descriptions.get(color_name, "")
                data = {"name": color_name, "description": color_desc}
                data.update(parse_color(color_value))
                result.append(data)
                remaining.discard(color_name)

        # Sort our result (by their keys)
        result.sort(key=operator.itemgetter("name"))

        # Ensure that we have all the color names we know about, so that if the
        # user deletes one he can still set it again via the web interface
        for color_name in remaining:
            color_desc = descriptions.get(color_name, "")
            result.append([color_name, color_desc, parse_color("")])

        return result
Beispiel #45
0
 def test_objecttypes(self):
     # check all types defined in Objects/
     calcsize = struct.calcsize
     size = test.support.calcobjsize
     vsize = test.support.calcvobjsize
     check = self.check_sizeof
     # bool
     check(True, vsize('') + self.longdigit)
     # buffer
     # XXX
     # builtin_function_or_method
     check(len, size('4P')) # XXX check layout
     # bytearray
     samples = [b'', b'u'*100000]
     for sample in samples:
         x = bytearray(sample)
         check(x, vsize('n2Pi') + x.__alloc__())
     # bytearray_iterator
     check(iter(bytearray()), size('nP'))
     # bytes
     check(b'', vsize('n') + 1)
     check(b'x' * 10, vsize('n') + 11)
     # cell
     def get_cell():
         x = 42
         def inner():
             return x
         return inner
     check(get_cell().__closure__[0], size('P'))
     # code
     def check_code_size(a, expected_size):
         self.assertGreaterEqual(sys.getsizeof(a), expected_size)
     check_code_size(get_cell().__code__, size('6i13P'))
     check_code_size(get_cell.__code__, size('6i13P'))
     def get_cell2(x):
         def inner():
             return x
         return inner
     check_code_size(get_cell2.__code__, size('6i13P') + calcsize('n'))
     # complex
     check(complex(0,1), size('2d'))
     # method_descriptor (descriptor object)
     check(str.lower, size('3PP'))
     # classmethod_descriptor (descriptor object)
     # XXX
     # member_descriptor (descriptor object)
     import datetime
     check(datetime.timedelta.days, size('3PP'))
     # getset_descriptor (descriptor object)
     import collections
     check(collections.defaultdict.default_factory, size('3PP'))
     # wrapper_descriptor (descriptor object)
     check(int.__add__, size('3P2P'))
     # method-wrapper (descriptor object)
     check({}.__iter__, size('2P'))
     # dict
     check({}, size('nQ2P') + calcsize('2nP2n') + 8 + (8*2//3)*calcsize('n2P'))
     longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
     check(longdict, size('nQ2P') + calcsize('2nP2n') + 16 + (16*2//3)*calcsize('n2P'))
     # dictionary-keyview
     check({}.keys(), size('P'))
     # dictionary-valueview
     check({}.values(), size('P'))
     # dictionary-itemview
     check({}.items(), size('P'))
     # dictionary iterator
     check(iter({}), size('P2nPn'))
     # dictionary-keyiterator
     check(iter({}.keys()), size('P2nPn'))
     # dictionary-valueiterator
     check(iter({}.values()), size('P2nPn'))
     # dictionary-itemiterator
     check(iter({}.items()), size('P2nPn'))
     # dictproxy
     class C(object): pass
     check(C.__dict__, size('P'))
     # BaseException
     check(BaseException(), size('5Pb'))
     # UnicodeEncodeError
     check(UnicodeEncodeError("", "", 0, 0, ""), size('5Pb 2P2nP'))
     # UnicodeDecodeError
     check(UnicodeDecodeError("", b"", 0, 0, ""), size('5Pb 2P2nP'))
     # UnicodeTranslateError
     check(UnicodeTranslateError("", 0, 1, ""), size('5Pb 2P2nP'))
     # ellipses
     check(Ellipsis, size(''))
     # EncodingMap
     import codecs, encodings.iso8859_3
     x = codecs.charmap_build(encodings.iso8859_3.decoding_table)
     check(x, size('32B2iB'))
     # enumerate
     check(enumerate([]), size('n3P'))
     # reverse
     check(reversed(''), size('nP'))
     # float
     check(float(0), size('d'))
     # sys.floatinfo
     check(sys.float_info, vsize('') + self.P * len(sys.float_info))
     # frame
     import inspect
     CO_MAXBLOCKS = 20
     x = inspect.currentframe()
     ncells = len(x.f_code.co_cellvars)
     nfrees = len(x.f_code.co_freevars)
     extras = x.f_code.co_stacksize + x.f_code.co_nlocals +\
               ncells + nfrees - 1
     check(x, vsize('12P3ic' + CO_MAXBLOCKS*'3i' + 'P' + extras*'P'))
     # function
     def func(): pass
     check(func, size('12P'))
     class c():
         @staticmethod
         def foo():
             pass
         @classmethod
         def bar(cls):
             pass
         # staticmethod
         check(foo, size('PP'))
         # classmethod
         check(bar, size('PP'))
     # generator
     def get_gen(): yield 1
     check(get_gen(), size('Pb2PPP'))
     # iterator
     check(iter('abc'), size('lP'))
     # callable-iterator
     import re
     check(re.finditer('',''), size('2P'))
     # list
     samples = [[], [1,2,3], ['1', '2', '3']]
     for sample in samples:
         check(sample, vsize('Pn') + len(sample)*self.P)
     # sortwrapper (list)
     # XXX
     # cmpwrapper (list)
     # XXX
     # listiterator (list)
     check(iter([]), size('lP'))
     # listreverseiterator (list)
     check(reversed([]), size('nP'))
     # int
     check(0, vsize(''))
     check(1, vsize('') + self.longdigit)
     check(-1, vsize('') + self.longdigit)
     PyLong_BASE = 2**sys.int_info.bits_per_digit
     check(int(PyLong_BASE), vsize('') + 2*self.longdigit)
     check(int(PyLong_BASE**2-1), vsize('') + 2*self.longdigit)
     check(int(PyLong_BASE**2), vsize('') + 3*self.longdigit)
     # module
     check(unittest, size('PnPPP'))
     # None
     check(None, size(''))
     # NotImplementedType
     check(NotImplemented, size(''))
     # object
     check(object(), size(''))
     # property (descriptor object)
     class C(object):
         def getx(self): return self.__x
         def setx(self, value): self.__x = value
         def delx(self): del self.__x
         x = property(getx, setx, delx, "")
         check(x, size('4Pi'))
     # PyCapsule
     # XXX
     # rangeiterator
     check(iter(range(1)), size('4l'))
     # reverse
     check(reversed(''), size('nP'))
     # range
     check(range(1), size('4P'))
     check(range(66000), size('4P'))
     # set
     # frozenset
     PySet_MINSIZE = 8
     samples = [[], range(10), range(50)]
     s = size('3nP' + PySet_MINSIZE*'nP' + '2nP')
     for sample in samples:
         minused = len(sample)
         if minused == 0: tmp = 1
         # the computation of minused is actually a bit more complicated
         # but this suffices for the sizeof test
         minused = minused*2
         newsize = PySet_MINSIZE
         while newsize <= minused:
             newsize = newsize << 1
         if newsize <= 8:
             check(set(sample), s)
             check(frozenset(sample), s)
         else:
             check(set(sample), s + newsize*calcsize('nP'))
             check(frozenset(sample), s + newsize*calcsize('nP'))
     # setiterator
     check(iter(set()), size('P3n'))
     # slice
     check(slice(0), size('3P'))
     # super
     check(super(int), size('3P'))
     # tuple
     check((), vsize(''))
     check((1,2,3), vsize('') + 3*self.P)
     # type
     # static type: PyTypeObject
     fmt = 'P2n15Pl4Pn9Pn11PIP'
     if hasattr(sys, 'getcounts'):
         fmt += '3n2P'
     s = vsize(fmt)
     check(int, s)
     s = vsize(fmt +                 # PyTypeObject
               '3P'                  # PyAsyncMethods
               '36P'                 # PyNumberMethods
               '3P'                  # PyMappingMethods
               '10P'                 # PySequenceMethods
               '2P'                  # PyBufferProcs
               '4P')
     # Separate block for PyDictKeysObject with 8 keys and 5 entries
     s += calcsize("2nP2n") + 8 + 5*calcsize("n2P")
     # class
     class newstyleclass(object): pass
     check(newstyleclass, s)
     # dict with shared keys
     check(newstyleclass().__dict__, size('nQ2P' + '2nP2n'))
     # unicode
     # each tuple contains a string and its expected character size
     # don't put any static strings here, as they may contain
     # wchar_t or UTF-8 representations
     samples = ['1'*100, '\xff'*50,
                '\u0100'*40, '\uffff'*100,
                '\U00010000'*30, '\U0010ffff'*100]
     asciifields = "nnbP"
     compactfields = asciifields + "nPn"
     unicodefields = compactfields + "P"
     for s in samples:
         maxchar = ord(max(s))
         if maxchar < 128:
             L = size(asciifields) + len(s) + 1
         elif maxchar < 256:
             L = size(compactfields) + len(s) + 1
         elif maxchar < 65536:
             L = size(compactfields) + 2*(len(s) + 1)
         else:
             L = size(compactfields) + 4*(len(s) + 1)
         check(s, L)
     # verify that the UTF-8 size is accounted for
     s = chr(0x4000)   # 4 bytes canonical representation
     check(s, size(compactfields) + 4)
     # compile() will trigger the generation of the UTF-8
     # representation as a side effect
     compile(s, "<stdin>", "eval")
     check(s, size(compactfields) + 4 + 4)
     # TODO: add check that forces the presence of wchar_t representation
     # TODO: add check that forces layout of unicodefields
     # weakref
     import weakref
     check(weakref.ref(int), size('2Pn2P'))
     # weakproxy
     # XXX
     # weakcallableproxy
     check(weakref.proxy(int), size('2Pn2P'))
Beispiel #46
0
    def gather_def_ids(node):
        if 'id' in node.attrs:
            defid = node.attrs['id']

            # ----------
            # no duplicate ids, of course

            if defid in node_with_id_:
                msg_at_node(node, f"duplicate id: '{defid}'")

            node_with_id_[defid] = node

            # ----------
            # id should begin with "(sec|eqn|figure|table)-"
            # if and only if the node is of certain kinds.

            id_prefix_expectation = {
              'emu-intro' : 'sec-',
              'emu-clause': 'sec-',
              'emu-annex' : 'sec-',
              'emu-eqn'   : 'eqn-',
              'emu-figure': 'figure-',
              'emu-table' : 'table-',
            }.get(node.element_name, None)
            if id_prefix_expectation:
                if not defid.startswith(id_prefix_expectation):
                    msg_at_node(node, f'Expected the id to start with "{id_prefix_expectation}"')
            else:
                if (False
                    or defid.startswith('sec-')
                    or defid.startswith('eqn-')
                    or defid.startswith('figure-')
                    or defid.startswith('table-')
                ):
                    msg_at_node(node, f'Did not expect the id to start that way')

            # ----------
            # If an element defines an abstract operation,
            # its id should be ...

            if 'aoid' in node.attrs:
                # TODO: After the merge of #545, most abstract ops don't have an 'aoid' attribute;
                # instead it's generated at 'render' time.
                # (But SDOs, emu-eqns, and a few others do, so this code is still being executed,
                # just not as much as we want.)
                aoid = node.attrs['aoid']
                assert node.element_name in ['emu-clause', 'emu-annex', 'emu-eqn', 'dfn']
                if id_prefix_expectation is None:
                    id_prefix_expectation = '' # for thisFooValue, was 'sec-' until PR 2103
                possibles = [
                    id_prefix_expectation + aoid.lower().replace(' ', '-').replace('::', '-'),
                    id_prefix_expectation + aoid,
                    id_prefix_expectation + kebab(aoid),
                    id_prefix_expectation + 'static-semantics-' + aoid.lower(),
                    id_prefix_expectation + 'runtime-semantics-' + aoid.lower(),
                ]
                if defid not in possibles:
                    msg_at_node(node, f'Expected id="{possibles[0]}"')

        if node.element_name == 'emu-alg':
            for mo in re.finditer(r' \[(\w+)="([^"]+)"\]', node.inner_source_text()):
                assert mo.group(1) == 'id'
                defid = mo.group(2)

                # ----------
                # no duplicate ids

                if defid in node_with_id_:
                    msg_at_node(node, f"duplicate id: '{defid}'")

                node_with_id_[defid] = node
                # XXX Should really be the node that will later be constructed
                # for the step in which this step_attribute occurs.

                # ----------
                # id should begin with "step-"

                assert defid.startswith('step-')

        if 'oldids' in node.attrs:
            for oldid in node.attrs['oldids'].split(','):
                assert oldid not in all_oldids
                all_oldids.add(oldid)

        for child in node.children:
            gather_def_ids(child)
def extract(
    mention_id ="text",
    sentence_text ="text",
    tokens ="text[]",
    begin_exp ="int",
    end_exp ="int",
    begin_explain ="int",
    end_explain ="int",
    sentence_source ="text[]",
    position_source ="text[]"
    ):
	
	forbidden_word = ["nếu","phải","đó","không","được","đã","đồng_thời","cần", "chỉ",'cụ_thể'] 
	for i in range(2):
		if end_exp +2 +i <= end_explain:
			if handle_string.toLowerCase(tokens[end_exp+2+i]) in forbidden_word:
				yield [
				mention_id,
				-10,
				"forbidden_word_1"
				]
		if end_exp - i >= begin_exp:
			if handle_string.toLowerCase(tokens[end_exp-i]) in forbidden_word:
				yield [
				mention_id,
				-10,
				"forbidden_word_1"
				]
	if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word:
		yield [
		mention_id,
		-1,
		"forbidden_word_2"
		]
	if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]):
		yield [
		mention_id,
		-1,
		"forbidden_word_3"
		]
	i = len(mention_id) - 1
	first = False
	while(i>0) :
		if mention_id[i] == '_' and not first:
			first = True
			i -= 1
			continue
		if mention_id[i] == '_' and first:
			break
		i -= 1
	j = 0
	while(j<len(mention_id)) :
		if mention_id[j] == '_':
			break
		j += 1
	position_require = mention_id[j+1:i+1]
	index = 0
	for index in range(0,len(position_source)):
		if position_require in position_source[index] :
			if divlaw.lenIterator(re.finditer(r"Giải_thích\stừ_ngữ",sentence_source[index],re.U|re.I)) > 0 :
				yield [
					mention_id,
					1,
					"in_explain_words_law"
				]
Beispiel #48
0
site = pywikibot.Site("lv", "wikipedia")


def notify_Edgars():
    page = pywikibot.Page(site, "Dalībnieka diskusija:EdgarsBot")
    pagetext = page.get()
    pagetext += "\n\n{{ping|Edgars2007}} DYK sagatavē divi vienādi datumi --~~~~"
    page.text = pagetext
    page.save(summary="New error", botflag=True)


page = pywikibot.Page(site, 'Veidne:Vai tu zināji/Sagatave')
page_text = page.get()

all_dates = re.finditer('<!--(\d+)\. datums\n-->\|\d+=', page_text)

found_dublicate = False

all_date_matches = []

for match in all_dates:
    date = match.group(1)
    if date in all_date_matches:
        found_dublicate = True
        break
    all_date_matches.append(date)

if found_dublicate:
    notify_Edgars()
Beispiel #49
0
def print_HOR_read(r, show_cenpbbxx=False):

    coords = [0] * len(r.hors)
    hv_str = ["."] * len(r.hors)

    for n, (_idx, _size, elem) in enumerate(r.hors):

        idx, size = int(_idx), int(_size)
        if r.ori == '+':
            b, e = r.mons[idx].begin, r.mons[idx + size - 1].end
            gap = 0 if idx == 0 else r.mons[idx].begin - r.mons[
                idx - 1].end  # gap before me.
        else:
            b, e = r.mons[idx].end, r.mons[idx + size - 1].begin
            gap = 0 if idx == 0 else -(r.mons[idx].end - r.mons[idx - 1].begin)

        coords[n] = (b, e, gap)

        if elem[:5] == "M=HOR":
            hv_str[n] = "m" if gap < 100 else "M"
        if elem[:3] == "M=M":
            hv_str[n] = "m" if gap < 100 else "M"
        elif elem[:3] == "Rev":
            hv_str[n] = "r" if gap < 100 else "R"
        elif elem[:2] == "M=":
            hv_str[n] = "a" if gap < 100 else "A"
        else:
            hv_str[n] = "h" if gap < 100 else "H"

    # search variant HOR candidates
    elems = [e for i, s, e in r.hors]
    hv_new_idx = functools.reduce(lambda x, y: x | y, [
        set(range(mt.start(1) + 1,
                  mt.end(1) - 1))
        for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str))
    ], set())
    hv_new_hash = {
        mt.start(1) + 1:
        f"{hash(tuple(elems[ mt.start(1) + 1: mt.end(1) - 1 ])):x}"[-8:]
        for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str))
    }

    for n, (_idx, _size, elem) in enumerate(r.hors):
        b, e, gap = coords[n]
        idx, size = int(_idx), int(_size)
        nvars = sum([len(m.monomer.snvs) for m in r.mons[idx:idx + size]])
        if n in hv_new_hash:
            print( f"{r.name}\t{b}\t{e}\t{idx}\t{size}\t{elem}\t" +\
                   f"{gap}\t{nvars}\t{100.0*nvars/abs(e-b):.2f}\t" +\
                   "new=" + hv_new_hash[n])
        else:
            print( f"{r.name}\t{b}\t{e}\t{idx}\t{size}\t{elem}\t" +\
                   f"{gap}\t{nvars}\t{100.0*nvars/abs(e-b):.2f}\t" +\
                   ("new" if n in hv_new_idx else "."))

    print("")

    # 5-mer
    kmer = [
        tuple(elems[mt.start(1):mt.end(1)])
        for mt in re.finditer(r"(?=([hH]hhhh))", "".join(hv_str))
    ]
    cand = [
        tuple(elems[mt.start(1) + 1:mt.end(1) - 1])
        for mt in re.finditer(r"(?=([hH]m+h))", "".join(hv_str))
    ]

    return kmer, cand
Beispiel #50
0
    def interpret_expression(self, expr, local_vars, allow_recursion):
        expr = expr.strip()

        if expr == '':  # Empty expression
            return None

        if expr.startswith('('):
            parens_count = 0
            for m in re.finditer(r'[()]', expr):
                if m.group(0) == '(':
                    parens_count += 1
                else:
                    parens_count -= 1
                    if parens_count == 0:
                        sub_expr = expr[1:m.start()]
                        sub_result = self.interpret_expression(
                            sub_expr, local_vars, allow_recursion,
                        )
                        remaining_expr = expr[m.end():].strip()
                        if not remaining_expr:
                            return sub_result
                        else:
                            expr = json.dumps(sub_result) + remaining_expr
                        break
            else:
                raise ExtractorError('Premature end of parens in %r' % expr)

        for op, opfunc in _ASSIGN_OPERATORS:
            m = re.match(
                r'''(?x)
                (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
                \s*%s
                (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr,
            )
            if not m:
                continue
            right_val = self.interpret_expression(
                m.group('expr'), local_vars, allow_recursion - 1,
            )

            if m.groupdict().get('index'):
                lvar = local_vars[m.group('out')]
                idx = self.interpret_expression(
                    m.group('index'), local_vars, allow_recursion,
                )
                assert isinstance(idx, int)
                cur = lvar[idx]
                val = opfunc(cur, right_val)
                lvar[idx] = val
                return val
            else:
                cur = local_vars.get(m.group('out'))
                val = opfunc(cur, right_val)
                local_vars[m.group('out')] = val
                return val

        if expr.isdigit():
            return int(expr)

        var_m = re.match(
            r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
            expr,
        )
        if var_m:
            return local_vars[var_m.group('name')]

        try:
            return json.loads(expr)
        except ValueError:
            pass

        m = re.match(
            r'(?P<var>%s)\.(?P<member>[^(]+)'
            '(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
            expr,
        )
        if m:
            variable = m.group('var')
            member = m.group('member')
            arg_str = m.group('args')

            if variable in local_vars:
                obj = local_vars[variable]
            else:
                if variable not in self._objects:
                    self._objects[variable] = self.extract_object(variable)
                obj = self._objects[variable]

            if arg_str is None:
                # Member access
                if member == 'length':
                    return len(obj)
                return obj[member]

            assert expr.endswith(')')
            # Function call
            if arg_str == '':
                argvals = tuple()
            else:
                argvals = tuple([
                    self.interpret_expression(v, local_vars, allow_recursion)
                    for v in arg_str.split(',')
                ])

            if member == 'split':
                assert argvals == ('',)
                return list(obj)
            if member == 'join':
                assert len(argvals) == 1
                return argvals[0].join(obj)
            if member == 'reverse':
                assert len(argvals) == 0
                obj.reverse()
                return obj
            if member == 'slice':
                assert len(argvals) == 1
                return obj[argvals[0]:]
            if member == 'splice':
                assert isinstance(obj, list)
                index, howMany = argvals
                res = []
                for i in range(index, min(index + howMany, len(obj))):
                    res.append(obj.pop(index))
                return res

            return obj[member](argvals)

        m = re.match(
            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr,
        )
        if m:
            val = local_vars[m.group('in')]
            idx = self.interpret_expression(
                m.group('idx'), local_vars, allow_recursion - 1,
            )
            return val[idx]

        for op, opfunc in _OPERATORS:
            m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
            if not m:
                continue
            x, abort = self.interpret_statement(
                m.group('x'), local_vars, allow_recursion - 1,
            )
            if abort:
                raise ExtractorError(
                    'Premature left-side return of %s in %r' % (op, expr),
                )
            y, abort = self.interpret_statement(
                m.group('y'), local_vars, allow_recursion - 1,
            )
            if abort:
                raise ExtractorError(
                    'Premature right-side return of %s in %r' % (op, expr),
                )
            return opfunc(x, y)

        m = re.match(
            r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]+)\)$' % _NAME_RE, expr,
        )
        if m:
            fname = m.group('func')
            argvals = tuple([
                int(v) if v.isdigit() else local_vars[v]
                for v in m.group('args').split(',')
            ])
            if fname not in self._functions:
                self._functions[fname] = self.extract_function(fname)
            return self._functions[fname](argvals)

        raise ExtractorError('Unsupported JS expression %r' % expr)
Beispiel #51
0
def find_suffix(besedilo, vzorec):
    return {
        x.group(0)
        for x in re.finditer(r'\b\w*?' + vzorec + r'\b', besedilo)
    }
Beispiel #52
0
def preminify_markdown(lines, rem_log):
    '''
    We can save a few extra KBs by doing some targeted minification on
    markdown.js that our minification tools would otherwise overlook
    '''

    # start with direct replacement for the State enum
    lines = minify_markdown_state_enum(lines)

    # stateToStr takes up a lot of space when it probably doesn't have to for the minified version
    # Save several hundred bytes by removing it
    start = lines.find('const stateToStr')
    if start != -1:
        end = lines.find('}', lines.find('}', start) + 1) + 1
        if end != -1:
            lines = lines.replace(lines[start:end],
                                  'let stateToStr = (state) => state;')

    if rem_log != 0:
        start = lines.find('const shouldLogTmi = ')
        if start == -1:
            print('WARN: could not find "shouldLogTmi" for removal')
        else:
            end = lines.find('}\n', start) + 2
            lines = lines.replace(lines[start:end], '')

    # Check whether we should remove TMI logging. There is a
    # separate flag for markdown-specific removal, since it's especially noisy
    if rem_log == 4:
        lines = re.sub(r'(\/\*@__PURE__\*\/)?\blogTmi\(.*\); *\n', '', lines)

    # currentRun is used quite a bit
    lines = re.sub(r'\bthis\.currentRun\b', 'this.' + next_var(), lines)

    # State of a run - Disabled because other classes use .state
    # lines = re.sub(r'\.state\b', '.' + next_var(), lines)

    # Inner runs
    lines = re.sub(r'\binnerRuns\b', next_var(), lines)

    # Run methods
    lines = re.sub(r'\bstartContextLength\b', next_var(), lines)
    lines = re.sub(r'\bendContextLength\b', next_var(), lines)
    lines = re.sub(r'\btransform\b', next_var(), lines)

    # Now look for things that are very method-like.
    for match in re.finditer(r'\n    (_\w+)\(', lines):
        cur_var = next_var()
        if cur_var == '':
            return lines
        lines = re.sub(r'\b' + match.group(1) + r'\b', cur_var, lines)

    # TODO: this.text is _very_ heavily used, but multiple classes
    # use this, so it breaks things.

    # Now getting real hacky. Modify String's prototype to save a hundred bytes or so
    # Some should already be done if 'ultra' is set, so don't do anything in that case
    lines = re.sub(r'\.indexOf\(', '.i(', lines)
    lines = 'String.prototype.i = String.prototype.indexOf;\n' + lines
    lines = 'Array.prototype.i = Array.prototype.indexOf;\n' + lines

    lines = re.sub(r'\.substring\(', '.s(', lines)
    lines = 'String.prototype.s = String.prototype.substring;\n' + lines

    return lines
def webScraping(CSVPath):

    #  Connect to Browser
    DRIVER_PATH = "C:/Users/filip/Documents/PythonFiles/chromedriver"
    browser = webdriver.Chrome(DRIVER_PATH)

    # Access website
    URL = "https://www.imdb.com/"
    browser.get(URL)
    # Give the browser time to load all content.
    # time.sleep(2)

    # Click on Menu button
    menuButton = browser.find_element_by_css_selector(
        '#imdbHeader-navDrawerOpen--desktop')
    menuButton.click()
    time.sleep(2)

    # Select Most Popular Movies from Menu items
    mostPopMovies = browser.find_element_by_css_selector(
        '#nav-link-categories-mov+ ._299G6wcz6LCpY_QFQJtc76 .ipc-list__item--indent-one:nth-child(4)'
    )
    mostPopMovies.click()
    time.sleep(2)

    # Sort by IMDB rating descending, so I don't collect movies without IMDB ratings.
    sortByButton = browser.find_element_by_css_selector(
        "#lister-sort-by-options [value='ir:descending']")
    sortByButton.click()

    # Extract selected contents
    mainInfoList = browser.find_elements_by_css_selector('.titleColumn')
    ratingList = browser.find_elements_by_css_selector('.imdbRating')

    # Create DataFrame to store result
    df = pd.DataFrame(columns=['Title', 'Year', 'Ranking', 'Rating'])

    # Loop through 87 elements (excluding movies without rating)
    for i in range(87):
        # Prepare overal text result
        start = mainInfoList[i].get_attribute('innerHTML')
        # Beautiful soup allows us to remove HTML tags from our content, if it exists.
        soup = BeautifulSoup(start, features="lxml")
        # Remove leading and trailing whitespaces
        rawString = soup.get_text().strip()
        # Remove hidden characters for tabs and new lines.
        rawString = re.sub(
            r"[\n\t]*", "",
            rawString)  # re.sub(pattern, repl, string, count=0, flags=0)
        # Replace(remove) two or more consecutive empty spaces with ''
        rawString = re.sub('[ ]{2,}', '', rawString)

        # Extract TITLE
        titleCutOff = rawString.index('(')
        title = rawString[:titleCutOff]
        # title.strip()

        # Extract YEAR
        yearCutOff = rawString.index(')')  # RANKING after this point
        year = rawString[titleCutOff + 1:yearCutOff]

        # Extract RANKING
        # Here I find the second (, which determines the end index for Ranking
        rankingCutOff = [m.start() for m in re.finditer("\(", rawString)][1]
        ranking = rawString[yearCutOff + 1:rankingCutOff]

        # Extract RATING
        start = ratingList[i].get_attribute('innerHTML')
        # Beautiful soup allows us to remove HTML tags from our content if it exists.
        soup = BeautifulSoup(start, features="lxml")
        rating = soup.get_text().strip(
        )  # Leading and trailing whitespaces are removed

        # Adding info on a Data Frame
        moviesInfo = {
            'Title': title,
            'Year': year,
            'Ranking': ranking,
            'Rating': rating
        }
        df = df.append(moviesInfo, ignore_index=True)

    # Show all columns.
    pd.set_option('display.max_columns', None)
    # Show all rows.
    pd.set_option('display.max_rows', None)
    # Increase number of columns that display on one line.
    pd.set_option('display.width', 1000)
    print("\nTop 87 Movies listed in descending order of IMDB Rating")
    print(df)

    # Saving results in a CSV file
    df.to_csv(CSVPath)
Beispiel #54
0
def double_letters(besedilo):
    return {x.group(0) for x in re.finditer(r'\w*(.)\1\w+', besedilo)}
Beispiel #55
0
def _generateTriggerFromPattern(pattern, template, template_numeric_dict):
    crit_value_list = sorted(template_numeric_dict[template])
    prefix_crit = crit_value_list[:-1]
    suffix_crit = crit_value_list[1:]
    mid_crit = [(v1 + v2) / 2 for v1, v2 in zip(suffix_crit, prefix_crit)]
    enhanced_crit_value_list = sorted(
        mid_crit + [crit_value_list[0] - 1, crit_value_list[-1] + 1] +
        crit_value_list)
    enhanced_crit_value_str_list = [
        str(v).replace('.', '_') for v in enhanced_crit_value_list
    ]
    hit_pattern_list = [m.group(0) for m in re.finditer(r'(\d)\1*', pattern)]

    cover_list = list()
    start_index = 0
    for hit_pattern in hit_pattern_list:
        if '1' in hit_pattern:
            cover_list.append((start_index, start_index + len(hit_pattern)))
        start_index = start_index + len(hit_pattern)

    statement_list = list()
    for cover in cover_list:
        if cover[0] != 0 and cover[1] != len(pattern):
            # raise Exception('this pattern %s is not supported' % pattern)
            # ugly fix:
            if cover[1] % 2 != 0:
                # non-critical value, "<"
                upper_str = '<' + enhanced_crit_value_str_list[
                    cover[1]].replace('_', '.')
            else:
                # critical value, "<="
                upper_str = '<=' + enhanced_crit_value_str_list[cover[1] -
                                                                1].replace(
                                                                    '_', '.')
            if cover[0] % 2 == 0:
                # non-critical value, ">"
                lower_str = enhanced_crit_value_str_list[cover[0] - 1].replace(
                    '_', '.') + '<'
            else:
                # critical value, ">="
                lower_str = enhanced_crit_value_str_list[cover[0]].replace(
                    '_', '.') + '<='
            statement_list.append('%s%s%s' % (lower_str, template, upper_str))
        elif cover[0] == 0 and cover[1] != len(pattern):
            if cover[1] % 2 != 0:
                # non-critical value, "<"
                statement_list.append(
                    '%s<%s' %
                    (template, enhanced_crit_value_str_list[cover[1]].replace(
                        '_', '.')))
            else:
                # critical value, "<="
                statement_list.append(
                    '%s<=%s' %
                    (template,
                     enhanced_crit_value_str_list[cover[1] - 1].replace(
                         '_', '.')))
                # statement_list.append('%s=%s' % (template, enhanced_crit_value_str_list[cover[1]-1].replace('_', '.')))
        elif cover[0] != 0 and cover[1] == len(pattern):
            if cover[0] % 2 == 0:
                # non-critical value, ">"
                statement_list.append(
                    '%s>%s' %
                    (template,
                     enhanced_crit_value_str_list[cover[0] - 1].replace(
                         '_', '.')))
            else:
                # critical value, ">="
                statement_list.append(
                    '%s>=%s' %
                    (template, enhanced_crit_value_str_list[cover[0]].replace(
                        '_', '.')))
                # statement_list.append('%s=%s' % (template, enhanced_crit_value_str_list[cover[0]].replace('_', '.')))
        elif cover[0] == 0 and cover[1] == len(pattern):
            statement_list.append('%s changed' % template)

    return statement_list
    def __init__(self, source_root, url_base, doc_root, list_outputs_only):
        self.source_root = source_root
        self.url_base = url_base
        self.doc_root = doc_root
        self.list_outputs_only = list_outputs_only

        self.source_url_base = self.url_base + 'SourceControl/latest#'
        self.wiki_url_base = self.url_base + 'wikipage?title='
        self.issue_url_base = self.url_base + 'workitem/'

        try:
            with open('maps.cache', 'rb') as f:
                self.file_map, self.type_map = pickle.load(f)
            return
        except:
            pass

        if self.list_outputs_only:
            self.file_map = None
            self.type_map = None
            return

        print('Creating file maps')
        file_map = {}
        type_map = {}
        for dirname, dirnames, filenames in os.walk(source_root):
            for filename in filenames:
                fullpath = os.path.join(source_root, dirname, filename)
                urlpath = fullpath[len(source_root):].lstrip('\\').replace('\\', '/')

                nameonly = os.path.split(fullpath)[1]
                if nameonly in file_map:
                    file_map[nameonly] = None
                else:
                    file_map[nameonly] = urlpath

                if not filename.upper().endswith(('.PY', '.CS')):
                    continue

                try:
                    with open(fullpath, 'r', encoding='utf-8-sig') as f:
                        content = f.read()
                except UnicodeDecodeError:
                    #print('Cannot read {}'.format(filename))
                    continue
                
                nsname = None
                if filename.upper().endswith('.PY'):
                    nsname = os.path.splitext(filename)[0]

                for match in re.finditer(r'(namespace|class|struct|enum|interface) ([\w\.]+)', content):
                    kind, name = match.groups()
                    if kind == 'namespace':
                        nsname = name
                    elif nsname:
                        type_map[nsname + '.' + name] = urlpath
        
        try:
            with open('maps.cache', 'wb') as f:
                pickle.dump((file_map, type_map), f, pickle.HIGHEST_PROTOCOL)
        except:
            pass
        self.file_map = file_map
        self.type_map = type_map
Beispiel #57
0
import os
import glob
import re
from shutil import copyfile, move

# indir = os.path.join('datasets', 'nude')
indir = os.path.join('datasets', 'full')

for root, dirs, files in os.walk(indir):

    for dir in dirs:
        dir = os.path.join(root, dir)

        for fname in glob.glob(dir + '/*.jpg'):
            # source file
            src = fname

            # destination file
            slash_list = [m.start() for m in re.finditer(r"/", fname)]
            slash_3rd_pos = slash_list[2]
            dst = fname[:slash_3rd_pos] + fname[fname.rfind('/'):]

            # move
            move(src, dst)

        if len(os.listdir(dir)) == 0:
            os.rmdir(dir)
Beispiel #58
0
    if re.search(r'^(x|y)', gene) and re.search(r'e$', gene):
        print(gene)

print('\nContains three or more numbers in a row:')
for gene in gene_names:
    if re.search(r'[0-9]{3,100}', gene):  # no space in {3,100}
        print(gene)

print('\nEnds with d followed by a, r or p:')
for gene in gene_names:
    if re.search(r'd[a|r|p]$.*', gene):
        print(gene)

# Double digest
dna = open('dna.txt').read().rstrip('\n')

cuts = [0]

for match in re.finditer(r"A[ATGC]TAAT", dna):
    cuts.append(match.start() + 3)

for match in re.finditer(r"GC[AG][AT]TG", dna):
    cuts.append(match.start() + 4)

cuts.append(len(dna))
cuts.sort()

print('\nFragment lengths:\n')
for i in range(1, len(cuts)):
    print(cuts[i] - cuts[i - 1])
Beispiel #59
0
        def _events(self, params):
            session = self.get_session()

            if session is None:
                self.send_response(_http_client.UNAUTHORIZED)
                self.send_header(HTTP_HEADER.CONNECTION, "close")
                return None

            start, end, size, total = None, None, -1, None
            content = None
            log_exists = False
            dates = params.get("date", "")

            if ".." in dates:
                pass
            elif '_' not in dates:
                try:
                    date = datetime.datetime.strptime(dates, "%Y-%m-%d").strftime("%Y-%m-%d")
                    event_log_path = os.path.join(config.LOG_DIR, "%s.log" % date)
                    if os.path.exists(event_log_path):
                        range_handle = open(event_log_path, "rb")
                        log_exists = True
                except ValueError:
                    print("[!] invalid date format in request")
                    log_exists = False
            else:
                logs_data = ""
                date_interval = dates.split("_", 1)
                try:
                    start_date = datetime.datetime.strptime(date_interval[0], "%Y-%m-%d").date()
                    end_date = datetime.datetime.strptime(date_interval[1], "%Y-%m-%d").date()
                    for i in xrange(int((end_date - start_date).days) + 1):
                        date = start_date + datetime.timedelta(i)
                        event_log_path = os.path.join(config.LOG_DIR, "%s.log" % date.strftime("%Y-%m-%d"))
                        if os.path.exists(event_log_path):
                            log_handle = open(event_log_path, "rb")
                            logs_data += log_handle.read()
                            log_handle.close()

                    range_handle = io.BytesIO(logs_data)
                    log_exists = True
                except ValueError:
                    print("[!] invalid date format in request")
                    log_exists = False

            if log_exists:
                range_handle.seek(0, 2)
                total = range_handle.tell()
                range_handle.seek(0)

                if self.headers.get(HTTP_HEADER.RANGE):
                    match = re.search(r"bytes=(\d+)-(\d+)", self.headers[HTTP_HEADER.RANGE])
                    if match:
                        start, end = int(match.group(1)), int(match.group(2))
                        max_size = end - start + 1
                        end = min(total - 1, end)
                        size = end - start + 1

                        if start == 0 or not session.range_handle:
                            session.range_handle = range_handle

                        if session.netfilters is None and not session.mask_custom:
                            session.range_handle.seek(start)
                            self.send_response(_http_client.PARTIAL_CONTENT)
                            self.send_header(HTTP_HEADER.CONNECTION, "close")
                            self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain")
                            self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes %d-%d/%d" % (start, end, total))
                            content = session.range_handle.read(size)
                        else:
                            self.send_response(_http_client.OK)
                            self.send_header(HTTP_HEADER.CONNECTION, "close")
                            self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain")

                            buffer, addresses, netmasks, regex = io.StringIO(), set(), [], ""
                            for netfilter in session.netfilters or []:
                                if not netfilter:
                                    continue
                                if '/' in netfilter:
                                    netmasks.append(netfilter)
                                elif re.search(r"\A[\d.]+\Z", netfilter):
                                    addresses.add(netfilter)
                                elif "\\." in netfilter:
                                    regex = r"\b(%s)\b" % netfilter
                                else:
                                    print("[!] invalid network filter '%s'" % netfilter)
                                    return

                            for line in session.range_handle:
                                display = session.netfilters is None
                                ip = None
                                line = line.decode(UNICODE_ENCODING, "ignore")

                                if regex:
                                    match = re.search(regex, line)
                                    if match:
                                        ip = match.group(1)
                                        display = True

                                if not display and (addresses or netmasks):
                                    for match in re.finditer(r"\b(\d+\.\d+\.\d+\.\d+)\b", line):
                                        if not display:
                                            ip = match.group(1)
                                        else:
                                            break

                                        if ip in addresses:
                                            display = True
                                            break
                                        elif netmasks:
                                            for _ in netmasks:
                                                prefix, mask = _.split('/')
                                                if addr_to_int(ip) & make_mask(int(mask)) == addr_to_int(prefix):
                                                    addresses.add(ip)
                                                    display = True
                                                    break

                                if session.mask_custom and "(custom)" in line:
                                    line = re.sub(r'("[^"]+"|[^ ]+) \(custom\)', "- (custom)", line)

                                if display:
                                    if ",%s" % ip in line or "%s," % ip in line:
                                        line = re.sub(r" ([\d.,]+,)?%s(,[\d.,]+)? " % re.escape(ip), " %s " % ip, line)
                                    buffer.write(line)
                                    if buffer.tell() >= max_size:
                                        break

                            content = buffer.getvalue()
                            end = start + len(content) - 1
                            self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes %d-%d/%d" % (start, end, end + 1 + max_size * (len(content) >= max_size)))

                        if len(content) < max_size:
                            session.range_handle.close()
                            session.range_handle = None

                if size == -1:
                    self.send_response(_http_client.OK)
                    self.send_header(HTTP_HEADER.CONNECTION, "close")
                    self.send_header(HTTP_HEADER.CONTENT_TYPE, "text/plain")
                    self.end_headers()

                    with range_handle as f:
                        while True:
                            data = f.read(io.DEFAULT_BUFFER_SIZE)
                            if not data:
                                break
                            else:
                                self.wfile.write(data)

            else:
                self.send_response(_http_client.OK)  # instead of _http_client.NO_CONTENT (compatibility reasons)
                self.send_header(HTTP_HEADER.CONNECTION, "close")
                if self.headers.get(HTTP_HEADER.RANGE):
                    self.send_header(HTTP_HEADER.CONTENT_RANGE, "bytes 0-0/0")

            return content
Beispiel #60
0
def get_mol_from_graph(line) :

        x = line.split('\t')
        mol_smiles = x[1]
        mol_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(mol_smiles),True) #make sure this is cannonical
        atom_info = x[2]
        bond_info = x[3]
        regex = r"\{(.*?)\}"
        matches = re.finditer(regex, atom_info , re.MULTILINE | re.DOTALL)
        atom_info_list = []
        for matchNum, match in enumerate(matches):
            for groupNum in range(0, len(match.groups())):
                atom_info_list.append(match.group(1))

        
        #create mol from the graph
        mol = Chem.RWMol()
        natoms = int(atom_info_list[0])
        node_to_idx = {}
        idx_to_info = {}
        for atom_index in range(natoms):
            atom_info = atom_info_list[atom_index + 1]
            z = atom_info.split(':')
            ID = z[0]
            typ = z[1]
            smiles = z[2]
            order = z[3]
            sym = z[4]
            at_no = z[5]
            f_c = z[6]
            hyb = z[7]
            i_hc = z[8]
            e_hc = z[9]
            is_aro = z[10]

            a=Chem.Atom(int(at_no))
            #a.SetChiralTag(chiral_tags[node])
            a.SetFormalCharge(int(f_c))
            if is_aro=='true':
                a.SetIsAromatic(True)
            else:   
                a.SetIsAromatic(False)
            if hyb in rdkit.Chem.rdchem.HybridizationType.names:
                a.SetHybridization(rdkit.Chem.rdchem.HybridizationType.names[hyb])
            else:
                a.SetHybridization(rdkit.Chem.rdchem.HybridizationType.names['OTHER'])
            #a.SetHybridization(hyb_map[hyb])
            a.SetNumExplicitHs(int(e_hc))
            idx = mol.AddAtom(a)
            node_to_idx[ID] = idx
            idx_to_info[idx] = [ID,typ,smiles,order]
            print(idx,ID,typ,smiles,order)
        
        bond_info_arr = bond_info.split(',')
        nbonds = int(bond_info_arr[0])
        for bond_index in range(nbonds):
            bond = bond_info_arr[bond_index + 1]
            y = bond.split(':')
            edge = y[0]
            btype = y[1]
            y = edge.split('-')
            atom1_id = y[0]
            atom2_id = y[1]
            atom1_idx = node_to_idx[atom1_id]
            atom2_idx = node_to_idx[atom2_id]
            if(atom1_idx < atom2_idx):
                #print('Adding',atom1_idx,atom2_idx,btype)
                mol.AddBond(atom1_idx,atom2_idx,rdkit.Chem.rdchem.BondType.names[btype])

        Chem.SanitizeMol(mol)   

        new_smiles = Chem.MolToSmiles(mol)

        print('smiles=',mol_smiles,'\n')
        print('new_smiles=',new_smiles,'\n')
        assert new_smiles == mol_smiles
        print('graph and molecule matched')
        return mol,mol_smiles,idx_to_info