def range(lower, upper, sem): with open(str(lower) + "-" + str(upper) + ".csv", "w") as log: while (lower <= upper): with requests.Session() as req: dom = BeautifulSoup( req.post("http://www.wbutech.net/show-result.php", data={ "semno": sem, "rollno": lower, "rectype": 1 }, headers={ "Referer": "http://www.wbutech.net/result_odd.php" }).text) if len(dom.find_all("th")) == 14: name = find("Name : (.+)", dom.find_all("th")[1].text.strip()).group(1) reg = find("Registration No. : (.+) OF", dom.find_all("th")[3].text.strip()).group(1) sgpa = find("SEMESTER : ([0-9.]+)", dom.find_all("td")[54].text.strip()).group(1) print(name + ", " + str(lower) + ", " + reg + ", " + sgpa) log.write("\"" + name + "\",\"" + str(lower) + "\",\"" + reg + "\"," + sgpa + "\n") else: print(str(lower) + " MISSING") req.close() lower += 1
def phoneticcase(filetotopen): f = open(filetotopen, 'r', encoding="utf8") message = f.readlines() f.close() phoneticarr = [] newphonecticlist = [] for m in message: for i in m.strip().split(","): # print(i) for re in i.strip().split("'"): if re.find("[") == -1: if re.find("]") == -1: if re != " ": if re != "": # if x not in phoneticarr: # print(re) phoneticarr.append(str(re).lower()) # for index,x1 in enumerate(phoneticarr): # print(index) # print(len(phoneticarr)) # if x1 not in newphonecticlist: # newphonecticlist.append(x1) # # return newphonecticlist newphonecticlist = list(set(phoneticarr)) # print(phoneticarr) return newphonecticlist
def post_request(cookies, class_code, hashkey, img_data, pred_type="ydm"): global THREAD_FLAG # while count < 50: # check_url = 'https://dean.bjtu.edu.cn/course_selection/courseselecttask/selects_action/?action=load&iframe=school&page=1&perpage=500' # res = requests.get(check_url, cookies=cookies, headers=check_classheader) # count += 1 try: if pred_type == "ydm": req_id, answer = yundama.decode(img_data, 2004, 20) elif pred_type == "chaoren": res = chaoren_client.recv_byte(img_data) answer, req_id = res[u'result'], res[u'imgId'] elif pred_type == "pp": answer, req_id = api.Predict(40400, img_data) elif pred_type == "cjy": answer, req_id = chaojiying.PostPic(img_data, 2004) data = { 'checkboxs': class_code, # 'is_cross':True, 'hashkey': hashkey, 'answer': answer } re = requests.post( 'https://dean.bjtu.edu.cn/course_selection/courseselecttask/selects_action/?action=submit', cookies=cookies, headers=robclass_headers, allow_redirects=False, data=data) if re.status_code == 503: print(re.status_code) print("重新提交抢课请求") time.sleep(0.3) post_request(cookies, class_code, hashkey, img_data, pred_type) re = re.headers['Set-Cookie'] message = re[re.find('[['):re.find(']]') + 2] res = str(json.loads(eval("'" + message + "'"))) print(pred_type + "请求:" + str(data)) print(res) if "选课成功" in res: THREAD_FLAG = True return 200 elif "课堂无课余量" in res: return 404 elif "验证码" in res: if pred_type == 'pp': api.Justice(req_id) elif pred_type == 'cjy': chaojiying.ReportError(req_id) elif pred_type == 'ydm': yundama.report(req_id) else: chaoren_client.report_err(req_id) return 403 # 完全错误,比如有类似的课了 else: return 500 except Exception as e: print("139postreq bug :" + str(e)) return 403
def getip_66ip(self): for page in range(1, 4): url = 'http://www.66ip.cn/{}.html'.format(page) html = getpage(url) if html: doc = pq(html) res = doc('#main table tr:gt(0)').items() for re in res: address = re.find('td').eq(0).text() port = re.find('td').eq(1).text() if address and port: result = '{0}:{1}'.format(address, port) yield result.replace(' ', '')
def get_video_img_info(html): #find site Name site_url='http://' url_p = '^http://v.qq.com/.+?tm|^/cover/' sub_url_p = '^/' video_with_img = [] finded = re.find('qq.com',html) if finded: site_url = site_url+'v.qq.com/' url_p = '^http://v.qq.com/.+tm|^/cover/' elif (finded = re.find('youku.com',html)): site_url = site_url+'v.youku.com' url_p = '^http://v.youku.com/.+?_show'
def range(lower,upper,sem): while(lower<=upper): with requests.Session() as req: dom=BeautifulSoup(req.post("http://www.wbutech.net/show-result.php",data={"semno":sem,"rollno":lower,"rectype":1},headers={"Referer":"http://www.wbutech.net/result_odd.php"}).text) if len(dom.find_all("th"))==14: name=find("Name : (.+)",dom.find_all("th")[1].text.strip()).group(1) reg=find("Registration No. : (.+) OF",dom.find_all("th")[3].text.strip()).group(1) sgpa=find("SEMESTER : ([0-9.]+)",dom.find_all("td")[54].text.strip()).group(1) print name+", "+str(lower)+", "+reg+", "+sgpa else: print str(lower)+" MISSING" req.close() lower+=1
def get_chapterurl(self, response): item = DingdianItem() item['name'] = str(response.meta['name']).replace('\xa0', '') item['novelurl'] = response.meta['novelurl'] re = BeautifulSoup(response.text, 'lxml') category = re.find('table').find('a').get_text() #类别 author = re.find('table').find_all('td')[1].get_text() bash_url = re.find('p', class_='btnlinks').find('a', class_='read')['href'] name_id = str(bash_url)[-6:-1].replace('/', '') item['category'] = str(category).replace('/', '') item['author'] = str(author).replase('/', '') item['name_id'] = name_id return item
def get_video_site_info(html): finded = re.find('qq.com',html) if finded: site_url = site_url+'v.qq.com/' return 'qq' finded = re.find('youku.com',html) if finded: site_url = site_url+'v.youku.com' return 'youku' finded = re.find('tudou.com',html) if finded: site_url = site_url+'v.tudou.com' return 'tudou' else: print 'error not support yet...' return ''
def parse_proxies(country="all"): pattern = re.compile("(?:\d{1,3}\.){3}\d{1,3}:\d+") proxies = [] for page_num in range(1, PAGES_COUNT + 1): print("page:", page_num) soup = get_soup(PROXIES_URL % (country.lower(), page_num)) table = soup.find("table", id="proxy_list") if not table: continue rows = table.findall("tr") print("rows:", len(rows)) for row in rows: script = row.find("script").text cipher = re.findall(r'Base64.decode\("(.+)"\)', script)[0] ip = base64.b64decode(cipher) proxy = ip if proxy and re.find(pattern, proxy): proxies.append(proxy) return proxies
def get_subclass_name_from_item(item): match = re.find('\n\n(?P<subclass>.+)\n', item) subclass_name = match.group('subclass') print("\tsubclass: " + subclass_name) return subclass_name
def parse(document, clean_html=True, unix_timestamp=False, encoding=None): """Parse a document and return a feedparser dictionary with attr key access. If clean_html is False, the html in the feed will not be cleaned. If clean_html is True, a sane version of lxml.html.clean.Cleaner will be used. If it is a Cleaner object, that cleaner will be used. If unix_timestamp is True, the date information will be a numerical unix timestamp rather than a struct_time. If encoding is provided, the encoding of the document will be manually set to that.""" if isinstance(document, six.text_type): encoding = 'utf8' m = re.find(b'''<\?xml.*?encoding=['"](.*?)['"].*\?>''', document) document = document.encode(encoding) if isinstance(clean_html, bool): cleaner = default_cleaner if clean_html else fake_cleaner else: cleaner = clean_html result = feedparser.FeedParserDict() result['feed'] = feedparser.FeedParserDict() result['entries'] = [] result['bozo'] = 0 try: parser = SpeedParser(document, cleaner, unix_timestamp, encoding) parser.update(result) except Exception as e: if isinstance(e, UnicodeDecodeError) and encoding is True: encoding = chardet.detect(document)['encoding'] document = document.decode(encoding, 'replace').encode('utf-8') return parse(document, clean_html, unix_timestamp, encoding) import traceback result['bozo'] = 1 result['bozo_exception'] = e result['bozo_tb'] = traceback.format_exc() return result
def get_data(url): res = requests.get(url) result_json = res.json() print(result_json) next_page_url = result_json["paging"].get("next") data = result_json["data"] # get first data for item in data: message = item.get("message") if message: # whitelist posts if any(whitelist in message.lower() for whitelist in whitelists): # check if there are blacklisted words if not any(blacklist in message.lower() for blacklist in blacklists): number = "" match = pattern.search(message) if match: if match.group(0): number = match.group(0).replace("/", "") elif match.group(1): number = match.group(1).replace("/+", "") # ensure that we only take the number phone digits number = re.find("\d", number) # check if number is exists. if number not in numbers: messages.append(message) numbers.append(number) return next_page_url
def reorderLines(logFileSize, logfile): # WRITE YOUR CODE HERE id_map = {} content_words = [] content_numbers = [] for log_line in logfile: log_contents = log_line.split(' ') id = log_contents[0] content = log_contents[1:] id_map[content] = id if re.find('\d+', content): content_numbers.append(content) else: content_words.append(content) #sort the content_words lexicographically content_words_sorted = sorted(content_words) result = [] for sorted_line in content_words_sorted: if sorted_line in id_map: result_line = id_map[sorted_line] + " " + sorted_line result.add(result_line) for number_line in content_numbers: if number_line in id_map: result_line = id_map[number_line] + " " + number_line result.add(result_line) return result
def groupon_poster_gz(request): cmd='curl -H "Host:groupon.mlapi.meilishuo.com" 10.0.0.55/groupon/groupon_poster' re=os.popen(cmd).read() reStatus="success" if re.find('"error_code":0') < 0: reStatus="fail" return HttpResponse(reStatus)
def getNumbers(varInput): variables = find(r'[\d.]+', varInput) if variables is not None: return float(variables[0]), float(variables[1]) else: print "\'%s\' is not valid input. Please try again.\n" % varInput
def BIOtagSingleOffset_NoOverlap(self, element, text2, i1, i2, last_offset): # before drug words start before_part = text2[last_offset:i1] element["text_splits"].append(before_part) # drug words element["text_splits"].append(self.BItagWSpace) # now add the next word only? or what? i2 = re.find(r'\b', text2[i1:]) if i2 > -1: entity_text = text2[i1:] element["text_splits"].append(entity_text) #entity_text2 = self.BIOTagWord(entity_text) #deviation += len(entity_text2) - len(entity_text) element["text_splits"].append(self.BIOtagWSpace) last_offset = i1 + len(entity_text) + 1 return last_offset
def get_class_and_subclass_codes_from_item(item): match = re.find('(?P<class>\d{2} )(?P<subclass>\d{2}\n)' , item) class_code = match.group('class') subclass_code = match.group('subclass') print("\tclass: " + class_code + "\n\tsubclass: " + subclass_code) return [class_code , subclass_code]
def println(name): global var rule = r'“(.*?)”' try: print("\n") print(var[name]) except: print("\n") print(re.find(rule))
def is_valid_channel(name): """Returns whether NAME is a valid channel name, that is, it starts with any of '#', '&', '+', or '!', and does not contain NUL, BEL, CR, LF, ' ', ',', or ':'. """ return ( (name[0] in CHANNEL_PREFIXES) and (re.find('[\0\7\r\n ,:]', name) == -1) )
def gerarAfndGramatica(afnd, gramatica, alfabeto): #cria o afnd das gramaticas if not afnd: afnd.update({0: {}}) aTemp = {} mpRgs = {} for regra in gramatica: simbolos = find(r'(\w*<\w+>|\w+|&)', regra) if simbolos[0] in mpRgs.keys( ): # Verifica se a regra já foi criada e armazena no mapa de regras iRg = mpRgs[simbolos[0]] # iRg armazena o índice da regra else: iRg = len(aTemp) aTemp.update({iRg: {}}) mpRgs.update({simbolos[0]: iRg}) for simbolo in simbolos[1:]: term = find(r'^\w+', simbolo) nTerm = find(r'<\w+>', simbolo) term = '&' if not term else term[0] if term not in alfabeto: alfabeto.append(term) if not nTerm: # produção sem não terminal, gera uma regra terminal rg = aTemp[iRg] if term in rg.keys(): rg[term].append(len(aTemp)) else: rg.update({term: [len(aTemp)]}) aTemp.update({len(aTemp): {'*': [1]}}) else: nTerm = nTerm[0] if nTerm in mpRgs.keys(): rg = mpRgs[nTerm] else: rg = len(aTemp) mpRgs.update({nTerm: rg}) aTemp.update({rg: {}}) mp = aTemp[iRg] if term in mp.keys(): mp[term].append(rg) else: mp.update({term: [rg]}) unirAutomatos(afnd, aTemp) #mescla os automatos, compartilhand o simbolo inicial
def get_attributes(tag): keys = re.findall(' ([\w-]+?)=', tag) result = {} for key in keys: value = re.find('%s="(.*?)"' % key, tag) if value is None: print('cannot find attribute "%s" in %s' % (key, tag)) continue result[key] = value check_attributes(keys, tag) return result
def usd_to_inr(): from bs4 import BeautifulSoup import requests import re amount = re.find(r'(\d)+', x) url = 'https://www.xe.com/currencyconverter/convert/?Amount=' + amount + '&From=USD&To=INR' html_text = requests.get(url).text soup = BeautifulSoup(html_text, 'lxml') value = soup.find('p', class_='result__BigRate-sc-1bsijpp-1 iGrAod').text print_and_say(value)
def getIP(): try: res=urllib2.urlopen('http://whois.pconline.com.cn/ipJson.jsp',timeout=2) except: return None if res.getcode()!=200: return None re=res.read().decode('gbk').encode('utf8') res.close() re=re[re.rfind('{'):re.find('}')+1] return json.loads(re)
def getIP(self): try: res=urllib2.urlopen('http://whois.pconline.com.cn/ipJson.jsp',timeout=2000) except: return None if res.getcode()!=200: return None re=res.read().decode('gbk').encode('utf8') res.close() re=re[re.rfind('{'):re.find('}')+1] return json.loads(re)
def get_dependencies(self): dependencies = [] dependencies_in_angular_quotes = [] dependencies_in_angular_braces = re.findall('\<(.*?)\>', self.raw_content) print(self.raw_content) for line in self.raw_content.split("\n"): if line[0] == '#': dependencies_in_angular_quotes.append = re.find( r'"([^"]*)"', line) self.dependencies = dependencies_in_angular_braces + dependencies_in_angular_quotes
def fill(self, char): if not self: return self[FILL] if re.find("reset", char.lower): del self[FILL] self[_CHANGE] return undef char = char[0:1] # only one character allowed for fill value. re.sub("[^\x20-\x7f]+", "", char) if char: self[FILL] = char return self[FILL]
def matching_one(regexes, string): '''Returns True if and only if one of the given regexes (a list) matches the string. For example, if regexes is ['abc','foo'] and the string is 'blabcar', it should return True, but not if regexes is ['lalala','foo'] and the string is 'blabcar'. 3/3 points ''' for item in regexes: if re.find(item, string): return True return False
def parse_jmdict(file=JMDICT_FILE): # {{{1 alang = "{http://www.w3.org/XML/1998/namespace}lang" data = [] with gzip.open(file) as f: with click.progressbar(ET.parse(f).getroot(), width=0, label="parsing jmdict") as bar: for e in bar: seq, pos = int(e.find("ent_seq").text), () kanji, reading, sense = [], [], [] for ke in e.findall("k_ele"): # 0+ kanji elem keb = ke.find("keb").text.strip() # word/phrase w/ kanji info = tuple(x.text.strip() for x in ke.findall("ke_inf")) assert all("\n" not in x and "\x1e" not in x for x in info) kanji.append( Kanji(keb, _kanji_chars(keb), info, _prio_k(ke))) for re in e.findall("r_ele"): # 1+ reading elem reb = re.find("reb").text.strip() # reading elem restr = tuple(x.text.strip() for x in re.findall("re_restr")) # reading only applies to keb subset info = tuple(x.text.strip() for x in re.findall("re_inf")) assert all("\n" not in x and "\x1e" not in x for xs in [restr, info] for x in xs) reading.append(Reading(reb, restr, info, _prio_r(re))) for se in e.findall("sense"): # 1+ sense elem pos = tuple(x.text.strip() for x in se.findall("pos")) or pos # part of speech, applies to following senses too lang, gloss = None, [] for x in se.findall("gloss"): l = x.get(alang, "eng") if l in LANGS and x.text: assert lang is None or lang == l lang = l gloss.append(x.text.strip()) if lang is None: continue s_inf = tuple(x.text.strip() for x in se.findall("s_inf")) misc = tuple(x.text.strip() for x in se.findall("misc")) xref = tuple(y.strip() for x in se.findall("xref") for y in x.text.split("・") if not y.strip().isdigit()) assert seq < MAXSEQ assert all("\n" not in x and "\x1e" not in x for xs in [pos, gloss, s_inf, misc, xref] for x in xs) sense.append( Sense(pos, lang, tuple(gloss), s_inf + misc, xref)) krs = (tuple(x) for x in [kanji, reading, sense]) jlpt = jlpt_level(kanji, reading, _usukana(sense)) data.append(Entry(seq, jlpt, *krs)) return data
def check_attributes(attributes, tag): check = tag for key in attributes: attr = re.find('%s=".*?"' % key, tag) check = check.replace(attr, '') check = check.replace('<img', '') check = check.replace('/>', '') check = check.replace('>', '') check = check.replace(' ', '') if len(check) > 0: print('tag has unexpected attributes: %s\n%s\n%s' % (attributes, check, tag)) raise Exception('unexpected attributes')
def grep(filename, string): '''Returns all lines that match a given string. Interpret the string as a regular expression, so e.g. if the string is "foo[bp]ar" it should return all those lines that contain the words foobar and/or foopar. It shouldn't return the lines altogether in a list, but rather one by one using yield. 5/5 points ''' with open(filename) as f: for line in f: if re.find(string, line): yield line
def sanitize_token(self, token, strip_tokens=False): if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: if token.has_key("data"): attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) for attr in self.attr_val_is_uri: if not attrs.has_key(attr): continue val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and 'xlink:href' in attrs and re.find('^\s*[^#\s].*', attrs['xlink:href'])): del attrs['xlink:href'] if attrs.has_key('style'): attrs['style'] = self.sanitize_css(attrs['style']) token["data"] = [[name,val] for name,val in attrs.items()] return token else: if strip_tokens: return None if token["type"] == tokenTypes["EndTag"]: token["data"] = "</%s>" % token["name"] elif token["data"]: attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) token["data"] = "<%s%s>" % (token["name"],attrs) else: token["data"] = "<%s>" % token["name"] if token["type"] == tokenTypes["EmptyTag"]: token["data"]=token["data"][:-1] + "/>" token["type"] = tokenTypes["Characters"] del token["name"] return token elif token["type"] == tokenTypes["Comment"]: pass else: return token
def sanitize_token(self, token): if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: if token.has_key("data"): attrs = dict([(name, val) for name, val in token["data"][::-1] if name in self.allowed_attributes]) for attr in self.attr_val_is_uri: if not attrs.has_key(attr): continue val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and 'xlink:href' in attrs and re.find('^\s*[^#\s].*', attrs['xlink:href'])): del attrs['xlink:href'] if attrs.has_key('style'): attrs['style'] = self.sanitize_css(attrs['style']) token["data"] = [[name, val] for name, val in attrs.items()] return token else: if token["type"] == tokenTypes["EndTag"]: token["data"] = "</%s>" % token["name"] elif token["data"]: attrs = ''.join([ ' %s="%s"' % (k, escape(v)) for k, v in token["data"] ]) token["data"] = "<%s%s>" % (token["name"], attrs) else: token["data"] = "<%s>" % token["name"] if token["type"] == tokenTypes["EmptyTag"]: token["data"] = token["data"][:-1] + "/>" token["type"] = tokenTypes["Characters"] del token["name"] return token elif token["type"] == tokenTypes["Comment"]: pass else: return token
def _get_conflict_file_name(outfile): """ """ outdir = '/'.join(outfile.split('/')[:-1]) outfilename = outfile.split('/')[-1] filelist = os.listdir(outdir) previous_conflicts = [filename.split('.')[2] for filename in filelist if filename.startswith(outfilename) and len(filename.split('.')) == 3] if previous_conflicts: max_conflict = max([int(re.find('[0-9]{3}',conflict)[0]) for conflict in previous_conflicts]) else: max_conflict = -1 return '{}.conflict_{:03.0f}'.format(outfile,max_conflict+1)
def capo(hope_comments_count): url_capogames = "http://www.capogames.net/samw/board/board.do" souping_capogames = souping(url_capogames) for row in souping_capogames.find_all('tr'): row_count = row.find('span').contents[0] reply_count = re.search(r'wd+', row_count).group(0) if int(reply_count) >hope_comments_count: site_name = 'samw' number = row.find('td').contents[0].contents[0] link = 'http://www.capogames.net/samw/'+row.find('td').a.get('href') title = row.find('td').a.contents[0] name = row.find('td').a.contents[0] raw_date = re.find('td').contents[0] best_article_list([site_name, number, title, link, reply_count])
def sanitize_html(tag_name, D, SElms=SElms, SAttr=SAttr, SProtocols=SURITypes): """ tag_name -> The tag's name D -> The tag's attributes dict DElms -> The allowed elements DAttr -> The allowed attributes DProtocols -> The allowed protocols (see Tags.DURITypes) """ tag_name = tag_name.lower() # HACK! if tag_name in SElms: for k in list(D.keys()): # Delete unallowed attributes if not k in SAttr: del D[k] for attr in SURIAttrs: # Validate URLs using REs if not attr in D: continue val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(D[attr])).lower() if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and \ (val_unescaped.split(':')[0] not in SURITypes): del D[attr] for attr in svg_attr_val_allows_ref: # SVG something something... if attr in D: D[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(D[attr])) if (tag_name in svg_allow_local_href and 'xlink:href' in D and re.find('^\s*[^#\s].*', D['xlink:href'])): # ??? # Disable SVG links? del D['xlink:href'] if 'style' in D and D['style']: # Sanitize the CSS D['style'] = sanitize_css(D['style']) return tag_name, D else: # Don't allow! return None, None
def clean(t_list, comm_trig=False): # cleaning up the name html. list_ = pd.DataFrame() command = [] for i in t_list: try: list_.append(i.get('data-name')) if i.get('data-name') == None: #.find('a', attr = 'data-name')) command.append(i.get(re.find('a', 'href="/mtg-card/'))) except: # need to come p with conditional to define the commander.. #command.append(i.get('href')) print( 'if there are more than 3-4 of these ther is a problem here!!!' ) return command + list_ # remember partner commanders, need to add cleanfor that.
def countNumDrugWords(self, element): count = 0 element["verified_drug_words"] = [] # get all sorted offsets # merge offsets that overlap overlaped_offsets = [] for off in element["offsets"]: if len(overlaped_offsets) == 0: overlaped_offsets.append(off) else: i1 = off[0] i2 = off[1] if i2 <= overlaped_offsets[-1][1]: # as the offsets are sorted by i1, # we could skip this offsets as it is contained in the previous one pass elif i2 > overlaped_offsets[-1][1] and i1 < overlaped_offsets[ -1][1]: # this one overlaps but includes more words after it # so we should increment i2 of the previous offset to current i2 overlaped_offsets[-1][1] = i2 else: overlaped_offsets.append(off) for groupoffset in overlaped_offsets: if len(groupoffset) == 2: i1 = groupoffset[0] i2 = groupoffset[1] fragment = element["text_original"][i1:i2 + 1] fwords = self.tokenize(fragment) element["verified_drug_words"].extend(fwords) count += sum([1 for word in fwords]) else: i1 = int(groupoffset[0]) i2 = i1 + int(re.find(r'\b', element["text_original"][i1:])) if i2 > i1: fragment = element["text_original"][i1:i2 + 1] fwords = self.tokenize(fragment) element["verified_drug_words"].extend(fwords) count += sum([1 for word in fwords]) return count
def countNumDrugWords(self, element): count = 0 element["verified_drug_words"] = [] # get all sorted offsets # merge offsets that overlap overlaped_offsets = [] for off in element["offsets"]: if len(overlaped_offsets)==0: overlaped_offsets.append(off) else: i1 = off[0] i2 = off[1] if i2<= overlaped_offsets[-1][1]: # as the offsets are sorted by i1, # we could skip this offsets as it is contained in the previous one pass elif i2> overlaped_offsets[-1][1] and i1< overlaped_offsets[-1][1]: # this one overlaps but includes more words after it # so we should increment i2 of the previous offset to current i2 overlaped_offsets[-1][1] = i2 else: overlaped_offsets.append(off) for groupoffset in overlaped_offsets: if len(groupoffset)==2: i1 = groupoffset[0] i2 = groupoffset[1] fragment = element["text_original"][i1:i2+1] fwords =self.tokenize(fragment) element["verified_drug_words"].extend(fwords) count += sum([1 for word in fwords]) else: i1 = int(groupoffset[0]) i2 = i1 + int(re.find(r'\b',element["text_original"][i1:])) if i2>i1: fragment = element["text_original"][i1:i2+1] fwords = self.tokenize(fragment) element["verified_drug_words"].extend(fwords) count += sum([1 for word in fwords]) return count
def fromurl(url): """Return app/class/method/function pointed by an url 1. raise ValueError if url is external 2. return None if url does not match any callable app/class/... else return target item """ if re.find("[^\w/-+#]", url): raise ValueError target = None for part in url.split("/"): pass # aaa/bb/cc/dd return
def updatePermissions(self, permissions): for p in permissions: obj = {} obj["permission_k"] = p["permission_k"] obj["date_created"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # //remove this permission from all roles roles = self.role.getByPermissions(p["permission_k"]) for role in roles: self.permission.deleteRolePermissions(role) #//add this permission to each role for key in p.keys(): match = re.find("/^role_/", key) if match: obj["role_k"] = key[5:] obj["value"] = p[key] self.permissions.addRolePermissions(obj) return {"success": True, "message": "Permissions successfully saved"}
def updatePermissions(self, permissions): for p in permissions: obj = {} obj["permission_k"] = p["permission_k"] obj["date_created"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # //remove this permission from all roles roles = self.role.getByPermissions(p["permission_k"]) for role in roles: self.permission.deleteRolePermissions(role) #//add this permission to each role for key in p.keys(): match = re.find("/^role_/", key) if match: obj["role_k"] = key[5:] obj["value"] = p[key] self.permissions.addRolePermissions(obj) return {"success": True, "message": "Permissions successfully saved"}
def fetch(file_path): with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() # print(line) a, b = line.split(':') anchors.append((a, b)) txt = '| 主播 | 标题 | 状态 | 订阅 |\n|:---:|:---:|:---:|:---:|\n' for suffix, anchor in anchors: # time.sleep(100) print('watching:', anchor) try: r = requests.get(huya_url + suffix, timeout=3) html = r.content.decode('utf-8') newaddr = re.find(r'更换为.+href="https://www.huya.com/(.+)"', html) print(newaddr) if newaddr: print('NEW:', anchor, suffix, '->', newaddr) anchors.append((newaddr, anchor)) continue title = re.findall(r'<h1 id="J_roomTitle">(.+)</h1>', html)[0] status = re.findall(r'id="live-count">(.+?)</em></span>', html) fans = re.findall(r'id="activityCount">(\d+)</div>', html)[0] last_live = '未直播' if status and status[0]: last_live = status[0] except: print('ERROR:' + huya_url + suffix) else: txt += ('|' + anchor + '|' + title + '|' + last_live + '|' + fans + '|\n') print(txt) return if send_msg('主播直播状态', txt): print('wechat message push success.') else: print('wechat message push failed.')
def get_session_details(s): url = 'http://www.azleg.gov/xml/sessions.asp' with s.urlopen(url) as page: root = etree.fromstring(page) session_file = open('session_details.py', 'w') detail = """ '%s': {'type': '%s', 'session_id': %s, 'start_date': datetime.date(%s), 'end_date': datetime.date(%s)}, """ for session in root.xpath('//session'): session_type = 'primary' if re.find('Regular', session.get('Session_Full_Name')) else 'special' start_date = datetime.datetime.strptime( session.get('Session_Start_Date'), '%Y-%m-%dT%H:%M:%S') end_date = datetime.datetime.strptime(session.get('Sine_Die_Day'), '%Y-%m-%dT%H:%M:%S') session_file.write(detail % ( session.get('Session_Full_Name'), session_type, session.get('Session_ID'), start_date, end_date))
def replace_image_tag(html, path=None): tags = re.findall('<img.+?src="/img.+?>', html) for tag in tags: image = re.find('src="/img/(.+?)"', tag) if image is None: print('image path not found in tag: "%s" in %s' % (tag, path)) continue imagepaths.append(image) attributes = get_attributes(tag) attributes.pop('src') if len(attributes) > 0: # http://railsdoc.com/references/image_tag # <img src="/img/hoge.png" alt="hogera"> -> <%= image_tag("hoge.png", alt: "hogera") %> attributes = ', '.join( ['%s: "%s"' % o for o in attributes.items()]) replaced = '<%%= image_tag("%s", %s) %%>' % (image, attributes) else: replaced = '<%%= image_tag("%s") %%>' % (image) if is_dry: print('"%s" -> "%s"' % (tag, replaced)) continue html = html.replace(tag, replaced) return html
def dict_to_hstore(python_dict): """ There's an implementation of this here ( HstoreAdapter ) https://github.com/psycopg/psycopg2/blob/master/lib/extras.py but the comments say that it is "painfully inefficient!" """ hstore = StringIO() first_row = True for key in python_dict: ## Prune the hash--if it's empty value = python_dict[key] if value: if not first_row: hstore.write("|") else: first_row=False # don't allow quotes within the value. Not sure if we should check this here. value = value.replace('"','') hstore.write("\"%s\"=>\"%s\"" % (key, value) return hstore.getvalue() ## If we were using a comma as a delimiter, we'd need to use a regex to allow for the possibility that a comma was inside the commas--so use the below. But we don't have to do that if we use the bar as delimiter. We already clean bars out in utf8clean. # key_pair_re = re.compile('"(.+?)"=>"(.+?)"(?:,|$)') key_pair_re = re.compile('"(.+?)"=>"(.+?)"') def hstore_to_dict(text_string): return_dict = {} keypairs = text_string.split("|") for keypair in keypairs: keygroups = re.find(key_pair_re, keypair) return_dict[keygroups[1]]=keygroups[2] return return_dict
def BIOtagSingleOffset_NoOverlap(self, element, text2, i1, i2, last_offset): # before drug words start before_part = text2[last_offset:i1] element["text_splits"].append(before_part) # drug words element["text_splits"].append(self.BItagWSpace) # now add the next word only? or what? i2 = re.find(r'\b', text2[i1:]) if i2>-1: entity_text = text2[i1:] element["text_splits"].append(entity_text) #entity_text2 = self.BIOTagWord(entity_text) #deviation += len(entity_text2) - len(entity_text) element["text_splits"].append(self.BIOtagWSpace) last_offset = i1 + len(entity_text) + 1 return last_offset
Copyright (c) 2014 Beckersweet. All rights reserved. """ from commands import getoutput as command from json import loads as decodeJSON from json import dumps as encodeJSON from mininet.cli import CLI from mininet.net import Mininet from mininet.node import Node, RemoteController, CPULimitedHost from mininet.util import pmonitor import pp from re import findall as find ifconfig = command('ifconfig') try: localIp = find('addr:(192\.168\.56\.\d+) ', ifconfig)[0] except: print "Network settings not configured. Try running 'sudo dhclient eth1'." NETWORK_CONTROLLER_PORT = 6633 NUMBER_OF_HOSTS = 3 TCP_REQUEST_COMMAND = "python tcpRequest.py " + localIp + " 9999 " JOB_SERVER_COMMAND = "sudo python dynamic_ncpus.py " BENCHMARK_RESULTS_FILE_NAME = "OpMub_benchmarking.out" print print "Creating network:" virtualNetwork = Mininet(controller=RemoteController, host=CPULimitedHost, build=False)
def match_expr(self, expr): return SubtitlesClip([e for e in self.subtitles if re.find(expr, e) != []])
def is_valid_nickname(name): """Returns whether NAME is a valid nickname, that is, it contains only letters, numbers, '_', '[', ']', '{', '}', '\', '|', '`', or '^'. """ return (re.find(r'[^A-Za-z0-9_\[\]\{\}\\\|\`\^]', name) == -1)
def check_queue(uid): """ Check the queue for any uid string, return job list with running node information. """ from re import compile as mkregex qstat = rn(['qstat', '-u', uid, '-n', '-1']).decode('utf8').rstrip().split('\n')[5:] # If there are no job return nothing if not qstat: return jobs = {} for i in qstat: f = s(r' +', i.rstrip()) # Only look at jobs in the interactive queue if not f[2] == short_queue_name: continue # Skip completed jobs if f[9] == 'C': continue # Get node name, if there is one if f[11] == '--': node = '' else: nodes = set(find(r'node[0-9][0-9]', f[11])) if len(nodes) > 1: continue node = str(list(nodes)[0]) # Get job number job_id = find(r'[0-9]+', f[0])[0] # Now that we have a limited job set, use qstat -f to get the # complete job and queue name find_queue = mkregex(r'queue = (.*)$') find_name = mkregex(r'Job_Name = (.*)$') for i in subprocess.check_output(['qstat', '-f', job_id]).decode().rstrip().split('\n'): # Get Queue Name if find_queue.search(i): try: queue = find_queue.findall(i)[0] except IndexError: # Queue parsing failed, report this and continue print("Failed to parse queue for job number:{:^3}\nskipping".format(job_id), file=stderr) continue if not queue == interactive_queue: continue elif find_name.search(i): try: names = find_name.findall(i)[0].split('_') except IndexError: # Queue parsing failed, report this and continue print("Failed to parse queue for job number:{:^3}\nskipping".format(job_id), file=stderr) continue # Check that this is actually one of our jobs identifier = '_'.join(names[-2:]) if identifier == 'int_tmux': type = 'tmux' elif identifier == 'int_vnc': type = 'vnc' elif identifier == 'int_gui': type = 'gui' else: continue # Fix queue name name = '_'.join(names[:-2]) name = name if name else type # Assemble the dictionary jobs[job_id] = {'queue' : queue, 'job_name' : name, 'type' : type, 'node' : node, 'state' : f[9]} # Sort the dictionary jobs = OrderedDict(sorted(jobs.items())) return(jobs)
#!/usr/bin/python27 import os; import re; from subprocess import call; dirStruct = os.walk("../../raspi/LineAnalysis/testImages"); for roots, dirs, files in dirStruct : for fname in files : if re.find("jpg", fname) != None : ret = call(["./stats"], ["-i"], ["
#list类型以,为分隔符,对于字符中出现的,应使用\,转义 data_arr = [] for d in data: symbol = getFlag(d) if d is None: print 'Unknown Value' continue data_arr.append(symbol + str(d).replace('\\', '\\\\').replace(',', '\,')) data_str = ','.join(data_arr) request += "$" + str(len(data_str) + 1) + "\r\n" request += getFlag(data) + data_str + "\r\n" else: print pred('Unknown Value') continue sock.send(request + "\n") time.sleep(0.05) re = sock.recv(2048) if re[0] in ('+', '-'): print re[1:], elif re[0] == '$': resp_len = int(re[1:re.find("\r\n")]) if resp_len == -1: print pred("Not Found") continue data_start = re.find("\r\n") + 2; data = re[data_start:data_start + resp_len] data = parseCMD(data) print "(" + pyellow(type(data).__name__) + ")", pgreen(data) sock.close()
def demo_bad_catch(): try: var = input("Enter variable name") if re.find(reg, var): print('The input is valid') except ValueError as e:
def attach_job(job_id, attempt_gui=False): """ Attach to a currently running job, default is tmux. To attach to a GUI running in tmux, pass attempt_gui """ # Get details job_list = check_queue(uid) try: node = job_list[job_id]['node'] type = job_list[job_id]['type'] state = job_list[job_id]['state'] except KeyError: print("Sorry, that job number doesn't exist. Please try again") print_jobs(job_list) sys.exit(1) if not state == 'R': print("Job not running, cannot attach") return if type == 'gui' or attempt_gui: # Confirm GUI Possible if not xpra_installed: print("It appears that xpra is not in your PATH, I cannot run GUI jobs", file=stderr) print("Exiting", file=stderr) sys.exit(-1) # Display xpra instructions print("You MUST NOT close your program by closing the window unless you want to") print("terminate your session\n") print("To preserve your session, you need to Ctrl-C in the command line, not close") print("the window\n") sleep(1) # Actually attach to the session! subprocess.call(['xpra', 'attach', 'ssh:' + uid + '@' + node + ':' + job_id]) return elif type == 'tmux': # Do not attach if running from within a tmux session already if rn('echo $TMUX', shell=True).decode().rstrip(): print("You are already running a tmux session, sessions should be nested with care") print("To force run, unset the $TMUX variable, but I suggest you just detatch your") print("current session and try the same command again") return # Attempt to initially attach to xpra, fail gracefully without # notifying user if xpra_installed: GUI_PID='' if subprocess.call("xpra attach ssh:" + uid + "@" + node + ":" + job_id + " >/dev/null 2>/dev/null &", shell=True) == 0: GUI_PID = subprocess.check_output('ps axo pid,user,cmd | grep "xpra attach" | grep "' + job_id + '$"| awk \'{print $1}\'', shell=True).decode().rstrip() # Actually attach to the session! job_string = ' '.join(['ssh', node, '-t', 'DISPLAY=:' + job_id, 'tmux', 'a', '-t', job_id]) subprocess.call(job_string, shell=True) # Kill GUI if open if xpra_installed and GUI_PID: subprocess.call(['kill', GUI_PID]) elif type == 'vnc': # Check that vnc can run if not vnc_installed: print("It appears that vncviewer is not in your PATH, I cannot run connect to a VNC session", file=stderr) print("Exiting", file=stderr) sys.exit(-1) # Get VNC Port ports = [] files = subprocess.check_output('ssh ' + node + ' "ls $HOME/.vnc"', shell=True).decode().rstrip().split('\n') for i in files: if i.startswith(node) and i.endswith('pid'): port = find(r':([0-9]+)\.pid', i)[0] ports.append(port) if not ports: print("It appears no VNC servers are running on the selected server.") print("If the job is still running in the queue, there is a problem.") print("Try clearing out the *.log and *.pid files in $HOME/.vnc, and killing") print("the running VNC queue job") return if len(ports) > 1: print("There is more than one vnc server running for you on that node.") print("That isn't allowed and I don't know which one to join. It may") print("be that your last session exited without cleaning $HOME/.vnc") print("Check in there and clean out log files for vnc servers that") print("aren't running to prevent problems") return subprocess.call(['vncviewer', node + ':' + ports[0]]) return else: print("I don't understand the job type") return
return '' def get_video_img_info(html): #find site Name site_url='http://' url_p = '^http://v.qq.com/.+?tm|^/cover/' sub_url_p = '^/' video_with_img = [] finded = re.find('qq.com',html) if finded: site_url = site_url+'v.qq.com/' url_p = '^http://v.qq.com/.+tm|^/cover/' elif (finded = re.find('youku.com',html)): site_url = site_url+'v.youku.com' url_p = '^http://v.youku.com/.+?_show' elif (finded = re.find('tudou.com',html)): site_url = site_url+'v.tudou.com' url_p = 'http://v.tudou.com/' else: print 'error not support yet...' return [] soup = BeautifulSoup(html) all_img=soup.select("a > img") for img in all_img: img_attrs = img.attrs for attr in img_attrs: if attr == 'src' or attr == '_src': imgurl=img[attr] if attr == 'alt': imgalt=img[attr]
from sys import argv import re script, directory, csv = argv identified = open(csv).read() # 从整理后的csv生成鉴定到的queries列表 identified_peaks = [] for line in open(csv): identified_peaks.append(line[:-1]) print('total identifed peaks: ', len(identified_peaks)) output = open('identfied.mgf','w') # add new name for new mgf file for mgf in os.listdir(directory): if not re.find('mgf', mgf): continue s = open(mgf).read() queries = {} match = re.compile('BEGIN IONS.*?END IONS\n', re.DOTALL) peak_list = re.findall(match, s) #生成queries的列表 print('total MS/MS spectra: ', len(peak_list)) for query in peak_list: title = re.search('TITLE=.*? ', query) # 从表中提取title行 #print(title.group()[6:])
def hstack(name): return re.find(r'\[([0-9]+)\]', name)
def create_job(cores=default_cores, mem='', gui='', name='', vnc=False): """ Create a job in the queue, wait for it to run, and then attach Ctl-C after submission will not kill job, it will only kill attach queue """ # Figure out memory request try: mem = str(int(cores*default_max_mem/default_max_cores)) + 'GB' if not mem else str(int(mem)) + 'GB' except ValueError: print("Incorrect formatting for memory request, please submit an integer multiple in GB") sys.exit(1) # Create job name if gui: gui_name = gui.split(' ')[0] job_name = name + '_' + gui_name + '_int_gui' if name else gui_name + '_int_gui' elif vnc: job_name = name + '_int_vnc' if name else 'int_vnc' else: job_name = name + '_int_tmux' if name else 'int_tmux' # Prep the job template = "#!/bin/bash\n#PBS -S /bin/bash\n" template = ''.join([template, "#PBS -q ", interactive_queue, "\n#PBS -N ", job_name, '\n#PBS -l nodes=1:ppn=' + str(cores), '\n#PBS -l mem=' + mem, '\n#PBS -e ' + os.environ['HOME'] + '/.' + job_name + '.error', '\n#PBS -o /dev/null']) if gui: template = template + ("\n\nexport QCONNECT=gui" "\n\njob_id=$(echo $PBS_JOBID | sed 's#\..*##g')\n" "xpra start :$job_id\n" "export DISPLAY=:${job_id}\n" "sleep 1\n" + gui + "\n" "PID=$!\n" "sleep 1\n" "while true\n" "do\n" " if kill -0 $PID > /dev/null 2>&1; then\n" " sleep 5\n" " else\n" " xpra stop :${job_id}\n" " xpra list >/dev/null 2>/dev/null\n" " rm ~/.xpra/:${job_id}.log 2>/dev/null\n" " exit 0\n" " fi\n" "done\n") elif vnc: if not vnc_installed: print("It appears that vncviewer is not in your PATH, I cannot create a VNC connection", file=stderr) print("Exiting", file=stderr) sys.exit(-1) template = template + ("\n\nexport QCONNECT=vnc\n\nvncserver -geometry " + vnc_geometry + " -fg\n") else: template = template + ( "\n\nexport QCONNECT=tmux" "\n\nsession_id=$(echo $PBS_JOBID | sed 's#\..*##g')\n") if xpra_installed: template = template + ("if xpra start :$session_id >/dev/null 2>/dev/null; then\n" " export DISPLAY=:$session_id\n" "fi\n") template = template + ( "CMD=\"tmux new-session -s $session_id -d\"\n" "$CMD\n" "PID=$(ps axo pid,user,cmd | grep tmux | grep $USER | grep -v grep | awk '{print $1}')\n" "while true\n" "do\n" " if kill -0 $PID > /dev/null 2>&1; then\n" " if [[ ! $(tmux ls | grep $session_id) ]]; then\n") if xpra_installed: template = template + (" xpra stop :$session_id >/dev/null 2>/dev/null\n" " xpra list >/dev/null 2>/dev/null\n" " rm ~/.xpra/:$session_id.log 2>/dev/null\n") template = template + ( " exit 0\n" " else\n" " sleep 5\n" " fi\n" " else\n") if xpra_installed: template = template + (" xpra stop :$session_id >/dev/null 2>/dev/null\n" " xpra list >/dev/null 2>/dev/null\n" " rm ~/.xpra/:$session_id.log 2>/dev/null\n") template = template + ( " exit 0\n" " fi\n" "done\n") if debug: print(template) pbs_command = (['qsub']) # Submit the job pbs_submit = subprocess.Popen(pbs_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) pbs_submit.stdin.write(template.encode()) pbs_submit.stdin.close() # Get job number job_no = (pbs_submit.stdout.read().decode().rstrip()) try: job_no = find(r'[0-9]+', job_no)[0] except IndexError: print("PBS Submission failed with message:\n{}".format(job_no), file=stderr) sys.exit(1) print("Job", job_name, "created with job id", job_no, "\n") sleep(1) return(job_no)
def get_table_info(self, table): table_data = re.find('CREATE TABLE {table_name} (.*\n?)\)\n\))'.format(table), self._file_content, re.MULTILINE) return table_data