Example #1
0
def range(lower, upper, sem):
    with open(str(lower) + "-" + str(upper) + ".csv", "w") as log:
        while (lower <= upper):
            with requests.Session() as req:
                dom = BeautifulSoup(
                    req.post("http://www.wbutech.net/show-result.php",
                             data={
                                 "semno": sem,
                                 "rollno": lower,
                                 "rectype": 1
                             },
                             headers={
                                 "Referer":
                                 "http://www.wbutech.net/result_odd.php"
                             }).text)
                if len(dom.find_all("th")) == 14:
                    name = find("Name : (.+)",
                                dom.find_all("th")[1].text.strip()).group(1)
                    reg = find("Registration No. : (.+) OF",
                               dom.find_all("th")[3].text.strip()).group(1)
                    sgpa = find("SEMESTER : ([0-9.]+)",
                                dom.find_all("td")[54].text.strip()).group(1)
                    print(name + ", " + str(lower) + ", " + reg + ", " + sgpa)
                    log.write("\"" + name + "\",\"" + str(lower) + "\",\"" +
                              reg + "\"," + sgpa + "\n")
                else:
                    print(str(lower) + " MISSING")
                req.close()
                lower += 1
def phoneticcase(filetotopen):
    f = open(filetotopen, 'r', encoding="utf8")
    message = f.readlines()

    f.close()
    phoneticarr = []
    newphonecticlist = []

    for m in message:

        for i in m.strip().split(","):
            # print(i)
            for re in i.strip().split("'"):
                if re.find("[") == -1:
                    if re.find("]") == -1:
                        if re != " ":
                            if re != "":
                                # if x not in phoneticarr:
                                # print(re)
                                phoneticarr.append(str(re).lower())

    # for index,x1 in enumerate(phoneticarr):
    #     print(index)
    #     print(len(phoneticarr))
    #     if x1 not in newphonecticlist:
    #         newphonecticlist.append(x1)
    #
    # return newphonecticlist
    newphonecticlist = list(set(phoneticarr))
    # print(phoneticarr)
    return newphonecticlist
Example #3
0
def post_request(cookies, class_code, hashkey, img_data, pred_type="ydm"):
    global THREAD_FLAG
    # while count < 50:
    #     check_url = 'https://dean.bjtu.edu.cn/course_selection/courseselecttask/selects_action/?action=load&iframe=school&page=1&perpage=500'
    #     res = requests.get(check_url, cookies=cookies, headers=check_classheader)
    #     count += 1
    try:

        if pred_type == "ydm":
            req_id, answer = yundama.decode(img_data, 2004, 20)
        elif pred_type == "chaoren":
            res = chaoren_client.recv_byte(img_data)
            answer, req_id = res[u'result'], res[u'imgId']
        elif pred_type == "pp":
            answer, req_id = api.Predict(40400, img_data)
        elif pred_type == "cjy":
            answer, req_id = chaojiying.PostPic(img_data, 2004)
        data = {
            'checkboxs': class_code,
            # 'is_cross':True,
            'hashkey': hashkey,
            'answer': answer
        }
        re = requests.post(
            'https://dean.bjtu.edu.cn/course_selection/courseselecttask/selects_action/?action=submit',
            cookies=cookies,
            headers=robclass_headers,
            allow_redirects=False,
            data=data)
        if re.status_code == 503:
            print(re.status_code)
            print("重新提交抢课请求")
            time.sleep(0.3)
            post_request(cookies, class_code, hashkey, img_data, pred_type)
        re = re.headers['Set-Cookie']
        message = re[re.find('[['):re.find(']]') + 2]
        res = str(json.loads(eval("'" + message + "'")))
        print(pred_type + "请求:" + str(data))
        print(res)
        if "选课成功" in res:
            THREAD_FLAG = True
            return 200
        elif "课堂无课余量" in res:
            return 404
        elif "验证码" in res:
            if pred_type == 'pp':
                api.Justice(req_id)
            elif pred_type == 'cjy':
                chaojiying.ReportError(req_id)
            elif pred_type == 'ydm':
                yundama.report(req_id)
            else:
                chaoren_client.report_err(req_id)
            return 403
            # 完全错误,比如有类似的课了
        else:
            return 500
    except Exception as e:
        print("139postreq bug :" + str(e))
        return 403
Example #4
0
 def getip_66ip(self):
     for page in range(1, 4):
         url = 'http://www.66ip.cn/{}.html'.format(page)
         html = getpage(url)
         if html:
             doc = pq(html)
             res = doc('#main table tr:gt(0)').items()
             for re in res:
                 address = re.find('td').eq(0).text()
                 port = re.find('td').eq(1).text()
                 if address and port:
                     result = '{0}:{1}'.format(address, port)
                     yield result.replace(' ', '')
def get_video_img_info(html):
	#find site Name
	site_url='http://'
	url_p = '^http://v.qq.com/.+?tm|^/cover/'
	sub_url_p = '^/'
	video_with_img = []
	finded = re.find('qq.com',html)
	if finded:
		site_url = site_url+'v.qq.com/'
		url_p = '^http://v.qq.com/.+tm|^/cover/'
	elif (finded = re.find('youku.com',html)):
		site_url = site_url+'v.youku.com'
		url_p = '^http://v.youku.com/.+?_show'
Example #6
0
def range(lower,upper,sem):
    while(lower<=upper):
        with requests.Session() as req:
            dom=BeautifulSoup(req.post("http://www.wbutech.net/show-result.php",data={"semno":sem,"rollno":lower,"rectype":1},headers={"Referer":"http://www.wbutech.net/result_odd.php"}).text)
            if len(dom.find_all("th"))==14:
            	name=find("Name : (.+)",dom.find_all("th")[1].text.strip()).group(1)
            	reg=find("Registration No. : (.+) OF",dom.find_all("th")[3].text.strip()).group(1)
            	sgpa=find("SEMESTER : ([0-9.]+)",dom.find_all("td")[54].text.strip()).group(1)
            	print name+", "+str(lower)+", "+reg+", "+sgpa
            else:
            	print str(lower)+" MISSING"
            req.close()
            lower+=1
Example #7
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['novelurl']
     re = BeautifulSoup(response.text, 'lxml')
     category = re.find('table').find('a').get_text()  #类别
     author = re.find('table').find_all('td')[1].get_text()
     bash_url = re.find('p', class_='btnlinks').find('a',
                                                     class_='read')['href']
     name_id = str(bash_url)[-6:-1].replace('/', '')
     item['category'] = str(category).replace('/', '')
     item['author'] = str(author).replase('/', '')
     item['name_id'] = name_id
     return item
def get_video_site_info(html):
	finded = re.find('qq.com',html)
	if finded:
		site_url = site_url+'v.qq.com/'
		return 'qq'
	finded = re.find('youku.com',html)	
	if finded:
		site_url = site_url+'v.youku.com'
		return 'youku'
	finded = re.find('tudou.com',html)
	if finded:
		site_url = site_url+'v.tudou.com'
		return 'tudou'	
	else:
		print 'error not support yet...'
		return ''
Example #9
0
def parse_proxies(country="all"):
    pattern = re.compile("(?:\d{1,3}\.){3}\d{1,3}:\d+")
    proxies = []

    for page_num in range(1, PAGES_COUNT + 1):
        print("page:", page_num)
        soup = get_soup(PROXIES_URL % (country.lower(), page_num))
        table = soup.find("table", id="proxy_list")

        if not table: 
            continue

        rows = table.findall("tr")

        print("rows:", len(rows))

        for row in rows:
            script = row.find("script").text
            cipher = re.findall(r'Base64.decode\("(.+)"\)', script)[0]
            ip = base64.b64decode(cipher)
            proxy = ip

            if proxy and re.find(pattern, proxy):
                proxies.append(proxy)

    return proxies
Example #10
0
def get_subclass_name_from_item(item):
    match = re.find('\n\n(?P<subclass>.+)\n', item)
    subclass_name = match.group('subclass')

    print("\tsubclass: " + subclass_name)

    return subclass_name
Example #11
0
def parse(document, clean_html=True, unix_timestamp=False, encoding=None):
    """Parse a document and return a feedparser dictionary with attr key access.
    If clean_html is False, the html in the feed will not be cleaned.  If
    clean_html is True, a sane version of lxml.html.clean.Cleaner will be used.
    If it is a Cleaner object, that cleaner will be used.  If unix_timestamp is
    True, the date information will be a numerical unix timestamp rather than a
    struct_time.  If encoding is provided, the encoding of the document will be
    manually set to that."""
    if isinstance(document, six.text_type):
        encoding = 'utf8'
        m = re.find(b'''<\?xml.*?encoding=['"](.*?)['"].*\?>''', document)
        document = document.encode(encoding)
    if isinstance(clean_html, bool):
        cleaner = default_cleaner if clean_html else fake_cleaner
    else:
        cleaner = clean_html
    result = feedparser.FeedParserDict()
    result['feed'] = feedparser.FeedParserDict()
    result['entries'] = []
    result['bozo'] = 0
    try:
        parser = SpeedParser(document, cleaner, unix_timestamp, encoding)
        parser.update(result)
    except Exception as e:
        if isinstance(e, UnicodeDecodeError) and encoding is True:
            encoding = chardet.detect(document)['encoding']
            document = document.decode(encoding, 'replace').encode('utf-8')
            return parse(document, clean_html, unix_timestamp, encoding)
        import traceback
        result['bozo'] = 1
        result['bozo_exception'] = e
        result['bozo_tb'] = traceback.format_exc()
    return result
Example #12
0
def get_data(url):
    res = requests.get(url)
    result_json = res.json()
    print(result_json)
    next_page_url = result_json["paging"].get("next")
    data = result_json["data"]

    # get first data
    for item in data:
        message = item.get("message")
        if message:
            # whitelist posts
            if any(whitelist in message.lower() for whitelist in whitelists):
                # check if there are blacklisted words
                if not any(blacklist in message.lower()
                           for blacklist in blacklists):
                    number = ""
                    match = pattern.search(message)
                    if match:
                        if match.group(0):
                            number = match.group(0).replace("/", "")
                        elif match.group(1):
                            number = match.group(1).replace("/+", "")
                            # ensure that we only take the number phone digits
                            number = re.find("\d", number)
                        # check if number is exists.
                        if number not in numbers:
                            messages.append(message)
                            numbers.append(number)

    return next_page_url
def reorderLines(logFileSize, logfile):
    # WRITE YOUR CODE HERE
    id_map = {}
    content_words = []
    content_numbers = []
    for log_line in logfile:
        log_contents = log_line.split(' ')
        id = log_contents[0]
        content = log_contents[1:]
        id_map[content] = id
        if re.find('\d+', content):
            content_numbers.append(content)
        else:
            content_words.append(content)

    #sort the content_words lexicographically
    content_words_sorted = sorted(content_words)

    result = []
    for sorted_line in content_words_sorted:
        if sorted_line in id_map:
            result_line = id_map[sorted_line] + " " + sorted_line
            result.add(result_line)

    for number_line in content_numbers:
        if number_line in id_map:
            result_line = id_map[number_line] + " " + number_line
            result.add(result_line)

    return result
Example #14
0
def parse(document, clean_html=True, unix_timestamp=False, encoding=None):
    """Parse a document and return a feedparser dictionary with attr key access.
    If clean_html is False, the html in the feed will not be cleaned.  If
    clean_html is True, a sane version of lxml.html.clean.Cleaner will be used.
    If it is a Cleaner object, that cleaner will be used.  If unix_timestamp is
    True, the date information will be a numerical unix timestamp rather than a
    struct_time.  If encoding is provided, the encoding of the document will be
    manually set to that."""
    if isinstance(document, six.text_type):
        encoding = 'utf8'
        m = re.find(b'''<\?xml.*?encoding=['"](.*?)['"].*\?>''', document)
        document = document.encode(encoding)
    if isinstance(clean_html, bool):
        cleaner = default_cleaner if clean_html else fake_cleaner
    else:
        cleaner = clean_html
    result = feedparser.FeedParserDict()
    result['feed'] = feedparser.FeedParserDict()
    result['entries'] = []
    result['bozo'] = 0
    try:
        parser = SpeedParser(document, cleaner, unix_timestamp, encoding)
        parser.update(result)
    except Exception as e:
        if isinstance(e, UnicodeDecodeError) and encoding is True:
            encoding = chardet.detect(document)['encoding']
            document = document.decode(encoding, 'replace').encode('utf-8')
            return parse(document, clean_html, unix_timestamp, encoding)
        import traceback
        result['bozo'] = 1
        result['bozo_exception'] = e
        result['bozo_tb'] = traceback.format_exc()
    return result
Example #15
0
def groupon_poster_gz(request):
	cmd='curl -H "Host:groupon.mlapi.meilishuo.com" 10.0.0.55/groupon/groupon_poster'
	re=os.popen(cmd).read()
	reStatus="success"
	if re.find('"error_code":0') < 0:
		reStatus="fail"
	return HttpResponse(reStatus)
Example #16
0
def getNumbers(varInput):
	variables = find(r'[\d.]+', varInput)
	
	if variables is not None:
		return float(variables[0]), float(variables[1])
	else:
		print "\'%s\' is not valid input. Please try again.\n" % varInput
Example #17
0
    def BIOtagSingleOffset_NoOverlap(self, element, text2, i1, i2,
                                     last_offset):

        # before drug words start
        before_part = text2[last_offset:i1]
        element["text_splits"].append(before_part)

        # drug words
        element["text_splits"].append(self.BItagWSpace)

        # now add the next word only? or what?

        i2 = re.find(r'\b', text2[i1:])

        if i2 > -1:

            entity_text = text2[i1:]
            element["text_splits"].append(entity_text)
            #entity_text2 = self.BIOTagWord(entity_text)
            #deviation += len(entity_text2) - len(entity_text)
            element["text_splits"].append(self.BIOtagWSpace)

            last_offset = i1 + len(entity_text) + 1

        return last_offset
Example #18
0
def get_class_and_subclass_codes_from_item(item):
    match = re.find('(?P<class>\d{2} )(?P<subclass>\d{2}\n)' , item)
    class_code = match.group('class')
    subclass_code = match.group('subclass')

    print("\tclass: " + class_code + "\n\tsubclass: " + subclass_code)

    return [class_code , subclass_code]
Example #19
0
def println(name):
    global var
    rule = r'“(.*?)”'
    try:
        print("\n")
        print(var[name])
    except:
        print("\n")
        print(re.find(rule))
Example #20
0
def is_valid_channel(name):
    """Returns whether NAME is a valid channel name, that is, it starts with
    any of '#', '&', '+', or '!', and does not contain NUL, BEL, CR, LF, ' ',
    ',', or ':'.
    """
    return (
      (name[0] in CHANNEL_PREFIXES)
      and (re.find('[\0\7\r\n ,:]', name) == -1)
    )
Example #21
0
def gerarAfndGramatica(afnd, gramatica, alfabeto):  #cria o afnd das gramaticas
    if not afnd:
        afnd.update({0: {}})
    aTemp = {}
    mpRgs = {}
    for regra in gramatica:
        simbolos = find(r'(\w*<\w+>|\w+|&)', regra)
        if simbolos[0] in mpRgs.keys(
        ):  # Verifica se a regra já foi criada e armazena no mapa de regras
            iRg = mpRgs[simbolos[0]]  # iRg armazena o índice da regra
        else:
            iRg = len(aTemp)
            aTemp.update({iRg: {}})
            mpRgs.update({simbolos[0]: iRg})
        for simbolo in simbolos[1:]:
            term = find(r'^\w+', simbolo)
            nTerm = find(r'<\w+>', simbolo)
            term = '&' if not term else term[0]
            if term not in alfabeto:
                alfabeto.append(term)
            if not nTerm:  # produção sem não terminal, gera uma regra terminal
                rg = aTemp[iRg]
                if term in rg.keys():
                    rg[term].append(len(aTemp))
                else:
                    rg.update({term: [len(aTemp)]})
                aTemp.update({len(aTemp): {'*': [1]}})
            else:
                nTerm = nTerm[0]
                if nTerm in mpRgs.keys():
                    rg = mpRgs[nTerm]
                else:
                    rg = len(aTemp)
                    mpRgs.update({nTerm: rg})
                    aTemp.update({rg: {}})
                mp = aTemp[iRg]
                if term in mp.keys():
                    mp[term].append(rg)
                else:
                    mp.update({term: [rg]})

    unirAutomatos(afnd,
                  aTemp)  #mescla os automatos, compartilhand o simbolo inicial
def get_attributes(tag):
    keys = re.findall(' ([\w-]+?)=', tag)
    result = {}
    for key in keys:
        value = re.find('%s="(.*?)"' % key, tag)
        if value is None:
            print('cannot find attribute "%s" in %s' % (key, tag))
            continue
        result[key] = value
    check_attributes(keys, tag)
    return result
Example #23
0
def usd_to_inr():
    from bs4 import BeautifulSoup
    import requests
    import re

    amount = re.find(r'(\d)+', x)
    url = 'https://www.xe.com/currencyconverter/convert/?Amount=' + amount + '&From=USD&To=INR'
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'lxml')
    value = soup.find('p', class_='result__BigRate-sc-1bsijpp-1 iGrAod').text
    print_and_say(value)
Example #24
0
def getIP():
    try:
        res=urllib2.urlopen('http://whois.pconline.com.cn/ipJson.jsp',timeout=2)
    except:
        return None
    if res.getcode()!=200:
        return None
    re=res.read().decode('gbk').encode('utf8')
    res.close()
    re=re[re.rfind('{'):re.find('}')+1]
    return json.loads(re)
Example #25
0
 def getIP(self):
     try:
         res=urllib2.urlopen('http://whois.pconline.com.cn/ipJson.jsp',timeout=2000)
     except:
         return None
     if res.getcode()!=200:
         return None
     re=res.read().decode('gbk').encode('utf8')
     res.close()
     re=re[re.rfind('{'):re.find('}')+1]
     return json.loads(re)
Example #26
0
 def get_dependencies(self):
     dependencies = []
     dependencies_in_angular_quotes = []
     dependencies_in_angular_braces = re.findall('\<(.*?)\>',
                                                 self.raw_content)
     print(self.raw_content)
     for line in self.raw_content.split("\n"):
         if line[0] == '#':
             dependencies_in_angular_quotes.append = re.find(
                 r'"([^"]*)"', line)
     self.dependencies = dependencies_in_angular_braces + dependencies_in_angular_quotes
Example #27
0
def fill(self, char):
    if not self: return self[FILL]
    if re.find("reset", char.lower):
        del self[FILL]
        self[_CHANGE]
        return undef

    char = char[0:1]  # only one character allowed for fill value.
    re.sub("[^\x20-\x7f]+", "", char)
    if char: self[FILL] = char
    return self[FILL]
Example #28
0
def matching_one(regexes, string):
    '''Returns True if and only if one of the given regexes (a list) matches the
    string. For example, if regexes is ['abc','foo'] and the string is 'blabcar',
    it should return True, but not if regexes is ['lalala','foo'] and the string
    is 'blabcar'.

    3/3 points
    '''
    for item in regexes:
        if re.find(item, string):
            return True
    return False
Example #29
0
def parse_jmdict(file=JMDICT_FILE):  # {{{1
    alang = "{http://www.w3.org/XML/1998/namespace}lang"
    data = []
    with gzip.open(file) as f:
        with click.progressbar(ET.parse(f).getroot(),
                               width=0,
                               label="parsing jmdict") as bar:
            for e in bar:
                seq, pos = int(e.find("ent_seq").text), ()
                kanji, reading, sense = [], [], []
                for ke in e.findall("k_ele"):  # 0+ kanji elem
                    keb = ke.find("keb").text.strip()  # word/phrase w/ kanji
                    info = tuple(x.text.strip() for x in ke.findall("ke_inf"))
                    assert all("\n" not in x and "\x1e" not in x for x in info)
                    kanji.append(
                        Kanji(keb, _kanji_chars(keb), info, _prio_k(ke)))
                for re in e.findall("r_ele"):  # 1+ reading elem
                    reb = re.find("reb").text.strip()  # reading elem
                    restr = tuple(x.text.strip()
                                  for x in re.findall("re_restr"))
                    # reading only applies to keb subset
                    info = tuple(x.text.strip() for x in re.findall("re_inf"))
                    assert all("\n" not in x and "\x1e" not in x
                               for xs in [restr, info] for x in xs)
                    reading.append(Reading(reb, restr, info, _prio_r(re)))
                for se in e.findall("sense"):  # 1+ sense elem
                    pos = tuple(x.text.strip()
                                for x in se.findall("pos")) or pos
                    # part of speech, applies to following senses too
                    lang, gloss = None, []
                    for x in se.findall("gloss"):
                        l = x.get(alang, "eng")
                        if l in LANGS and x.text:
                            assert lang is None or lang == l
                            lang = l
                            gloss.append(x.text.strip())
                    if lang is None: continue
                    s_inf = tuple(x.text.strip() for x in se.findall("s_inf"))
                    misc = tuple(x.text.strip() for x in se.findall("misc"))
                    xref = tuple(y.strip() for x in se.findall("xref")
                                 for y in x.text.split("・")
                                 if not y.strip().isdigit())
                    assert seq < MAXSEQ
                    assert all("\n" not in x and "\x1e" not in x
                               for xs in [pos, gloss, s_inf, misc, xref]
                               for x in xs)
                    sense.append(
                        Sense(pos, lang, tuple(gloss), s_inf + misc, xref))
                krs = (tuple(x) for x in [kanji, reading, sense])
                jlpt = jlpt_level(kanji, reading, _usukana(sense))
                data.append(Entry(seq, jlpt, *krs))
            return data
def check_attributes(attributes, tag):
    check = tag
    for key in attributes:
        attr = re.find('%s=".*?"' % key, tag)
        check = check.replace(attr, '')
    check = check.replace('<img', '')
    check = check.replace('/>', '')
    check = check.replace('>', '')
    check = check.replace(' ', '')
    if len(check) > 0:
        print('tag has unexpected attributes: %s\n%s\n%s' %
              (attributes, check, tag))
        raise Exception('unexpected attributes')
Example #31
0
def grep(filename, string):
    '''Returns all lines that match a given string. Interpret the string as a
    regular expression, so e.g. if the string is "foo[bp]ar" it should return
    all those lines that contain the words foobar and/or foopar.

    It shouldn't return the lines altogether in a list, but rather one by one
    using yield.

    5/5 points
    '''
    with open(filename) as f:
        for line in f:
            if re.find(string, line):
                yield line
Example #32
0
 def sanitize_token(self, token, strip_tokens=False):
     if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
                          tokenTypes["EmptyTag"]):
         if token["name"] in self.allowed_elements:
             if token.has_key("data"):
                 attrs = dict([(name,val) for name,val in
                               token["data"][::-1] 
                               if name in self.allowed_attributes])
                 for attr in self.attr_val_is_uri:
                     if not attrs.has_key(attr):
                         continue
                     val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                            unescape(attrs[attr])).lower()
                     if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
                         (val_unescaped.split(':')[0] not in 
                          self.allowed_protocols)):
                         del attrs[attr]
                 for attr in self.svg_attr_val_allows_ref:
                     if attr in attrs:
                         attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                              ' ',
                                              unescape(attrs[attr]))
                 if (token["name"] in self.svg_allow_local_href and
                     'xlink:href' in attrs and re.find('^\s*[^#\s].*',
                                                       attrs['xlink:href'])):
                     del attrs['xlink:href']
                 if attrs.has_key('style'):
                     attrs['style'] = self.sanitize_css(attrs['style'])
                 token["data"] = [[name,val] for name,val in attrs.items()]
             return token
         else:
             if strip_tokens:
                 return None
             if token["type"] == tokenTypes["EndTag"]:
                 token["data"] = "</%s>" % token["name"]
             elif token["data"]:
                 attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                 token["data"] = "<%s%s>" % (token["name"],attrs)
             else:
                 token["data"] = "<%s>" % token["name"]
             if token["type"] == tokenTypes["EmptyTag"]:
                 token["data"]=token["data"][:-1] + "/>"
             token["type"] = tokenTypes["Characters"]
             del token["name"]
             return token
     elif token["type"] == tokenTypes["Comment"]:
         pass
     else:
         return token
Example #33
0
 def sanitize_token(self, token):
     if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
                          tokenTypes["EmptyTag"]):
         if token["name"] in self.allowed_elements:
             if token.has_key("data"):
                 attrs = dict([(name, val)
                               for name, val in token["data"][::-1]
                               if name in self.allowed_attributes])
                 for attr in self.attr_val_is_uri:
                     if not attrs.has_key(attr):
                         continue
                     val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                            unescape(attrs[attr])).lower()
                     if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped)
                             and (val_unescaped.split(':')[0]
                                  not in self.allowed_protocols)):
                         del attrs[attr]
                 for attr in self.svg_attr_val_allows_ref:
                     if attr in attrs:
                         attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                              ' ', unescape(attrs[attr]))
                 if (token["name"] in self.svg_allow_local_href
                         and 'xlink:href' in attrs
                         and re.find('^\s*[^#\s].*', attrs['xlink:href'])):
                     del attrs['xlink:href']
                 if attrs.has_key('style'):
                     attrs['style'] = self.sanitize_css(attrs['style'])
                 token["data"] = [[name, val]
                                  for name, val in attrs.items()]
             return token
         else:
             if token["type"] == tokenTypes["EndTag"]:
                 token["data"] = "</%s>" % token["name"]
             elif token["data"]:
                 attrs = ''.join([
                     ' %s="%s"' % (k, escape(v)) for k, v in token["data"]
                 ])
                 token["data"] = "<%s%s>" % (token["name"], attrs)
             else:
                 token["data"] = "<%s>" % token["name"]
             if token["type"] == tokenTypes["EmptyTag"]:
                 token["data"] = token["data"][:-1] + "/>"
             token["type"] = tokenTypes["Characters"]
             del token["name"]
             return token
     elif token["type"] == tokenTypes["Comment"]:
         pass
     else:
         return token
def _get_conflict_file_name(outfile):
    """
    """
    outdir = '/'.join(outfile.split('/')[:-1])
    outfilename = outfile.split('/')[-1]
    filelist = os.listdir(outdir)
    previous_conflicts = [filename.split('.')[2] for filename in filelist 
                          if filename.startswith(outfilename) and len(filename.split('.')) == 3]
    if previous_conflicts:
        max_conflict = max([int(re.find('[0-9]{3}',conflict)[0]) 
                            for conflict in previous_conflicts])
    else:
        max_conflict = -1
    
    return '{}.conflict_{:03.0f}'.format(outfile,max_conflict+1)
Example #35
0
def capo(hope_comments_count):
    url_capogames = "http://www.capogames.net/samw/board/board.do"
    souping_capogames = souping(url_capogames)

    for row in souping_capogames.find_all('tr'):
        row_count = row.find('span').contents[0]
        reply_count = re.search(r'wd+', row_count).group(0)
        if int(reply_count) >hope_comments_count:
            site_name = 'samw'
            number = row.find('td').contents[0].contents[0]
            link = 'http://www.capogames.net/samw/'+row.find('td').a.get('href')
            title = row.find('td').a.contents[0]
            name = row.find('td').a.contents[0]
            raw_date = re.find('td').contents[0]
            best_article_list([site_name, number, title, link, reply_count])
Example #36
0
def sanitize_html(tag_name, D, SElms=SElms, SAttr=SAttr, SProtocols=SURITypes):
    """
    tag_name -> The tag's name
    D -> The tag's attributes dict
    DElms -> The allowed elements
    DAttr -> The allowed attributes
    DProtocols -> The allowed protocols (see Tags.DURITypes)
    """

    tag_name = tag_name.lower()  # HACK!
    if tag_name in SElms:
        for k in list(D.keys()):
            # Delete unallowed attributes
            if not k in SAttr:
                del D[k]

        for attr in SURIAttrs:
            # Validate URLs using REs
            if not attr in D:
                continue

            val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                   unescape(D[attr])).lower()

            if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and \
                (val_unescaped.split(':')[0] not in SURITypes):
                del D[attr]

        for attr in svg_attr_val_allows_ref:
            # SVG something something...
            if attr in D:
                D[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ',
                                 unescape(D[attr]))

        if (tag_name in svg_allow_local_href and 'xlink:href' in D
                and re.find('^\s*[^#\s].*', D['xlink:href'])):
            # ???
            # Disable SVG links?
            del D['xlink:href']

        if 'style' in D and D['style']:
            # Sanitize the CSS
            D['style'] = sanitize_css(D['style'])
        return tag_name, D

    else:
        # Don't allow!
        return None, None
Example #37
0
def clean(t_list, comm_trig=False):  # cleaning up the name html.
    list_ = pd.DataFrame()
    command = []
    for i in t_list:
        try:
            list_.append(i.get('data-name'))
            if i.get('data-name') == None:
                #.find('a', attr = 'data-name'))
                command.append(i.get(re.find('a', 'href="/mtg-card/')))

        except:  # need to come p with conditional to define the commander..
            #command.append(i.get('href'))
            print(
                'if there are more than 3-4 of these ther is a problem here!!!'
            )
    return command + list_  # remember partner commanders, need to add cleanfor that.
Example #38
0
    def countNumDrugWords(self, element):

        count = 0
        element["verified_drug_words"] = []

        # get all sorted offsets
        # merge offsets that overlap
        overlaped_offsets = []
        for off in element["offsets"]:
            if len(overlaped_offsets) == 0:
                overlaped_offsets.append(off)
            else:
                i1 = off[0]
                i2 = off[1]

                if i2 <= overlaped_offsets[-1][1]:
                    # as the offsets are sorted by i1,
                    # we could skip this offsets as it is contained in the previous one
                    pass

                elif i2 > overlaped_offsets[-1][1] and i1 < overlaped_offsets[
                        -1][1]:
                    # this one overlaps but includes more words after it
                    # so we should increment i2 of the previous offset to current i2
                    overlaped_offsets[-1][1] = i2

                else:
                    overlaped_offsets.append(off)

        for groupoffset in overlaped_offsets:
            if len(groupoffset) == 2:
                i1 = groupoffset[0]
                i2 = groupoffset[1]
                fragment = element["text_original"][i1:i2 + 1]
                fwords = self.tokenize(fragment)
                element["verified_drug_words"].extend(fwords)
                count += sum([1 for word in fwords])
            else:
                i1 = int(groupoffset[0])
                i2 = i1 + int(re.find(r'\b', element["text_original"][i1:]))
                if i2 > i1:
                    fragment = element["text_original"][i1:i2 + 1]
                    fwords = self.tokenize(fragment)
                    element["verified_drug_words"].extend(fwords)
                    count += sum([1 for word in fwords])

        return count
Example #39
0
 def countNumDrugWords(self, element):
     
     count = 0
     element["verified_drug_words"] = []
     
     # get all sorted offsets
     # merge offsets that overlap
     overlaped_offsets = []
     for off in element["offsets"]:
         if len(overlaped_offsets)==0:
             overlaped_offsets.append(off)
         else:
             i1 = off[0]
             i2 = off[1]
             
             if i2<= overlaped_offsets[-1][1]:
                 # as the offsets are sorted by i1,
                 # we could skip this offsets as it is contained in the previous one
                 pass
             
             elif i2> overlaped_offsets[-1][1] and i1< overlaped_offsets[-1][1]:
                 # this one overlaps but includes more words after it
                 # so we should increment i2 of the previous offset to current i2
                 overlaped_offsets[-1][1] = i2
             
             else:
                 overlaped_offsets.append(off)
             
     for groupoffset in overlaped_offsets:
         if len(groupoffset)==2:
             i1 = groupoffset[0]
             i2 = groupoffset[1]
             fragment = element["text_original"][i1:i2+1]
             fwords =self.tokenize(fragment)
             element["verified_drug_words"].extend(fwords)
             count += sum([1 for word in fwords]) 
         else:
             i1 = int(groupoffset[0])
             i2 = i1 + int(re.find(r'\b',element["text_original"][i1:]))
             if i2>i1:
                 fragment = element["text_original"][i1:i2+1]
                 fwords = self.tokenize(fragment)
                 element["verified_drug_words"].extend(fwords)
                 count += sum([1 for word in fwords]) 
     
     return count
Example #40
0
def fromurl(url):
    """Return app/class/method/function pointed by an url

		1. raise ValueError if url is external
		2. return None      if url does not match any callable app/class/...

		else return target item
	"""
    if re.find("[^\w/-+#]", url):
        raise ValueError

    target = None
    for part in url.split("/"):
        pass
        # aaa/bb/cc/dd

    return
Example #41
0
    def updatePermissions(self, permissions):
        for p in permissions:
            obj = {}
            obj["permission_k"] = p["permission_k"]
            obj["date_created"] = datetime.datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")

            # //remove this permission from all roles
            roles = self.role.getByPermissions(p["permission_k"])
            for role in roles:
                self.permission.deleteRolePermissions(role)

            #//add this permission to each role
            for key in p.keys():
                match = re.find("/^role_/", key)
                if match:
                    obj["role_k"] = key[5:]
                    obj["value"] = p[key]
                    self.permissions.addRolePermissions(obj)

        return {"success": True, "message": "Permissions successfully saved"}
Example #42
0
    def updatePermissions(self, permissions):
        for p in permissions:
            obj = {}
            obj["permission_k"] = p["permission_k"]
            obj["date_created"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            # //remove this permission from all roles
            roles = self.role.getByPermissions(p["permission_k"])
            for role in roles:
                self.permission.deleteRolePermissions(role)

            #//add this permission to each role
            for key in p.keys():
                match = re.find("/^role_/", key)
                if match:
                    obj["role_k"] = key[5:]
                    obj["value"] = p[key]
                    self.permissions.addRolePermissions(obj)

        return {"success": True,
                "message": "Permissions successfully saved"}
Example #43
0
def fetch(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            # print(line)
            a, b = line.split(':')
            anchors.append((a, b))
    txt = '| 主播 | 标题 | 状态 | 订阅 |\n|:---:|:---:|:---:|:---:|\n'
    for suffix, anchor in anchors:
        # time.sleep(100)
        print('watching:', anchor)
        try:
            r = requests.get(huya_url + suffix, timeout=3)
            html = r.content.decode('utf-8')
            newaddr = re.find(r'更换为.+href="https://www.huya.com/(.+)"', html)
            print(newaddr)
            if newaddr:
                print('NEW:', anchor, suffix, '->', newaddr)
                anchors.append((newaddr, anchor))
                continue
            title = re.findall(r'<h1 id="J_roomTitle">(.+)</h1>', html)[0]
            status = re.findall(r'id="live-count">(.+?)</em></span>', html)
            fans = re.findall(r'id="activityCount">(\d+)</div>', html)[0]
            last_live = '未直播'
            if status and status[0]:
                last_live = status[0]
        except:
            print('ERROR:' + huya_url + suffix)
        else:
            txt += ('|' + anchor + '|' + title + '|' + last_live + '|' + fans +
                    '|\n')
    print(txt)
    return
    if send_msg('主播直播状态', txt):
        print('wechat message push success.')
    else:
        print('wechat message push failed.')
Example #44
0
def get_session_details(s):
    url = 'http://www.azleg.gov/xml/sessions.asp'
    with s.urlopen(url) as page:
        root = etree.fromstring(page)
        session_file = open('session_details.py', 'w')
        detail = """
                 '%s':
                    {'type': '%s', 'session_id': %s,
                     'start_date': datetime.date(%s),
                     'end_date': datetime.date(%s)},
                 """
        for session in root.xpath('//session'):
            session_type = 'primary' if re.find('Regular', session.get('Session_Full_Name')) else 'special'
            start_date = datetime.datetime.strptime(
                                              session.get('Session_Start_Date'),
                                              '%Y-%m-%dT%H:%M:%S')
            end_date = datetime.datetime.strptime(session.get('Sine_Die_Day'),
                                                  '%Y-%m-%dT%H:%M:%S')
            session_file.write(detail % ( session.get('Session_Full_Name'),
                                           session_type,
                                           session.get('Session_ID'),
                                           start_date,
                                           end_date))
def replace_image_tag(html, path=None):
    tags = re.findall('<img.+?src="/img.+?>', html)
    for tag in tags:
        image = re.find('src="/img/(.+?)"', tag)
        if image is None:
            print('image path not found in tag: "%s" in %s' % (tag, path))
            continue
        imagepaths.append(image)
        attributes = get_attributes(tag)
        attributes.pop('src')
        if len(attributes) > 0:
            # http://railsdoc.com/references/image_tag
            # <img src="/img/hoge.png" alt="hogera"> -> <%= image_tag("hoge.png", alt: "hogera") %>
            attributes = ', '.join(
                ['%s: "%s"' % o for o in attributes.items()])
            replaced = '<%%= image_tag("%s", %s) %%>' % (image, attributes)
        else:
            replaced = '<%%= image_tag("%s") %%>' % (image)
        if is_dry:
            print('"%s" -> "%s"' % (tag, replaced))
            continue
        html = html.replace(tag, replaced)
    return html
def dict_to_hstore(python_dict):
    """ 
    There's an implementation of this here ( HstoreAdapter ) https://github.com/psycopg/psycopg2/blob/master/lib/extras.py
    but the comments say that it is "painfully inefficient!"
    
    """
    hstore = StringIO()
    first_row = True
    for key in python_dict:
        ## Prune the hash--if it's empty
        value = python_dict[key]
        if value:
            if not first_row:
                hstore.write("|")
            else:
                first_row=False
            # don't allow quotes within the value. Not sure if we should check this here. 
            value = value.replace('"','')
            hstore.write("\"%s\"=>\"%s\"" % (key, value)
    return hstore.getvalue()
    

## If we were using a comma as a delimiter, we'd need to use a regex to allow for the possibility that a comma was inside the commas--so use the below. But we don't have to do that if we use the bar as delimiter. We already clean bars out in utf8clean.

# key_pair_re = re.compile('"(.+?)"=>"(.+?)"(?:,|$)')
key_pair_re = re.compile('"(.+?)"=>"(.+?)"')
def hstore_to_dict(text_string):
    return_dict = {}
    keypairs = text_string.split("|")
    for keypair in keypairs:
        keygroups = re.find(key_pair_re, keypair)
        return_dict[keygroups[1]]=keygroups[2]
    return return_dict
    
    
    
    
Example #47
0
    def BIOtagSingleOffset_NoOverlap(self, element, text2, i1, i2, last_offset):
        
        # before drug words start
        before_part = text2[last_offset:i1]
        element["text_splits"].append(before_part)

        # drug words
        element["text_splits"].append(self.BItagWSpace)

        # now add the next word only? or what?

        i2 = re.find(r'\b', text2[i1:])

        if i2>-1:

            entity_text = text2[i1:]
            element["text_splits"].append(entity_text)
            #entity_text2 = self.BIOTagWord(entity_text)
            #deviation += len(entity_text2) - len(entity_text)
            element["text_splits"].append(self.BIOtagWSpace)

            last_offset = i1 + len(entity_text) + 1
            
        return last_offset
Copyright (c) 2014 Beckersweet. All rights reserved.
"""

from commands import getoutput as command
from json import loads as decodeJSON
from json import dumps as encodeJSON
from mininet.cli import CLI
from mininet.net import Mininet
from mininet.node import Node, RemoteController, CPULimitedHost
from mininet.util import pmonitor
import pp
from re import findall as find

ifconfig = command('ifconfig')
try:
	localIp = find('addr:(192\.168\.56\.\d+) ', ifconfig)[0]
except:
	print "Network settings not configured. Try running 'sudo dhclient eth1'."

NETWORK_CONTROLLER_PORT = 6633
NUMBER_OF_HOSTS = 3
TCP_REQUEST_COMMAND = "python tcpRequest.py " + localIp + " 9999 "
JOB_SERVER_COMMAND = "sudo python dynamic_ncpus.py "
BENCHMARK_RESULTS_FILE_NAME = "OpMub_benchmarking.out"

print
print "Creating network:"

virtualNetwork = Mininet(controller=RemoteController,
							   host=CPULimitedHost,
							  build=False)
Example #49
0
    def match_expr(self, expr):

        return SubtitlesClip([e for e in self.subtitles
                              if re.find(expr, e) != []])
Example #50
0
def is_valid_nickname(name):
    """Returns whether NAME is a valid nickname, that is, it contains only
    letters, numbers, '_', '[', ']', '{', '}', '\', '|', '`', or '^'.
    """
    return (re.find(r'[^A-Za-z0-9_\[\]\{\}\\\|\`\^]', name) == -1)
Example #51
0
def check_queue(uid):
    """ Check the queue for any uid string, return job list with running
        node information. """
    from re import compile as mkregex

    qstat = rn(['qstat', '-u', uid, '-n', '-1']).decode('utf8').rstrip().split('\n')[5:]

    # If there are no job return nothing
    if not qstat:
        return

    jobs = {}
    for i in qstat:
        f = s(r' +', i.rstrip())

        # Only look at jobs in the interactive queue
        if not f[2] == short_queue_name:
            continue

        # Skip completed jobs
        if f[9] == 'C':
            continue

        # Get node name, if there is one
        if f[11] == '--':
            node = ''
        else:
            nodes = set(find(r'node[0-9][0-9]', f[11]))
            if len(nodes) > 1:
                continue
            node = str(list(nodes)[0])

        # Get job number
        job_id = find(r'[0-9]+', f[0])[0]

        # Now that we have a limited job set, use qstat -f to get the
        # complete job and queue name
        find_queue = mkregex(r'queue = (.*)$')
        find_name  = mkregex(r'Job_Name = (.*)$')

        for i in subprocess.check_output(['qstat', '-f', job_id]).decode().rstrip().split('\n'):
            # Get Queue Name
            if find_queue.search(i):
                try:
                    queue = find_queue.findall(i)[0]
                except IndexError:
                    # Queue parsing failed, report this and continue
                    print("Failed to parse queue for job number:{:^3}\nskipping".format(job_id), file=stderr)
                    continue
                if not queue == interactive_queue:
                    continue
            elif find_name.search(i):
                try:
                    names = find_name.findall(i)[0].split('_')
                except IndexError:
                    # Queue parsing failed, report this and continue
                    print("Failed to parse queue for job number:{:^3}\nskipping".format(job_id), file=stderr)
                    continue

        # Check that this is actually one of our jobs
        identifier = '_'.join(names[-2:])
        if identifier == 'int_tmux':
            type = 'tmux'
        elif identifier == 'int_vnc':
            type = 'vnc'
        elif identifier == 'int_gui':
            type = 'gui'
        else:
            continue

        # Fix queue name
        name = '_'.join(names[:-2])
        name = name if name else type

        # Assemble the dictionary
        jobs[job_id] = {'queue'    : queue,
                        'job_name' : name,
                        'type'     : type,
                        'node'     : node,
                        'state'    : f[9]}

    # Sort the dictionary
    jobs = OrderedDict(sorted(jobs.items()))

    return(jobs)
Example #52
0
#!/usr/bin/python27

import os;
import re;
from subprocess import call;

dirStruct = os.walk("../../raspi/LineAnalysis/testImages");

for roots, dirs, files in dirStruct :
    for fname in files :
        if re.find("jpg", fname) != None :
            ret = call(["./stats"], ["-i"], ["
Example #53
0
            #list类型以,为分隔符,对于字符中出现的,应使用\,转义
            data_arr = []
            for d in data:
                symbol = getFlag(d)
                if d is None:
                    print 'Unknown Value'
                    continue
                data_arr.append(symbol + str(d).replace('\\', '\\\\').replace(',', '\,'))
            data_str = ','.join(data_arr)
            request += "$" + str(len(data_str) + 1) + "\r\n"
            request += getFlag(data) + data_str + "\r\n"
        else:
            print pred('Unknown Value')
            continue

    sock.send(request + "\n")
    time.sleep(0.05)
    re = sock.recv(2048)
    if re[0] in ('+', '-'):
        print re[1:],
    elif re[0] == '$':
        resp_len = int(re[1:re.find("\r\n")])
        if resp_len == -1:
            print pred("Not Found")
            continue
        data_start = re.find("\r\n") + 2;
        data = re[data_start:data_start + resp_len]
        data = parseCMD(data)
        print "(" + pyellow(type(data).__name__) + ")", pgreen(data)
sock.close()
Example #54
0
def demo_bad_catch():
	try:
		var = input("Enter variable name")
		if re.find(reg, var):
			print('The input is valid')
	except ValueError as e:
Example #55
0
def attach_job(job_id, attempt_gui=False):
    """ Attach to a currently running job, default is tmux.
        To attach to a GUI running in tmux, pass attempt_gui """

    # Get details
    job_list = check_queue(uid)
    try:
        node  = job_list[job_id]['node']
        type  = job_list[job_id]['type']
        state = job_list[job_id]['state']
    except KeyError:
        print("Sorry, that job number doesn't exist. Please try again")
        print_jobs(job_list)
        sys.exit(1)

    if not state == 'R':
        print("Job not running, cannot attach")
        return

    if type == 'gui' or attempt_gui:
        # Confirm GUI Possible
        if not xpra_installed:
            print("It appears that xpra is not in your PATH, I cannot run GUI jobs", file=stderr)
            print("Exiting", file=stderr)
            sys.exit(-1)

        # Display xpra instructions
        print("You MUST NOT close your program by closing the window unless you want to")
        print("terminate your session\n")
        print("To preserve your session, you need to Ctrl-C in the command line, not close")
        print("the window\n")
        sleep(1)

        # Actually attach to the session!
        subprocess.call(['xpra', 'attach', 'ssh:' + uid + '@' + node + ':' + job_id])
        return

    elif type == 'tmux':
        # Do not attach if running from within a tmux session already
        if rn('echo $TMUX', shell=True).decode().rstrip():
            print("You are already running a tmux session, sessions should be nested with care")
            print("To force run, unset the $TMUX variable, but I suggest you just detatch your")
            print("current session and try the same command again")
            return

        # Attempt to initially attach to xpra, fail gracefully without
        # notifying user
        if xpra_installed:
            GUI_PID=''
            if subprocess.call("xpra attach ssh:" + uid + "@" + node + ":" + job_id + " >/dev/null 2>/dev/null &", shell=True) == 0:
                GUI_PID = subprocess.check_output('ps axo pid,user,cmd | grep "xpra attach" | grep "' + job_id + '$"| awk \'{print $1}\'', shell=True).decode().rstrip()

        # Actually attach to the session!
        job_string = ' '.join(['ssh', node, '-t', 'DISPLAY=:' + job_id, 'tmux', 'a', '-t', job_id])
        subprocess.call(job_string, shell=True)

        # Kill GUI if open
        if xpra_installed and GUI_PID:
            subprocess.call(['kill', GUI_PID])

    elif type == 'vnc':
        # Check that vnc can run
        if not vnc_installed:
            print("It appears that vncviewer is not in your PATH, I cannot run connect to a VNC session", file=stderr)
            print("Exiting", file=stderr)
            sys.exit(-1)

        # Get VNC Port
        ports = []
        files = subprocess.check_output('ssh ' + node + ' "ls $HOME/.vnc"', shell=True).decode().rstrip().split('\n')
        for i in files:
            if i.startswith(node) and i.endswith('pid'):
                    port = find(r':([0-9]+)\.pid', i)[0]
                    ports.append(port)

        if not ports:
            print("It appears no VNC servers are running on the selected server.")
            print("If the job is still running in the queue, there is a problem.")
            print("Try clearing out the *.log and *.pid files in $HOME/.vnc, and killing")
            print("the running VNC queue job")
            return

        if len(ports) > 1:
            print("There is more than one vnc server running for you on that node.")
            print("That isn't allowed and I don't know which one to join. It may")
            print("be that your last session exited without cleaning $HOME/.vnc")
            print("Check in there and clean out log files for vnc servers that")
            print("aren't running to prevent problems")
            return

        subprocess.call(['vncviewer', node + ':' + ports[0]])
        return

    else:
        print("I don't understand the job type")
        return
		return ''

def get_video_img_info(html):
	#find site Name
	site_url='http://'
	url_p = '^http://v.qq.com/.+?tm|^/cover/'
	sub_url_p = '^/'
	video_with_img = []
	finded = re.find('qq.com',html)
	if finded:
		site_url = site_url+'v.qq.com/'
		url_p = '^http://v.qq.com/.+tm|^/cover/'
	elif (finded = re.find('youku.com',html)):
		site_url = site_url+'v.youku.com'
		url_p = '^http://v.youku.com/.+?_show'
	elif (finded = re.find('tudou.com',html)):
		site_url = site_url+'v.tudou.com'
		url_p = 'http://v.tudou.com/'
	else:
		print 'error not support yet...'
		return []	

	soup = BeautifulSoup(html)
	all_img=soup.select("a > img")
	for img in all_img:
		img_attrs = img.attrs
		for attr in img_attrs:			
			if attr == 'src' or attr == '_src':
				imgurl=img[attr]
			if attr == 'alt':
				imgalt=img[attr]			
Example #57
0
from sys import argv
import re

script, directory, csv = argv

identified = open(csv).read() # 从整理后的csv生成鉴定到的queries列表

identified_peaks = []
for line in open(csv):
    identified_peaks.append(line[:-1])
print('total identifed peaks: ', len(identified_peaks))

output = open('identfied.mgf','w') # add new name for new mgf file

for mgf in os.listdir(directory):
    if not re.find('mgf', mgf):
        continue

    s = open(mgf).read()

    queries = {}

    match = re.compile('BEGIN IONS.*?END IONS\n', re.DOTALL)

    peak_list = re.findall(match, s) #生成queries的列表

    print('total MS/MS spectra: ', len(peak_list))

    for query in peak_list:
        title = re.search('TITLE=.*? ', query) # 从表中提取title行
        #print(title.group()[6:])
Example #58
0
def hstack(name):
    return re.find(r'\[([0-9]+)\]', name)
Example #59
0
def create_job(cores=default_cores, mem='', gui='', name='', vnc=False):
    """ Create a job in the queue, wait for it to run, and then attach
        Ctl-C after submission will not kill job, it will only kill attach
        queue """

    # Figure out memory request
    try:
        mem = str(int(cores*default_max_mem/default_max_cores)) + 'GB' if not mem else str(int(mem)) + 'GB'
    except ValueError:
        print("Incorrect formatting for memory request, please submit an integer multiple in GB")
        sys.exit(1)

    # Create job name
    if gui:
        gui_name = gui.split(' ')[0]
        job_name = name + '_' + gui_name + '_int_gui' if name else gui_name + '_int_gui'
    elif vnc:
        job_name = name + '_int_vnc' if name else 'int_vnc'
    else:
        job_name = name + '_int_tmux' if name else 'int_tmux'

    # Prep the job
    template = "#!/bin/bash\n#PBS -S /bin/bash\n"
    template = ''.join([template, "#PBS -q ", interactive_queue,
                        "\n#PBS -N ", job_name,
                        '\n#PBS -l nodes=1:ppn=' + str(cores),
                        '\n#PBS -l mem=' + mem,
                        '\n#PBS -e ' + os.environ['HOME'] + '/.' + job_name + '.error',
                        '\n#PBS -o /dev/null'])

    if gui:
        template = template + ("\n\nexport QCONNECT=gui"
                               "\n\njob_id=$(echo $PBS_JOBID | sed 's#\..*##g')\n"
                               "xpra start :$job_id\n"
                               "export DISPLAY=:${job_id}\n"
                               "sleep 1\n" +
                               gui + "\n"
                               "PID=$!\n"
                               "sleep 1\n"
                               "while true\n"
                               "do\n"
                               "  if kill -0 $PID > /dev/null 2>&1; then\n"
                               "    sleep 5\n"
                               "  else\n"
                               "    xpra stop :${job_id}\n"
                               "    xpra list >/dev/null 2>/dev/null\n"
                               "    rm ~/.xpra/:${job_id}.log 2>/dev/null\n"
                               "    exit 0\n"
                               "  fi\n"
                               "done\n")

    elif vnc:
        if not vnc_installed:
            print("It appears that vncviewer is not in your PATH, I cannot create a VNC connection", file=stderr)
            print("Exiting", file=stderr)
            sys.exit(-1)

        template = template + ("\n\nexport QCONNECT=vnc\n\nvncserver -geometry " + vnc_geometry + " -fg\n")

    else:
        template = template + ( "\n\nexport QCONNECT=tmux"
                                "\n\nsession_id=$(echo $PBS_JOBID | sed 's#\..*##g')\n")
        if xpra_installed:
            template = template + ("if xpra start :$session_id >/dev/null 2>/dev/null; then\n"
                                   "    export DISPLAY=:$session_id\n"
                                   "fi\n")

        template = template + ( "CMD=\"tmux new-session -s $session_id -d\"\n"
                                "$CMD\n"
                                "PID=$(ps axo pid,user,cmd | grep tmux | grep $USER | grep -v grep | awk '{print $1}')\n"
                                "while true\n"
                                "do\n"
                                "  if kill -0 $PID > /dev/null 2>&1; then\n"
                                "    if [[ ! $(tmux ls | grep $session_id) ]]; then\n")
        if xpra_installed:
            template = template + ("      xpra stop :$session_id >/dev/null 2>/dev/null\n"
                                   "      xpra list >/dev/null 2>/dev/null\n"
                                   "      rm ~/.xpra/:$session_id.log 2>/dev/null\n")

        template = template + ( "      exit 0\n"
                                "    else\n"
                                "      sleep 5\n"
                                "    fi\n"
                                "  else\n")
    if xpra_installed:
        template = template + ("      xpra stop :$session_id >/dev/null 2>/dev/null\n"
                                "      xpra list >/dev/null 2>/dev/null\n"
                                "      rm ~/.xpra/:$session_id.log 2>/dev/null\n")

        template = template + ( "    exit 0\n"
                                "  fi\n"
                                "done\n")
    if debug:
        print(template)

    pbs_command = (['qsub'])

    # Submit the job
    pbs_submit = subprocess.Popen(pbs_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    pbs_submit.stdin.write(template.encode())
    pbs_submit.stdin.close()

    # Get job number
    job_no = (pbs_submit.stdout.read().decode().rstrip())
    try:
        job_no = find(r'[0-9]+', job_no)[0]
    except IndexError:
        print("PBS Submission failed with message:\n{}".format(job_no), file=stderr)
        sys.exit(1)
    print("Job", job_name, "created with job id", job_no, "\n")
    sleep(1)

    return(job_no)
Example #60
0
 def get_table_info(self, table):
     table_data = re.find('CREATE TABLE {table_name} (.*\n?)\)\n\))'.format(table), self._file_content, re.MULTILINE)
     return table_data