Ejemplos de split en Python, ejemplos de lxml.html.split en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: rt_parse_cmd.py Proyecto: boalinlai/real-time-tracking-software

def parseWeb():
    """
     This module is used to get the data points from ECE2031 server
    """
    response = urllib2.urlopen(serverUrl)
    html = response.read()
    xPoints = []  # list for robot x points
    yPoints = []  # list for robot y points
    xDesPoints = []  # list for destination x points
    yDesPoints = []  # list for destination y ponins
    # this line should be changed for starting trigger.
    strparse = "<p>"
    html = html.split(strparse)[-1]
    strparse = "</p>"
    html = html.split(strparse)[0]
    for line in html.split('<br>'):
        if line is not "":
            points = line.split(',')
            print points
            x = float(points[0])
            y = float(points[1])
            xPoints.append(x)
            yPoints.append(y)
    print "Robot Path"
    for i in range(1, len(xPoints)):
        print '(' + str(xPoints[i])[:5] + ',' + str(yPoints[i])[:5] + ')'
    return xPoints, yPoints

Ejemplo n.º 2

0

Mostrar archivo

 def difference (self, html):
     print "Initial size %d" % len(html)
     s_html = html.split('\n')
     s_last_html = html.split('\n')
     diff = difflib.ndiff(s_last_html, s_html)
     for l in diff:
         print l
         print "kikou\n"
     delta = [l for l in diff if l.startswith('+ ')]
     print "Delta size %d" % len(delta)
     return delta

Ejemplo n.º 3

0

Mostrar archivo

Archivo: compile_docs.py Proyecto: PHI-base/canto-docs

def link_html_images(html):
    image_link_elem = (
        '<a href="<% $c->uri_for($image_path . "/{image}") %>"/>'
        '<img class="screenshot"'
        ' src="<% $c->uri_for($image_path . "/{image}") %>"'
        ' alt=""/>'
        '</a>'
    )
    image_template = cleandoc("""
    <div class="row-fluid">
    <div class="span6">
    {image_link}
    </div>
    </div>
    """.format(image_link=image_link_elem))
    image_re = re.compile(
        r'<p><img alt="" src="images/(?P<image>\w+\.png)" title=""\s*/?></p>'
    )
    out_lines = []
    for line in html.split('\n'):
        match = image_re.match(line)
        if match:
            image_name = match.group('image')
            out_lines.append(image_template.format(image=image_name))
        else:
            out_lines.append(line)
    return '\n'.join(out_lines)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: legislators.py Proyecto: mTspaceEvolved/openstates

    def scrape(self, chamber, term_name):

        for t in self.metadata['terms']:
            if t['name'] == term_name:
                session = t['sessions'][-1]
                slug = self.metadata['session_details'][session]['slug']

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (
            chamber_slug, slug)
        leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (
            slug, chamber_slug)

        resp = json.loads(self.get(leg_json_url).text)

        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            leg = Legislator(term_name,
                             chamber,
                             item['DistrictNbr'],
                             item['FullName'],
                             party=item['Party'],
                             photo_url=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            phone = details['LCBPhone']
            email = details['LCBEmail']

            leg.add_office('district',
                           'District Address',
                           address=address,
                           phone=phone,
                           email=email)
            leg.add_source(leg_details_url)
            self.save_legislator(leg)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: filters.py Proyecto: AloneRoad/jupo

def remove_empty_lines(html):
  key = '%s:remove_empty_lines' % hash(html)
  out = cache.get(key, namespace="filters")
  if out:
    return out
  
  if '</' in html:
    html = html.strip().replace('\n', '')
    soup = BeautifulSoup(html)
    lines = []
    for element in soup.contents:
      if isinstance(element, Tag):
        if element.text:
          lines.append(str(element).strip())
        elif 'br' in str(element):
          lines.append('\n')
      elif isinstance(element, NavigableString):
        lines.append(str(element).strip())
    out = ''.join(lines).strip()
    while '\n\n' in out:
      out = out.replace('\n\n', '\n')
  else:
    out = '\n'.join([line for line in html.split('\n') if line.strip()])
  cache.set(key, out, namespace="filters")
  return out

Ejemplo n.º 6

0

Mostrar archivo

def remove_empty_lines(html):
    key = '%s:remove_empty_lines' % hash(html)
    out = cache.get(key, namespace="filters")
    if out:
        return out

    if '</' in html:
        html = html.strip().replace('\n', '')
        soup = BeautifulSoup(html)
        lines = []
        for element in soup.contents:
            if isinstance(element, Tag):
                if element.text:
                    lines.append(str(element).strip())
                elif 'br' in str(element):
                    lines.append('\n')
            elif isinstance(element, NavigableString):
                lines.append(str(element).strip())
        out = ''.join(lines).strip()
        while '\n\n' in out:
            out = out.replace('\n\n', '\n')
    else:
        out = '\n'.join([line for line in html.split('\n') if line.strip()])
    cache.set(key, out, namespace="filters")
    return out

Ejemplo n.º 7

0

Mostrar archivo

 def userstats_api(self, user):
     url = "http://www.bright-shadows.net/userdata.php?username=%s"
     html = urllib2.urlopen(url % plugins.to_utf8(user), timeout=5).read()
     if html == "Unknown User":
         return None
     real_user, rank, users_total, challs_solved, challs_total = html.split(":")
     return real_user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), None, None, None

Ejemplo n.º 8

0

Mostrar archivo

Archivo: tbs.py Proyecto: tehron/tehbot

 def userstats_api(self, user):
     url = "http://www.bright-shadows.net/userdata.php?"
     html = urllib.request.urlopen(url + urllib.parse.urlencode({"username" : user}), timeout=5).read()
     html = html.decode()
     if html == "Unknown User":
         return None
     real_user, rank, users_total, challs_cnt, challs_total = html.split(":")
     return real_user, str(int(challs_cnt)), int(challs_total), str(int(rank)), int(users_total), None, None, None

Ejemplo n.º 9

0

Mostrar archivo

Archivo: misc.py Proyecto: CouponIsTalking/PriceWatch

def html_to_words(html):
    html = re.sub(
        '[\.<>=\\/"|~|!|@|#|$|%|^|&|*|(|)|{|}| |1|2|3|4|5|6|7|8|9|0]+', ' ',
        html)
    html = html.rstrip().lstrip().lower()
    html = re.sub(' +', ' ', html)
    words = html.split(' ')
    return words

Ejemplo n.º 10

0

Mostrar archivo

Archivo: st.py Proyecto: jms1103/tehbot

 def userstats_api(self, user):
     url = "https://www.securitytraps.pl/wcscore.php?uname=%s&key=%s"
     authkey = self.settings["securitytraps_api_key"]
     html = urllib2.urlopen(url % (plugins.to_utf8(user), authkey), timeout=5).read()
     if html == "0":
         return None
     rank, challs_solved, challs_total, users_total, score = html.split(":")
     return user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), int(score), None, None

Ejemplo n.º 11

0

Mostrar archivo

Archivo: article_extractor.py Proyecto: lxshen/try

    def get_content1(self):
        '''
        方法一
        @summary:
        基于文本密度查找正文
            1、将html去标签，将空格和换行符外的其他空白符去掉
            2、统计连续n段文字的长度，此处用于形容一定区域的文本密度
            3、将文本最密集处当成正文的开始和结束位置
            4、在正文开始处向上查找、找到文本密度小于等于正文文本密度阈值值，算为正文起始位置。该算法文本密度阈值值为文本密度值的最小值
            5、在正文开始处向下查找、找到文本密度小于等于正文文本密度阈值值，算为正文结束位置。该算法文本密度阈值值为文本密度值的最小值

        去除首页等干扰项：
            1、正文一般都包含p标签。此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值，则算为正文
        待解决：
            翻页 如：http://mini.eastday.com/a/171205202028050-3.html
        ---------
        ---------
        @result:
        '''
        if USEFUL_TAG:
            html = self.__replace_str(self._text, r'(?!{useful_tag})<(.|\n)+?>'.format(useful_tag = '|'.join(USEFUL_TAG)))
        else:
            html = self.__replace_str(self._text, '<(.|\n)*?>')
        paragraphs = html.split('\n')
        # for i, paragraph in enumerate(paragraphs):
        #     print(i, paragraph)

        # 统计连续n段的文本密度
        paragraph_lengths = [len(self.__del_html_tag(paragraph)) for paragraph in paragraphs]
        # paragraph_lengths = [len(paragraph.strip()) for paragraph in paragraphs]
        paragraph_block_lengths = [sum(paragraph_lengths[i : i + MAX_PARAGRAPH_DISTANCE]) for i in range(len(paragraph_lengths))]  # 连续n段段落长度的总和（段落块），如段落长度为[0,1,2,3,4] 则连续三段段落长度为[3,6,9,3,4]

        self._content_center_pos = content_start_pos = content_end_pos = paragraph_block_lengths.index(max(paragraph_block_lengths)) #文章的开始和结束位置默认在段落块文字最密集处
        min_paragraph_block_length = MIN_PARAGRAPH_LENGHT * MAX_PARAGRAPH_DISTANCE
        # 段落块长度大于最小段落块长度且数组没有越界，则看成在正文内。开始下标继续向上查找
        while content_start_pos > 0 and paragraph_block_lengths[content_start_pos] > min_paragraph_block_length:
            content_start_pos -= 1

        # 段落块长度大于最小段落块长度且数组没有越界，则看成在正文内。结束下标继续向下查找
        while content_end_pos < len(paragraph_block_lengths) and paragraph_block_lengths[content_end_pos] > min_paragraph_block_length:
            content_end_pos += 1

        # 处理多余的换行和空白符
        content = paragraphs[content_start_pos : content_end_pos]
        content = '\n'.join(content)
        content = self.__del_unnecessary_character(content)

        # 此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值，则算为正文
        paragraphs_text_len = len(self.__del_html_tag(''.join(tools.get_info(content, '<p.*?>(.*?)</p>'))))
        content_text_len = len(self.__del_html_tag(content))
        if content_text_len and content_text_len > MIN_COUNTENT_WORDS and ((paragraphs_text_len / content_text_len) > MIN_PARAGRAPH_AND_CONTENT_PROPORTION):
            self._content_start_pos = content_start_pos
            self._content_end_pos = content_end_pos
            self._paragraphs = paragraphs
            # print(content_start_pos, content_end_pos, self._content_center_pos)
            return content
        else:
            return ''

Ejemplo n.º 12

0

Mostrar archivo

 def userstats_api(self, user):
     url = "https://cryptohack.org/wechall/userscore/?username=%s&authkey=%s"
     authkey = self.settings["cryptohack_api_key"]
     r = self.sess.get(url % (Plugin.to_utf8(user), authkey), timeout=5)
     html = r.text
     if html == "failed":
         return None
     user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(":")
     return user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), int(score), int(scoremax), None

Ejemplo n.º 13

0

Mostrar archivo

Archivo: utils.py Proyecto: qwang2505/transcode

 def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties):
     if not style_str:
         return None
     properties = {}
     for p in style_str.split(";"):
         if p.strip():
             token = p.split(":")
             if len(token) > 1:
                 properties[token[0].strip()] = token[1].strip()
     return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: l411_input.py Proyecto: flyeven/scraperwiki-scraper-vault

def fields_from_split_html(template, html, separator, regex_with_groups_named_as_keys):
    list_to_ret = []
    lines = html.split(separator)
    for line in lines: 
        m = re.match(regex_with_groups_named_as_keys, line)
        if hasattr(m, 'groupdict'): 
            dict_to_ret = dict(template.items() + m.groupdict().items())
            list_to_ret.append(dict_to_ret)
    
    return list_to_ret

Ejemplo n.º 15

0

Mostrar archivo

Archivo: utils.py Proyecto: qwang2505/transcode

 def is_hidden_node(cls, node):
     """ Check if a node is hidden in html page
     """
     style_list = node.get("style", None)
     if style_list:
         for p in style_list.split(";"):
             tokens = p.split(":")
             if len(tokens) >= 2 and tokens[0].strip().lower() == "display" and tokens[1].strip().lower() == "none":
                 return True
     return False

Ejemplo n.º 16

0

Mostrar archivo

 def getEvent(self, url, parkrunEvent):
     self.logger.debug('Hitting {}'.format(url + str(parkrunEvent)))
     html = self.getURL(url + str(parkrunEvent))
     #Test if we got a valid response'
     if html is None:  #most likely a 404 error
         self.logger.warning('Error retrieving event')
         self.msgQ.put(
             Message('Error', self.id,
                     'Error getting event. Check url ' + url))
         return None
     if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html:
         self.logger.warning('Error retrieving event')
         self.msgQ.put(
             Message('Error', self.id,
                     'Possible URL error getting event. Check url ' + url))
         return None
     html = '<table' + html.split('<table')[1]
     html = html.split('</p>')[0]
     table = lxml.html.fromstring(html)
     return self.getEventTable(table)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: legislators.py Proyecto: Hasimir/sunlight-openstates

    def scrape(self, chamber, term_name):

        for t in self.metadata['terms']:
            if t['name'] == term_name:
                session = t['sessions'][-1]
                slug = self.metadata['session_details'][session]['slug']

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug)
        leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug)

        resp = json.loads(self.get(leg_json_url).text)

        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            leg = Legislator(term_name, chamber, item['DistrictNbr'],
                             item['FullName'], party=item['Party'],
                             photo_url=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            address += '\n%s, NV %s' % (details['City'], details['Zip'])

            phone = details['LCBPhone']
            email = details['LCBEmail']

            leg.add_office('district', 'District Address', address=address,
                                   phone=phone,email=email)
            leg.add_source(leg_details_url)
            self.save_legislator(leg)

Ejemplo n.º 18

0

Mostrar archivo

def fields_from_split_html(template, html, separator,
                           regex_with_groups_named_as_keys):
    list_to_ret = []
    lines = html.split(separator)
    for line in lines:
        m = re.match(regex_with_groups_named_as_keys, line)
        if hasattr(m, 'groupdict'):
            dict_to_ret = dict(template.items() + m.groupdict().items())
            list_to_ret.append(dict_to_ret)

    return list_to_ret

Ejemplo n.º 19

0

Mostrar archivo

Archivo: two47ctf.py Proyecto: tehron/tehbot

 def userstats_api(self, user):
     url = "https://247ctf.com/wechall_validate_score_service?username=%s&authkey=%s"
     authkey = self.settings["247ctf_api_key"]
     html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey),
                                   timeout=5).read()
     html = html.decode()
     if html == "":
         return None
     user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(
         ":")
     return user, str(int(challs_solved)), int(challs_total), str(
         int(rank)), int(users_total), int(score), int(scoremax), None

Ejemplo n.º 20

0

Mostrar archivo

 def userstats(self, user):
     url = "https://defendtheweb.net/wechall/userscore?username=%s&authkey=%s"
     authkey = self.settings["defendtheweb_auth_key"]
     html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey),
                                   timeout=5).read()
     html = html.decode()
     if html == "0":
         return None
     user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(
         ":")
     return user, str(int(challs_solved)), int(challs_total), str(
         int(rank)), int(users_total), int(score), int(scoremax), None

Ejemplo n.º 21

0

Mostrar archivo

def parse(url, html, webInfo):
    encoding, html = html_util.get_unicode_str(html)
    if 'ISO' in encoding:
        return '', '', '', '', '', ''
    if encoding == '': return '', '', '', '', '', ''
    newHtml = ''
    imgList = []
    for x in html.split('\n'):
        if x.count('<img') > 1:
            x = x.replace(u'<img', u'\n<img')
        newHtml = newHtml + "\n" + x
    html = newHtml
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        html = lxml.etree.tounicode(doc, method='html')
    except:
        traceback.print_exc()
        pass
    newHtml = getBody(html, webInfo['bodyPattern'])
    if newHtml == "":
        return '', '', '', '', '', ''
    title, time, text, images = get_main_content(html, newHtml, webInfo)
    mypos = ''
    if webInfo.has_key('textPattern'):
        text = getText(html, webInfo['textPattern'])
    if webInfo.has_key('titlePattern'):
        title = getText(html, webInfo['titlePattern'])
    if webInfo.has_key('myposPattern'):
        mypos = getText(html, webInfo['myposPattern'])
    if webInfo.has_key('textXpath'):
        text = getDataByXpath(doc, webInfo['textXpath'])
    if webInfo.has_key('titleXpath'):
        title = getDataByXpath(doc, webInfo['titleXpath'])
    if webInfo.has_key('myposXpath'):
        mypos = getDataByXpath(doc, webInfo['myposXpath'])
    if webInfo.has_key('publishTimeXpath'):
        time = getDataByXpath(doc, webInfo['publishTimeXpath'])
        time = strtotime(time, '')

    if webInfo.has_key('imgReplace'):
        patternList = webInfo['imgReplace']
        for picUrl in images:
            for pattern in patternList:
                picUrl = picUrl.replace(pattern[0], pattern[1])
            imgList.append(picUrl)
    else:
        imgList = images
    #print time.encode('utf-8')
    #print text.encode('utf-8')
    return encoding, title, text, time, imgList, mypos

Ejemplo n.º 22

0

Mostrar archivo

    def getEventHistory(self, url):
        self.logger.debug('Hitting {}'.format(url))
        html = self.getURL(url)
        #Test if we got a valid response'
        if html is None:  #most likely a 404 error
            self.logger.warning('Error retrieving event. URL: ' + url)
            self.msgQ.put(
                Message(
                    'Error', self.id,
                    'Possible 404 error gettint event history. Check url ' +
                    url))
            return None
        if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html:
            self.logger.warning('Error retrieving event')
            self.msgQ.put(
                Message('Error', self.id,
                        'URL error in event history. Check ' + url))
            return None
        html = '<table' + html.split('<table')[1]
        html = html.split('<div')[0]
        table = lxml.html.fromstring(html)

        headings = ['EventNumber', 'EventDate', 'Runners']
        rows = table.xpath('//tbody/tr')

        data = []
        for row in rows:
            d = {}
            for h, v in zip(headings, row.getchildren()):
                if h == 'EventNumber':
                    d[h] = int(v.getchildren()[0].text)
                if h == 'Runners':
                    d[h] = int(v.text)
                if h == 'EventDate':
                    d[h] = datetime.strptime(v.getchildren()[0].text,
                                             "%d/%m/%Y")
            data.insert(0, d)
        return data

Ejemplo n.º 23

0

Mostrar archivo

Archivo: etfplan.py Proyecto: leighjpeter/python

def get_history_price_by_fund(fundid, starttime):
    enddate = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    startdate = starttime.split(" ")[0]
    delta_days = cal_time(startdate + " 00:00:00", enddate + " 00:00:00")
    html = common_download(
        "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" +
        fundid + "&page=1&per=" + str(delta_days) + "&sdate=" + startdate +
        "&edate=" + enddate)
    html = html.split("<tr>")
    history = []
    history_pattern = re.compile(
        r"<td>(\d{4}-\d{2}-\d{2})</td><td class='tor bold'>(\d*.\d{4})</td>.*")
    for i in range(2, len(html)):
        history.append(history_pattern.findall(html[i]))
    return history

Ejemplo n.º 24

0

Mostrar archivo

Archivo: etfplan.py Proyecto: leighjpeter/python

def get_fhsp_records(fundid, sdate):
    html = common_download("http://fund.eastmoney.com/f10/fhsp_" + fundid +
                           ".html")
    text = "".join(html.split())
    fhsp_pattern = re.compile(
        r"<td>(\d{4}-\d{2}-\d{2})</td><td>每份派现金(\d*\.\d{4})元</td>")
    tmp = fhsp_pattern.findall(text)
    retval = []
    for i in range(0, len(tmp)):
        delta = cal_time(sdate, tmp[i][0] + " 15:00:00")
        if delta > 0:
            retval.append(tmp[i])

    retval.reverse()  # 反向排序
    return retval

Ejemplo n.º 25

0

Mostrar archivo

    def getLatestEvent(self, url):
        self.logger.debug('Hitting {}'.format(url))
        html = self.getURL(url)
        #Test if we got a valid response'
        if html is None:  #most likely a 404 error
            self.logger.warning('Error retrieving event')
            self.msgQ.put(
                Message('Error', self.id,
                        'Error getting event. Check url ' + url))
            return 0, None, None
        if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html:
            self.logger.warning('Error retrieving event')
            self.msgQ.put(
                Message('Error', self.id,
                        'Possible URL error getting event. Check url ' + url))
            return 0, None, None
        try:
            eventHTML = html.split('<h2>')[1]
            eventHTML = eventHTML.split('</h2>')[0]
        except IndexError:
            self.logger.warning('Error retrieving event')
            self.msgQ.put(
                Message('Error', self.id,
                        'Possible page error retrieving url ' + url))
            return 0, None, None

        if len(eventHTML.split('#')[1].split('-')[0].strip()) == 0:
            return 0, None, None
        eventNumber = int(eventHTML.split('#')[1].split('-')[0].strip())
        eventDate = datetime.strptime(eventHTML[len(eventHTML) - 10:],
                                      '%d/%m/%Y')

        html = '<table' + html.split('<table')[1]
        html = html.split('</p>')[0]
        table = lxml.html.fromstring(html)
        return eventNumber, eventDate, self.getEventTable(table)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: main.py Proyecto: pkubiak/caves-kml

def render_placemark(record, external_data=True):
    data = record.description
    description = []
    for key, value in data.items():
        if value.strip() == '':
            value = '---'
        value = value.replace("\n\n", '<br><br>')

        description.append(f"<b>{key.upper()}</b><p>{value}</p>")

    description = "\n".join(description)

    links_html = ''.join(
        map(lambda link: f"<li>{link.to_html()}</li>", record.links))

    # Generate
    if record.attachments and external_data:
        attachments = "".join(
            map(
                lambda attachment:
                f"<lc:attachment>files/{attachment.id}.jpg</lc:attachment>",
                record.attachments))
        extended_data = f'<ExtendedData xmlns:lc="http://www.locusmap.eu">{attachments}</ExtendedData>'
    else:
        extended_data = ''

    coords = [record.coords[1], record.coords[0]]
    if record.elevation is not None:
        coords.append(record.elevation)
    coords = ",".join(map(str, coords))

    html = f"""
        <Placemark>
          <name>{record.name} [{record.length or '???'}m]</name>
          <description><![CDATA[
            <style type="text/css">p{{margin-top:0;text-align:justify}}</style>
            <small>{description}<b>LINKI</b><ul>{links_html}</ul></small><br/>
          ]]></description>
          <styleUrl>{record.icon.value}</styleUrl>
          {extended_data}
          <Point>
            <coordinates>{coords}</coordinates>
          </Point>
        </Placemark>
    """

    return ''.join([line.strip() for line in html.split("\n")]) + "\n"

Ejemplo n.º 27

0

Mostrar archivo

 def createNewContact(self, username, mobile):
     conn = httplib.HTTPConnection("dealer.che168.com",
                                   timeout=timeout_che168)
     headers = copy.copy(self.headers)
     encoded_username = ""
     for c in username:
         encoded_username += "%u" + ("%x" % ord(c)).upper()
     conn.request("GET",
                  "/Handler/SaleMan/SaveSaleMan.ashx?Name=" +
                  encoded_username + "&Mobile=" + mobile +
                  "&QQ=&weixin=&pic=&file=",
                  headers=headers)
     res = conn.getresponse()
     resHeaders = res.getheaders()
     resRead = res.read()
     html = self.decodeBody(resHeaders, resRead)
     html = html.decode('GB18030')
     conn.close()
     html = html.split('|')
     if len(html) > 1:
         return html[1]
     return None

Ejemplo n.º 28

0

Mostrar archivo

Archivo: extractFMF.py Proyecto: tjthejuggler/ankimaker

def getTranscriptions(urls):
	for url in urls:
		namePrefix = "podFoundMyFitness"
		name = url.replace('https://www.foundmyfitness.com/episodes/','').capitalize()
		fullName = namePrefix + name
		if not path.exists(folder+fullName+".txt"):
			chromedriver = 'C:\\Program Files\\chromedriver\\chromedriver.exe' 
			options = webdriver.ChromeOptions()
			options.add_argument('headless')
			#options.add_argument('window-size=1200x600') 
			browser = webdriver.Chrome(executable_path=chromedriver, chrome_options=options) 
			browser.get(url)
			html = browser.page_source
			if 'episode_tabs-content' in html:
				html2 = html.split('episode_tabs-content">')[1]
				html3 = html2.split("container episode_supporter-call")[0]
				soup = BeautifulSoup(html3)
				text = soup.get_text()
				print(text)
				fileToWrite = open(folder+fullName+".txt","w+", encoding="utf8")
				fileToWrite.write(text)
				fileToWrite.close()
			browser.quit()

Ejemplo n.º 29

0

Mostrar archivo

Archivo: warc_to_shelve.py Proyecto: christianbuck/mtma_bitext

def html_clean(html):
    import lxml.html.clean
    import lxml.html
    import lxml.etree

    html, errors = tidy_document(html,
                                 # Tidy options: http://tidy.sourceforge.net/docs/quickref.html
                                 options={'bare': 1, 'clean': 1, 'output-xhtml': 1,
                                          'drop-font-tags': 1, 'drop-proprietary-attributes': 1,
                                          'hide-comments': 1,
                                          'char-encoding': 'utf8', 'input-encoding': 'utf8', 'output-encoding': 'utf8'})
    cleaner = lxml.html.clean.Cleaner(
        kill_tags=frozenset(['script', 'style', 'option']),
        remove_tags=frozenset(['a', 'strong', 'em']),
        safe_attrs_only=True, safe_attrs=frozenset())
    html = cleaner.clean_html(html)

    # html = lxml.etree.tostring(lxml.html.fromstring(html), pretty_print=True).decode('utf8')

    # html = html.encode('utf-8', errors='strict')
    soup = BeautifulSoup(html)
    # [s.extract() for s in soup('script')]  # remove 'script', 'style', 'option' tags
    # [s.extract() for s in soup('style')]
    # [s.extract() for s in soup('option')]
    html = soup.prettify()

    # html = htmllaundry.strip_markup(html)  # leave only text

    # remove continuous empty lines
    html = re.sub(r'\n\s*\n+', '\n\n', html).strip()
    html = re.sub(r'[ \t]+', ' ', html, re.M).strip()  # remove continuous spaces

    # cleaned_html = [sent for sent in cleaned_html.split(
    # '\n')]  # if len(sent.split()) == 0 or len(sent.split()) >= 6]
    html_lines = html.split('\n')
    # return html_lines
    return list(html_sent_word_tokenize(html_lines))

Ejemplo n.º 30

0

Mostrar archivo

Archivo: 3_webscrap_fame.py Proyecto: chocochun/FameForecaster

    def get_fame(self):
        ''' Get fame information from http://www.whoisbigger.com/ '''

        # get whoisbigger page
        url = "http://www.whoisbigger.com/download_entity.php?entity=entity-" + self.name.lower(
        ).replace(" ", "-")

        # open page
        u = urllib.urlopen(url)
        #u = urlopen(url)
        try:
            html = u.read()
        finally:
            u.close()

        # if this person is famous
        if html != 'Error downloading this file.':

            try:

                dat = html.split(",\"")
                startnumber = 16

                # if the search is a person
                if dat[7].replace('"', '').split(":")[1] == "1":
                    startnumber += 6

                for i in range(3, startnumber):
                    fameinfo = dat[i].replace('"', '').split(":")

                    # dynamic create variable
                    setattr(self, fameinfo[0], fameinfo[1])

            except Exception:
                self.init_fame_single()
        else:
            self.init_fame_single()

Ejemplo n.º 31

0

Mostrar archivo

def HtmlTagStrip(html, webInfo):
    images = []
    for item in re.findall(webInfo['imgPattern'], html):
        if not item:
            continue
        #print item.encode('utf-8')
        #print webInfo['imgUrlPattern']
        if checkSpecialImg(item, webInfo['imgUrlPattern']):
            for term in item.split(' '):
                res = re.search(webInfo['imgUrlPattern'], term)
                if res:
                    img = (res.group())
                    if 'big' in term or 'onclick' in term:
                        images.append(img)
        else:
            #print webInfo['imgUrlPattern']
            res = re.search(webInfo['imgUrlPattern'], item)
            #print res
            if res:
                img = (res.group())
                images.append(img)

    for k, v in RE_IGNORE_BLOCK.iteritems():
        html = re.sub(v, '', html)
    for k, v in RE_NEWLINE_BLOCK.iteritems():
        html = re.sub(v, '\n', html)
    html = re.sub(RE_MULTI_NEWLINE, '\n', html)
    html = re.sub(RE_TAG, '', html)
    lines = []
    for line in html.split('\n'):
        if len(line) == 0:
            continue
        else:
            lines.append(line.strip())
    html = '\n'.join(lines)
    return images, html

Ejemplo n.º 32

0

Mostrar archivo

        url = link.get('href')
        if url is not None and 'incident-reports' in url and 'read more' not in link.text:
            reportUrls.append('http://www.wrps.on.ca' + url)
        elif url is not None and 'next' in link.text:
            listUrls.append('http://www.wrps.on.ca' + url)

g = geocoders.Google(domain='maps.google.ca')

#scraperwiki.sqlite.execute('drop table swdata')
#scraperwiki.sqlite.commit()

reportUrls.reverse()
for report in reportUrls:
    print report
    html = scraperwiki.scrape(report)
    lines = html.split('\n')
    itype = None
    for line in lines:
        match = re.search('<h1 class="title">.*eports( for)? (.*?)( ?- ?[Uu][Pp][Dd][Aa][Tt][Ee].*)?</h1>', line)
        if match:
            try:
                reportdate = datetime.strptime(match.group(2), '%B %d, %Y')
            except:
                reportdate = datetime.strptime(match.group(2), '%B%d, %Y')
            continue
        match = re.search('^<p>.*?Incident # ([0-9]{2}-[0-9]{6}).*?Type : ([^&]*)&?.*?<br />(.*?)<br />(.*?)</p>', line.strip())
        if match:
            if itype != None:
                processIncident(incident, itype, location, text, reportdate)
                #

Ejemplo n.º 33

0

Mostrar archivo

Archivo: public_notices_3.py Proyecto: flyeven/scraperwiki-scraper-vault

#print "ok handle"
# Want debugging messages?
br.set_debug_http(True)
br.set_debug_redirects(True)
br.set_debug_responses(True)
    
# User-Agent 
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#print "ok headers"
the_list = []

list_url = "http://greencracker.net/wp-content/uploads/2013/06/elevenn.csv"
response = br.open(list_url)
html = response.read()

the_list = html.split("*")
i = 0
for item in range(len(the_list)):
    the_list[i] = the_list[i].replace("\r", "")
    the_list[i] = the_list[i].replace("'", "")
    the_list[i] = the_list[i].replace('"', "")
    the_list[i] = the_list[i].strip()
    i=i+1

print the_list


target = "http://www.nsopw.gov/en-us/Search"
response = br.open(target)
html = response.read()
print html

Ejemplo n.º 34

0

Mostrar archivo

def parse_cast_td(td):
    html = lxml.html.tostring(td, encoding="utf-8").decode()
    casts = [re.sub(r"（.*）|<[^<]*>|[ 　]+", "", x) for x in html.split("<br>")]
    return casts

Ejemplo n.º 35

0

Mostrar archivo

def get_main_content(html):
    if not isinstance(html, unicode):
        return '', ''

    html_lines_len = [len(x.strip()) for x in html.split('\n')]

    # 保存图片信息
    """
    images = {}
    for img in re.findall(RE_IMG, html):
        md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16]
        html = html.replace(img, md5)
        r = re.findall(RE_IMG_SRC, img)
        if len(r) == 1: src = r[0][1]
        else: src = ''
        images[md5] = "<img src='%s'>" % src#img
    """

    # 去除所有的html标签
    text = re.sub(RE_TAG, '', html)

    # 抽取发表时间
    time = ''
    t_time = re.findall(RE_TIME, text)
    if len(t_time) > 0:
        time = t_time[0]

    date = ''
    t_date = re.findall(RE_DATETIME, text)
    if len(t_date) > 0:
        date = t_date[0][0]

    lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')]
    index_dist = []
    size = len(lines)
    for i in xrange(size - BLOCKS_WIDTH + 1):
        char_num = 0
        for j in xrange(i, i + BLOCKS_WIDTH):
            strip = re.sub(ur'\s+', '', lines[j])
            char_num += len(strip)
        index_dist.append(char_num)
    main_text = ''
    fstart = -1
    start = -1
    end = -1
    flag_s = False
    flag_e = False
    first_match = True
    for i in xrange(len(index_dist) - 1):
        if first_match and not flag_s:
            if index_dist[i] > THRESHOLD / 2:
                if index_dist[i + 1] != 0 or index_dist[i + 2] != 0:
                    first_match = False
                    flag_s = True
                    start = i
                    fstart = i
                    continue
        if index_dist[i] > THRESHOLD and not flag_s:
            if index_dist[i + 1] != 0 or index_dist[i + 2] != 0 or index_dist[
                    i + 3] != 0:
                flag_s = True
                start = i
                continue
        if flag_s:
            if index_dist[i] == 0 or index_dist[i + 1] == 0:
                end = i
                flag_e = True
        tmp = ''
        if flag_e:
            for ii in xrange(start, end + 1):
                if (len(lines[ii]) < 1): continue
                tmp += lines[ii] + '\n'
            main_text += tmp
            flag_s = flag_e = False


#    for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1):
#        for md5 in images.keys():
#            if lines[pre].find(md5) > 0:
#                main_text = lines[pre] + '\n' + main_text
#                break
    """
    for md5,img in images.iteritems():
        main_text = main_text.replace(md5, img)
    """
    return strtotime(date, time), main_text

Ejemplo n.º 36

0

Mostrar archivo

Archivo: content_extract.py Proyecto: liuzl/nlp4econ

def get_main_content(html):
    if not isinstance(html, unicode):
        return '',''

    html_lines_len = [len(x.strip()) for x in html.split('\n')]

    # 保存图片信息
    images = {}
    for img in re.findall(RE_IMG, html):
        md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16]
        html = html.replace(img, md5)
        r = re.findall(RE_IMG_SRC, img)
        if len(r) == 1: src = r[0][1]
        else: src = ''
        images[md5] = "<img src='%s'>" % src#img

    # 去除所有的html标签
    text = re.sub(RE_TAG, '', html)

    # 抽取发表时间
    time = ''
    t = re.findall(RE_DATETIME, text)
    if len(t) > 0:
        time = t[0][0]

    lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')]
    index_dist = []
    size = len(lines)
    for i in xrange(size - BLOCKS_WIDTH + 1):
        char_num = 0
        for j in xrange(i, i + BLOCKS_WIDTH):
            strip = re.sub(ur'\s+', '', lines[j])
            char_num += len(strip)
        index_dist.append(char_num)
    main_text = ''
    fstart = -1
    start = -1
    end = -1
    flag_s = False
    flag_e = False
    first_match = True
    for i in xrange(len(index_dist) - 1):
        if first_match and not flag_s:
            if index_dist[i] > THRESHOLD / 2:
                if index_dist[i+1] != 0 or index_dist[i+2] != 0:
                    first_match = False
                    flag_s = True
                    start = i
                    fstart = i
                    continue
        if index_dist[i] > THRESHOLD and not flag_s:
            if index_dist[i+1] != 0 or index_dist[i+2] != 0 or index_dist[i+3] != 0:
                flag_s = True
                start = i
                continue
        if flag_s:
            if index_dist[i] == 0 or index_dist[i+1] == 0:
                end = i
                flag_e = True
        tmp = ''
        if flag_e:
            for ii in xrange(start, end+1):
                if (len(lines[ii]) < 1): continue
                tmp += lines[ii] + '\n'
            main_text += tmp
            flag_s = flag_e = False

#    for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1):
#        for md5 in images.keys():
#            if lines[pre].find(md5) > 0:
#                main_text = lines[pre] + '\n' + main_text
#                break

    for md5,img in images.iteritems():
        main_text = main_text.replace(md5, img)
    return strtotime(time), main_text

Ejemplo n.º 37

0

Mostrar archivo

Archivo: main.py Proyecto: edbrannin/sunday-readings-questions

tree = html.fromstring(page.text)

#This will create a list of prices
reflection_html = tree.xpath('//div[@class="moduleBody"]')[0]
html = etree.tostring(reflection_html, pretty_print=True)

html = html.replace('\r\n', '\n')
html = html.replace('<br/>', '\n').replace('&#160;', ' ').replace('\n \n', '\n\n')

new_html = ""
for line in html:
    new_html += line.strip()
    new_html += "\r"
#html = new_html

parts = html.split("<h1> </h1>")


#with open("reflection.html", 'w') as out:
#    out.write(html.encode('utf8'))
#print "{} parts.".format(len(parts))

with open('output.markdown', 'w') as out_all:
    for count, part in enumerate(parts):
        markdown = converter.convert(html, 'markdown', format='html')
        out_all.write(part.encode('utf8'))
        if count+1 < len(parts):
            out_all.write("\n\\pagebreak\n")
        with open('output{}.markdown'.format(count), 'w') as out:
            out.write(part.encode('utf8'))

Ejemplo n.º 38

0

Mostrar archivo

Archivo: CheckHistory.py Proyecto: Golgothen/ParkrunDB

    try:
        for athlete in data:
            tick = timer()
            while not outQ.empty():
                logger.debug(outQ.get(False))
            athlete['EventCount'] = c.execute(
                "SELECT dbo.getAthleteEventCount({})".format(
                    athlete['AthleteID']))
            logger.debug("Checking ID {}, {} {} ({})".format(
                athlete['AthleteID'], athlete['FirstName'],
                athlete['LastName'], athlete['EventCount']))
            html = getURL(baseURL.format(athlete['AthleteID']))
            try:
                runcount = int(
                    html.split('<h2>')[1].split('<br/>')[0].split(
                        ' runs at All Events')[0].split(' ')[-1])
                logger.debug("Runcount = {}".format(runcount))
            except (ValueError, IndexError, AttributeError):
                print("Error reading run count for Athlete {}".format(
                    athlete['AthleteID']))
                logger.warning("Error reading run count for Athlete {}".format(
                    athlete['AthleteID']))
                continue
            if athlete['EventCount'] != runcount:
                eventsMissing = runcount - athlete['EventCount']
                rows = lxml.html.fromstring(
                    '<table' + html.split('<table')[3].split('</table>')[0] +
                    '</table>').xpath('//tbody/tr')
                hist_data = c.execute(
                    "SELECT * FROM getAthleteEventHistory({})".format(
                        athlete['AthleteID']))