def parseWeb(): """ This module is used to get the data points from ECE2031 server """ response = urllib2.urlopen(serverUrl) html = response.read() xPoints = [] # list for robot x points yPoints = [] # list for robot y points xDesPoints = [] # list for destination x points yDesPoints = [] # list for destination y ponins # this line should be changed for starting trigger. strparse = "<p>" html = html.split(strparse)[-1] strparse = "</p>" html = html.split(strparse)[0] for line in html.split('<br>'): if line is not "": points = line.split(',') print points x = float(points[0]) y = float(points[1]) xPoints.append(x) yPoints.append(y) print "Robot Path" for i in range(1, len(xPoints)): print '(' + str(xPoints[i])[:5] + ',' + str(yPoints[i])[:5] + ')' return xPoints, yPoints
def difference (self, html): print "Initial size %d" % len(html) s_html = html.split('\n') s_last_html = html.split('\n') diff = difflib.ndiff(s_last_html, s_html) for l in diff: print l print "kikou\n" delta = [l for l in diff if l.startswith('+ ')] print "Delta size %d" % len(delta) return delta
def link_html_images(html): image_link_elem = ( '<a href="<% $c->uri_for($image_path . "/{image}") %>"/>' '<img class="screenshot"' ' src="<% $c->uri_for($image_path . "/{image}") %>"' ' alt=""/>' '</a>' ) image_template = cleandoc(""" <div class="row-fluid"> <div class="span6"> {image_link} </div> </div> """.format(image_link=image_link_elem)) image_re = re.compile( r'<p><img alt="" src="images/(?P<image>\w+\.png)" title=""\s*/?></p>' ) out_lines = [] for line in html.split('\n'): match = image_re.match(line) if match: image_name = match.group('image') out_lines.append(image_template.format(image=image_name)) else: out_lines.append(line) return '\n'.join(out_lines)
def scrape(self, chamber, term_name): for t in self.metadata['terms']: if t['name'] == term_name: session = t['sessions'][-1] slug = self.metadata['session_details'][session]['slug'] if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % ( chamber_slug, slug) leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % ( slug, chamber_slug) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ['District No', 'Vacant'] if any(name in item['FullName'] for name in empty_names): continue last, first = item['FullName'].split(",", 1) item['FullName'] = "{first} {last}".format(last=last, first=first).strip() leg = Legislator(term_name, chamber, item['DistrictNbr'], item['FullName'], party=item['Party'], photo_url=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # hack to get the legislator ID html = self.get(leg_url).text for l in html.split('\n'): if 'GetLegislatorDetails' in l: leg_id = l.split(',')[1].split("'")[1] # fetch the json used by the page leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp['legislatorDetails'] address = details['Address1'] address2 = details['Address2'] if address2: address += ' ' + address2 phone = details['LCBPhone'] email = details['LCBEmail'] leg.add_office('district', 'District Address', address=address, phone=phone, email=email) leg.add_source(leg_details_url) self.save_legislator(leg)
def remove_empty_lines(html): key = '%s:remove_empty_lines' % hash(html) out = cache.get(key, namespace="filters") if out: return out if '</' in html: html = html.strip().replace('\n', '') soup = BeautifulSoup(html) lines = [] for element in soup.contents: if isinstance(element, Tag): if element.text: lines.append(str(element).strip()) elif 'br' in str(element): lines.append('\n') elif isinstance(element, NavigableString): lines.append(str(element).strip()) out = ''.join(lines).strip() while '\n\n' in out: out = out.replace('\n\n', '\n') else: out = '\n'.join([line for line in html.split('\n') if line.strip()]) cache.set(key, out, namespace="filters") return out
def userstats_api(self, user): url = "http://www.bright-shadows.net/userdata.php?username=%s" html = urllib2.urlopen(url % plugins.to_utf8(user), timeout=5).read() if html == "Unknown User": return None real_user, rank, users_total, challs_solved, challs_total = html.split(":") return real_user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), None, None, None
def userstats_api(self, user): url = "http://www.bright-shadows.net/userdata.php?" html = urllib.request.urlopen(url + urllib.parse.urlencode({"username" : user}), timeout=5).read() html = html.decode() if html == "Unknown User": return None real_user, rank, users_total, challs_cnt, challs_total = html.split(":") return real_user, str(int(challs_cnt)), int(challs_total), str(int(rank)), int(users_total), None, None, None
def html_to_words(html): html = re.sub( '[\.<>=\\/"|~|!|@|#|$|%|^|&|*|(|)|{|}| |1|2|3|4|5|6|7|8|9|0]+', ' ', html) html = html.rstrip().lstrip().lower() html = re.sub(' +', ' ', html) words = html.split(' ') return words
def userstats_api(self, user): url = "https://www.securitytraps.pl/wcscore.php?uname=%s&key=%s" authkey = self.settings["securitytraps_api_key"] html = urllib2.urlopen(url % (plugins.to_utf8(user), authkey), timeout=5).read() if html == "0": return None rank, challs_solved, challs_total, users_total, score = html.split(":") return user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), int(score), None, None
def get_content1(self): ''' 方法一 @summary: 基于文本密度查找正文 1、将html去标签,将空格和换行符外的其他空白符去掉 2、统计连续n段文字的长度,此处用于形容一定区域的文本密度 3、将文本最密集处当成正文的开始和结束位置 4、在正文开始处向上查找、找到文本密度小于等于正文文本密度阈值值,算为正文起始位置。该算法文本密度阈值值为文本密度值的最小值 5、在正文开始处向下查找、找到文本密度小于等于正文文本密度阈值值,算为正文结束位置。该算法文本密度阈值值为文本密度值的最小值 去除首页等干扰项: 1、正文一般都包含p标签。此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文 待解决: 翻页 如:http://mini.eastday.com/a/171205202028050-3.html --------- --------- @result: ''' if USEFUL_TAG: html = self.__replace_str(self._text, r'(?!{useful_tag})<(.|\n)+?>'.format(useful_tag = '|'.join(USEFUL_TAG))) else: html = self.__replace_str(self._text, '<(.|\n)*?>') paragraphs = html.split('\n') # for i, paragraph in enumerate(paragraphs): # print(i, paragraph) # 统计连续n段的文本密度 paragraph_lengths = [len(self.__del_html_tag(paragraph)) for paragraph in paragraphs] # paragraph_lengths = [len(paragraph.strip()) for paragraph in paragraphs] paragraph_block_lengths = [sum(paragraph_lengths[i : i + MAX_PARAGRAPH_DISTANCE]) for i in range(len(paragraph_lengths))] # 连续n段段落长度的总和(段落块),如段落长度为[0,1,2,3,4] 则连续三段段落长度为[3,6,9,3,4] self._content_center_pos = content_start_pos = content_end_pos = paragraph_block_lengths.index(max(paragraph_block_lengths)) #文章的开始和结束位置默认在段落块文字最密集处 min_paragraph_block_length = MIN_PARAGRAPH_LENGHT * MAX_PARAGRAPH_DISTANCE # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。开始下标继续向上查找 while content_start_pos > 0 and paragraph_block_lengths[content_start_pos] > min_paragraph_block_length: content_start_pos -= 1 # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。结束下标继续向下查找 while content_end_pos < len(paragraph_block_lengths) and paragraph_block_lengths[content_end_pos] > min_paragraph_block_length: content_end_pos += 1 # 处理多余的换行和空白符 content = paragraphs[content_start_pos : content_end_pos] content = '\n'.join(content) content = self.__del_unnecessary_character(content) # 此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文 paragraphs_text_len = len(self.__del_html_tag(''.join(tools.get_info(content, '<p.*?>(.*?)</p>')))) content_text_len = len(self.__del_html_tag(content)) if content_text_len and content_text_len > MIN_COUNTENT_WORDS and ((paragraphs_text_len / content_text_len) > MIN_PARAGRAPH_AND_CONTENT_PROPORTION): self._content_start_pos = content_start_pos self._content_end_pos = content_end_pos self._paragraphs = paragraphs # print(content_start_pos, content_end_pos, self._content_center_pos) return content else: return ''
def userstats_api(self, user): url = "https://cryptohack.org/wechall/userscore/?username=%s&authkey=%s" authkey = self.settings["cryptohack_api_key"] r = self.sess.get(url % (Plugin.to_utf8(user), authkey), timeout=5) html = r.text if html == "failed": return None user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(":") return user, str(int(challs_solved)), int(challs_total), str(int(rank)), int(users_total), int(score), int(scoremax), None
def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties): if not style_str: return None properties = {} for p in style_str.split(";"): if p.strip(): token = p.split(":") if len(token) > 1: properties[token[0].strip()] = token[1].strip() return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)
def fields_from_split_html(template, html, separator, regex_with_groups_named_as_keys): list_to_ret = [] lines = html.split(separator) for line in lines: m = re.match(regex_with_groups_named_as_keys, line) if hasattr(m, 'groupdict'): dict_to_ret = dict(template.items() + m.groupdict().items()) list_to_ret.append(dict_to_ret) return list_to_ret
def is_hidden_node(cls, node): """ Check if a node is hidden in html page """ style_list = node.get("style", None) if style_list: for p in style_list.split(";"): tokens = p.split(":") if len(tokens) >= 2 and tokens[0].strip().lower() == "display" and tokens[1].strip().lower() == "none": return True return False
def getEvent(self, url, parkrunEvent): self.logger.debug('Hitting {}'.format(url + str(parkrunEvent))) html = self.getURL(url + str(parkrunEvent)) #Test if we got a valid response' if html is None: #most likely a 404 error self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'Error getting event. Check url ' + url)) return None if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html: self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'Possible URL error getting event. Check url ' + url)) return None html = '<table' + html.split('<table')[1] html = html.split('</p>')[0] table = lxml.html.fromstring(html) return self.getEventTable(table)
def scrape(self, chamber, term_name): for t in self.metadata['terms']: if t['name'] == term_name: session = t['sessions'][-1] slug = self.metadata['session_details'][session]['slug'] if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug) leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ['District No', 'Vacant'] if any(name in item['FullName'] for name in empty_names): continue last, first = item['FullName'].split(",", 1) item['FullName'] = "{first} {last}".format(last=last, first=first).strip() leg = Legislator(term_name, chamber, item['DistrictNbr'], item['FullName'], party=item['Party'], photo_url=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # hack to get the legislator ID html = self.get(leg_url).text for l in html.split('\n'): if 'GetLegislatorDetails' in l: leg_id = l.split(',')[1].split("'")[1] # fetch the json used by the page leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp['legislatorDetails'] address = details['Address1'] address2 = details['Address2'] if address2: address += ' ' + address2 address += '\n%s, NV %s' % (details['City'], details['Zip']) phone = details['LCBPhone'] email = details['LCBEmail'] leg.add_office('district', 'District Address', address=address, phone=phone,email=email) leg.add_source(leg_details_url) self.save_legislator(leg)
def userstats_api(self, user): url = "https://247ctf.com/wechall_validate_score_service?username=%s&authkey=%s" authkey = self.settings["247ctf_api_key"] html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey), timeout=5).read() html = html.decode() if html == "": return None user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split( ":") return user, str(int(challs_solved)), int(challs_total), str( int(rank)), int(users_total), int(score), int(scoremax), None
def userstats(self, user): url = "https://defendtheweb.net/wechall/userscore?username=%s&authkey=%s" authkey = self.settings["defendtheweb_auth_key"] html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey), timeout=5).read() html = html.decode() if html == "0": return None user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split( ":") return user, str(int(challs_solved)), int(challs_total), str( int(rank)), int(users_total), int(score), int(scoremax), None
def parse(url, html, webInfo): encoding, html = html_util.get_unicode_str(html) if 'ISO' in encoding: return '', '', '', '', '', '' if encoding == '': return '', '', '', '', '', '' newHtml = '' imgList = [] for x in html.split('\n'): if x.count('<img') > 1: x = x.replace(u'<img', u'\n<img') newHtml = newHtml + "\n" + x html = newHtml try: doc = lxml.html.document_fromstring(html) doc.make_links_absolute(url) html = lxml.etree.tounicode(doc, method='html') except: traceback.print_exc() pass newHtml = getBody(html, webInfo['bodyPattern']) if newHtml == "": return '', '', '', '', '', '' title, time, text, images = get_main_content(html, newHtml, webInfo) mypos = '' if webInfo.has_key('textPattern'): text = getText(html, webInfo['textPattern']) if webInfo.has_key('titlePattern'): title = getText(html, webInfo['titlePattern']) if webInfo.has_key('myposPattern'): mypos = getText(html, webInfo['myposPattern']) if webInfo.has_key('textXpath'): text = getDataByXpath(doc, webInfo['textXpath']) if webInfo.has_key('titleXpath'): title = getDataByXpath(doc, webInfo['titleXpath']) if webInfo.has_key('myposXpath'): mypos = getDataByXpath(doc, webInfo['myposXpath']) if webInfo.has_key('publishTimeXpath'): time = getDataByXpath(doc, webInfo['publishTimeXpath']) time = strtotime(time, '') if webInfo.has_key('imgReplace'): patternList = webInfo['imgReplace'] for picUrl in images: for pattern in patternList: picUrl = picUrl.replace(pattern[0], pattern[1]) imgList.append(picUrl) else: imgList = images #print time.encode('utf-8') #print text.encode('utf-8') return encoding, title, text, time, imgList, mypos
def getEventHistory(self, url): self.logger.debug('Hitting {}'.format(url)) html = self.getURL(url) #Test if we got a valid response' if html is None: #most likely a 404 error self.logger.warning('Error retrieving event. URL: ' + url) self.msgQ.put( Message( 'Error', self.id, 'Possible 404 error gettint event history. Check url ' + url)) return None if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html: self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'URL error in event history. Check ' + url)) return None html = '<table' + html.split('<table')[1] html = html.split('<div')[0] table = lxml.html.fromstring(html) headings = ['EventNumber', 'EventDate', 'Runners'] rows = table.xpath('//tbody/tr') data = [] for row in rows: d = {} for h, v in zip(headings, row.getchildren()): if h == 'EventNumber': d[h] = int(v.getchildren()[0].text) if h == 'Runners': d[h] = int(v.text) if h == 'EventDate': d[h] = datetime.strptime(v.getchildren()[0].text, "%d/%m/%Y") data.insert(0, d) return data
def get_history_price_by_fund(fundid, starttime): enddate = time.strftime('%Y-%m-%d', time.localtime(time.time())) startdate = starttime.split(" ")[0] delta_days = cal_time(startdate + " 00:00:00", enddate + " 00:00:00") html = common_download( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + fundid + "&page=1&per=" + str(delta_days) + "&sdate=" + startdate + "&edate=" + enddate) html = html.split("<tr>") history = [] history_pattern = re.compile( r"<td>(\d{4}-\d{2}-\d{2})</td><td class='tor bold'>(\d*.\d{4})</td>.*") for i in range(2, len(html)): history.append(history_pattern.findall(html[i])) return history
def get_fhsp_records(fundid, sdate): html = common_download("http://fund.eastmoney.com/f10/fhsp_" + fundid + ".html") text = "".join(html.split()) fhsp_pattern = re.compile( r"<td>(\d{4}-\d{2}-\d{2})</td><td>每份派现金(\d*\.\d{4})元</td>") tmp = fhsp_pattern.findall(text) retval = [] for i in range(0, len(tmp)): delta = cal_time(sdate, tmp[i][0] + " 15:00:00") if delta > 0: retval.append(tmp[i]) retval.reverse() # 反向排序 return retval
def getLatestEvent(self, url): self.logger.debug('Hitting {}'.format(url)) html = self.getURL(url) #Test if we got a valid response' if html is None: #most likely a 404 error self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'Error getting event. Check url ' + url)) return 0, None, None if '<h1>Something odd has happened, so here are the most first finishers</h1>' in html: self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'Possible URL error getting event. Check url ' + url)) return 0, None, None try: eventHTML = html.split('<h2>')[1] eventHTML = eventHTML.split('</h2>')[0] except IndexError: self.logger.warning('Error retrieving event') self.msgQ.put( Message('Error', self.id, 'Possible page error retrieving url ' + url)) return 0, None, None if len(eventHTML.split('#')[1].split('-')[0].strip()) == 0: return 0, None, None eventNumber = int(eventHTML.split('#')[1].split('-')[0].strip()) eventDate = datetime.strptime(eventHTML[len(eventHTML) - 10:], '%d/%m/%Y') html = '<table' + html.split('<table')[1] html = html.split('</p>')[0] table = lxml.html.fromstring(html) return eventNumber, eventDate, self.getEventTable(table)
def render_placemark(record, external_data=True): data = record.description description = [] for key, value in data.items(): if value.strip() == '': value = '---' value = value.replace("\n\n", '<br><br>') description.append(f"<b>{key.upper()}</b><p>{value}</p>") description = "\n".join(description) links_html = ''.join( map(lambda link: f"<li>{link.to_html()}</li>", record.links)) # Generate if record.attachments and external_data: attachments = "".join( map( lambda attachment: f"<lc:attachment>files/{attachment.id}.jpg</lc:attachment>", record.attachments)) extended_data = f'<ExtendedData xmlns:lc="http://www.locusmap.eu">{attachments}</ExtendedData>' else: extended_data = '' coords = [record.coords[1], record.coords[0]] if record.elevation is not None: coords.append(record.elevation) coords = ",".join(map(str, coords)) html = f""" <Placemark> <name>{record.name} [{record.length or '???'}m]</name> <description><![CDATA[ <style type="text/css">p{{margin-top:0;text-align:justify}}</style> <small>{description}<b>LINKI</b><ul>{links_html}</ul></small><br/> ]]></description> <styleUrl>{record.icon.value}</styleUrl> {extended_data} <Point> <coordinates>{coords}</coordinates> </Point> </Placemark> """ return ''.join([line.strip() for line in html.split("\n")]) + "\n"
def createNewContact(self, username, mobile): conn = httplib.HTTPConnection("dealer.che168.com", timeout=timeout_che168) headers = copy.copy(self.headers) encoded_username = "" for c in username: encoded_username += "%u" + ("%x" % ord(c)).upper() conn.request("GET", "/Handler/SaleMan/SaveSaleMan.ashx?Name=" + encoded_username + "&Mobile=" + mobile + "&QQ=&weixin=&pic=&file=", headers=headers) res = conn.getresponse() resHeaders = res.getheaders() resRead = res.read() html = self.decodeBody(resHeaders, resRead) html = html.decode('GB18030') conn.close() html = html.split('|') if len(html) > 1: return html[1] return None
def getTranscriptions(urls): for url in urls: namePrefix = "podFoundMyFitness" name = url.replace('https://www.foundmyfitness.com/episodes/','').capitalize() fullName = namePrefix + name if not path.exists(folder+fullName+".txt"): chromedriver = 'C:\\Program Files\\chromedriver\\chromedriver.exe' options = webdriver.ChromeOptions() options.add_argument('headless') #options.add_argument('window-size=1200x600') browser = webdriver.Chrome(executable_path=chromedriver, chrome_options=options) browser.get(url) html = browser.page_source if 'episode_tabs-content' in html: html2 = html.split('episode_tabs-content">')[1] html3 = html2.split("container episode_supporter-call")[0] soup = BeautifulSoup(html3) text = soup.get_text() print(text) fileToWrite = open(folder+fullName+".txt","w+", encoding="utf8") fileToWrite.write(text) fileToWrite.close() browser.quit()
def html_clean(html): import lxml.html.clean import lxml.html import lxml.etree html, errors = tidy_document(html, # Tidy options: http://tidy.sourceforge.net/docs/quickref.html options={'bare': 1, 'clean': 1, 'output-xhtml': 1, 'drop-font-tags': 1, 'drop-proprietary-attributes': 1, 'hide-comments': 1, 'char-encoding': 'utf8', 'input-encoding': 'utf8', 'output-encoding': 'utf8'}) cleaner = lxml.html.clean.Cleaner( kill_tags=frozenset(['script', 'style', 'option']), remove_tags=frozenset(['a', 'strong', 'em']), safe_attrs_only=True, safe_attrs=frozenset()) html = cleaner.clean_html(html) # html = lxml.etree.tostring(lxml.html.fromstring(html), pretty_print=True).decode('utf8') # html = html.encode('utf-8', errors='strict') soup = BeautifulSoup(html) # [s.extract() for s in soup('script')] # remove 'script', 'style', 'option' tags # [s.extract() for s in soup('style')] # [s.extract() for s in soup('option')] html = soup.prettify() # html = htmllaundry.strip_markup(html) # leave only text # remove continuous empty lines html = re.sub(r'\n\s*\n+', '\n\n', html).strip() html = re.sub(r'[ \t]+', ' ', html, re.M).strip() # remove continuous spaces # cleaned_html = [sent for sent in cleaned_html.split( # '\n')] # if len(sent.split()) == 0 or len(sent.split()) >= 6] html_lines = html.split('\n') # return html_lines return list(html_sent_word_tokenize(html_lines))
def get_fame(self): ''' Get fame information from http://www.whoisbigger.com/ ''' # get whoisbigger page url = "http://www.whoisbigger.com/download_entity.php?entity=entity-" + self.name.lower( ).replace(" ", "-") # open page u = urllib.urlopen(url) #u = urlopen(url) try: html = u.read() finally: u.close() # if this person is famous if html != 'Error downloading this file.': try: dat = html.split(",\"") startnumber = 16 # if the search is a person if dat[7].replace('"', '').split(":")[1] == "1": startnumber += 6 for i in range(3, startnumber): fameinfo = dat[i].replace('"', '').split(":") # dynamic create variable setattr(self, fameinfo[0], fameinfo[1]) except Exception: self.init_fame_single() else: self.init_fame_single()
def HtmlTagStrip(html, webInfo): images = [] for item in re.findall(webInfo['imgPattern'], html): if not item: continue #print item.encode('utf-8') #print webInfo['imgUrlPattern'] if checkSpecialImg(item, webInfo['imgUrlPattern']): for term in item.split(' '): res = re.search(webInfo['imgUrlPattern'], term) if res: img = (res.group()) if 'big' in term or 'onclick' in term: images.append(img) else: #print webInfo['imgUrlPattern'] res = re.search(webInfo['imgUrlPattern'], item) #print res if res: img = (res.group()) images.append(img) for k, v in RE_IGNORE_BLOCK.iteritems(): html = re.sub(v, '', html) for k, v in RE_NEWLINE_BLOCK.iteritems(): html = re.sub(v, '\n', html) html = re.sub(RE_MULTI_NEWLINE, '\n', html) html = re.sub(RE_TAG, '', html) lines = [] for line in html.split('\n'): if len(line) == 0: continue else: lines.append(line.strip()) html = '\n'.join(lines) return images, html
url = link.get('href') if url is not None and 'incident-reports' in url and 'read more' not in link.text: reportUrls.append('http://www.wrps.on.ca' + url) elif url is not None and 'next' in link.text: listUrls.append('http://www.wrps.on.ca' + url) g = geocoders.Google(domain='maps.google.ca') #scraperwiki.sqlite.execute('drop table swdata') #scraperwiki.sqlite.commit() reportUrls.reverse() for report in reportUrls: print report html = scraperwiki.scrape(report) lines = html.split('\n') itype = None for line in lines: match = re.search('<h1 class="title">.*eports( for)? (.*?)( ?- ?[Uu][Pp][Dd][Aa][Tt][Ee].*)?</h1>', line) if match: try: reportdate = datetime.strptime(match.group(2), '%B %d, %Y') except: reportdate = datetime.strptime(match.group(2), '%B%d, %Y') continue match = re.search('^<p>.*?Incident # ([0-9]{2}-[0-9]{6}).*?Type : ([^&]*)&?.*?<br />(.*?)<br />(.*?)</p>', line.strip()) if match: if itype != None: processIncident(incident, itype, location, text, reportdate) #
#print "ok handle" # Want debugging messages? br.set_debug_http(True) br.set_debug_redirects(True) br.set_debug_responses(True) # User-Agent br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] #print "ok headers" the_list = [] list_url = "http://greencracker.net/wp-content/uploads/2013/06/elevenn.csv" response = br.open(list_url) html = response.read() the_list = html.split("*") i = 0 for item in range(len(the_list)): the_list[i] = the_list[i].replace("\r", "") the_list[i] = the_list[i].replace("'", "") the_list[i] = the_list[i].replace('"', "") the_list[i] = the_list[i].strip() i=i+1 print the_list target = "http://www.nsopw.gov/en-us/Search" response = br.open(target) html = response.read() print html
def parse_cast_td(td): html = lxml.html.tostring(td, encoding="utf-8").decode() casts = [re.sub(r"(.*)|<[^<]*>|[ ]+", "", x) for x in html.split("<br>")] return casts
def get_main_content(html): if not isinstance(html, unicode): return '', '' html_lines_len = [len(x.strip()) for x in html.split('\n')] # 保存图片信息 """ images = {} for img in re.findall(RE_IMG, html): md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16] html = html.replace(img, md5) r = re.findall(RE_IMG_SRC, img) if len(r) == 1: src = r[0][1] else: src = '' images[md5] = "<img src='%s'>" % src#img """ # 去除所有的html标签 text = re.sub(RE_TAG, '', html) # 抽取发表时间 time = '' t_time = re.findall(RE_TIME, text) if len(t_time) > 0: time = t_time[0] date = '' t_date = re.findall(RE_DATETIME, text) if len(t_date) > 0: date = t_date[0][0] lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')] index_dist = [] size = len(lines) for i in xrange(size - BLOCKS_WIDTH + 1): char_num = 0 for j in xrange(i, i + BLOCKS_WIDTH): strip = re.sub(ur'\s+', '', lines[j]) char_num += len(strip) index_dist.append(char_num) main_text = '' fstart = -1 start = -1 end = -1 flag_s = False flag_e = False first_match = True for i in xrange(len(index_dist) - 1): if first_match and not flag_s: if index_dist[i] > THRESHOLD / 2: if index_dist[i + 1] != 0 or index_dist[i + 2] != 0: first_match = False flag_s = True start = i fstart = i continue if index_dist[i] > THRESHOLD and not flag_s: if index_dist[i + 1] != 0 or index_dist[i + 2] != 0 or index_dist[ i + 3] != 0: flag_s = True start = i continue if flag_s: if index_dist[i] == 0 or index_dist[i + 1] == 0: end = i flag_e = True tmp = '' if flag_e: for ii in xrange(start, end + 1): if (len(lines[ii]) < 1): continue tmp += lines[ii] + '\n' main_text += tmp flag_s = flag_e = False # for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1): # for md5 in images.keys(): # if lines[pre].find(md5) > 0: # main_text = lines[pre] + '\n' + main_text # break """ for md5,img in images.iteritems(): main_text = main_text.replace(md5, img) """ return strtotime(date, time), main_text
def get_main_content(html): if not isinstance(html, unicode): return '','' html_lines_len = [len(x.strip()) for x in html.split('\n')] # 保存图片信息 images = {} for img in re.findall(RE_IMG, html): md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16] html = html.replace(img, md5) r = re.findall(RE_IMG_SRC, img) if len(r) == 1: src = r[0][1] else: src = '' images[md5] = "<img src='%s'>" % src#img # 去除所有的html标签 text = re.sub(RE_TAG, '', html) # 抽取发表时间 time = '' t = re.findall(RE_DATETIME, text) if len(t) > 0: time = t[0][0] lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')] index_dist = [] size = len(lines) for i in xrange(size - BLOCKS_WIDTH + 1): char_num = 0 for j in xrange(i, i + BLOCKS_WIDTH): strip = re.sub(ur'\s+', '', lines[j]) char_num += len(strip) index_dist.append(char_num) main_text = '' fstart = -1 start = -1 end = -1 flag_s = False flag_e = False first_match = True for i in xrange(len(index_dist) - 1): if first_match and not flag_s: if index_dist[i] > THRESHOLD / 2: if index_dist[i+1] != 0 or index_dist[i+2] != 0: first_match = False flag_s = True start = i fstart = i continue if index_dist[i] > THRESHOLD and not flag_s: if index_dist[i+1] != 0 or index_dist[i+2] != 0 or index_dist[i+3] != 0: flag_s = True start = i continue if flag_s: if index_dist[i] == 0 or index_dist[i+1] == 0: end = i flag_e = True tmp = '' if flag_e: for ii in xrange(start, end+1): if (len(lines[ii]) < 1): continue tmp += lines[ii] + '\n' main_text += tmp flag_s = flag_e = False # for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1): # for md5 in images.keys(): # if lines[pre].find(md5) > 0: # main_text = lines[pre] + '\n' + main_text # break for md5,img in images.iteritems(): main_text = main_text.replace(md5, img) return strtotime(time), main_text
tree = html.fromstring(page.text) #This will create a list of prices reflection_html = tree.xpath('//div[@class="moduleBody"]')[0] html = etree.tostring(reflection_html, pretty_print=True) html = html.replace('\r\n', '\n') html = html.replace('<br/>', '\n').replace(' ', ' ').replace('\n \n', '\n\n') new_html = "" for line in html: new_html += line.strip() new_html += "\r" #html = new_html parts = html.split("<h1> </h1>") #with open("reflection.html", 'w') as out: # out.write(html.encode('utf8')) #print "{} parts.".format(len(parts)) with open('output.markdown', 'w') as out_all: for count, part in enumerate(parts): markdown = converter.convert(html, 'markdown', format='html') out_all.write(part.encode('utf8')) if count+1 < len(parts): out_all.write("\n\\pagebreak\n") with open('output{}.markdown'.format(count), 'w') as out: out.write(part.encode('utf8'))
try: for athlete in data: tick = timer() while not outQ.empty(): logger.debug(outQ.get(False)) athlete['EventCount'] = c.execute( "SELECT dbo.getAthleteEventCount({})".format( athlete['AthleteID'])) logger.debug("Checking ID {}, {} {} ({})".format( athlete['AthleteID'], athlete['FirstName'], athlete['LastName'], athlete['EventCount'])) html = getURL(baseURL.format(athlete['AthleteID'])) try: runcount = int( html.split('<h2>')[1].split('<br/>')[0].split( ' runs at All Events')[0].split(' ')[-1]) logger.debug("Runcount = {}".format(runcount)) except (ValueError, IndexError, AttributeError): print("Error reading run count for Athlete {}".format( athlete['AthleteID'])) logger.warning("Error reading run count for Athlete {}".format( athlete['AthleteID'])) continue if athlete['EventCount'] != runcount: eventsMissing = runcount - athlete['EventCount'] rows = lxml.html.fromstring( '<table' + html.split('<table')[3].split('</table>')[0] + '</table>').xpath('//tbody/tr') hist_data = c.execute( "SELECT * FROM getAthleteEventHistory({})".format( athlete['AthleteID']))