def remove_empty_lines(html): key = '%s:remove_empty_lines' % hash(html) out = cache.get(key, namespace="filters") if out: return out if '</' in html: html = html.strip().replace('\n', '') soup = BeautifulSoup(html) lines = [] for element in soup.contents: if isinstance(element, Tag): if element.text: lines.append(str(element).strip()) elif 'br' in str(element): lines.append('\n') elif isinstance(element, NavigableString): lines.append(str(element).strip()) out = ''.join(lines).strip() while '\n\n' in out: out = out.replace('\n\n', '\n') else: out = '\n'.join([line for line in html.split('\n') if line.strip()]) cache.set(key, out, namespace="filters") return out
def is_hidden_node(cls, node): """ Check if a node is hidden in html page """ style_list = node.get("style", None) if style_list: for p in style_list.split(";"): tokens = p.split(":") if len(tokens) >= 2 and tokens[0].strip().lower() == "display" and tokens[1].strip().lower() == "none": return True return False
def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties): if not style_str: return None properties = {} for p in style_str.split(";"): if p.strip(): token = p.split(":") if len(token) > 1: properties[token[0].strip()] = token[1].strip() return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)
def fields_from_split_html(template, html, separator, regex_with_groups_named_as_keys): list_to_ret = [] lines = html.split(separator) for line in lines: m = re.match(regex_with_groups_named_as_keys, line) if hasattr(m, 'groupdict'): dict_to_ret = dict(template.items() + m.groupdict().items()) list_to_ret.append(dict_to_ret) return list_to_ret
def scrape(self, chamber, term_name): for t in self.metadata['terms']: if t['name'] == term_name: session = t['sessions'][-1] slug = self.metadata['session_details'][session]['slug'] if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug) leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ['District No', 'Vacant'] if any(name in item['FullName'] for name in empty_names): continue last, first = item['FullName'].split(",", 1) item['FullName'] = "{first} {last}".format(last=last, first=first).strip() leg = Legislator(term_name, chamber, item['DistrictNbr'], item['FullName'], party=item['Party'], photo_url=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # hack to get the legislator ID html = self.get(leg_url).text for l in html.split('\n'): if 'GetLegislatorDetails' in l: leg_id = l.split(',')[1].split("'")[1] # fetch the json used by the page leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp['legislatorDetails'] address = details['Address1'] address2 = details['Address2'] if address2: address += ' ' + address2 address += '\n%s, NV %s' % (details['City'], details['Zip']) phone = details['LCBPhone'] email = details['LCBEmail'] leg.add_office('district', 'District Address', address=address, phone=phone,email=email) leg.add_source(leg_details_url) self.save_legislator(leg)
def html_clean(html): import lxml.html.clean import lxml.html import lxml.etree html, errors = tidy_document(html, # Tidy options: http://tidy.sourceforge.net/docs/quickref.html options={'bare': 1, 'clean': 1, 'output-xhtml': 1, 'drop-font-tags': 1, 'drop-proprietary-attributes': 1, 'hide-comments': 1, 'char-encoding': 'utf8', 'input-encoding': 'utf8', 'output-encoding': 'utf8'}) cleaner = lxml.html.clean.Cleaner( kill_tags=frozenset(['script', 'style', 'option']), remove_tags=frozenset(['a', 'strong', 'em']), safe_attrs_only=True, safe_attrs=frozenset()) html = cleaner.clean_html(html) # html = lxml.etree.tostring(lxml.html.fromstring(html), pretty_print=True).decode('utf8') # html = html.encode('utf-8', errors='strict') soup = BeautifulSoup(html) # [s.extract() for s in soup('script')] # remove 'script', 'style', 'option' tags # [s.extract() for s in soup('style')] # [s.extract() for s in soup('option')] html = soup.prettify() # html = htmllaundry.strip_markup(html) # leave only text # remove continuous empty lines html = re.sub(r'\n\s*\n+', '\n\n', html).strip() html = re.sub(r'[ \t]+', ' ', html, re.M).strip() # remove continuous spaces # cleaned_html = [sent for sent in cleaned_html.split( # '\n')] # if len(sent.split()) == 0 or len(sent.split()) >= 6] html_lines = html.split('\n') # return html_lines return list(html_sent_word_tokenize(html_lines))
url = link.get('href') if url is not None and 'incident-reports' in url and 'read more' not in link.text: reportUrls.append('http://www.wrps.on.ca' + url) elif url is not None and 'next' in link.text: listUrls.append('http://www.wrps.on.ca' + url) g = geocoders.Google(domain='maps.google.ca') #scraperwiki.sqlite.execute('drop table swdata') #scraperwiki.sqlite.commit() reportUrls.reverse() for report in reportUrls: print report html = scraperwiki.scrape(report) lines = html.split('\n') itype = None for line in lines: match = re.search('<h1 class="title">.*eports( for)? (.*?)( ?- ?[Uu][Pp][Dd][Aa][Tt][Ee].*)?</h1>', line) if match: try: reportdate = datetime.strptime(match.group(2), '%B %d, %Y') except: reportdate = datetime.strptime(match.group(2), '%B%d, %Y') continue match = re.search('^<p>.*?Incident # ([0-9]{2}-[0-9]{6}).*?Type : ([^&]*)&?.*?<br />(.*?)<br />(.*?)</p>', line.strip()) if match: if itype != None: processIncident(incident, itype, location, text, reportdate) #
def get_main_content(html): if not isinstance(html, unicode): return '','' html_lines_len = [len(x.strip()) for x in html.split('\n')] # 保存图片信息 images = {} for img in re.findall(RE_IMG, html): md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16] html = html.replace(img, md5) r = re.findall(RE_IMG_SRC, img) if len(r) == 1: src = r[0][1] else: src = '' images[md5] = "<img src='%s'>" % src#img # 去除所有的html标签 text = re.sub(RE_TAG, '', html) # 抽取发表时间 time = '' t = re.findall(RE_DATETIME, text) if len(t) > 0: time = t[0][0] lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')] index_dist = [] size = len(lines) for i in xrange(size - BLOCKS_WIDTH + 1): char_num = 0 for j in xrange(i, i + BLOCKS_WIDTH): strip = re.sub(ur'\s+', '', lines[j]) char_num += len(strip) index_dist.append(char_num) main_text = '' fstart = -1 start = -1 end = -1 flag_s = False flag_e = False first_match = True for i in xrange(len(index_dist) - 1): if first_match and not flag_s: if index_dist[i] > THRESHOLD / 2: if index_dist[i+1] != 0 or index_dist[i+2] != 0: first_match = False flag_s = True start = i fstart = i continue if index_dist[i] > THRESHOLD and not flag_s: if index_dist[i+1] != 0 or index_dist[i+2] != 0 or index_dist[i+3] != 0: flag_s = True start = i continue if flag_s: if index_dist[i] == 0 or index_dist[i+1] == 0: end = i flag_e = True tmp = '' if flag_e: for ii in xrange(start, end+1): if (len(lines[ii]) < 1): continue tmp += lines[ii] + '\n' main_text += tmp flag_s = flag_e = False # for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1): # for md5 in images.keys(): # if lines[pre].find(md5) > 0: # main_text = lines[pre] + '\n' + main_text # break for md5,img in images.iteritems(): main_text = main_text.replace(md5, img) return strtotime(time), main_text
#print "ok handle" # Want debugging messages? br.set_debug_http(True) br.set_debug_redirects(True) br.set_debug_responses(True) # User-Agent br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] #print "ok headers" the_list = [] list_url = "http://greencracker.net/wp-content/uploads/2013/06/elevenn.csv" response = br.open(list_url) html = response.read() the_list = html.split("*") i = 0 for item in range(len(the_list)): the_list[i] = the_list[i].replace("\r", "") the_list[i] = the_list[i].replace("'", "") the_list[i] = the_list[i].replace('"', "") the_list[i] = the_list[i].strip() i=i+1 print the_list target = "http://www.nsopw.gov/en-us/Search" response = br.open(target) html = response.read() print html
tree = html.fromstring(page.text) #This will create a list of prices reflection_html = tree.xpath('//div[@class="moduleBody"]')[0] html = etree.tostring(reflection_html, pretty_print=True) html = html.replace('\r\n', '\n') html = html.replace('<br/>', '\n').replace(' ', ' ').replace('\n \n', '\n\n') new_html = "" for line in html: new_html += line.strip() new_html += "\r" #html = new_html parts = html.split("<h1> </h1>") #with open("reflection.html", 'w') as out: # out.write(html.encode('utf8')) #print "{} parts.".format(len(parts)) with open('output.markdown', 'w') as out_all: for count, part in enumerate(parts): markdown = converter.convert(html, 'markdown', format='html') out_all.write(part.encode('utf8')) if count+1 < len(parts): out_all.write("\n\\pagebreak\n") with open('output{}.markdown'.format(count), 'w') as out: out.write(part.encode('utf8'))