def scrape(crno): crnostr = "%07d" % crno baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO=" url = baseurl + crnostr print "trying local", crnostr html = load_local(url) if html is None: print "trying site", crnostr html = scraperwiki.scrape(url).decode('utf-8') print "storing local", crnostr store_local(url, html.encode('utf-8')) else: html = html.decode('utf-8') if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'): print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!' return nil root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") while tds == []: print "trying", crnostr, "again" sleep(46) html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8') root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") #for idx, val in enumerate(tds): # print idx, ":", val.text_content().encode('utf-8') names = {} for nameidx, nameval in enumerate(namestds): names["Name" + str(nameidx)] = nameval.text_content()[10:] names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10] print "got", tds[1].text_content() data = { 'cr' : tds[1].text_content(), 'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'), 'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'), 'Company Type' : tds[4].text_content()[:-1], 'Date of incorporation' : tds[6].text_content(), # 'Company status' : tds[8].text_content()[:-1], 'Active status' : tds[8].text_content()[:-1], 'Remarks' : tds[9].text_content().replace(u"備註:",""), 'Winding up mode' : tds[11].text_content()[:-1], 'Date of Dissolution' : tds[13].text_content(), 'Register of Charges' : tds[15].text_content()[:-1], 'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t') } data.update(names) db['swdata'].upsert(data, ['cr']) print "wrote", tds[1].text_content()
def main(): import optparse optparser = optparse.OptionParser( description="Transforms Hansard XML from the Canadian House of Commons into " "an easy-to-process HTML format. If no options are specified, reads XML from stdin." ) optparser.add_option("-f", "--file", dest="filename", help="Process the XML file at FILE") optparser.add_option( "-i", "--docid", dest="docid", help="Document ID (e.g. 5069607) on parl.gc.ca; it'll be fetched and processed", metavar="ID", ) optparser.add_option( "-l", "--language", dest="language", metavar="[E,F]", default="E", help="Language of the document to download. Only necessary if alpheus is downloading from parl.gc.ca.", ) group = optparse.OptionGroup(optparser, "Debugging Options") group.add_option( "--print-names", dest="print_names", action="store_true", help="Instead of outputting HTML, print a list of names of people speaking.", ) group.add_option("--pdb", dest="pdb", action="store_true", help="Drop into the Python debugger on exception") optparser.add_option_group(group) (options, args) = optparser.parse_args() try: if options.filename: document = parse_file(open(options.filename)) elif options.docid: document = fetch_and_parse(options.docid, options.language[0].upper()) else: document = parse_file(sys.stdin) except Exception as e: if options.pdb: import pdb pdb.post_mortem() else: raise # sys.stderr.write("Parsed %d statements\n" % len(document.statements)) if options.print_names: for s in document.statements: print s.meta.get("person_attribution", "").encode("utf8") else: html = document.as_html() print html.encode("utf8")
def render_GET(self, request): try: style = get_style_by_name(self.style_name) except ClassNotFound: style = get_style_by_name('default') self.style_name = 'default' prev_url = None if self.days_back: prev_url = self.url_for(request, self.days_back - 1) next_url = self.url_for(request, (self.days_back or 0) + 1) formatter = LogFormatter(style=style) if self.days_back: log_date = date.today() - timedelta(self.days_back) suffix = log_date.strftime('.%Y_%m_%d').replace('_0', '_') self.logfilename += suffix try: with codecs.open(self.logfilename, 'r', 'utf-8') as logfile: html = self.render_log(logfile.read(), formatter, prev_url, next_url) except IOError: request.setResponseCode(404) return '<html><body>Go away.</body></html>' request.setHeader('Content-Type', 'text/html;charset=utf-8') return html.encode('utf-8')
def convert_html_to_markdown(html): # type: (Text) -> Text # On Linux, the tool installs as html2markdown, and there's a command called # html2text that does something totally different. On OSX, the tool installs # as html2text. commands = ["html2markdown", "html2text"] for command in commands: try: # A body width of 0 means do not try to wrap the text for us. p = subprocess.Popen( [command, "--body-width=0"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT) break except OSError: continue markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip() # We want images to get linked and inline previewed, but html2text will turn # them into links of the form `![](http://foo.com/image.png)`, which is # ugly. Run a regex over the resulting description, turning links of the # form `![](http://foo.com/image.png?12345)` into # `[image.png](http://foo.com/image.png)`. return re.sub(u"!\\[\\]\\((\\S*)/(\\S*)\\?(\\S*)\\)", u"[\\2](\\1/\\2)", markdown)
def parse_usage(html): """ Extract the usage integers out of the summary HTML and return them as a dictionary with keys 'minutes', 'texts', 'megabytes'. """ with open('test.html', 'w') as f: f.write(html.encode('utf-8')) lxml_root = lxml.html.fromstring(html) fields = OrderedDict([ ('minutes', ("//*[contains(text(), 'minutes used')]/" "preceding-sibling::strong/text()", int)), ('texts', ("//*[contains(text(), 'texts used')]/" "preceding-sibling::strong/text()", int)), ('megabytes', ("//*[contains(text(), 'MB data used')]/" "preceding-sibling::strong/text()", int)), ]) data = {} for field, (xpath, convert_function) in fields.items(): logging.debug(xpath) matching_elements = lxml_root.xpath(xpath) assert len(matching_elements) == 1 value = matching_elements[0] if convert_function: value = convert_function(value) data[field] = value return data
def parse_speeches(i,url,html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() meta = content.cssselect("table.meta")[0] try: record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations except: pass try: record['delivered_by'] = meta.xpath('//*[contains(text(), "Delivered by:")]/following-sibling::*')[0].text_content() #delivered by except: pass try: record['delivered_on_date'] = meta.xpath('//*[contains(text(), "Delivered date:")]/following-sibling::*')[0].text_content().strip() # delivered on date except: pass try: record['speech_type'] = meta.xpath('//*[contains(text(), "Type:")]/following-sibling::*')[0].text_content().strip() # speech type except: pass try: record['event'] = meta.xpath('//*[contains(text(), "Event:")]/following-sibling::*')[0].text_content().strip() # event except: pass try: record['location'] = meta.xpath('//*[contains(text(), "Location:")]/following-sibling::*')[0].text_content().strip() # location except: pass if 'event' in record and 'location' in record: record['event_and_location'] = record['event'] + ', ' + record['location'] # event + location try: record['date'] = dateutil.parser.parse(record["delivered_on_date"], dayfirst=True).date().isoformat() #iso date except: pass try: record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associate policies except: pass for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') return record
def fromstring(self, html): html = encodeValue(html) try: self.doc = lxml.html.fromstring(html) except: html = html.encode('utf-8','replace') self.doc = lxml.html.fromstring(html) return self.doc
def fromstring(self, html): html = encodeValue(html) try: self.doc = lxml.html.fromstring(html) except: html = html.encode("ascii", "replace") self.doc = lxml.html.fromstring(html) return self.doc
def searchGoogle(driver, query): query_decoded=query.decode("utf-8") #print query_decoded driver.get("http://google.ru/search?" + urlencode({'q': query})) time.sleep(2) html = driver.page_source html=html.encode("utf-8") return html
def markdownify_html2text(html): p = subprocess.Popen(['html2text', '-d', '-b', '0', ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = p.communicate(input=html.encode('utf-8')) return stdout
def fromstring(cls, html, original_encoding='utf-8'): html = encodeValue(html, encoding=original_encoding) try: parser = lxml.html.HTMLParser(encoding=original_encoding) cls.doc = lxml.html.fromstring(html.encode(original_encoding), parser=parser) except Exception, e: print '[Parse lxml ERR]', str(e) return None
def clean_html(cls, html, encoding=None): parser = lxml.html.HTMLParser(encoding=encoding) if isinstance(html, unicode) and encoding is not None: html = html.encode(encoding) html = lxml.html.document_fromstring(html, parser=parser) return _cleaner.clean_html(html)
def unicodeToStr(html, encoding='utf-8'): if not isinstance(html, unicode): decoding, charJust = '', chardet.detect(html) try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding'] except Exception, e: print 'unicodeToStr chardet detect error:', Exception, '->', e if encoding and decoding and decoding!=encoding : html = html.decode(decoding, 'ignore').encode(encoding, 'ignore') else: if encoding: html = html.encode(encoding, 'ignore') return html
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.rstrip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warn ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError, stderr return html.decode ('utf-8')
def cache_results(search_params, html): """Stores a html resource as a file in scrapecache/fname.cache This will always write(overwrite) the cache file. """ fname = cached_file_name(search_params) with open(os.path.join(CACHEDIR, fname), 'w') as fd: #TODO see encoding in detail. fd.write(html.encode('utf8'))
def fromstring(self, html): # html = normalize_spaces(html) html = clean_attributes(html) self.doc = lxml.html.fromstring(html.encode('utf-8')) # TODO: 查看哪个正确率更高 # from lxml.html import html5parser # import lxml.html # html5doc = html5parser.document_fromstring(html.encode('utf-8')) # self.doc = lxml.html.fromstring(self.nodeToString(html5doc)) return self.doc
def send_email(from_email, to_email_list, subject, html, smtp_host, smtp_port=587, username=None, password=None): message = Message(From=from_email, To=to_email_list, charset='utf-8') # Keep from creating threads in gmail... message.Subject = "{} -- {}".format(subject, datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) message.Html = html.encode('utf-8') message.Body = 'See the HTML!' sender = Mailer(host=smtp_host, port=smtp_port, use_tls=True, usr=username, pwd=password) if username is not None: sender.login(username, password) sender.send(message)
def parse_news(i,url,html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() #stripped title meta = content.cssselect("table.meta")[0] try: record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations except: pass try: record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associate policies except: pass try: record['first_published'] = meta.xpath('//*[contains(text(), "Published date:")]/following-sibling::*')[0].text_content().strip() # first published date except: pass try: record['type'] = meta.xpath('//*[contains(text(), "Type:")]/following-sibling::*')[0].text_content().strip() # type except: pass try: record['date'] = dateutil.parser.parse(record["first_published"], dayfirst=True).date().isoformat() #iso date except: pass for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding=unicode)) #bodytext #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding="ascii")) #bodytext #encoding mess: record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') #print lxml.html.tostring(content,encoding="ascii") #print repr(record['body']) return record
def doConvert(url): # загрузка страницы j = urllib.urlopen(url) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'cp1251' data = text.decode(encoding) # конвертирование html документа в markdown originalMarkdownDocument = html2text.html2text(data, url) markdownDocument = originalMarkdownDocument.split("\n") # поиск верхней границы статьи title = lxml.html.document_fromstring(text) startLine = findStartMerker(title.find(".//title").text, markdownDocument) # удаление текста выше верхней границы del markdownDocument[:startLine] # поиск нижней границы статьи skiplist = [] endLine = findEndMarker(markdownDocument, skiplist) # удаление строк из skiplist for x in range(len(skiplist)-1,0, -1): markdownDocument.pop(skiplist[x]) # отсечение статьи по нижней границе if endLine <> -1: del markdownDocument[endLine-len(skiplist)+1:] else: return; # замена ссылок линками fragment = listToString(markdownDocument) fragment = replaceInternalLinks(originalMarkdownDocument, fragment) global htmlOut if htmlOut == 1: # конвертирование markdown в html html = markdown.markdown(fragment) print html.encode('utf-8') else: print fragment.encode('utf-8')
def process_article(html, full=True, replace=False): pos = 0 src = None try: soup = BeautifulSoup(html) except UnicodeEncodeError: soup = BeautifulSoup(html.encode('utf-8', 'ignore')) media_found = False for tag in soup.find_all(True): if any(x == tag.name for x in EXCLUDED_TAGS) \ or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\ or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip())) or (tag.name == 'img' and 'src' in tag.attrs and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\ or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\ or isinstance(tag, Comment): if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() continue for attr in EXCLUDED_ATTR: try: del tag[attr] except AttributeError: pass if not replace and not media_found and full: if tag.name != 'img' and tag.name != 'a' and pos > 12: media_found = True elif tag.name == 'img' and 'src' in tag.attrs: src = tag.attrs['src'] if src: o = urlparse.urlparse(src) src = o.scheme + "://" + o.netloc + o.path if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() media_found = True pos += 1 if replace: if tag.name == 'img' and 'src' in tag.attrs and tag.attrs['src'] == replace: if tag.parent and tag.parent.name == 'a': tag.parent.decompose() else: tag.decompose() content = unicode(soup) if full: excerpt = (strip_tags(content)).strip() return {'content': content, 'image': src, 'word_count': len(excerpt.split()), 'excerpt': excerpt} else: return {'content': content, 'image': src}
def __bytes__(self): # try to make sure summary is wrapped in a tag summary = self.document.metadata['summary'] try: etree.fromstring(summary) html = '{}'.format(summary) except etree.XMLSyntaxError: html = """\ <div class="description" data-type="description"\ xmlns="http://www.w3.org/1999/xhtml"> {} </div>""".format(summary) return html.encode('utf-8')
def extract(self, html, link): (title, body) = readability_extract(html) document = lxml.html.fromstring(html.encode('utf-8')) date_cells = document.cssselect('td.createdate') date = date_cells[0].text_content().strip() if len(date_cells) == 1 else None doc = { 'url': link, 'title': title, 'text': body, 'date': parse(date), 'source': 'ACGA News & Views' } return doc
def load_html(tree_or_html, base_url=None): """ Parse HTML data to a lxml tree. ``tree_or_html`` must be either unicode or utf8-encoded (even if original page declares a different encoding). If ``tree_or_html`` is not a string then it is returned as-is. """ if not isinstance(tree_or_html, (six.string_types, bytes)): return tree_or_html html = tree_or_html if isinstance(html, six.text_type): html = html.encode('utf8') return lxml.html.fromstring(html, base_url=base_url, parser=parser)
def Consultations(): #scraperwiki.sqlite.execute("drop table if exists consultations") scraperwiki.sqlite.execute("create table if not exists consultations (old_url text, i integer)") hurlbatch = scraperwiki.sqlite.execute("select xllinks.i, xllinks.url, html, htmlcache.status from xllinks left join htmlcache on htmlcache.url=xllinks.url left join consultations on consultations.i=xllinks.i where xllinks.sheetname='Consultations' and consultations.old_url is null and htmlcache.url is not null limit 20") ldata = [ ] print "fetched batch", len(hurlbatch["data"]) for i, url, html, response_status in hurlbatch["data"]: data = parse_consultations(i, url, html.encode('latin-1'), response_status) if data: ldata.append(data) print data else: print "Failed to parse", url, html scraperwiki.sqlite.save(["i"], ldata, "consultations") return len(ldata)
def parse(self, html=None): """Public function to start parsing the search engine results. Args: html: The raw html data to extract the SERP entries from. """ if html: self.html = html.encode('utf-8').decode('utf-8') # lets do the actual parsing self._parse() # Apply subclass specific behaviour after parsing has happened # This is needed because different parsers need to clean/modify # the parsed data uniquely. self.after_parsing()
def getData(html, course): root = lxml.html.fromstring(html.encode('utf-8')) sect = root.find_class('section main') # Section Allgemeines entfernen sect.pop(0) sections = [] # all sections for sec in sect: date = sec[0].text_content() kw = parseDateToKW(date) assignments = [] scripts = [] others = [] for docs in sec.find_class('activityinstance'): #Spezialfälle falls man nicht auf link zugreifen kann if (not (docs.xpath("a"))): continue link = (docs.xpath("a"))[0].get('href') name_of_file = docs.find_class('instancename')[0].text nof = removeUmlaut(name_of_file) if re.match(course['pattern_script'], name_of_file): scripts.append(((nof, link))) elif re.match(course['pattern_assignment'], name_of_file): assignments.append((nof, link)) # Alle anderen Links und auch splashes else: others.append((nof, link)) if (not assignments) and (not scripts) and (not others): continue else: sections.append( Processed_Section(date, kw, assignments, scripts, others)) return sections
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS): """Converts HTML to DOM.""" if isinstance(html, unicode): decoded_html = html # encode HTML for case it's XML with encoding declaration forced_encoding = encoding if encoding else default_encoding html = html.encode(forced_encoding, errors) else: decoded_html = decode_html(html, default_encoding, encoding, errors) try: dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser()) except ValueError: # Unicode strings with encoding declaration are not supported. # for XHTML files with encoding declaration, use the declared encoding dom = lxml.html.fromstring(html, parser=lxml.html.HTMLParser()) return dom
def searchGoogle(driver, query): query_decoded=query.decode("utf-8") #print query_decoded driver.get("http://google.ru/search?" + urlencode({'q': query})) #inputElement = driver.find_element_by_id("lst-ib") #inputElement.send_keys(query_decoded) #inputElement.submit() time.sleep(2) html = driver.page_source html=html.encode("utf-8") #f = open(query + '.html', 'w') #f.write(html) #f.close() #check if yandex banned query then std print error and continue # if (("Нам очень жаль, но запросы, поступившие" in html) or ("Введите, пожалуйста, символы с картинки в поле ввода" in html)): # print 'Yandex banned: ' + query_decoded + '\n' # raise return html
def Consultations(): #scraperwiki.sqlite.execute("drop table if exists consultations") scraperwiki.sqlite.execute( "create table if not exists consultations (old_url text, i integer)") hurlbatch = scraperwiki.sqlite.execute( "select xllinks.i, xllinks.url, html, htmlcache.status from xllinks left join htmlcache on htmlcache.url=xllinks.url left join consultations on consultations.i=xllinks.i where xllinks.sheetname='Consultations' and consultations.old_url is null and htmlcache.url is not null limit 20" ) ldata = [] print "fetched batch", len(hurlbatch["data"]) for i, url, html, response_status in hurlbatch["data"]: data = parse_consultations(i, url, html.encode('latin-1'), response_status) if data: ldata.append(data) print data else: print "Failed to parse", url, html scraperwiki.sqlite.save(["i"], ldata, "consultations") return len(ldata)
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS): """Converts HTML to DOM.""" if isinstance(html, unicode): decoded_html = html # encode HTML for case it's XML with encoding declaration forced_encoding = encoding if encoding else default_encoding html = html.encode(forced_encoding, errors) else: decoded_html = decode_html(html, default_encoding, encoding, errors) try: dom = lxml.html.fromstring(decoded_html) except ValueError: dom = lxml.html.fromstring(html) return dom
def remove_control_characters(html): # type: (t.Text) -> t.Text """ Strip invalid XML characters that `lxml` cannot parse. """ # See: https://github.com/html5lib/html5lib-python/issues/96 # # The XML 1.0 spec defines the valid character range as: # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] # # We can instead match the invalid characters by inverting that range into: # InvalidChar ::= #xb | #xc | #xFFFE | #xFFFF | [#x0-#x8] | [#xe-#x1F] | [#xD800-#xDFFF] # # Sources: # https://www.w3.org/TR/REC-xml/#charsets, # https://lsimons.wordpress.com/2011/03/17/stripping-illegal-characters-out-of-xml-in-python/ def strip_illegal_xml_characters(s, default, base=10): # Compare the "invalid XML character range" numerically n = int(s, base) if ( n in (0xB, 0xC, 0xFFFE, 0xFFFF) or 0x0 <= n <= 0x8 or 0xE <= n <= 0x1F or 0xD800 <= n <= 0xDFFF ): return "" return default # We encode all non-ascii characters to XML char-refs, so for example "💖" becomes: "💖" # Otherwise we'd remove emojis by mistake on narrow-unicode builds of Python html = html.encode("ascii", "xmlcharrefreplace").decode("utf-8") html = re.sub( r"&#(\d+);?", lambda c: strip_illegal_xml_characters(c.group(1), c.group(0)), html ) html = re.sub( r"&#[xX]([0-9a-fA-F]+);?", lambda c: strip_illegal_xml_characters(c.group(1), c.group(0), base=16), html, ) # A regex matching the "invalid XML character range" html = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]").sub("", html) return html
def decompress_descriptions(encoding='utf-8'): """Convert parquet to tarfile""" pf = pq.ParquetFile(YAHOO_PARQUET) progress = tqdm(file=sys.stdout, disable=False) with tarfile.open(YAHOO_ARCH, 'w:bz2') as archive: for i in range(pf.metadata.num_row_groups): table = pf.read_row_group(i) columns = table.to_pydict() for symbol, html in zip(columns['symbol'], columns['html']): bytes = html.encode(encoding) s = io.BytesIO(bytes) tarinfo = tarfile.TarInfo(name=f'yahoo/{symbol}.html') tarinfo.size = len(bytes) archive.addfile(tarinfo=tarinfo, fileobj=s) progress.update(1) progress.close()
def sync_oss(): mongo = MongoSpider(conf.mongo_spider) with open(conf.data_root + os.sep + 'filter.json') as fd: data = fd.read() feeds = json.loads(data) feeds = dict(filter(lambda x: x[1]['status'] == 'good', feeds.iteritems())) for key, feed in feeds.iteritems(): try: doc = { 'rss_id': feed['_id'], 'html': { '$exists': True, '$ne': '' }, 'oss': { '$ne': 'success' } } success, error = 0, 0 for article in mongo.article.find(doc, timeout=False): html = mongo.file.get(article['html']) html = html.encode('utf-8') if html and mongo.oss.put(article['html'], html): mongo.article.update({'_id': article['_id']}, {'$set': { 'oss': 'success' }}) print article['title'], 'OK' success += 1 else: mongo.article.update({'_id': article['_id']}, {'$set': { 'oss': 'error' }}) print article['title'], 'Error' error += 1 print key, success + error, success, error, feed['url'] except KeyboardInterrupt, e: break except Exception, e: print e
def add_text(): url = request.json["url"] print("url", url) if any([y in url for y in blocklist]): print("blocked", [y for y in blocklist if y in url]) return jsonify({}) html = request.json["html"] html = lxml.html.tostring(lxml.html.fromstring(html.encode("utf8"))) tree = make_tree(html, url) html = lxml.html.tostring(tree).decode("utf8") slugged_url = slug_url(url) t1 = time.time() # meta_path = BASE_PATH / "meta/v1/{}_{}.json".format(t1, slugged_url) # try: # article = parse_article(html, url) # metadata = article.to_dict(keys=ARTICLE_KEYS_TO_KEEP, skip_if_empty=True) # except Exception as e: # metadata = {"error": str(e)} # metadata["creation_time"] = t1 # metadata["slugged_url"] = slugged_url # with open(meta_path, "w") as f: # json.dump(metadata, f, indent=4) # just.write(metadata, meta_path) html_path = BASE_PATH + "html/{}_{}.html.gz".format(t1, slugged_url) print("html_path", html_path) just.write(html, html_path) obj = {"path": str(html_path), "url": url, "time": str(time.time())} print("META_PATH", META_PATH) just.append(obj, META_PATH) last.append(html) last_urls.append(url) print("saved", url) return jsonify({"urls": list(last_urls)})
def get_html_tree(html): """ Given the HTML string, returns a LXML tree object. The tree is wrapped in <div> elements if it doesn't have a top level tag or parsing would otherwise result in an error. The wrapping can be later removed with strip_wrapping(). """ parser = lxml.html.HTMLParser(encoding='utf-8') html = html.encode('utf8') try: tree = lxml.html.fromstring(html, parser=parser) except lxml.etree.Error: # E.g. empty document. Use dummy <div> tree = lxml.html.fromstring('<div></div>') # If the document doesn't start with a top level tag, wrap it with a <div> # that will be later stripped out for consistent behavior. if tree.tag not in lxml.html.defs.top_level_tags: html = b'<div>%s</div>' % html tree = lxml.html.fromstring(html, parser=parser) # HACK for Outlook emails, where tags like <o:p> are rendered as <p>. We # can generally ignore these tags so we replace them with <span>, which # doesn't cause a line break. Also, we can't look up the element path of # tags that contain colons. When rendering the tree, we will restore the # tag name. for el in tree.iter(): if el.nsmap or (isinstance(el.tag, string_class) and ':' in el.tag): if el.nsmap: actual_tag_name = '{}:{}'.format( list(el.nsmap.keys())[0], el.tag) else: actual_tag_name = el.tag el.tag = 'span' el.attrib['__tag_name'] = actual_tag_name return tree
def test_image_data_links_in_style(self): data = b'123' data_b64 = base64.b64encode(data).decode('ASCII') urls = [ "data:image/jpeg;base64," + data_b64, "data:image/apng;base64," + data_b64, "data:image/png;base64," + data_b64, "data:image/gif;base64," + data_b64, "data:image/webp;base64," + data_b64, "data:image/bmp;base64," + data_b64, "data:image/tiff;base64," + data_b64, "data:image/x-icon;base64," + data_b64, ] for url in urls: html = '<style> url(%s) </style>' % url s = lxml.html.fragment_fromstring(html) cleaned = lxml.html.tostring(clean_html(s)) self.assertEqual( html.encode("UTF-8"), cleaned, "%s -> %s" % (url, cleaned))
def get_html_tree(html): """ Given the HTML string, returns a LXML tree object. The tree is wrapped in <div> elements if it doesn't have a top level tag or parsing would otherwise result in an error. The wrapping can be later removed with strip_wrapping(). """ parser = lxml.html.HTMLParser(encoding='utf-8') html = html.encode('utf8') try: tree = lxml.html.fromstring(html, parser=parser) except lxml.etree.Error: # E.g. empty document. Use dummy <div> tree = lxml.html.fromstring('<div></div>') # If the document doesn't start with a top level tag, wrap it with a <div> # that will be later stripped out for consistent behavior. if tree.tag not in lxml.html.defs.top_level_tags: html = b'<div>%s</div>' % html tree = lxml.html.fromstring(html, parser=parser) # HACK for Outlook emails, where tags like <o:p> are rendered as <p>. We # can generally ignore these tags so we replace them with <span>, which # doesn't cause a line break. Also, we can't look up the element path of # tags that contain colons. When rendering the tree, we will restore the # tag name. for el in tree.iter(): if el.nsmap or (isinstance(el.tag, string_class) and ':' in el.tag): if el.nsmap: actual_tag_name = '{}:{}'.format(list(el.nsmap.keys())[0], el.tag) else: actual_tag_name = el.tag el.tag = 'span' el.attrib['__tag_name'] = actual_tag_name return tree
def html2dita_saxon(html, infotype='topic'): if not isinstance(html, six.text_type): raise TypeError('HTML must be str/unicode') html_out, errors = tidylib.tidy_document( html.encode('utf8'), options={ 'doctype': 'omit', 'output_xhtml': 1, 'input-encoding': 'utf8', 'output-encoding': 'utf8', 'char-encoding': 'utf8', }) html_out = html_out.replace(b' xmlns="http://www.w3.org/1999/xhtml"', b'') html_filename = tempfile.mktemp(suffix='.html') with io.open(html_filename, 'wb') as fp: fp.write(html_out) output_filename = tempfile.mktemp(suffix='.html') cmd = '"{saxon}" "{html_filename}" "{h2d_xsl}" infotype={infotype} >"{output_filename}"'.format( saxon=saxon, html_filename=html_filename, h2d_xsl=h2d_xsl, infotype=infotype, output_filename=output_filename) status, output = util.runcmd(cmd) if status != 0: raise RuntimeError('html2dita() failed: {}'.format(output)) with io.open(output_filename, 'r') as fp: topic_out = fp.read() os.unlink(html_filename) os.unlink(output_filename) return topic_out
def extractHtml(html, selector, type='css', dump=False): items = [] if html != '': try: soup = lxml.html.fromstring(html.encode('utf-8')) if type == 'css': for item in soup.cssselect(selector): item = lxml.etree.tostring(item).decode('utf-8').strip() items.append(item) elif type == 'xpath': result = soup.xpath(selector) result = result if isinstance(result, list) else [result] for item in result: if isinstance(item, lxml.etree._Element): item = lxml.etree.tostring(item).decode('utf-8') items.append(str(item).strip()) except Exception as e: items.append('ERROR: ' + str(e)) return items
def convert_html_to_markdown(html: str) -> str: # On Linux, the tool installs as html2markdown, and there's a command called # html2text that does something totally different. On OSX, the tool installs # as html2text. commands = ["html2markdown", "html2text"] for command in commands: try: # A body width of 0 means do not try to wrap the text for us. p = subprocess.Popen( [command, "--body-width=0"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.STDOUT) break except OSError: continue markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip() # We want images to get linked and inline previewed, but html2text will turn # them into links of the form `![](http://foo.com/image.png)`, which is # ugly. Run a regex over the resulting description, turning links of the # form `![](http://foo.com/image.png?12345)` into # `[image.png](http://foo.com/image.png)`. return re.sub("!\\[\\]\\((\\S*)/(\\S*)\\?(\\S*)\\)", "[\\2](\\1/\\2)", markdown)
def to_xhtml (self, html, base_url): html = html.replace (u' ', u' ') html = html.replace (u'—', u'—') outputfilename = os.path.join (options.outputdir, options.outputfile) debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html' try: os.remove (debugfilename) except OSError: pass if options.verbose > 1: with open (debugfilename, 'w') as fp: fp.write (html.encode ('utf-8')) try: xhtml = etree.fromstring ( html, lxml.html.XHTMLParser (), base_url = base_url) except etree.ParseError, what: error ("etree.fromstring says %s" % what) raise
variation_template = config.VAR_TEMPLATE variation_dict = { "variation_table": variant_table.decode('utf-8'), "footer": footer.decode('utf-8'), "gwas_table": gwas_table.decode('utf-8'), "consequence_table": VEP_table.decode('utf-8'), "regulation": regulatory_table.decode('utf-8'), "Genomes_freq": population_table.decode('utf-8'), "ExAC_freq": exac_table.decode('utf-8'), "UK10K_freq": uk10K_table.decode('utf-8'), "genes": gene_table.decode('utf-8'), 'GTExGenes': GTEx_genes_table.decode('utf-8'), "pubmed": pubmed_table.decode('utf-8'), "phenotypes": phenotype_table.decode('utf-8') } html = draw_html(variation_template, variation_dict) # Saving file: f = open(filename, 'w') f.write(html.encode("utf8")) print >> sys.stderr, "Done." print >> sys.stderr, "Annotating genes... " for dist in gene_list.keys(): for gene in gene_list[dist]: print >> sys.stderr, "\tAnnotating %s... " % gene["ID"], Annotate_gene(gene["ID"]) print >> sys.stderr, "Done."
def parse_publications(i,url,html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag record['URN'] = root.cssselect('meta[name="DC.identifier"][scheme="ISBN"]')[0].get('content') #ISBN from metatag record['ISBN'] = record['URN'] record['command_paper_number'] = record['URN'] contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() meta = content.cssselect("table.meta")[0] try: record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associated policies except: pass try: record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations except: pass try: record['publication_date'] = meta.xpath('//*[contains(text(), "Published date:")]/following-sibling::*')[0].text_content() #delivered by except: pass try: record['publication_date_iso'] = dateutil.parser.parse(record["publication_date"], dayfirst=True).date().isoformat() #iso date except: pass for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() links = content.cssselect('a') if len(links) > 0: #put the page(s) with large number of attachments into a json field n = 1 attachment_json = [] for link in links: try: if 'assets.dft.gov.uk' in link.attrib['href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib['href']: attachment_json.append({'link': link.attrib['href'], 'title': link.text_content()}) n = n+1 except: pass try: if 'tsoshop.co.uk' in link.attrib['href']: record['order_url'] = link.attrib['href'] except: pass record['manual'] = 1 record['z'] = attachment_json else: #process the attachments n = 1 for link in links: try: if 'assets.dft.gov.uk' in link.attrib['href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib['href']: record['attachment_'+str(n)] = link.attrib['href'] record['attachment_'+str(n)+'_title'] = link.text_content() n = n+1 except: pass try: if 'tsoshop.co.uk' in link.attrib['href']: record['order_url'] = link.attrib['href'] except: pass record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') return record
def html_pdf_output(self): html = "\n".join(self.get_html_outputs()) pdf = pipe(["wkhtmltopdf", "-", "-"], html.encode('ascii', 'xmlcharrefreplace')) return pdf
parsed_url = urlparse(url) cookies = CookieJar() useragent = 'newspaper/0.2.8' headers = {'User-Agent': useragent} timeout = 7 response = requests.get(url=url, **{'headers': headers, 'cookies': cookies, 'timeout': timeout, 'allow_redirects': True}) html = response.text doc = lxml.html.fromstring(html) clean_doc = copy.deepcopy(doc) raw_html = html.encode('utf-8', 'replace') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) title = '' tag= 'title' selector = 'descendant-or-self::%s' % (tag or '*') elems = clean_doc.xpath(selector, namespaces=None) txts = [i for i in elems[0].itertext()] TABSSPACE = re.compile(r'[\s\t]+') value = ' '.join(txts).strip() value = re.sub(TABSSPACE, ' ', value) value = ''.join(value.splitlines())
def parse_speeches(i, url, html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get( 'content') #summary from metatag contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() meta = content.cssselect("table.meta")[0] try: record['associated_organisations'] = meta.xpath( '//*[contains(text(), "Publisher:")]/following-sibling::*' )[0].text_content() #associate organisations except: pass try: record['delivered_by'] = meta.xpath( '//*[contains(text(), "Delivered by:")]/following-sibling::*' )[0].text_content() #delivered by except: pass try: record['delivered_on_date'] = meta.xpath( '//*[contains(text(), "Delivered date:")]/following-sibling::*' )[0].text_content().strip() # delivered on date except: pass try: record['speech_type'] = meta.xpath( '//*[contains(text(), "Type:")]/following-sibling::*' )[0].text_content().strip() # speech type except: pass try: record['event'] = meta.xpath( '//*[contains(text(), "Event:")]/following-sibling::*' )[0].text_content().strip() # event except: pass try: record['location'] = meta.xpath( '//*[contains(text(), "Location:")]/following-sibling::*' )[0].text_content().strip() # location except: pass if 'event' in record and 'location' in record: record['event_and_location'] = record['event'] + ', ' + record[ 'location'] # event + location try: record['date'] = dateutil.parser.parse( record["delivered_on_date"], dayfirst=True).date().isoformat() #iso date except: pass try: record['associated_policies'] = meta.xpath( '//*[contains(text(), "Mode/topic:")]/following-sibling::*' )[0].text_content() #associate policies except: pass for node in content.cssselect( "div.header" ): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() record['body'] = html2text.HTML2Text().handle( data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') return record
def write_to_file(html, output_file): with open(output_file, "wb") as fh: fh.write(html.encode('utf-8'))
def parse_news(i, url, html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get( 'content') #summary from metatag contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() #stripped title meta = content.cssselect("table.meta")[0] try: record['associated_organisations'] = meta.xpath( '//*[contains(text(), "Publisher:")]/following-sibling::*' )[0].text_content() #associate organisations except: pass try: record['associated_policies'] = meta.xpath( '//*[contains(text(), "Mode/topic:")]/following-sibling::*' )[0].text_content() #associate policies except: pass try: record['first_published'] = meta.xpath( '//*[contains(text(), "Published date:")]/following-sibling::*' )[0].text_content().strip() # first published date except: pass try: record['type'] = meta.xpath( '//*[contains(text(), "Type:")]/following-sibling::*' )[0].text_content().strip() # type except: pass try: record['date'] = dateutil.parser.parse( record["first_published"], dayfirst=True).date().isoformat() #iso date except: pass for node in content.cssselect( "div.header" ): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding=unicode)) #bodytext #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding="ascii")) #bodytext #encoding mess: record['body'] = html2text.HTML2Text().handle( data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') #print lxml.html.tostring(content,encoding="ascii") #print repr(record['body']) return record
def browserview(self, html): tf = tempfile.NamedTemporaryFile(delete=False) tf.write(html.encode()) webbrowser.open(tf.name)
def parse_srs(i, url, html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get( 'content') #summary from metatag record['type'] = 'release' type_img = root.xpath('//img[@alt="National Statistics logo"]') if type_img: record['type'] = 'National Stats' chart = root.xpath('//div[@id="line_chart"]') if chart: record['chart'] = 'yes' contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() meta = content.cssselect("table.meta")[0] try: record['associated_policies'] = meta.xpath( '//*[contains(text(), "Statistics topic:")]/following-sibling::*' )[0].text_content() #associated policies except: pass try: record['associated_organisations'] = meta.xpath( '//*[contains(text(), "Publisher:")]/following-sibling::*' )[0].text_content() #associate organisations except: pass try: record['publication_date'] = meta.xpath( '//*[contains(text(), "Published date:")]/following-sibling::*' )[0].text_content().replace(u'\xa0', u'') except: pass try: record['publication_date_iso'] = dateutil.parser.parse( record["publication_date"], dayfirst=True).date().isoformat() #iso date except: pass try: record['publication_series'] = meta.xpath( '//*[contains(text(), "Series:")]/following-sibling::*' )[0].text_content() #delivered by except: pass print record for node in content.cssselect( "div.header" ): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() links = content.cssselect('a') if len( links ) > 50: #put the page(s) with large number of attachments into a json field n = 1 attachment_json = {} for link in links: try: if 'assets.dft.gov.uk' in link.attrib[ 'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[ 'href']: attachment_json['attachment_' + str(n)] = link.attrib['href'] attachment_json['attachment_' + str(n) + '_title'] = link.text_content() n = n + 1 except: pass record['manual'] = 1 record['attachment_json'] = json.dumps(attachment_json) else: #process the attachments n = 1 for link in links: try: if 'assets.dft.gov.uk' in link.attrib[ 'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[ 'href']: record['attachment_' + str(n)] = link.attrib['href'] record['attachment_' + str(n) + '_title'] = link.text_content() n = n + 1 except: pass record['body'] = html2text.HTML2Text().handle( data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') print record['body'] return record
def parse_sts(i, url, html, response_status): record = {} record['old_url'] = url record['i'] = i if response_status == 404: #don't bother with the 404's record['status'] = 404 return record root = lxml.html.fromstring(html.encode('iso-8859-1')) record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get( 'content') #summary from metatag contentdiv = root.cssselect("div#content") if not contentdiv: return False content = contentdiv[0] if not content: return False titles = list(content.cssselect("div.hgroup h1")) if titles: record['title'] = titles[0].text_content().strip() meta = content.cssselect("table.meta")[0] try: record['geo_scope'] = meta.xpath( '//*[contains(text(), "Geographical scope:")]/following-sibling::*' )[0].text_content() except: pass try: record['geo_breakdown'] = meta.xpath( '//*[contains(text(), "Geographical breakdown:")]/following-sibling::*' )[0].text_content() except: pass try: record['urn'] = meta.xpath( '//*[contains(text(), "Reference:")]/following-sibling::*' )[0].text_content().replace(u'\xa0', u'') #delivered by except: pass try: record['associated_policies'] = meta.xpath( '//*[contains(text(), "Statistics topic:")]/following-sibling::*' )[0].text_content() #associated policies except: pass try: record['associated_organisations'] = meta.xpath( '//*[contains(text(), "Publisher:")]/following-sibling::*' )[0].text_content() #associate organisations except: pass try: record['publication_date'] = meta.xpath( '//*[contains(text(), "Published date:")]/following-sibling::*' )[0].text_content().replace(u'\xa0', u'') except: pass try: record['publication_date_iso'] = dateutil.parser.parse( record["publication_date"], dayfirst=True).date().isoformat() #iso date except: pass try: record['publication_series'] = meta.xpath( '//*[contains(text(), "Series:")]/following-sibling::*' )[0].text_content() #delivered by except: pass try: record['type'] = meta.xpath( '//*[contains(text(), "Type:")]/following-sibling::*' )[0].text_content() #delivered by except: pass for node in content.cssselect( "div.header" ): #drop the header info - we are done with it and don't want it in the body text node.drop_tree() for node in content.xpath('div[@id="secondary"]'): #drop the secondary node.drop_tree() #links = content.xpath('//li/a/.') #linksul = content.xpath('//h2[text()="Download table"]//following-sibling::*') #for link in linksul: # print link.tag links = root.xpath('//div[@id="content"]//li/a/.') n = 1 for link in links: try: if 'assets.dft.gov.uk' in link.attrib[ 'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[ 'href']: record['attachment_' + str(n)] = link.attrib['href'] record['attachment_' + str(n) + '_title'] = link.text_content() n = n + 1 except: pass record['body'] = html2text.HTML2Text().handle( data=lxml.html.tostring(content)) #bodytext record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces record['body'] = record['body'].encode('utf-8') print record return record
def scrape(crno): crnostr = "%07d" % crno baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO=" url = baseurl + crnostr print "trying local", crnostr html = load_local(url) if html is None: print "trying site", crnostr html = scraperwiki.scrape(url).decode('utf-8') print "storing local", crnostr store_local(url, html.encode('utf-8')) else: html = html.decode('utf-8') if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'): print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!' return nil root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") while tds == []: print "trying", crnostr, "again" sleep(46) html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8') root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") #for idx, val in enumerate(tds): # print idx, ":", val.text_content().encode('utf-8') names = {} for nameidx, nameval in enumerate(namestds): names["Name" + str(nameidx)] = nameval.text_content()[10:] names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10] print "got", tds[1].text_content() data = { 'cr': tds[1].text_content(), 'English Company Name': tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'), 'Chinese Company Name': tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'), 'Company Type': tds[4].text_content()[:-1], 'Date of incorporation': tds[6].text_content(), # 'Company status' : tds[8].text_content()[:-1], 'Active status': tds[8].text_content()[:-1], 'Remarks': tds[9].text_content().replace(u"備註:", ""), 'Winding up mode': tds[11].text_content()[:-1], 'Date of Dissolution': tds[13].text_content(), 'Register of Charges': tds[15].text_content()[:-1], 'Important Note': tds[16].text_content().replace(u"重要事項:", "").lstrip('\r\n\t') } data.update(names) db['swdata'].upsert(data, ['cr']) print "wrote", tds[1].text_content()
def _cleaned_html_tree(html: str) -> HtmlElement: parser = HTMLParser(encoding='utf8') tree = fromstring(html.encode('utf8'), parser=parser) return _clean_html(tree)
def scrape_new_html(limit=20, url_comment_id=dict(), test_url=None): theta_conn = db_connections.get_theta_postgres_db() theta_cur = theta_conn.cursor() theta_cur.execute('set search_path = "backend"') if test_url is not None: theta_cur.execute( "select loc, html from html where loc = '{}';".format(test_url)) else: theta_cur.execute(""" SELECT html.loc , html FROM html JOIN sitemap ON html.loc = sitemap.loc WHERE (last_scrape IS NULL OR lastmod > last_scrape) AND html IS NOT NULL --AND NOT ('fundraisers' = ANY (categories)) AND NOT ('static' = ALL (categories) OR html.loc = 'https://www.crowdrise.com') limit {};""".format(limit)) html_data = theta_cur.fetchall() if len(html_data) == 0: theta_cur.close() theta_conn.close() return True all_data = dict(fundraiser=[], user=[], charity=[], event=[], special_user=[], front_page_redirect=[], user_project=[], charity_event=[], team=[], donation=[]) scraped_urls = [] for url, html in html_data: scraped_urls.append(url) try: # root = lxml.html.fromstring(lxml.html.tostring(lxml.html.fromstring(html.encode('latin1'))).decode('utf8')) try: root = lxml.html.fromstring( html.encode('latin1').decode('utf8')) except UnicodeDecodeError: logging.warning( 'unicode decode error for url "{}"'.format(url)) theta_conn, theta_cur = keep_theta_conn_alive( theta_conn, theta_cur) theta_cur.execute( 'insert into html_bad_encoding values (%s) on CONFLICT DO NOTHING ;', [(url, )]) theta_conn.commit() root = lxml.html.fromstring( html.encode('latin1').decode('utf8', errors='ignore')) try: page_type = CrowdriseScraper.get_page_type(root) except NotImplementedError: theta_conn, theta_cur = keep_theta_conn_alive( theta_conn, theta_cur) theta_cur.executemany( "insert into unknown_page_type values (%s) on CONFLICT DO NOTHING;", [(url, )]) theta_conn.commit() continue page_data = CrowdriseScraper.get_crowdrise_data( page_type, root, url, latest_comment_id=url_comment_id.get(url)) if page_data is not None: # file_data['file_path'] = cur_file_name page_data['url'] = url page_data['true_url'] = root.xpath( '//meta[@property="og:url"]')[0].attrib['content'].replace( 'https://', '').replace('http://', '') page_data['base_true_url'] = None # file_data['last_scrape'] = time.gmtime(os.path.getmtime(cur_file_name)) # handle data that requires its own table - eg the fundraisers each user has if 'projects' in page_data.keys(): projects = page_data.pop('projects') all_data['user_project'] += [{ 'username': page_data['username'], 'project': 'www.crowdrise.com' + x } for x in projects] if 'events' in page_data.keys(): events = page_data.pop('events') all_data['charity_event'] += [{ 'charity': page_data['url'], 'event': 'www.crowdrise.com' + x } for x in events] if 'team_members' in page_data.keys(): team_members = page_data.pop('team_members') all_data['team'] += team_members if 'donations' in page_data.keys(): donations = page_data.pop('donations') all_data['donation'] += donations all_data[page_type].append(page_data) except: print('failed on url "{}"'.format(url)) logging.error('failed on url "{}"'.format(url)) raise all_data['user_project'] = [ x for x in all_data['user_project'] if re.match(CROWDRISE_URL_RE, 'https://' + x['project']) ] db = db_connections.get_fungrosencrantz_schema('crowdrise') db_connections.multi_table_upload(data=all_data, db=db, ensure=True, process_num=None, chunk_size=3000) scrape_time = time.time() # update table with new entries db.query('truncate table _recently_updated') db.executable.execute( 'insert ignore into _recently_updated values (%s, %s)', [(x, scrape_time) for x in scraped_urls]) db.executable.execute(""" replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) SELECT fundraiser.url, CASE WHEN fundraiser_url IS NULL # individual fundraiser THEN fundraiser.username ELSE # team fundraiser '' # give team total raised for fundraiser, then use `team` to give individual contributions END, coalesce(team_total_raised, total_raised), NULL, _recently_updated.last_scrape_unix, 'fundraiser' FROM fundraiser join _recently_updated on _recently_updated.url = fundraiser.url LEFT JOIN team ON fundraiser.url = team.fundraiser_url GROUP BY fundraiser.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select fundraiser_url, username, amount_raised, goal, _recently_updated.last_scrape_unix, 'team' from team join _recently_updated on _recently_updated.url = team.fundraiser_url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select charity.url, '', money_raised, null, _recently_updated.last_scrape_unix, 'charity' from charity join _recently_updated on _recently_updated.url = charity.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select event.url, '', amount_raised, goal, _recently_updated.last_scrape_unix, 'event' from event join _recently_updated on _recently_updated.url = event.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select user.url, username, money_raised, null, _recently_updated.last_scrape_unix, 'user' from user join _recently_updated on _recently_updated.url = user.url; """) q = """ update html set last_scrape = to_timestamp({}) where loc in ({});""".format( scrape_time, ", ".join(["'" + x + "'" for x in scraped_urls])) theta_conn, theta_cur = keep_theta_conn_alive(theta_conn, theta_cur) theta_cur.execute(q) if test_url is None and limit != 0: theta_conn.commit() theta_cur.close() theta_conn.close() if len(html_data) < limit or test_url is not None: return False else: return True
def prepare_mongodoc(doc): html = doc.get('html_rendered', doc.get('html')) or '' return prepare_html(html.encode('utf8'))
import scraperwiki import requests import lxml.html import lxml.etree from lxml.cssselect import CSSSelector import time from datetime import datetime url = "http://www.wetteronline.de/Berlin/Berlin.htm" html = requests.get(url, verify=False).text root = lxml.html.fromstring(html.encode("utf-8")) time = datetime.now() t_max_heute = root.cssselect("td.tmax")[0].text_content() t_max_morgen = root.cssselect("td.tmax")[1].text_content() t_max_ubermorgen = root.cssselect("td.tmax")[2].text_content() t_min_heute = root.cssselect("td.tmin")[0].text_content() t_min_morgen = root.cssselect("td.tmin")[1].text_content() t_min_ubermorgen = root.cssselect("td.tmin")[1].text_content() data = { 'time': time, 't_max_heute': t_max_heute, 't_max_morgen': t_max_morgen, 't_max_ubermorgen': t_max_ubermorgen, 't_min_heute': t_min_heute, 't_min_morgen': t_min_morgen, 't_min_ubermorgen': t_min_ubermorgen }
def prepare_doc(doc): html = doc.get('html_rendered', doc['html']) return prepare_htmltext(html.encode('utf8'))