def parser(): t1 = time.time() url = request.args.get('url') try: if url and url.strip() != "": url = url.strip() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' } rsps = requests.get(url, headers=headers) try: page = rsps.content.decode('utf-8') except: page = rsps.content.decode('gb18030', 'ignore') else: page = request.form.get("html_content") t2 = time.time() pm = PageModel(page, url) result = pm.extract() t3 = time.time() except: traceback.print_exc() return "download url failed" return render_template("result.html", data=result['content'], title=result['title'], json_s=json.dumps(result, indent=4), download_cost=t2 - t1, extract_cost=t3 - t2)
def run_jparser(htmlstring): '''try with jparser''' try: pm = PageModel(htmlstring) except ValueError: return '' result = pm.extract() mylist = list() for x in result['content']: if x['type'] in ('text', 'html'): mylist.append(str(x['data'])) returnstring = re.sub(r'\s+', ' ', ' '.join(mylist)) returnstring = re.sub(r' ([.,;!?])', '\1', returnstring) return returnstring
def content_extraction(html): try: pm = PageModel(html) result = pm.extract() title = result['title'] content = '' for x in result['content']: if x['type'] == 'text': content = '%s%s%s' % (content, x['data'], '\n') result = {}.fromkeys(("title", "content")) result["title"] = title result["content"] = content return result except Exception as e: print(e) return {"title": "", "content": ""}
def get_combined_index_data(self): combined_index_data = super(FeedContentFromPageItem, self).get_combined_index_data() if re.match(r'^https?\:\/\/', self.original_item['link']): page_link = self.original_item['link'] else: page_link = urljoin(self.source_definition['file_url'], self.original_item['link']) r = self.http_session.get(page_link, timeout=5) print >> sys.stderr, "Got %s with status code : %s" % ( self.original_item['link'], r.status_code) # only continue if we got the page if r.status_code < 200 or r.status_code >= 300: return combined_index_data try: full_content = r.content except etree.ElementTree.ParseError as e: return combined_index_data # TODO: Fix byte 0xff problem: 'utf8' codec can't decode byte 0xff in position <x>: invalid start byte # TODO: Fix Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. # TODO: remove things like: Share on Facebook Share Share on Twitter Tweet Share on Pinterest Share Share on LinkedIn Share Send email Mail Print Print try: cleaned = PageModel(full_content.decode(r.encoding)).extract() except Exception as e: print >> sys.stderr, e cleaned = {} output = u'' for elem in cleaned.get('content', []): if elem['type'] == 'text': # if it starts with these words it's probably garbage if re.match('^\s*(Share|Deel|Delen|Send|Print)\s*', elem['data']) is None: output += '<p>%s</p>' % (elem['data'], ) if elem['type'] == 'image': output += '<img src="%s" />' % (elem['data']['src'], ) if output.strip() != u'': combined_index_data['description'] = unicode(output) return combined_index_data
def extract_content(self, full_content, encoding): # TODO: Fix byte 0xff problem: 'utf8' codec can't decode byte 0xff in position <x>: invalid start byte # TODO: Fix Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. # TODO: remove things like: Share on Facebook Share Share on Twitter Tweet Share on Pinterest Share Share on LinkedIn Share Send email Mail Print Print try: cleaned = PageModel(full_content.decode(encoding)).extract() except Exception as e: print >>sys.stderr, e cleaned = {} output = u'' for elem in cleaned.get('content', []): if elem['type'] == 'text': # if it starts with these words it's probably garbage if re.match('^\s*(Share|Deel|Delen|Send|Print)\s*', elem['data']) is None: output += '<p>%s</p>' % (elem['data'],) if elem['type'] == 'image': output += '<img src="%s" />' % (elem['data']['src'],) if output.strip() != u'': return unicode(output)
def run_jparser(htmlstring): '''try with jparser''' try: pm = PageModel(htmlstring) except (TypeError, ValueError): return '' result = pm.extract() # old mylist = list() for x in result['content']: if x['type'] in ('text', 'html'): mylist.append(str(x['data'])) # suggested #mylist = [ # str(x['data']) # for x in result['content'] # if x['type'] in ('text', 'html') #] returnstring = ' '.join(mylist) # returnstring = re.sub(r'\s+', ' ', returnstring) returnstring = re.sub(r'\s+(p{P}+)', '\1', returnstring) return sanitize(returnstring)
def get_corpus_from_web(file, href): print href if ('ccf.org' not in href): return headers = {'User-Agent': choice(useragents)} resp = requests.get(href, verify=False, headers=headers) new_urls = link_pattern.findall(resp.content) result_body = PageModel(resp.content.decode('utf-8')).extract() result_body_temp = '' for x in result_body['content']: if x['type'] == 'text': result_body_temp += x['data'].replace(' ', '').replace('\n', '') # print result_body_temp file.write(result_body_temp) finished_urls.add(href) for url in all_url_cleaning(resp.url, new_urls): # print url, if url not in finished_urls: urls.add(url)
def main(): reader = csv.reader(open('query_text.csv', 'rb')) index = 0 for line in reader: record_id = 1 print('---------%i---------' % index) query_all = line[0] csvfile = open('scrapy_data/scrapy_result_%i.csv' % (index), 'wb') writer = csv.writer(csvfile) data = ['record_id', 'query', 'title', 'abstract', 'link', 'content'] writer.writerows([data]) index += 1 query_array = re.split(u"[,;。!?]", query_all.decode('utf-8')) if (len(query_array[-1]) < 5): query_array.pop(-1) flag = len(query_array) - 1 i = -1 while (i < flag): i += 1 if (i > flag - 1): break elif (len(query_array[i]) < 38): if (len(query_array[i]) + len(query_array[i + 1]) > 38): continue else: query_array[i + 1] = query_array[i] + query_array[i + 1] query_array.pop(i) flag -= 1 i -= 1 else: continue if (len(query_array)): for query in query_array: print(query) if (len(query) < 8): PAGE_NUM = 1 else: PAGE_NUM = 1 for k in range(0, PAGE_NUM): try: #待抓取的网页地址 url = 'http://www.baidu.com/s?wd=%s&pn=%i' % (query, k * 10) content = requests.get(url, headers=headers) #使用BeautifulSoup解析html soup = BeautifulSoup(content.text, 'html.parser') title = [] abstract = [] link = [] content = [] allNews = soup.find_all('div', {'id', 'result c-container '}) for hotNews in allNews: h3 = hotNews.find(name="h3", attrs={ "class": re.compile("t") }).find('a') title.append(h3.text.replace("\"", "")) div = hotNews.find( name="div", attrs={"class": re.compile("c-abstract")}) abstract.append(div.text.replace("\"", "")) a = hotNews.find( name="a", attrs={"class": re.compile("c-showurl")}) detail_url = a.get('href') link.append(detail_url) try: ret = api.article(url=detail_url, fields=['text', 'next']) content.append(ret['text'].replace( '\r', '').replace('\n', '')) except: try: time.sleep(1) ret = api.article(url=detail_url, fields=['text']) content.append(ret['text'].replace( '\r', '').replace('\n', '')) except: try: try: html = requests.get( detail_url, headers=headers).text.decode( 'utf-8') except: html = requests.get( detail_url, headers=headers).text.decode( 'gbk') pm = PageModel(html) result = pm.extract() ans = [ x['data'] for x in result['content'] if x['type'] == 'text' ] content.append(''.join(ans)) except Exception as e: print(e) print(detail_url) content.append('') pass #将数据写入csv data = [] for i in range(0, len(title)): try: data.append((record_id, query, title[i], abstract[i], link[i], content[i])) record_id += 1 except Exception as err: print(err) writer.writerows(data) print("第" + str(k + 1) + "页完成") except Exception as err: print(err) pass time.sleep(1) # break csvfile.close()
import urllib2 from jparser import PageModel html = urllib2.urlopen("http://news.sohu.com/20170512/n492734045.shtml").read().decode('gb18030') pm = PageModel(html) result = pm.extract() print "==title==" print result['title'] print "==content==" for x in result['content']: if x['type'] == 'text': print x['data'] if x['type'] == 'image': print "[IMAGE]", x['data']['src']
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun May 21 11:02:59 2017 @author: foolwolf0068 """ import urllib from jparser import PageModel html = urllib.urlopen("http://www.pythontab.com").read().decode('gb18030') pm = PageModel(html) result = pm.extract() print("**title**") print(result['title']) print("==content==") for x in result['content']: if x['type'] == 'text': print(x['data']) if x['type'] == 'image': print("[IMAGE]", x['data']['src'])