Beispiel #1
0
def parser():
    t1 = time.time()
    url = request.args.get('url')
    try:
        if url and url.strip() != "":
            url = url.strip()
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
            }
            rsps = requests.get(url, headers=headers)
            try:
                page = rsps.content.decode('utf-8')
            except:
                page = rsps.content.decode('gb18030', 'ignore')
        else:
            page = request.form.get("html_content")
        t2 = time.time()
        pm = PageModel(page, url)
        result = pm.extract()
        t3 = time.time()
    except:
        traceback.print_exc()
        return "download url failed"
    return render_template("result.html",
                           data=result['content'],
                           title=result['title'],
                           json_s=json.dumps(result, indent=4),
                           download_cost=t2 - t1,
                           extract_cost=t3 - t2)
Beispiel #2
0
def run_jparser(htmlstring):
    '''try with jparser'''
    try:
        pm = PageModel(htmlstring)
    except ValueError:
        return ''
    result = pm.extract()
    mylist = list()
    for x in result['content']:
        if x['type'] in ('text', 'html'):
            mylist.append(str(x['data']))
    returnstring = re.sub(r'\s+', ' ', ' '.join(mylist))
    returnstring = re.sub(r' ([.,;!?])', '\1', returnstring)
    return returnstring
Beispiel #3
0
 def content_extraction(html):
     try:
         pm = PageModel(html)
         result = pm.extract()
         title = result['title']
         content = ''
         for x in result['content']:
             if x['type'] == 'text':
                 content = '%s%s%s' % (content, x['data'], '\n')
         result = {}.fromkeys(("title", "content"))
         result["title"] = title
         result["content"] = content
         return result
     except Exception as e:
         print(e)
         return {"title": "", "content": ""}
Beispiel #4
0
    def get_combined_index_data(self):
        combined_index_data = super(FeedContentFromPageItem,
                                    self).get_combined_index_data()

        if re.match(r'^https?\:\/\/', self.original_item['link']):
            page_link = self.original_item['link']
        else:
            page_link = urljoin(self.source_definition['file_url'],
                                self.original_item['link'])
        r = self.http_session.get(page_link, timeout=5)
        print >> sys.stderr, "Got %s with status code : %s" % (
            self.original_item['link'], r.status_code)

        # only continue if we got the page
        if r.status_code < 200 or r.status_code >= 300:
            return combined_index_data

        try:
            full_content = r.content
        except etree.ElementTree.ParseError as e:
            return combined_index_data

        # TODO: Fix byte 0xff problem: 'utf8' codec can't decode byte 0xff in position <x>: invalid start byte
        # TODO: Fix Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
        # TODO: remove things like: Share on Facebook Share Share on Twitter Tweet Share on Pinterest Share Share on LinkedIn Share Send email Mail Print Print
        try:
            cleaned = PageModel(full_content.decode(r.encoding)).extract()
        except Exception as e:
            print >> sys.stderr, e
            cleaned = {}

        output = u''
        for elem in cleaned.get('content', []):
            if elem['type'] == 'text':
                # if it starts with these words it's probably garbage
                if re.match('^\s*(Share|Deel|Delen|Send|Print)\s*',
                            elem['data']) is None:
                    output += '<p>%s</p>' % (elem['data'], )
            if elem['type'] == 'image':
                output += '<img src="%s" />' % (elem['data']['src'], )

        if output.strip() != u'':
            combined_index_data['description'] = unicode(output)

        return combined_index_data
Beispiel #5
0
    def extract_content(self, full_content, encoding):
        # TODO: Fix byte 0xff problem: 'utf8' codec can't decode byte 0xff in position <x>: invalid start byte
        # TODO: Fix Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
        # TODO: remove things like: Share on Facebook Share Share on Twitter Tweet Share on Pinterest Share Share on LinkedIn Share Send email Mail Print Print
        try:
            cleaned = PageModel(full_content.decode(encoding)).extract()
        except Exception as e:
            print >>sys.stderr, e
            cleaned = {}

        output = u''
        for elem in cleaned.get('content', []):
            if elem['type'] == 'text':
                # if it starts with these words it's probably garbage
                if re.match('^\s*(Share|Deel|Delen|Send|Print)\s*', elem['data']) is None:
                    output += '<p>%s</p>' % (elem['data'],)
            if elem['type'] == 'image':
                output += '<img src="%s" />' % (elem['data']['src'],)

        if output.strip() != u'':
            return unicode(output)
Beispiel #6
0
def run_jparser(htmlstring):
    '''try with jparser'''
    try:
        pm = PageModel(htmlstring)
    except (TypeError, ValueError):
        return ''
    result = pm.extract()
    # old
    mylist = list()
    for x in result['content']:
        if x['type'] in ('text', 'html'):
            mylist.append(str(x['data']))
    # suggested
    #mylist = [
    #    str(x['data'])
    #    for x in result['content']
    #    if x['type'] in ('text', 'html')
    #]

    returnstring = ' '.join(mylist)
    # returnstring = re.sub(r'\s+', ' ', returnstring)
    returnstring = re.sub(r'\s+(p{P}+)', '\1', returnstring)
    return sanitize(returnstring)
Beispiel #7
0
def get_corpus_from_web(file, href):
    print href
    if ('ccf.org' not in href):
        return
    headers = {'User-Agent': choice(useragents)}
    resp = requests.get(href, verify=False, headers=headers)
    new_urls = link_pattern.findall(resp.content)
    result_body = PageModel(resp.content.decode('utf-8')).extract()
    result_body_temp = ''
    for x in result_body['content']:
        if x['type'] == 'text':
            result_body_temp += x['data'].replace(' ', '').replace('\n', '')
    # print result_body_temp
    file.write(result_body_temp)
    finished_urls.add(href)
    for url in all_url_cleaning(resp.url, new_urls):
        # print url,
        if url not in finished_urls:
            urls.add(url)
Beispiel #8
0
def main():
    reader = csv.reader(open('query_text.csv', 'rb'))
    index = 0
    for line in reader:
        record_id = 1
        print('---------%i---------' % index)
        query_all = line[0]
        csvfile = open('scrapy_data/scrapy_result_%i.csv' % (index), 'wb')
        writer = csv.writer(csvfile)
        data = ['record_id', 'query', 'title', 'abstract', 'link', 'content']
        writer.writerows([data])
        index += 1

        query_array = re.split(u"[,;。!?]", query_all.decode('utf-8'))
        if (len(query_array[-1]) < 5):
            query_array.pop(-1)
        flag = len(query_array) - 1
        i = -1
        while (i < flag):
            i += 1
            if (i > flag - 1):
                break
            elif (len(query_array[i]) < 38):
                if (len(query_array[i]) + len(query_array[i + 1]) > 38):
                    continue
                else:
                    query_array[i + 1] = query_array[i] + query_array[i + 1]
                    query_array.pop(i)
                    flag -= 1
                    i -= 1
            else:
                continue

        if (len(query_array)):
            for query in query_array:
                print(query)
                if (len(query) < 8):
                    PAGE_NUM = 1
                else:
                    PAGE_NUM = 1

                for k in range(0, PAGE_NUM):
                    try:
                        #待抓取的网页地址
                        url = 'http://www.baidu.com/s?wd=%s&pn=%i' % (query,
                                                                      k * 10)
                        content = requests.get(url, headers=headers)
                        #使用BeautifulSoup解析html
                        soup = BeautifulSoup(content.text, 'html.parser')
                        title = []
                        abstract = []
                        link = []
                        content = []
                        allNews = soup.find_all('div',
                                                {'id', 'result c-container '})
                        for hotNews in allNews:
                            h3 = hotNews.find(name="h3",
                                              attrs={
                                                  "class": re.compile("t")
                                              }).find('a')
                            title.append(h3.text.replace("\"", ""))
                            div = hotNews.find(
                                name="div",
                                attrs={"class": re.compile("c-abstract")})
                            abstract.append(div.text.replace("\"", ""))
                            a = hotNews.find(
                                name="a",
                                attrs={"class": re.compile("c-showurl")})
                            detail_url = a.get('href')
                            link.append(detail_url)
                            try:
                                ret = api.article(url=detail_url,
                                                  fields=['text', 'next'])
                                content.append(ret['text'].replace(
                                    '\r', '').replace('\n', ''))
                            except:
                                try:
                                    time.sleep(1)
                                    ret = api.article(url=detail_url,
                                                      fields=['text'])
                                    content.append(ret['text'].replace(
                                        '\r', '').replace('\n', ''))
                                except:
                                    try:
                                        try:
                                            html = requests.get(
                                                detail_url,
                                                headers=headers).text.decode(
                                                    'utf-8')
                                        except:
                                            html = requests.get(
                                                detail_url,
                                                headers=headers).text.decode(
                                                    'gbk')
                                        pm = PageModel(html)
                                        result = pm.extract()
                                        ans = [
                                            x['data']
                                            for x in result['content']
                                            if x['type'] == 'text'
                                        ]
                                        content.append(''.join(ans))
                                    except Exception as e:
                                        print(e)
                                        print(detail_url)
                                        content.append('')
                                        pass

                            #将数据写入csv
                        data = []
                        for i in range(0, len(title)):
                            try:
                                data.append((record_id, query, title[i],
                                             abstract[i], link[i], content[i]))
                                record_id += 1
                            except Exception as err:
                                print(err)
                        writer.writerows(data)
                        print("第" + str(k + 1) + "页完成")

                    except Exception as err:
                        print(err)
                        pass

                time.sleep(1)
                # break

        csvfile.close()
Beispiel #9
0
import urllib2
from jparser import PageModel


html = urllib2.urlopen("http://news.sohu.com/20170512/n492734045.shtml").read().decode('gb18030')
pm = PageModel(html)
result = pm.extract()

print "==title=="
print result['title']
print "==content=="
for x in result['content']:
    if x['type'] == 'text':
        print x['data']
    if x['type'] == 'image':
        print "[IMAGE]", x['data']['src']
Beispiel #10
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 21 11:02:59 2017

@author: foolwolf0068
"""

import urllib
from jparser import PageModel
html = urllib.urlopen("http://www.pythontab.com").read().decode('gb18030')
pm = PageModel(html)
result = pm.extract()
print("**title**")
print(result['title'])
print("==content==")
for x in result['content']:
    if x['type'] == 'text':
        print(x['data'])
    if x['type'] == 'image':
        print("[IMAGE]", x['data']['src'])