Exemple #1
0
class Article:

    def __init__(self, url):
        print('Saving page: {}'.format(url))
        res = requests.get(url)
        self.url = url
        self.article = Document(res.content)
        self._add_title()
        self._save_images()

    def _add_title(self):
        self.root = etree.fromstring(self.article.summary())
        body = self.root.find('body')

        title = self.article.title()
        ascii_title = unidecode(title) if type(title) == unicode else title

        title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title))
        body.insert(0, title_header)

    def _save_images(self):
        tmppath = tempfile.mkdtemp()
        images = self.root.xpath('//img')
        for img in images:
            imgsrc = img.get('src')

            # handle scheme-agnostic URLs
            if 'http' not in imgsrc and '//' in imgsrc:
                imgsrc = 'http:{}'.format(imgsrc)

            # handle relative file paths
            elif 'http' not in imgsrc:
                parsed = urlparse(self.url)
                imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc)

            filename = os.path.basename(imgsrc)
            dest = os.path.join(tmppath, filename)

            try:
                res = requests.get(imgsrc)
            except Exception as e:
                print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc))
                return

            if res.status_code == 404:
                print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src')))
                continue

            with open(dest, 'wb') as f:
                f.write(res.content)

            img.set('src', dest)

    @property
    def title(self):
        return self.article.title()

    @property
    def html(self):
        return etree.tostring(self.root)
def main():
    novels = {
        'cbi': 'https://boxnovel.com/novel/castle-of-black-iron/chapter-',
        'sgg': 'https://boxnovel.com/novel/super-gene/chapter-',
        'sas': 'https://boxnovel.com/novel/strongest-abandoned-son/chapter-',
        'atg': 'https://www.wuxiaworld.com/novel/against-the-gods/atg-chapter-'
    }
    total = []
    if len(sys.argv) < 4:
        inicio = int(sys.argv[2])
        fim = int(sys.argv[2]) + 1
    else:
        inicio = int(sys.argv[2])
        fim = int(sys.argv[3]) + 1

    url = novels[sys.argv[1]]
    for i in range(inicio, fim):
        response = getPage(url + str(i))
        doc = Document(response.text)
        fileName = re.sub(r'[^a-zA-Z0-9]+', ' ', doc.title())
        total.append(doc.summary())
        print(i)

    f = open(fileName + str(fim - 1) + '.html', 'w')
    for i in total:
        f.write(i)
    f.close()
Exemple #3
0
def run(index):
	print "Index %d" % index
	dirname = "data/%04d" % index

	# url of english article
	url = open(dirname + "/url_en.txt").read()

	# download html
	html = urllib.urlopen(url).read().decode('latin-1')

	# apply readability
	document = Document(html)
	article = document.summary()
	article = nltk.clean_html(article)

	# replace latin characters
	article = re.sub(u'&#13;', u'\n', article)
	article = re.sub(u'\x92', u'`', article)
	article = re.sub(u'\x96', u'-', article)

	# article_en.txt
	output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore')
	output.write(article)
	output.close()

	# title.txt
	output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore')
	output.write(document.title())
	output.close()
def download_via_url(url):
    response = requests.get(url)
    doc = Document(response.text)
    title = doc.title()
    summary = doc.summary()
    soup = BeautifulSoup(summary, "html.parser")

    return title, soup.text
Exemple #5
0
 def parse(self, response):
     doc = Document(response.text)
     yield {
         'full_title': doc.title(),
         # 'date': response.selector.xpath('//time/@datetime').getall()
         # 'date': response.xpath('//span[@class="post-date"]/text()').get()
         'date': '2009'
     }
 def extract(self, html):
     # https://github.com/buriy/python-readability/blob/master/readability/readability.py
     doc = Document(html)
     self.__title = doc.title()
     self.__html = doc.summary()
     self.__md = html2text.html2text(self.__html)
     self.__text = self.__format_to_text(self.__html)
     return self.__text
Exemple #7
0
 def _getResponseText(self, response):
     '''
     (reponse) -> Text
     Returns text within the body of an HttpResponse object.
     '''
     readability = Document(response.body)
     content = readability.title() + readability.summary()
     return content
Exemple #8
0
def process_html(html):
    doc = Document(html)
    return {
        'content': doc.content(),
        'clean_html': doc.get_clean_html(),
        'short_title': doc.short_title(),
        'summary': html_to_text(doc.summary()),
        'title': doc.title()
    }
Exemple #9
0
def crawl_url(url):
    html = requests.get(url)
    doc = Document(html.content)
    content = doc.summary().encode('utf-8')
    title = doc.title().encode('utf-8')
    return {
        'content': content,
        'title': title
    }
Exemple #10
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
Exemple #11
0
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
Exemple #12
0
 def extract_article(self):
     """Returns only readable content
     Returns:
         data - {
             'title': 'Title of the article',
             'content': 'HTML body of the article'
         }
     """
     doc = Document(self._html)
     return {'title': doc.title(), 'content': doc.summary()}
Exemple #13
0
 def extract_article(self):
     """Returns only readable content
     Returns:
         data - {
             'title': 'Title of the article',
             'content': 'HTML body of the article'
         }
     """
     doc = Document(self._html)
     return {'title': doc.title(), 'content': doc.summary()}
Exemple #14
0
def get_article(d):
    url = d['url']
    if table.find_one(url=url):
        return
    print "fetching stuff for %s" % url
    d['html'] = requests.get(url).content
    try:
        doc = Document(d['html'])
        d['summary'] = html.fromstring(doc.summary()).xpath('string()')
        d['content'] = html.fromstring(doc.content()).xpath('string()')
        d['title'] = doc.title()
    except Exception, e:
        print e
Exemple #15
0
 def handle(self, url, content):
     # Fix of issue27
     # content = re.sub('href="(.*?)"', '', content);
     doc = Document(content)
     try:
         hp = HParser(doc.summary())
         text = doc.title() + '\n' + hp.tag_list[0].rtext().replace('==+NL+==', '\n')
         text = '\n'.join(list(map(lambda l: l.strip(), text.split('\n'))))
         text = re.sub('\n{3,}', '\n\n', text).strip()
         return text
     except:
         self.logger.exception('Fail to parse the summary from readability!')
         raise
Exemple #16
0
    def get_main_text(self):
        doc = Document(self._page.content, positive_keywords=re.compile('event-description__text|event-heading__title|event-heading__argument', re.I))
        title = doc.title()
        summary = doc.summary(html_partial=True)

        self.summary_bs = BeautifulSoup(summary, 'html.parser')

        strings = []
        for div in self.summary_bs.find_all(['div', 'span', 'body']):
            strings.extend([string for string in div.stripped_strings if
                            string != "" and re.search(r'[<>{}=\[\]\|]', string) is None])
        text = "\n".join(strings)
        preprocessed_text = TextUtils.handle(text)
        return '{}\n{}'.format(' '.join(TextUtils.handle(title)), ' '.join(preprocessed_text))
Exemple #17
0
def preprocess_doc(html_text):
    """
    Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the
    content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the
    readability-lxml Document class to clean the content (text and images embedded in the text).
    An HTML string is returned together with the title of the website.

    :author: Sebastian
    :param html_text: html document in string format to preprocess.
    :returns: The preprocessed html as a String and the title if needed by the callee.
    """
    # remove some common advertisement tags beforehand
    bs = BeautifulSoup(html_text, "lxml")
    for tag_desc in negative_tags:
        for tag in bs.findAll(
                attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}):
            tag.extract()
    doc = Document(str(bs.html),
                   negative_keywords=negative_classes,
                   positive_keywords=positive_classes)
    try:
        # Detect the encoding of the html, if not detectable use utf-8 as default.
        encoding = chardet.detect(doc.content().encode()).get('encoding')
        title = doc.title()
    except TypeError or IndexError as e:
        logger("Encountered {} setting encoding to utf-8.".format(str(e)))
        encoding = "utf-8"
        title = bs.title.getText()
    if not encoding:
        logger("Using default encoding utf-8")
        encoding = 'utf-8'
        title = bs.title.getText()
    doc.encoding = encoding

    head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \
           '-transitional.dtd">\n' + '<head>\n' + \
           '<meta http-equiv="Content-Type" content="text/html" ' \
           'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \
           + '<h1>' + title.split(sep='|')[0] + '</h1>'

    # Unparsable Type Error in encoding, where's the problem.
    text = head + doc.summary()[12:]

    # sometimes some tags get messed up and need to be translated back
    text = text.replace("&lt;", "<").replace("&gt;", ">")
    logger(
        'Preprocessing done. Type of text is: {}, Length of test is {}'.format(
            type(text), len(text)))
    return text, title
Exemple #18
0
def retrieve_article_content(article):
    article.last_fetch = timezone.now()
    try:
        response = requests.get(article.link)
        if response.ok:
            doc = Document(response.content)
            article.content = doc.summary()
            article.title = doc.title()
            article.status = cst.READY_STATUS
        else:
            article.status = cst.ERROR_STATUS
        article.save()
    except Exception as e:
        logger.error(e)
        article.status = cst.ERROR_STATUS
        article.save()
Exemple #19
0
def make_readable(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError:
        return None

    document = Document(html)

    document_dict = {
        'title': document.title(),
        'summary': document.summary(),
        'content': document.content(),
        'short_title': document.short_title()
    }

    return document_dict
Exemple #20
0
def get_url():
    start_url = "http://www.ediliziaeterritorio.ilsole24ore.com/"
    sess.get(start_url, headers=headers)
    page = 1
    keyword = 'Luigi Di Maio'
    url = "http://www.ricerca24.ilsole24ore.com/s24service?profilo=r24_service&search_query_id=fullquery&max_docs=1000&highlight=true&keywords_operator=AND&search_parameters=&order_by=2&page_number={}&page_size=10&v=2009&mt=text%2Fhtml%3B%20charset%3Diso-8859-1&cog_extra=true&xsl_id=html_all&keywords={}".format(
        page, keyword)
    response = sess.get(url, headers=headers)
    txt = response.text
    html = etree.HTML(txt)
    lis = html.xpath('//ul[@class="list list-results"]/li[@class="i"]')
    for li in lis:
        news_url = li.xpath('./article//h3/a/@href')[0]
        try:
            date = re.search(r'\d+-\d+-\d+', news_url).group()
        except:
            d = re.search(r'\d+/\d+/\d+', news_url).group()
            date = d.replace('/', '-')
        timeArray = time.strptime(date, "%Y-%m-%d")
        timestamp = int(time.mktime(timeArray))
        stringDate = time.strftime("%Y-%m-%d %H:%M:%S",
                                   time.localtime(timestamp))
        response = sess.get(news_url, headers=headers)
        txt = response.text
        try:
            num = re.findall(r'\d+ Commenti', txt)
            print num
            num = num[0].split(' ')[0]
        except Exception as error:
            print error
            num = 0
        readable_article = Document(txt)
        title = readable_article.title()
        html = etree.HTML(readable_article.summary())
        context = ''.join(html.xpath('//p//text()')).replace('\r', '').replace(
            '\n', '').replace('\t', '')
        item = {}
        item['time'] = stringDate
        item['timestamp'] = timestamp
        item['title'] = title
        item['context'] = context
        item['source'] = 'ricerca24'
        item['url'] = news_url
        item['commont_num'] = num
        with open('24.json', 'a') as f:
            f.write(json.dumps(item) + '\n')
Exemple #21
0
 def parser_content(self, html, index_url):
     print index_url
     import pdb
     if 'charset=gb2312' in html:
         try:
             code = chardet.detect(html)['encoding']
             html = html.decode(code, 'ignore')
         except:
             pass
     html = re.sub('<select[\s\S]+?</select>', '', html)
     readable_article = Document(html)
     content = readable_article.summary()
     content = re.sub('</?div.*?>', '', content)
     title = readable_article.title()
     time_search = re.search("发布时间.{20}", html)
     # if u'发布日期' in content :
     #      pdb.set_trace()
     if time_search:
         push_time = self.parser_match_time(time_search.group())
     else:
         try:
             push_time = self.parser_html_time(html)
         except:
             push_time = ''
     text = PyQuery(readable_article).text()
     print "*" * 100
     print push_time
     self.SAVECO.update(
         {"url": index_url}, {
             "url": index_url,
             "html": content,
             "text": text,
             "time": push_time,
             "title": title,
             "createdAt":
             datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         },
         upsert=True)
Exemple #22
0
def parseURL_pr(url):
    parsed = urlparse(url)
    if ( "youtube" in parsed.hostname ):
        print url, 'has youtube and we dont parse that'
        return None
    try:
        response = urlopen(url)
    except IOError:
        return None

    if ( response.getcode() > 400 ):
        print url , ' is not accessible any more', response.getcode()
        return None
    html = response.read()
    doc = Document(html)
    content = {}
    #content['content'] = doc.summary()
    html = doc.summary(True)
    soup = BeautifulSoup(html)
    content['content'] = soup.get_text()
    content['title'] = doc.title()
    content['word_count'] = len(content['content'])
    return content
Exemple #23
0
def parseURL_pr(url):
    parsed = urlparse(url)
    if ("youtube" in parsed.hostname):
        print url, 'has youtube and we dont parse that'
        return None
    try:
        response = urlopen(url)
    except IOError:
        return None

    if (response.getcode() > 400):
        print url, ' is not accessible any more', response.getcode()
        return None
    html = response.read()
    doc = Document(html)
    content = {}
    #content['content'] = doc.summary()
    html = doc.summary(True)
    soup = BeautifulSoup(html)
    content['content'] = soup.get_text()
    content['title'] = doc.title()
    content['word_count'] = len(content['content'])
    return content
Exemple #24
0
    def parser_content(self, html, index_url):
        print index_url
        import pdb
        html = re.sub('<select[\s\S]+?</select>', '', html)
        readable_article = Document(html)
        content = readable_article.summary()
        title = readable_article.title()
        time_search = re.search("发布时间.{20}", html)
        # if u'发布日期' in content :
        #      pdb.set_trace()
        if time_search:
            push_time = self.parser_match_time(time_search.group())
        else:
            push_time = self.parser_html_time(html)
        print "*" * 100
        print push_time

        self.SAVECO.update({"url": index_url}, {
            "url": index_url,
            "html": content,
            "time": push_time,
            "title": title
        },
                           upsert=True)
Exemple #25
0
def extractTitle(html):
    if html == "":
        return None
    try:
        doc = Document(html)
        short_title = doc.short_title()
        title = doc.title()
        if short_title is not None and short_title.strip() != "":
            title = short_title

        for delimiter in ['|', '-', '::', '/', '_']:
            if delimiter in title:
                parts = title.split(delimiter)
                if len(parts[0]) >= 4:
                    title = parts[0]
                    break
                elif len(parts[-1]) >= 4:
                    title = parts[-1]
                    break

        return title
    except:
        pass
    return None
Exemple #26
0
# - pip install readability-lxml
# - :bind <key sequence> spawn --userscript readerview.py

from readability.readability import Document
import os
import tempfile

# use readability-lxml (https://pypi.python.org/pypi/readability-lxml) to
# extract the article text
html = open(os.environ.get('QUTE_HTML')).read()
url = os.environ.get('QUTE_URL')

# set the url kwarg to get absolute links
document = Document(html, url=url)
article = document.summary()
title = document.title()

# add styling and whatever for better reading
head = '''<html>
<head>
<title>''' + title + ''' [readerview]</title>
<style>
body {
 max-width: 800px;
 margin: 0 auto;
 background-color: #fdf6e3;
 color: #657b83;
}
#qute_orig_link {
 font-weight: bold;
 text-align: center;
Exemple #27
0
def textgetter(url):
    """Scrapes web news and returns the content

    Parameters
    ----------

    url : str
        web address to news report

    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article

    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        try:
            r = requests.get(url, verify=False, timeout=1)
        except:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer

        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url

        if len(r.content) > 500:
            article = Article(url)
            article.download(input_html=r.content)
            article.parse()
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider'] = site
                answer['published_date'] = article.publish_date
                if isinstance(article.publish_date, datetime.datetime):
                    answer['published_date'] = article.publish_date.astimezone(
                        pytz.utc).isoformat()

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url

            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all('div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer
        yield answer
        del r, data
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider'] = site
        answer['published_date'] = None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        yield answer
def textgetter(url):
    """Scrapes web news and returns the content

    Parameters
    ----------

    url : str
        web address to news report

    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article

    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        try:
            r = requests.get(url, verify=False, timeout=1)
        except:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer
                
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url


        if len(r.content)>500:
            article = Article(url)
            article.download(input_html=r.content)
            article.parse()
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider']=site
                answer['published_date'] = article.publish_date
                if isinstance(article.publish_date,datetime.datetime):
                    answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
                

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url

                
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all(
                            'div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer
        yield answer
        del r, data
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider']=site
        answer['published_date']=None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        yield answer
        
Exemple #29
0
def update_and_send(proxy, post, url, country, is_proxy):
    user = post.author
    if is_proxy:
        try:
            r = requests.get(url, proxies={"http": proxy})
        except:
            email.send_email_normal(user.email,
                                    'Your requested web Article Blocked in ' +
                                    country,
                                    'main/block_mail',
                                    user=user,
                                    post=post,
                                    server=app.config['SERVER_URL'])
            return True
    else:
        try:
            r = requests.get(url)
        except:
            email.send_email_normal(user.email,
                                    'Your requested web Article Blocked',
                                    'main/block_mail',
                                    user=user,
                                    post=post,
                                    server=app.config['SERVER_URL'])
            return True
    if r:
        doc = Document(r.text)
        sha256, html_text = calculate_hash_for_html_doc(doc)
        if sha256 == post.hashVal:
            return True
        else:

            try:
                originStampResult = save_render_zip_submit(
                    html_text, sha256, url, doc.title())
            except:
                app.logger.error(
                    '300 Internal System Error. Could not submit hash to originstamp'
                )

            app.logger.error('Hash: ' + sha256 + ' submitted to originstamp')
            dateTimeGMT = originStampResult.headers['Date']
            post_new = Post(body=doc.title(),
                            urlSite=url,
                            hashVal=sha256,
                            webTitl=doc.title(),
                            origStampTime=datetime.strptime(
                                dateTimeGMT, "%a, %d %b %Y %H:%M:%S %Z"),
                            author=user)
            db.session.add(post_new)
            db.session.commit()
            post_created = Post.query.filter(
                and_(Post.urlSite.like(url),
                     Post.hashVal.like(sha256))).first()
            ids = str(post.id) + ':' + str(post_created.id)
            if post_created:
                email.send_email_normal(
                    user.email,
                    'Change in the requested Article found',
                    'main/normal_email',
                    user=user,
                    post=post_created,
                    ids=ids,
                    server=app.config['SERVER_URL'])
            return True
    else:
        email.send_email_normal(user.email,
                                'Your requested web Article Blocked in ' +
                                country,
                                'main/block_email',
                                user=user,
                                post=post,
                                server=app.config['SERVER_URL'])
        return True
Exemple #30
0
def textgetter(url):
    """Scrapes web news and returns the content
    Parameters
    ----------
    url : str
        web address to news report
    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article
    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            yield done[url]
            pass
        try:
            # make a request to the url
            r = requests.get(url, verify=False, timeout=1)
        except:
            # if the url does not return data, set to empty values
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        # if url does not return successfully, set ot empty values
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None

        # test if length of url content is greater than 500, if so, fill data
        if len(r.content)>500:
            # set article url
            article = Article(url)
            # test for python version because of html different parameters
            if int(platform.python_version_tuple()[0])==3:
                article.download(input_html=r.content)
            elif int(platform.python_version_tuple()[0])==2:
                article.download(html=r.content)
            # parse the url
            article.parse()
            article.nlp()
            # if parse doesn't pull text fill the rest of the data
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider']=site
                answer['published_date'] = article.publish_date
                answer['keywords']=article.keywords
                answer['summary']=article.summary
                # convert the data to isoformat; exception for naive date
                if isinstance(article.publish_date,datetime.datetime):
                    try:
                        answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
                    except:
                        answer['published_date']=article.publish_date.isoformat()
                

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url
                
                

            # if previous didn't work, try another library
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                # as we did above, pull text if it's greater than 200 length
                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
                # if nothing works above, use beautiful soup
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all(
                            'div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
        # if nothing works, fill with empty values
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        yield answer

    # the else clause to catch if invalid url passed in
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider']=site
        answer['published_date']=None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        answer['keywords']=None
        answer['summary']=None
        yield answer