Esempio n. 1
0
class ArticleExtractor:
    def __init__(self):
        self.goose = Goose()

    def article_from_url(self, url):
        return self.goose.extract(url=url)

    def article_from_html(self, html):
        return self.goose.extract(raw_html=html)
Esempio n. 2
0
	def __init__(self,url,author):
 		self.url = url
		self.author = author
		goose = Goose({'stopwords_class': StopWordsChinese})
		article = goose.extract(url=url)
		if article.title == '':
			goose = Goose()
			article = goose.extract(url=url)
		self.title = article.title
		self.summary = article.cleaned_text[:150]
		self.body = article.cleaned_text
Esempio n. 3
0
class ArticleExtractor(object):
    def __init__(self):
        self.g = Goose({'stopwords_class': StopWordsChinese})

    def extractUrl(self, url=None):
        if url is not None:
            return self.g.extract(url=url)
        return None

    def extractHtm(self, html=None):
        if html is not None:
            return self.g.extract(raw_html=html)
        return None
def extract(url):
    '''
    提取网页正文
    '''
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    return article.cleaned_text
Esempio n. 5
0
def createResource(url):
    if resolve(url)!=None:
        url=resolve(url)
    g = Goose()
    a= g.extract(url=url)
    if len(url)>200:
        print "Los links largos de duckduckgo no funcionan"
        return None
    else:
        r=Resource.objects.filter(url=url)
        if len(r)>0:        
            print "El recurso ya lo tenia"
            r=r[0]
        else:
            if a.title==None or a.title=="":
                title="notitle"
            else:
                title=a.title
            try:
                r=Resource.objects.create(title=title,url=url)
            except:
                print "no ha ido bien"
                print title
                print url
            print "Creado el recurso para "+url
        return r
Esempio n. 6
0
def get_link_data_task(link_id):
    dbsession = get_link_data_task.dbsession
    services = get_link_data_task.services
    flags = get_link_data_task.flags
    if not flags:
        return
    link = services.link.get_link_by_id(link_id)
    if link is None:
        return
    html = None
    if 'screenshot' in flags:
        data, html = services.screenshot.capture(link.url, 1024, 800)
        # TODO: Investigate if this way of generating filename can create clashes
        # TODO: Delete the previous file if it exist
        filename = services.file.create(data,  str(uuid.uuid4()) + '.png', 'screenshots')
        link.meta['screenshot'] = filename

    if 'html' in flags:
        link.meta['html'] = html if html else requests.get(link.url).text

    # this should move to a service too
    if 'text' in flags or 'title' in flags:
        goose = Goose()
        a = goose.extract(raw_html=html if html else requests.get(link.url).text)
        if 'text' in flags:
            link.meta['text'] = a.cleaned_text

        if 'title' in flags:
            link.meta['title'] = a.title
    dbsession.commit() #  we are outside the web transaction
Esempio n. 7
0
class SoloSpider(CrawlSpider):
    name = "solo"

    rules = (Rule(LinkExtractor(), callback='parse_items', follow=True),)

    def __init__(self, **kw):
        super(SoloSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain')

        self.g = Goose()
        self.url = url
        self.allowed_domains = [url]
        self.start_urls = ['http://www.' + url]

    def parse_items(self, response):

        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SoloItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(),
            lambda v: re.sub(r'[\',|!]', '', v),
            lambda v: re.sub(r'\s+', ' ', v)
        )

        il.add_value('siteurl', parse_base_url(response.url))
        il.add_value('pageurl', response.url)
        il.add_value('text', fulltext.encode('ascii', 'ignore'))
        il.add_xpath('pagetitle', '//title/text()')

        return il.load_item()
Esempio n. 8
0
class SoloSpider(CrawlSpider):
    name = "solo"

    rules = (Rule(LinkExtractor(), callback="parse_items", follow=True),)

    def __init__(self, **kw):
        super(SoloSpider, self).__init__(**kw)
        url = kw.get("url") or kw.get("domain")

        self.g = Goose()
        self.url = url
        self.allowed_domains = [url]
        self.start_urls = ["http://www." + url]

    def parse_items(self, response):

        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SoloItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(), lambda v: re.sub(r"[\',|!]", "", v), lambda v: re.sub(r"\s+", " ", v)
        )

        il.add_value("siteurl", parse_base_url(response.url))
        il.add_value("pageurl", response.url)
        il.add_value("text", fulltext.encode("ascii", "ignore"))
        il.add_xpath("pagetitle", "//title/text()")

        return il.load_item()
Esempio n. 9
0
class GooseAPI:
    def __init__(self, url):
        self.url = url
        self.goose = Goose()
        self.extracted_content = None

    def extract(self):

        self.extracted_content = self.goose.extract(url = self.url)
        return {
            'title': self.extracted_content.title,
            'summary': self.extracted_content.meta_description,
            'content': self.extracted_content.content_html,
            'published_at': self.extracted_content.publish_date,
            'assets': self.images()
        }

    def images(self):
        images = []
        for image in self.extracted_content.images:
            images.append({
                'url': image.src,
                'width': image.width,
                'height': image.height,
                'type': 'image'
            })
        return images
Esempio n. 10
0
def createResource(url):
    if len(url)>200:
        print "Los links largos de duckduckgo no funcionan"
        return None
    else:
        r=Resource.objects.filter(url=url)
        if len(r)>0:        
            print "El recurso ya lo tenia"
            r=r[0]
        else:
            g = Goose()
            try:
                a= g.extract(url=url)   
            except:
                a=None
            if a==None or a.title==None or a.title=="":
                title="notitle"
            else:
                title=a.title
            try:
                tags=["one","two"]
                r=Resource.objects.create(title=title,url=url,status=Resource.ADDED)
                r.tags.add("one two")
            except TypeError as e:
                print e
                print "no ha ido bien"
                print title
                print url
            print "Creado el recurso para "+url
        return r            
Esempio n. 11
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Esempio n. 12
0
def fetch_content_for_url(url):
    try:
        g = Goose()
        article = g.extract(url=url)
        return article.cleaned_text
    except:
        return ''
Esempio n. 13
0
    def get_article(self, html):

        config = self.getConfig()
        self.parser = config.get_parser()

        g = Goose(config=config)
        return g.extract(url = "http://www.null.com", raw_html = html)
Esempio n. 14
0
def hackers_news():
    total_data = []
    obj = get_context()
    base_url, target_url = obj.urls()
    parsed_source = obj.get_parsed_source(base_url, target_url)
    news_urls = parsed_source.xpath("//table[@id='hnmain']//table//tr[@class='athing']")
    for each_data in news_urls:
        news_url = each_data.xpath(".//td[@class='title']//span[@class='deadmark']//following-sibling::a[1]//@href")
        news_url = "".join(news_url)
        upvotes = each_data.xpath(".//following-sibling::tr[1]//td[@class='subtext']//span//text()")
        upvotes = "".join(upvotes)
        posted_on = each_data.xpath(
            ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[2]//text()"
        )
        posted_on = "".join(posted_on)
        comments = each_data.xpath(
            ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[3]//text()"
        )
        comments = "".join(comments)
        g = Goose()
        article = g.extract(url=news_url)
        content = article.cleaned_text
        content = " ".join(content.split()).replace("\n", "").replace("\t", "").replace("\r", "")
        try:
            content = content.encode("utf-8").decode("ascii", "ignore").encode("ascii")
        except:
            try:
                content = content.decode("ascii", "ignore").encode("ascii")
            except:
                try:
                    content = content.encode("utf-8")
                except:
                    content = "No news found"
        connection, cursor = obj.get_connection()
        duplicate_query = "SELECT news_url FROM hackers_news WHERE news_url=%s"
        duplicate_values = (news_url,)
        cursor.execute(duplicate_query, duplicate_values)
        duplicate_data = cursor.fetchall()
        if duplicate_data:
            insert_data = "update hackers_news set upvotes =" + upvotes + ",comments=" + comments + " where news_url=%s"
            values = (news_url,)
            cursor.execute(insert_data, values)
            connection.commit()
        else:
            try:
                insert_data = (
                    "insert into hackers_news(news_url,news_content,upvotes,posted_on,comments) values(%s,%s,%s,%s,%s)"
                )
                values = (news_url, content, upvotes, posted_on, comments)
                cursor.execute(insert_data, values)
                connection.commit()
            except:
                continue
        cursor.close()
        connection.close()
        total_data.append(
            {"news_url": news_url, "content": content, "upvotes": upvotes, "posted_on": posted_on, "comments": comments}
        )
    context_dict = {"total_data": total_data}
    return context_dict
Esempio n. 15
0
def crawlerWebLink(url):
    g = Goose()
    article = g.extract(url=url)

    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text)
Esempio n. 16
0
    def save(self, *args, **kwargs):
        from goose import Goose
        from text.blob import TextBlob
        g = Goose()
        article = g.extract(url=self.url)
        try:
            b = TextBlob(article.title)
            lang = b.detect_language()
        except:
            lang='en'

        g = Goose({'use_meta_language': False, 'target_language':lang, 'paper_class':'soup'})
        if not self.title:
            self.title = article.title
        if not self.newspaper:
            self.newspaper = article.domain
        if not self.content:
            self.content = article.cleaned_text
        try:
            if article.top_image.src:
                layout = Photo()
                #layout.photo = "images/news/"+str(self.id)+".jpg"
                layout.url = article.top_image.src
                layout.article = self
                layout.save() 
        except:
            pass
        super(Article, self).save()
Esempio n. 17
0
    def process_item(self, item, spider):
        if "pdf_Link" in item:
            pdfName = item["report_name"] + u".pdf"
            PDFPath = os.path.join(PDF_PATH, item["source_name"])
            if not os.path.exists(PDFPath):
                os.makedirs(PDFPath)
            filepath = os.path.join(PDFPath, pdfName)
            try:
                content = self.downloadPDF(item["pdf_Link"], filepath)
                item["report_content"] = content
            except:
                self.jsonInfoStored(item, pdfName)
                log.msg("pdf download failure, information is serializing to json files", level=log.INFO)
        elif "content_Link" in item:
            from goose import Goose
            from goose.text import StopWordsChinese

            try:
                g = Goose({"stopwords_class": StopWordsChinese})
                article = g.extract(url=item["content_Link"])
                content = article.cleaned_text
                del item["content_Link"]
                item["report_content"] = content
            except:
                log.msg("Content extracted failure from page:%s" % item["report_link"], level=log.INFO)
        return item
Esempio n. 18
0
def scrape_category(url, c_label):
	extract_feed_world = "http://pipes.yahoo.com/pipes/pipe.run?_id=a625f9823d9b5c4858865b107dcc2516&_render=json&urlinput1=%s" % urllib.quote_plus(url)
	data_world = urllib2.urlopen(extract_feed_world)
	json_data_world = json.load(data_world)

	for item in json_data_world['value']['items']:
		# link = urllib2.urlopen(item['link'])
		# link = link.geturl()
		if not [x for x, y in enumerate(Categorized_Labeled_Article.objects.all()) if (y.url == item['link'])]:
			try:
				cj = cookielib.CookieJar()
				opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
				request = urllib2.Request(item['link'])
				response = opener.open(request)

				url = response.geturl()
			
				g = Goose()
				article = g.extract(url=url)

				readable_article = article.cleaned_text

				#Save in database
				article = Categorized_Labeled_Article.objects.create(text=readable_article,label=c_label,url=item['link'])
				article.save()
				print article.label

			except (urllib2.HTTPError, UnicodeDecodeError, AttributeError, IOError):
				print "error %s" % item['link']
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = { 'link': url, 'source': 'crawler_estadao' }
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['body_content'] = extract_content(news)
    article['published_time'] = extract_published_time(url, soup)

    return article
Esempio n. 20
0
def process_data(threadName, q):
	while not exitFlag:
		queueLock.acquire()
		if not workQueue.empty():
			global Id
			print "%s processing No.%s result page..." % (threadName, Id)
			data = q.get()
			g = Goose()
			resultUrl = data["unescapedUrl"]
			article = g.extract(url = resultUrl)
			item = {}
			item['title'] = data["titleNoFormatting"]
			item['url'] = resultUrl
			item['keyWords'] = keyWords
			item['description'] = article.cleaned_text[:4000]
			if article.top_image:
				item['image'] = article.top_image.src
			else:
				item['image'] = ""
			insert(item)
			Id += 1
			
			queueLock.release()
			
			
		else:
			queueLock.release()
		time.sleep(1)
Esempio n. 21
0
 def GetDesc_goose(self, url) :
     article = "NULL"
     try :
         g = Goose( {'stopwords_class': StopWordsChinese} )
         article = g.extract(url = url)
     except Exception, ex:
         l.Warning("Goose_Crawl Failed %s" % str(ex))
Esempio n. 22
0
def extract(URL):
	"""
	This function extract the page's text body of the given URL.

	Return:
		page_title: the value of the <title> html tag
		text_extracted: the extracted body text
		img: top_image url extracted
	"""

	g = Goose()

	text, text_type= _get_html_content_from_url(URL)

	if text_type != 'text/plain':
	#article = g.extract(url=URL)
		article = g.extract(raw_html=text)

		img = ''

		try:
			img = article.top_image.src
		except:
			img = ''
		return (article.title,article.cleaned_text,img)
	else:
		print "it's a plain/text"
		return ('plaintext',text,'n/a')
Esempio n. 23
0
def extract_title(html):
	"""
	 Extract the body title of a web page
	"""
	g = Goose({'enable_image_fetching':False})
	article = g.extract(raw_html=html)
	return article.title
Esempio n. 24
0
    def download_url(self, url):

        url = self.url
        #g = Goose()
        #g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'})
        g = Goose({'parser_class':'soup'}) #does this parser works for all?

        article = g.extract(url=url)

        self.title = article.title
        self.description = article.meta_description
        self.keywords = article.meta_keywords

        self.content = article.cleaned_text

        self.domain = article.domain

        self.movies = article.movies


        try:
            self.original_image_url = article.top_image.src
        except AttributeError:
            self.original_image_url = ""

        self.favicon_url = article.meta_favicon

        self.final_url = article.final_url

        #test
        self.domain_link = article.tags
Esempio n. 25
0
def categorize(request, article_url):
	
	#load model
	f = open('my_classifier.pickle')
	classif = pickle.load(f)
	f.close()
	print "loaded model"

	#categorize incoming article
	g = Goose()
	article = g.extract(url=article_url)

	#get list of words
	words = dict()

	
	article_text = article.cleaned_text
		
	for word in word_tokenize(article_text):
		words.setdefault(('%s' % word), 0)	
		words[('%s' % word)] += 1

	print "got words!"

	


	classified = classif.classify(words)

	output = ""
	output += "PREDICTED: %s <br>" % classified
	output += "<br><br> %s" % article_text

	return HttpResponse(output)
Esempio n. 26
0
def extract_body(html):
	"""
	 Extract the body text of a web page
	"""
	g = Goose({'enable_image_fetching':False})
	article = g.extract(raw_html=html)
	return article.cleaned_text
Esempio n. 27
0
File: yay.py Progetto: vayan/yay
 def on_pubmsg(self, serv, ev):
     canal = ev.target()
     message = ev.arguments()[0].lower()
     if self.channels[canal].has_user("Yppy"):
         return
     url = re.search("(?P<url>https?://[^\s]+)", message)
     if url:
         url = url.group(0)
         try:
             self.lasturl = url
             hostname = urlparse.urlparse(url).hostname
             g = Goose()
             article = g.extract(url=url)
             tinyurl = urllib2.urlopen("http://tinyurl.com/api-create.php?url=" + url).read()
             title = article.title.encode('utf-8')[:70]
             ret = "Title : %s (%s) | %s" % (title, hostname, tinyurl)
             serv.privmsg(canal, ret)
         except:  # todo log error
             e = sys.exc_info()[0]
             print(e)
             return
     if "!sum" in message:
         try:
             response = unirest.post("http://192.81.222.194:1142/api",{}, {"url": self.lasturl})
             print response.body
             for bullet in response.body:
                 serv.privmsg(canal, ("* %s" % (bullet).encode('utf-8')))
         except:  # todo log error
             e = sys.exc_info()[0]
             print(e)
             return
Esempio n. 28
0
 def _article(self):
     """Analyse resource content, return Goose interface"""
     # switch method depending on content_type
     # for pdf, fall back to teseract if pdf2text yields not much
     # (then use the larger, or maybe composit)
     g = Goose()
     return g.extract(raw_html=self._decode())
Esempio n. 29
0
class Html_parser(object):
    """
    use goose to parse raw html and
    """
    def __init__(self,need_stem):
        #set up goose
        config = Configuration()
        config.enable_image_fetching = False
        self._g = Goose(config)
        self._need_stem = need_stem

    def get_text(self,file_path):
        raw_html = ""
        with open(file_path) as f:
            raw_html = f.read()

        if not raw_html:
            return None
        try:
            article = self._g.extract(raw_html = raw_html)
        except lxml.etree.ParserError as e:
            return None

        text = article.title + ".\n" + article.cleaned_text 
        if self._need_stem:
            text = re.sub("\w+",do_stem,text)
            #words = re.findall("\w+",text,re.MULTILINE)
            #w = map(stem,words)
            #text = " ".join(w)
        return text
Esempio n. 30
0
def scrape(url):
    """
    Function to request and parse a given URL. Returns only the "relevant"
    text.

    Parameters
    ----------

    url : String.
            URL to request and parse.

    Returns
    -------

    text : String.
            Parsed text from the specified website.

    meta : String.
            Parsed meta description of an article. Usually equivalent to the
            lede.
    """
    logger = logging.getLogger('scraper_log')
    page = requests.get(url)
    g = Goose()
    try:
        article = g.extract(raw_html=page.content)
        text = article.cleaned_text
        meta = article.meta_description
        return text, meta
    #Generic error catching is bad
    except Exception, e:
        print 'There was an error. Check the log file for more information.'
        logger.warning('Problem scraping URL: {}. {}.'.format(url, e))
Esempio n. 31
0
def getUrl(item):
	url_name = re.split('&&', item)
	url = url_name[1]
	name = url_name[0]
	print url
	print name
	html_name = name + '.html'
	print html_name
	g = Goose({'stopwords_class': StopWordsChinese})
	article = g.extract(url=url)
	# print article.raw_html
	currentDir = os.getcwd() + '/' + 'pages' + '/' + name
	if not os.path.exists(currentDir):
		os.makedirs(currentDir)
	f = open(currentDir + '/' +html_name,'a')
	f.write(article.raw_html)
	f.close()
	#print article.title

	f_md = open(currentDir + '/' + name + '.md' ,'a')
	f_md.write(article.cleaned_text.encode('utf-8'))
	f_md.close()
Esempio n. 32
0
def articleExtractor():

    url = request.args.get('url', '')

    articleObject = []

    print("Program started ...")

    articleExtractor = Goose()
    article = articleExtractor.extract(url=url)

    #build article content
    articleBody = ""
    for letter in article.cleaned_text:
        articleBody += str(letter.encode('utf-8', 'ignore'))

    articleObject.append(article.title)
    articleObject.append(article.meta_description)
    articleObject.append(articleBody)

    #return article main text
    return jsonify(articleObject)
def stockQuery(stockSymbol):
    """ Returns a table with various information about a stock"""
    stockSymbol = stockSymbol.upper()
    end_date = datetime.date.today()
    start_date = datetime.date.today() - datetime.timedelta(days=2)
    seed_url = "https://www.google.com/finance/company_news?q=" + stockSymbol + "&ei=kT3SWJGvNMeguASQ3K7wCw&startdate=" + str(
        start_date) + "&enddate=" + str(end_date) + "&start=1&num=15"
    #seed_url1="https://www.google.com/finance/company_news?q=AAPL&ei=kT3SWJGvNMeguASQ3K7wCw&startdate=2017-03-21&enddate=2017-03-23&start=1&num=15"
    return_list = []

    r = requests.get(seed_url)
    soup = BeautifulSoup(r.content, 'lxml')
    url_list = soup.find_all('a', id='n-cn-')

    for url in url_list:
        table = {
            'title': None,
            'text': None,
            'img_url': None,
            'url': None,
            'publish_date': None
        }
        url = url['href']
        url = url[url.find('url=') + 4:url.find('&cid')]

        g = Goose()
        article = g.extract(url=url)

        try:
            title = article.title
            table['title'] = title
        except Exception, e:
            table['title'] = ""

        try:
            text = article.cleaned_text
            table['text'] = text
        except Exception, e:
            table['text'] = ""
Esempio n. 34
0
def initial_check():
    url_link = "http://fetchrss.com/rss/59549c628a93f872018b4567709026440.xml"
    # get all the links of news title
    links = []
    text = []
    title = []
    rss = feedparser.parse(url_link)

    for post in rss.entries:
        links.append(post.link)
        title.append(post.title_detail.value)
    oldlinks = rssdata.objects.values_list('link', flat=True)
    print("old links are: \n ", oldlinks)
    for i in range(0, len(links)):
        if links[i] not in oldlinks:
            response = get(links[i])
            extractor = Goose()
            article = extractor.extract(raw_html=response.content)
            texts = article.cleaned_text
            news_story = texts.encode('utf-8')
            print("new links:\n", links[i])
            extract(links[i], news_story, title[i])
Esempio n. 35
0
def processURL(url):
    toReturn = {}

    score = svm.compute(url)

    t = lxml.html.parse(url)

    title = t.find(".//title").text

    response = get(url)
    extractor = Goose()
    article = extractor.extract(raw_html=response.content)
    file = article.cleaned_text

    keywords = nlp.generateEntity(file)

    toReturn['title'] = title
    toReturn['score'] = score
    toReturn['keywords'] = keywords
    toReturn['url'] = url

    return json.dumps(toReturn)
Esempio n. 36
0
def download_article(url):
    article = {'link': url, 'source': 'crawler_oglobo'}
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language': 'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['published_time'] = extract_published_time(soup)
    article['body_content'] = extract_content(news)

    return article
Esempio n. 37
0
def retrieve_data_for_link(param):
    logging.debug('retrieve_data_for_link - param = {}'.format(param))
    (full_link, tmp_news_folder) = param
    link = full_link[0]
    google_title = full_link[1]
    link_datetime = full_link[2]
    compliant_filename_for_link = slugify(link)[:50]
    max_len = 100
    if len(compliant_filename_for_link) > max_len:
        logging.debug(
            'max length exceeded for filename ({}). Truncating.'.format(
                compliant_filename_for_link))
        compliant_filename_for_link = compliant_filename_for_link[:max_len]
    pickle_file = '{}/{}.pkl'.format(tmp_news_folder,
                                     compliant_filename_for_link)
    already_fetched = os.path.isfile(pickle_file)
    if not already_fetched:
        try:
            """html = download_html_from_link(link)
            soup = BeautifulSoup(html, 'html.parser')
            content = get_content(soup)
            full_title = complete_title(soup, google_title)
	    """
            goose_client = Goose()
            g_content = goose_client.extract(url=link)
            article = {
                'link': link,
                'title': g_content.title,
                'content': g_content.cleaned_text,
                'meta_description': g_content.meta_description,
                'datetime': link_datetime
            }
            pickle.dump(article, open(pickle_file, 'wb'))
        except Exception as e:
            logging.error(e)
            logging.error(
                'ERROR - could not download article with link {}'.format(link))
            pass
def clean_pp_html(url, pp_html):
    """
    Cleans the privacy policy html of html tags
    :param url: the pp url
    :param pp_html: the pp html
    :return: the clean html
    """
    ret_val = ''
    try:
        print("processing the following url {}".format(url))
        tempfile.tempdir = os.getcwd()
        g = Goose()
        ret_val = g.extract(raw_html=pp_html).cleaned_text
    except Exception as e:
        print(e)

    if ret_val == '':
        try:
            soup = BeautifulSoup(pp_html)
            ret_val = soup.body.getText()
        except Exception as ee:
            print(ee)
    return ret_val
Esempio n. 39
0
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = {'link': url, 'source': 'crawler_folha_sao_paulo'}
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language': 'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.content)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['category'] = extract_category(url)
    article['published_time'] = extract_published_time(soup)

    content = extract_content(news, soup)

    if len(content) is 2:
        article['link'], article['body_content'] = content
    else:
        article['body_content'] = content

    return article
Esempio n. 40
0
class SoloSpider(CrawlSpider):
    name = "solo"

    rules = (Rule(LinkExtractor(), callback='parse_items', follow=True), )

    def __init__(self, **kw):
        super(SoloSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain')

        self.g = Goose()
        self.url = url
        self.allowed_domains = [url]
        self.start_urls = ['http://www.' + url]
        # self.link_extractor = LinkExtractor()

    def parse_items(self, response):

        # print 'PARSE ITEMS'
        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SoloItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(), lambda v: re.sub(r'[\',|!]', '', v),
            lambda v: re.sub(r'\s+', ' ', v))

        il.add_value('siteurl', self.parse_base_url(response.url))
        il.add_value('pageurl', response.url)
        il.add_value('text', fulltext.encode('ascii', 'ignore'))
        il.add_xpath('pagetitle', '//title/text()')

        yield il.load_item()

    def parse_base_url(self, url):
        url = re.sub(r'((http(s)?://)?(www.)?)', '', url.lower())  # strip head
        # print url.find('/')
        return url[:url.find('/')] if url.find('/') != -1 else url
Esempio n. 41
0
def catchpg(x, dir_to_write, file_id):
    g = Goose()
    print '=== Start ==='
    print x
    try:
        a = g.extract(url=x)

        to_write = a.cleaned_text.replace('\r', '').replace('\n', ' ').strip() + '\n'
        to_write += a.title + '\n'
        to_write += a.top_image.src

        # translate(string.maketrans(string.punctuation, ' ' * len(string.punctuation)))

        if len(to_write.strip()) > 0:
            output = open(dir_to_write + os.sep + str(file_id), 'wb')
            output.write(to_write.encode('utf-8'))
            print 'caught ^_^Y'
        else:
            print 'None -_-!', x
    except Exception as e:
        print e
        print 'Missed -_-!', x

    print '=== End ===\n'
def extract_article_content(urls_extracted):

    connection = pymysql.connect(host,
                                 user=user,
                                 port=port,
                                 passwd=password,
                                 db=dbname)
    for url in urls_extracted:
        print url
        g = Goose()
        article = g.extract(url=url)
        article_title = re.sub(
            r'(?mis)[\[\]\!\@\#\$\%\&\*\`\~\^\-\_\"\{\}\:\;\<\>\'\/\\\|\(\)\n\r]*',
            '', article.title).encode('utf-8')

        if len(article_title) == 0:
            article_title = " "
        article_content_1 = re.sub(
            r'(?mis)[\[\]\!\@\#\$\%\&\*\`\~\^\-\_\"\{\}\:\;\<\>\/\'\\\|\(\)\n\r]*',
            '', article.cleaned_text).encode('utf-8')
        if len(article_content_1) == 0:
            article_content_1 = " "

        cursor = connection.cursor()
        sql = "INSERT INTO article_data(url,title,article_content,added_dt) VALUES ('{0}','{1}','{2}','{3}')".format(
            url, article_title, article_content_1,
            datetime.today().strftime("%Y-%m-%d"))
        #try:
        cursor.execute(sql)
        # Commit your changes in the database
        connection.commit()
        #except:
        #	print "yes"
        #	connection.rollback()

    connection.close()
Esempio n. 43
0
def initial_check():
    print("here")
    url_link = "http://fetchrss.com/rss/5bf76e868a93f84c038b45675bf76e658a93f869028b4567.xml"
    # get all the links of news title
    links = []
    text = []
    title = []
    rss = feedparser.parse(url_link)

    for post in rss.entries:
        links.append(post.link)
        title.append(post.title_detail.value)
    oldlinks = rssdata.objects.values_list('link', flat=True)
    # print oldlinks
    # print links
    for i in range(0, len(links)):
        if links[i] not in oldlinks:
            response = get(links[i])
            extractor = Goose()
            article = extractor.extract(raw_html=response.content)
            texts = article.cleaned_text
            news_story = texts.encode('utf-8')
            # print(news_story)
            extract(links[i], news_story, title[i])
Esempio n. 44
0
def extract_entry_data(url, fetch_images=True):
    """
    Fetch the full content for a feed entry url.

    Args:
        | url (str)    -- the url of the entry.

    Returns:
        | entry_data -- Goose object.
        | str        -- the full text, including html.
    """

    html = _get_html(url)
    g = Goose()
    g.config.enable_image_fetching = fetch_images

    try:
        # Use Goose to extract data from the raw html,
        # Use readability to give us the html of the main document.
        return g.extract(raw_html=html), Document(html).summary()

    except UnicodeDecodeError as e:
        logger.exception('UnicodeDecodeError with html: {0}'.format(html))
        return None, ''
Esempio n. 45
0
 def getArticle(self, url, raw_html, language=None):
     g = Goose({'stopwords_class': StopWordsChinese})
     article = g.extract(url=url, raw_html=raw_html)
     return article
Esempio n. 46
0
# install Goose https://github.com/grangier/python-goose
#
# Done so far: basic keyword extraction using tagger works.
#
# Concerns about keyword extraction using Tagger library:
# https://github.com/apresta/tagger
# - dictionary should be built from relevant corpi to article to be more
# 	effective at attracting attention in immersive interface
# - TF-IDF is a function provided in the module build_dict... if articles
# 	in collection ever accumulate enough around one subject, use TF-IDF
#
# immediate todos:
# - implement multitag

from goose import Goose
import tagger
import pickle

url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile"
g = Goose()
article = g.extract(url=url).cleaned_text

weights = pickle.load(open('data/dict.pkl', 'rb'))  # or your own dictionary
mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(),
                         tagger.Rater(weights))
best_3_tags = mytagger(article, 6)
print best_3_tags
Esempio n. 47
0
def get_text(url):
    g = Goose() 
    article = g.extract(url=url)
    with codecs.open(article.link_hash + “.speech”, “w”, “utf-8-sig”) as text_file:
    text_file.write(article.cleaned_text)
def run(keyword):
	headers = {'User-agent': "HotJava/1.1.2 FCS"}
        logging.debug('KEYWORD = {}'.format(keyword))
        #generate_articles(keyword)
	safe_keyword = "+".join(keyword.split(" "))
	link = "https://www.quora.com/search?q=%s"%safe_keyword
	session = get_tor_session()
        print(session.get("http://httpbin.org/ip").text)
	response = session.get(link, headers=headers, timeout=20)
	print response.status_code
	if response.status_code != 200:
		pass
	html = response.content
	soup = BeautifulSoup(html, 'html.parser')
	# q_data = {ques_link: [ques_text, [{
	#	external_link, link_text, answer_id, answer_text, 
	#	meta_keyword, meta_description, meta_title, image, video, favicon, domain}]]}
	q_data = {}
	print list(soup.find_all("a", {"class": "question_link"}))
	for i in soup.find_all("a", {"class": "question_link"}):
		print i
		print i.text, i['href']
		ques_link = "https://www.quora.com/%s"%i['href']
		ques_response = session.get(ques_link, headers=headers, timeout=20)
		if ques_response.status_code != 200:
			session = get_tor_session()
	                ques_response = session.get(ques_link, headers=headers, timeout=20)
		ques_html = ques_response.content
		ques_soup = BeautifulSoup(ques_html, 'html.parser')
		answers = ques_soup.find_all("div", {"class": "AnswerBase"})
		ans_links = ques_soup.find_all("span", {"class": "qlink_container"})
		print list(answers)
		if len(list(answers)) and len(list(ans_links)):
			q_data[i['href']] = {'text': i.text, 'links' : []}
		for ans in answers:
			print ans.text
			external_links = ans.find_all("span", {"class": "qlink_container"})
			for e_link in external_links:
				if len(list(e_link.children)):
					a_link = list(e_link.children)[0]
					if bool(urlparse.urlparse(a_link['href']).netloc):
						print a_link['href']
						link_url = a_link['href']
						if 'https://www.quora.com/_/redirect' in a_link['href']:
							try:
								link_url = filter(lambda y: y[0] == 'url', map(lambda x: x.split("="), urlparse.urlparse(link_url).query.split("&")))[0][1]
							except:
								print sys.exc_info()
						if len(urlparse.urlparse(link_url).path) < 3:
							pass
						signal.signal(signal.SIGALRM, g_timeout_handler)
						signal.alarm(20)
						try:
							# Got external link
							goose_client = Goose()
					                g_content = goose_client.extract(url = link_url)
				        	        q_data[i['href']]['links'].append({
							'title': g_content.title,
				                        'meta_description': g_content.meta_description,
                        				'image': g_content.top_image.src \
								if g_content.top_image else '-',
			                	        'video': g_content.movies[0].src \
								if len(g_content.movies) else '-',
	                        			'favicon': g_content.meta_favicon,
				                        'domain': g_content.domain,
							'a_link': a_link['href'],
							'a_link_text': a_link.text,
							'a_link_answer_id': ans.get('id'),
							'a_link_answer_text': ans.text,
							})
						except Exception as ex:
							if "goose_timeout" in ex:
								print "Goose Timeout!"
							else:
								print "New Error", ex
							q_data[i['href']]['links'].append({
							'a_link': a_link['href'],
                                                        'a_link_text': a_link.text,
                                                        'a_link_answer_id': ans.get('id'),
                                                        'a_link_answer_text': ans.text,
                                                        })
						finally:
							signal.alarm(0)
	json_file_n = safe_keyword + ''.join(random.choice(string.ascii_uppercase + string.digits) \
		for _ in range(5)) + '.json'
	with open(json_file_n, 'w') as json_file:
    		json.dump(q_data, json_file)
	return
Esempio n. 49
0
 text_file = open(
     "./headlines/headline" + str(year) + str(month) + ".txt", "w")
 text_file_arti = open(
     "./articles/article" + str(year) + str(month) + ".txt", "w")
 print(str(year) + str(month))
 value = api.query(year, month)
 val = value['response']['docs']
 for v in val:
     for l in lines:
         try:
             if l.lower() in v['headline']['main'].lower():
                 head += (str(count) + " " + v['pub_date'][0:10] + " " +
                          v['headline']['main'] + '\n')
                 response = get(v['web_url'])
                 extractor = Goose()
                 article1 = extractor.extract(raw_html=response.content)
                 text = article1.cleaned_text
                 if text == "":
                     article = article + (str(count) + " " +
                                          v['pub_date'][0:10] + " " +
                                          v['snippet'] + '\n')
                 else:
                     article = article + (
                         str(count) + " " + v['pub_date'][0:10] + " " +
                         (text.encode('utf-8').strip()
                          ).decode('utf-8').strip() + '\n')
                 print(str(count))
                 count = count + 1
                 break
         except:
             pass
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(
                article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight)
                                     for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0,
                                           results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector
import sys
from goose import Goose
import codecs

filename = sys.argv[1]

try:
    with open("tmp/htmls/" + filename, "rb") as f:
        html = f.read()
except:
    print("No file named as : ", filename)
    sys.exit(0)

g = Goose()
article = g.extract(raw_html=html)

with codecs.open("tmp/texts/" + filename, "w", "utf-8") as g:
    g.write(article.cleaned_text)

print("Finished html-to-text : " + filename)
Esempio n. 52
0
def gooseExample():
    g = Goose()
    url = "http://www.chinadaily.com.cn/a/201712/22/WS5a3c7473a31008cf16da2d9e.html"
    article = g.extract(url=url)
    print(article.title)
    print(article.cleaned_text[:150])
Esempio n. 53
0
    doc = open(doc_path, 'r')
    doc = codecs.open(doc_path, encoding='utf-8', mode='r')
    text = doc.read()
    text = text.encode('ascii', 'ignore')
elif doc_type == '-w':
    g = Goose()
    # determine if this is a New York Times url, in which case
    # we cannot use goose alone and must also rely on urllib2
    sites = 'www.(nytimes)|(theonion)'
    if re.search(sites, doc_path):
        print('handling special case')
        # do the nytimes thing
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
        response = opener.open(doc_path)
        raw_html = response.read()
        article = g.extract(raw_html=raw_html)
        text = article.cleaned_text.encode('ascii', 'ignore')
    else:
        # just use goose
        article = g.extract(url=doc_path)
        text = article.cleaned_text.encode('ascii', 'ignore')

# Tokenize the document to be summarized
tok = PunktSentenceTokenizer()

doc = ' '.join(text.strip().split('\n'))
sentences = tok.tokenize(doc)

# Pass tokenized document to tr_func.normalize
# which generates a graph containing vertices
# for each sentence in the document
Esempio n. 54
0
def spider():
    #url = raw_input("Enter a website to crawl articles from: ")
    print "Crawling from KseStocks Business..."
    #nltk.download('stopwords')

    r = requests.get("http://ksestocks.com/NewsCentral/Business_News")

    data = r.text

    soup = BeautifulSoup(data, "lxml")
    dict = {}
    counter = 0

    for link in soup.find_all('a'):
        # print(link.get('href'))
        dict[counter] = link.get('href')
        counter += 1

    print dict

    print "URLs DICTIONARY"
    # print urls_dict
    print "\n\nGoose Beginning from here \n"

    dict_of_validated_urls = {}

    for key, value in dict.iteritems():
        # print dict[key]
        check = validators.url(dict[key])
        # print check
        if check:
            dict_of_validated_urls[key] = value

    ####### Pass URL of article here ##########

    print dict_of_validated_urls
    print len(dict_of_validated_urls)

    keywords = {
        0: "twitter",
        1: "facebook",
        2: "fashion",
        3: "entertainment",
        4: "epaper",
        5: "sport",
        6: "politics",
        7: "images",
        8: "obituary",
        9: "watch-live",
        10: "herald",
        11: "supplements",
        12: "classifieds",
        13: "aurora",
        14: "cityfm",
        15: "#comments",
        16: "expo",
        17: "nnews",
        18: "latest-news",
        19: "category",
        20: "videos",
        21: "tv-shows",
        22: "urdu",
        23: "live",
        24: "php",
        25: "trending",
        26: "privacy",
        27: "about",
        28: "aspx",
        29: "faq",
        30: "talent",
        31: "ratecardon",
        32: "advertise"
    }

    print "Validation\n"

    for key in dict_of_validated_urls.keys():
        for values in keywords.values():
            if values in dict_of_validated_urls[key]:
                print dict_of_validated_urls[key]
                del dict_of_validated_urls[key]
                break

    print len(dict_of_validated_urls)

    print dict_of_validated_urls

    dict_of_articles = {}
    counter = 0

    for key, value in dict_of_validated_urls.iteritems():
        dict_of_articles[counter] = value
        counter += 1

    counter = 0
    print dict_of_articles

    dict_of_cleaned_urls = {}

    for key, value in dict_of_articles.items():
        if value not in dict_of_cleaned_urls.values():
            dict_of_cleaned_urls[key] = value

    print "Clean URLs:"
    print dict_of_cleaned_urls
    text = ""
    filtered_sentence = []

    dict_of_cleaned_articles_and_titles = {}

    cnx = mysql.connector.connect(user='******',
                                  password='******',
                                  host='localhost',
                                  database='articles')
    cursor = cnx.cursor()

    for key in dict_of_cleaned_urls.keys():
        url = dict_of_cleaned_urls[key]
        g = Goose()
        article = g.extract(url=url)
        print article.title
        # dict_of_cleaned_articles_and_titles[article.title]
        print "Title printed"
        print "\n"
        # print article.meta_description
        text = article.cleaned_text
        stop_words = set(stopwords.words('english'))

        word_tokens = word_tokenize(text)

        filtered_sentence = [w for w in word_tokens if not w in stop_words]

        for words in filtered_sentence:
            #filtered_sentence = words.encode('ascii', 'ignore')
            filtered_sentence = words.encode("utf-8")

        print filtered_sentence
        print type(filtered_sentence)
        filtered_sentence = str(filtered_sentence)

        for w in word_tokens:
            if w not in stop_words:
                # filtered_sentence.append(w)
                filtered_sentence += " " + w

        print "Filtered:"
        print filtered_sentence
        print "Text printed"
        # print article.top_image.src

        # dict_of_cleaned_articles_and_titles[article.title] = filtered_sentence

        data = (article.title, filtered_sentence)

        # data = (title, file_text)

        cursor.execute(
            "SELECT Title, COUNT(*) FROM articles_table WHERE Title = %s GROUP BY Title",
            (article.title, ))
        # query =
        msg = cursor.fetchone()
        # check if it is empty and print error
        if not msg:
            cursor.execute(
                "insert into articles_table (Title, Text) values(%s,%s)",
                (data))
            # cursor.execute(add_to_db_query, data)
            cnx.commit()
            print "Added to Database"

    id = "[]"
    delstatmt = "DELETE FROM articles_table WHERE Text = %s"
    cursor.execute(delstatmt, (id, ))
    cnx.commit()
    cursor.close()
    cnx.close()
    #option = raw_input("\nPress q to quit or any other to restart program: ")
    #print "\n"
    #if option == 'q':
    #    exit()
    print "Done Crawling and Cleaned Database!\n"
Esempio n. 55
0
BASE_URL = 'https://www.fxstreet.com/cryptocurrencies/news?q=&hPP=50&idx=FxsIndexPro&p=0&is_v=1'
client_response = Page(BASE_URL)
source = client_response.html
soup = BeautifulSoup(source, 'html.parser')

all_links = soup.find_all('h4', class_='fxs_headline_tiny')
#goose to extract content
#install goose again by reaching cd ~ directory and doing steps mentioned
#https://github.com/grangier/python-goose
g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'})
#you don't have to close csv file
F = csv.writer(open("fxNewsLink.csv", 'w'))

for elem in all_links:
    link = elem.contents[1]['href']
    article = g.extract(url=link)
    content = article.cleaned_text

    #to get date of article we oprn the link and extract time element
    response = urlopen(link).read()
    date_soup = BeautifulSoup(response, 'html.parser')
    date = date_soup.find_all('time')[0]['datetime']
    date = dateparser.parse(date)

    timestamp = datetime.now()
    title = article.title
    #encoding was required as ascii unicode popped up
    #list of element used as strings have commas and these commas act as delimiters
    #so prevent normal commas to act as delimiters we used a list and csv package
    out = [
        link.encode("utf-8"),
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = filter(
        lambda token: p.match(token) and len(token) >= min_length, tokens)
    return filtered_tokens


from goose import Goose

if __name__ == "__main__":
    url = 'http://www.reuters.com/article/global-oil-idUSL3N16408T'
    g = Goose()
    article = g.extract(url=url)
    a = article.cleaned_text
    html_dict = []
    tokenhtml = tokenize(a)
    print(tokenhtml)
    for i in range(0, len(tokenhtml)):
        body = ''
        body += tokenhtml[i] + ' '
    html_dict.append({"label": "0", "text": body})

    sc = SparkContext()
    htmldata = sc.parallelize(html_dict)
    labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True)

    tf = HashingTF().transform(
        htmldata.map(lambda doc: doc["text"], preservesPartitioning=True))
Esempio n. 57
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(url=url, raw_html=raw_html)
     return article
Esempio n. 58
0
def get_texteaser(url):
    g = Goose()
    article = g.extract(url=url)
    response = unirest.post("http://x.textteaser.com/api", {}, {"token": apikey, "text": article.cleaned_text, "title":  article.title})
    print response.body
    return json.dumps(response.body['sentences'])
Esempio n. 59
0
            print '--------------------------------------------'
            print articleno
            print link
            print '--------------------------------------------'
            r = urllib.urlopen(link).read()
            soup = BeautifulSoup(r, "lxml")
            text = soup.find('div', class_='columnLeft')
            if text is None:
                continue
            text = text.find('p')
            date = soup.find('span', class_='timestamp')
            articles.append([
                date.get_text().encode('utf-8'),
                text.get_text().encode('utf-8')
            ])
        else:
            print '--------------------------------------------'
            print articleno
            print link
            print '--------------------------------------------'
            article = goose.extract(url=link)
            date = article.publish_date
            text = article.cleaned_text.encode('utf-8')
            if date is None or text is None:
                continue
            articles.append([date, text])

with open('applefoolarticles.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(articles)
Esempio n. 60
0
def crawl_news(news_pool, min_body_len, doc_dir_path, doc_encoding):
    i = 1
    for newslink in news_pool:
        try:
            response = urllib.request.urlopen(newslink, timeout=10)
            html = response.read()
        except Exception as e:
            print("URL-Request-----%s: %s"%(type(e), newslink))
            continue
        try:
            soup = BeautifulSoup(html, 'lxml') # http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
            div1 = soup.find('div',class_='qq_mainArea')
            if repr(div1) != "None":
                cmt_id_start = div1.text.find('cmt_id')
                cmt_id_end = div1.text.find('cmt_is_group')
                cmt_id = div1.text[cmt_id_start+9:cmt_id_end]
                cmt_id_end = cmt_id.find(';')
                cmt_id = cmt_id[0:cmt_id_end]
                title = div1.find('h1').text
                time = div1.find('span',class_='a_time').text
                body = div1.find('div',class_='Cnt-Main-Article-QQ').text
            else:
                continue
            try:
                commentlist = getComments(cmt_id, limit*max_iter)
            except:
                commentlist = ["NULL"]        
            try:
                commentnum = getCommentsNum(cmt_id)    
            except:
                commentnum = str(len(commentlist))
        except:
            print("Crawl URL " + newslink + " failed.")
            commentlist = ["NULL"]
            continue
        doc = ET.Element("doc")
        ET.SubElement(doc, "source").text = "Tencent"
        ET.SubElement(doc, "id").text = "%d"%(i)
        ET.SubElement(doc, "url").text = newslink
        ET.SubElement(doc, "title").text = title
        ET.SubElement(doc, "datetime").text = time#time[0:16]
        body_cleaned = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", body)
        if len(body_cleaned)/len(body) <= 0.85:
            try:
                g = Goose({'stopwords_class': StopWordsChinese})
                article = g.extract(url=newslink)
                body = article.cleaned_text
                ET.SubElement(doc, "body").text = body
                #title_cleaned = article.title
            except:
                ET.SubElement(doc, "body").text = body_cleaned
                if len(body_cleaned)/len(body) <= 0.5:
                    ET.SubElement(doc, "body").text = "Potential video or image news."
        else:
            print(len(body_cleaned)/len(body))
            ET.SubElement(doc, "body").text = body
        comment = '\r\n'.join(list(commentlist))
        #comment_cleaned = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", comment)
        ET.SubElement(doc, "comments").text = comment
        ET.SubElement(doc, "comments_num").text = commentnum
        tree = ET.ElementTree(doc)
        tree.write(doc_dir_path + time.replace(' ','-').replace(':','-') + "_%d.xml"%(i), encoding = doc_encoding, xml_declaration = True)
        i += 1