Example #1
0
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = { 'link': url, 'source': 'crawler_estadao' }
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['body_content'] = extract_content(news)
    article['published_time'] = extract_published_time(url, soup)

    return article
Example #2
0
def createResource(url):
    if len(url)>200:
        print "Los links largos de duckduckgo no funcionan"
        return None
    else:
        r=Resource.objects.filter(url=url)
        if len(r)>0:        
            print "El recurso ya lo tenia"
            r=r[0]
        else:
            g = Goose()
            try:
                a= g.extract(url=url)   
            except:
                a=None
            if a==None or a.title==None or a.title=="":
                title="notitle"
            else:
                title=a.title
            try:
                tags=["one","two"]
                r=Resource.objects.create(title=title,url=url,status=Resource.ADDED)
                r.tags.add("one two")
            except TypeError as e:
                print e
                print "no ha ido bien"
                print title
                print url
            print "Creado el recurso para "+url
        return r            
Example #3
0
def get_page_content(url):
    g = Goose({'stopwords_class': StopWordsChinese})
    try:
        article = g.extract(url=url)
    except Exception, e:
        print e
        article = None
Example #4
0
    def get_article(self, html):

        config = self.getConfig()
        self.parser = config.get_parser()

        g = Goose(config=config)
        return g.extract(url = "http://www.null.com", raw_html = html)
Example #5
0
 def _article(self):
     """Analyse resource content, return Goose interface"""
     # switch method depending on content_type
     # for pdf, fall back to teseract if pdf2text yields not much
     # (then use the larger, or maybe composit)
     g = Goose()
     return g.extract(raw_html=self._decode())
Example #6
0
def get_parser(url, tokenizer):
    useragent = ' '.join([
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
        "AppleWebKit/537.36 (KHTML, like Gecko)",
        "Chrome/52.0.2743.116 Safari/537.36"
    ])

    # Scrape Web Page With HTMLParser and Goose and select the best scrape
    html_parser = HtmlParser.from_url(url, tokenizer)
    article = Goose({'browser_user_agent': useragent})

    # Goose raises IndexError when requesting unfamiliar sites.
    try:
        extract = article.extract(url=url)
    except:
        extract = article.extract(raw_html=requests.get(url).text)

    goose_parser = PlaintextParser(extract, tokenizer)

    # Aggregate Site Metadata
    meta = {
        k: v
        for (k, v) in extract.infos.items()
        if k not in ('cleaned_text', 'links', 'tweets', 'movies')
    }
    # Select Best Parser
    parser = (
        html_parser
        if len(goose_parser.document.words) < len(html_parser.document.words)
        else  # noqa
        goose_parser)

    return parser, meta
Example #7
0
def extract_body(html):
    """
	 Extract the body text of a web page
	"""
    g = Goose({'enable_image_fetching': False})
    article = g.extract(raw_html=html)
    return article.cleaned_text
Example #8
0
def extract_title(html):
    """
	 Extract the body title of a web page
	"""
    g = Goose({'enable_image_fetching': False})
    article = g.extract(raw_html=html)
    return article.title
Example #9
0
def createResource(url):
    if resolve(url)!=None:
        url=resolve(url)
    g = Goose()
    a= g.extract(url=url)
    if len(url)>200:
        print "Los links largos de duckduckgo no funcionan"
        return None
    else:
        r=Resource.objects.filter(url=url)
        if len(r)>0:        
            print "El recurso ya lo tenia"
            r=r[0]
        else:
            if a.title==None or a.title=="":
                title="notitle"
            else:
                title=a.title
            try:
                r=Resource.objects.create(title=title,url=url)
            except:
                print "no ha ido bien"
                print title
                print url
            print "Creado el recurso para "+url
        return r
Example #10
0
def article_extractor(url):

    articleObject = []

    print("Program started ...")

    articleExtractor = Goose()
    article = articleExtractor.extract(url=url)

    #build article content
    articleBody = ""
    for letter in article.cleaned_text:
        articleBody += str(letter.encode('utf-8', 'ignore'))

    #save article content in a file
    f1 = open('./output.txt', 'w+')
    f1.write(article.title + '\n')
    f1.write(article.meta_description + '\n')
    f1.write(articleBody)
    f1.close()

    articleObject.append(article.title)
    articleObject.append(article.meta_description)
    articleObject.append(articleBody)

    return articleObject
Example #11
0
def get_link_data_task(link_id):
    dbsession = get_link_data_task.dbsession
    services = get_link_data_task.services
    flags = get_link_data_task.flags
    if not flags:
        return
    link = services.link.get_link_by_id(link_id)
    if link is None:
        return
    html = None
    if 'screenshot' in flags:
        data, html = services.screenshot.capture(link.url, 1024, 800)
        # TODO: Investigate if this way of generating filename can create clashes
        # TODO: Delete the previous file if it exist
        filename = services.file.create(data,  str(uuid.uuid4()) + '.png', 'screenshots')
        link.meta['screenshot'] = filename

    if 'html' in flags:
        link.meta['html'] = html if html else requests.get(link.url).text

    # this should move to a service too
    if 'text' in flags or 'title' in flags:
        goose = Goose()
        a = goose.extract(raw_html=html if html else requests.get(link.url).text)
        if 'text' in flags:
            link.meta['text'] = a.cleaned_text

        if 'title' in flags:
            link.meta['title'] = a.title
    dbsession.commit() #  we are outside the web transaction
def generate_feature_matrix(data, stemmer, **prune_params):
    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    _parser = HTMLParser()

    sr_index = HashedIndex()

    for url_path, label in data.items():

        if os.path.exists(url_path):
            with open(url_path, 'r') as html_file:
                html_text = html_file.read()

            text = unicode(goose.extract(raw_html=html_text).cleaned_text)
            text = _parser.unescape(text)

            for token in word_tokenize(text, stemmer=stemmer):
                sr_index.add_term_occurrence(token, url_path)

    sr_index.prune(**prune_params)

    X = sr_index.generate_feature_matrix(mode='tfidf')

    y = np.zeros(len(sr_index.documents()))
    for index, doc in enumerate(sr_index.documents()):
        y[index] = 0 if data[doc] is None else 1

    return X, y
Example #13
0
def get_article(url):
    g = Goose()
    article = g.extract(url=url)
    regex = re.compile('[^a-zA-Z]')
    article = regex.sub(' ', article.title)
    article = re.sub(' +', ' ', article)
    return article
Example #14
0
def SplitArticle(url):
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)

    total_words = len(article.cleaned_text)
    current_word = 0
    last_sentence = ''
    sentences_pool = []
    while current_word < total_words:
        sub_article = last_sentence + article.cleaned_text[
            current_word:min(current_word + 100, total_words)]
        complete = (sub_article[-1] == u'。')
        sentences = sub_article.split(u'。')
        for s in range(len(sentences) - 1):
            for sub_s in sentences[s].split('\n'):
                if not sub_s == '':
                    sentences_pool.append(sub_s.encode('utf-8') + '\n')

        if not complete:
            last_sentence = sentences[-1]
        else:
            last_sentence = ''
            for sub_s in sentences[-1].split('\n'):
                if not sub_s == '':
                    sentences_pool.append(sub_s.encode('utf-8') + '\n')

        current_word = min(current_word + 100, total_words)

    return sentences_pool
Example #15
0
def extract_title(html):
	"""
	 Extract the body title of a web page
	"""
	g = Goose({'enable_image_fetching':False})
	article = g.extract(raw_html=html)
	return article.title
Example #16
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links = hxs.select("//a/@href").extract()

        # We stored already crawled links in this list
        crawledLinks = []

        # Pattern to check proper link
        linkPattern = re.compile(
            """^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+
                ]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?
                \+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$""")

        for link in links:
            if linkPattern.match(link) and not link in crawledLinks:
                crawledLinks.append(link)
                yield Request(link, self.parse)

        # Goose dziala lepiej, niz soup, do tego moze wyciagac img
        # moze wyciagac tekst, albo z url, albo z czystego html
        g = Goose()
        raw_html = response.body
        article = g.extract(raw_html=raw_html)
        text = article.cleaned_text
        if text.isspace() or not text:
            pass

        item = Website()
        item['text'] = text
        item['filename'] = '1.txt'
        yield item
Example #17
0
    def process_item(self, item, spider):
        if "pdf_Link" in item:
            pdfName = item["report_name"] + u".pdf"
            PDFPath = os.path.join(PDF_PATH, item["source_name"])
            if not os.path.exists(PDFPath):
                os.makedirs(PDFPath)
            filepath = os.path.join(PDFPath, pdfName)
            try:
                content = self.downloadPDF(item["pdf_Link"], filepath)
                item["report_content"] = content
            except:
                self.jsonInfoStored(item, pdfName)
                log.msg("pdf download failure, information is serializing to json files", level=log.INFO)
        elif "content_Link" in item:
            from goose import Goose
            from goose.text import StopWordsChinese

            try:
                g = Goose({"stopwords_class": StopWordsChinese})
                article = g.extract(url=item["content_Link"])
                content = article.cleaned_text
                del item["content_Link"]
                item["report_content"] = content
            except:
                log.msg("Content extracted failure from page:%s" % item["report_link"], level=log.INFO)
        return item
Example #18
0
def scrape_category(url, c_label):
	extract_feed_world = "http://pipes.yahoo.com/pipes/pipe.run?_id=a625f9823d9b5c4858865b107dcc2516&_render=json&urlinput1=%s" % urllib.quote_plus(url)
	data_world = urllib2.urlopen(extract_feed_world)
	json_data_world = json.load(data_world)

	for item in json_data_world['value']['items']:
		# link = urllib2.urlopen(item['link'])
		# link = link.geturl()
		if not [x for x, y in enumerate(Categorized_Labeled_Article.objects.all()) if (y.url == item['link'])]:
			try:
				cj = cookielib.CookieJar()
				opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
				request = urllib2.Request(item['link'])
				response = opener.open(request)

				url = response.geturl()
			
				g = Goose()
				article = g.extract(url=url)

				readable_article = article.cleaned_text

				#Save in database
				article = Categorized_Labeled_Article.objects.create(text=readable_article,label=c_label,url=item['link'])
				article.save()
				print article.label

			except (urllib2.HTTPError, UnicodeDecodeError, AttributeError, IOError):
				print "error %s" % item['link']
Example #19
0
 def run(self):
     df = pd.read_csv(LINKS_CSV)
     g = Goose({'stopwords_class': StopWordsChinese})
     df['content'] = df['url'].apply(
         lambda x: g.extract(url=x).cleaned_text)
     with self.output().open('w') as f:
         df.to_json(f, orient='records')
Example #20
0
def run(index):
    r = urllib2.urlopen(CNN_URL+index).read()
    soup = BeautifulSoup(r,"lxml")
    headlines = []
    links = []
    articles = []
    print ("Start reading news for: "+ index)
    for div in soup.find_all('table', 'wsod_newsTable')[0]:
        for col in div:
            link = col.find('a')['href']
            headline = col.find('a').contents[0]
            g = Goose()
            articles.append(g.extract(url=link).cleaned_text)


    print ("Finished Reading!")
    tokens = []

    for article in articles:
        tokens += word_tokenize(article)
    tokens = filter(lambda word: word not in string.punctuation, tokens)
    result = []
    for word in set(tokens):
        if tokens.count(word)>20 and tokens.count(word)<100:
            result.append((word, tokens.count(word)))

    return result
Example #21
0
def getproxyip_list(urllist):
    proxyurllist = []
    for url in urllist:
        time.sleep(10)
        g = Goose()
        article = g.extract(url=url)
        soup = BeautifulSoup(article.raw_html, "html.parser")
        proxy_list = soup.find_all("tr")
        for proxyip in proxy_list:
            iplist = []
            all_td = proxyip.find_all("td")
            if all_td:
                for td_line in all_td:
                    val = td_line.text.strip()
                    if val:
                        iplist.append(val)
                    else:
                        if td_line.div:
                            iplist.append(td_line.div["title"])
                        else:
                            iplist.append(u"中国")
                proxyurllist.append(iplist)
                print(iplist)

    return proxyurllist
Example #22
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return

    task = msg

    g = Goose()
    article = g.extract(url=task['url'])

    # print article.cleaned_text

    task['text'] = article.cleaned_text

    # # Scraping CNN news
    # text = None
    # if task['source']['id'] == 'cnn':
    #     print "Scraping CNN news"
    #     text = cnn_news_scraper.extractNews(task['url'])
    # else:
    #     print "News source [%s] is not supported." % task['source']['name']
    #
    # task['text'] = text

    dedupe_news_queue_client.sendMessage(task)
Example #23
0
def sms_ahoy_reply():
    """Respond to incoming messages with a friendly SMS."""
    body = request.values.get('Body', None)
    # Start our response
    with open('news') as data_file:
        current_news = json.load(data_file)
    headlines = current_news['articles']
    message = ""
    if "news" in body or "News" in body:
        headlines = current_news['articles']
        i = 0
        while i < 19:
            message = message + str(i +
                                    1) + ". " + headlines[i]['title'] + "\n"
            i = i + 1
    elif "more" in body:
        i = int(body.split()[0])
        headlines = current_news['articles']
        url = headlines[i - 1]['url']
        g = Goose()
        article = g.extract(url=url)
        message = article.cleaned_text[:1000]
        message2 = article.cleaned_text[1000:][:1000]
        message = message + "..."
    else:
        i = int(body)
        message = ""
        message = headlines[i - 1]['description']
    resp = MessagingResponse()
    # Add a message
    resp.message(message)

    return str(resp)
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = { 'link': url, 'source': 'crawler_estadao' }
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['body_content'] = extract_content(news)
    article['published_time'] = extract_published_time(url, soup)

    return article
Example #26
0
def parsepage(data, link):
    try:
        goo = Goose({'stopwords_class': StopWordsChinese})
        article = goo.extract(raw_html=data)
        return article
    except:
        traceback.print_exc()
Example #27
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Example #28
0
def grab(location, keywords, publication, publication_date, title):
    goose = Goose()
    try:
        raw_article = goose.extract(url=location)
        description = raw_article.meta_description.encode("utf8")
        article = raw_article.cleaned_text.encode("utf8")
        split_keywords = keywords.split(',')

        summary = pyteaser.SummarizeUrl(location)
        output = json.dumps({
            "title": title,
            "keywords": split_keywords,
            "publication": publication,
            "publication_date": publication_date,
            "description": description,
            "source": location,
            "article": article,
            "summary": summary
        })
        logging.warning('Succesfully grabbed through Goose.')
        logging.warning('Location: %s, Publication: %s' %
                        (location, publication))
        return output
    except:
        logging.critical('Unable to get article through Goose.')
        logging.critical('Location: %s, Publication: %s' %
                         (location, publication))
        return None
def extract(url):
    '''
    提取网页正文
    '''
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    return article.cleaned_text
def extract(url):
    '''
    提取网页正文
    '''
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    return article.cleaned_text
Example #31
0
 def GetDesc_goose(self, url) :
     article = "NULL"
     try :
         g = Goose( {'stopwords_class': StopWordsChinese} )
         article = g.extract(url = url)
     except Exception, ex:
         l.Warning("Goose_Crawl Failed %s" % str(ex))
Example #32
0
    def download_url(self, url):

        url = self.url
        #g = Goose()
        #g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'})
        g = Goose({'parser_class':'soup'}) #does this parser works for all?

        article = g.extract(url=url)

        self.title = article.title
        self.description = article.meta_description
        self.keywords = article.meta_keywords

        self.content = article.cleaned_text

        self.domain = article.domain

        self.movies = article.movies


        try:
            self.original_image_url = article.top_image.src
        except AttributeError:
            self.original_image_url = ""

        self.favicon_url = article.meta_favicon

        self.final_url = article.final_url

        #test
        self.domain_link = article.tags
Example #33
0
def crawlerWebLink(url):
    g = Goose()
    article = g.extract(url=url)

    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text)
Example #34
0
def get_data(rss, num):

    #pathToCSV = '/Users/Michal/Downloads/dialogflow-java-client-master2/samples/clients/VirtualTradingAssistant/src/main/java/ai/api/examples/fileStore/file.csv'
    #pathToCSV = 'C:\\Users\\ojwoo\\Documents\\Warwick\\CS261\\Coursework\\dialogflow-java-client-master\\samples\\clients\\VirtualTradingAssistant\\src\\main\\java\\ai\\api\\examples\\fileStore\\file.csv'
    #pathToCSV = '/Users/Michal/Desktop/apache-tomcat-8.5.28/bin/misc/file.csv'
    pathToCSV = 'C:\\apache-tomcat-8.5.28\\bin\\misc\\news.csv'

    with open(pathToCSV, 'w') as csvfile:
        wr = csv.writer(csvfile, delimiter='@', quotechar='#')
        index = 0
        for e in rss['entries']:
            if (index == int(num)):
                break

            wr.writerow([(e['title']).encode('utf-8')])
            wr.writerow([(e['link']).encode('utf-8')])

            try:
                g = Goose()
                article = g.extract(url=e['link'])

                cleaned_text = article.cleaned_text

                sent = sentiment(cleaned_text)

                if sent[0] < 0:
                    sent = 50 - (sent[0] * -50)
                else:
                    sent = sent[0] * 50 + 50

                wr.writerow([str(round(sent, 2)) + '%'])
            except TypeError:
                wr.writerow(['Sentiment Unavailable'])

            index = index + 1
Example #35
0
def crawlerWebLink(url):
    g = Goose()
    article = g.extract(url=url)

    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text)
 def getContent(self):
     g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'});
     urls = self.getNodeLinks();
     for i, url in enumerate(urls):
         article = g.extract(url=url);
         self.writteFile(i, 'title', article.title);
         self.writteFile(i, 'article', article.cleaned_text);
Example #37
0
def get_text(article_url):
    goose = Goose()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(article_url)
    raw_html = response.read()
    article = goose.extract(raw_html=raw_html)
    return article.cleaned_text
def process_data(threadName, q):
	while not exitFlag:
		queueLock.acquire()
		if not workQueue.empty():
			global Id
			print "%s processing No.%s result page..." % (threadName, Id)
			data = q.get()
			g = Goose()
			resultUrl = data["unescapedUrl"]
			article = g.extract(url = resultUrl)
			item = {}
			item['title'] = data["titleNoFormatting"]
			item['url'] = resultUrl
			item['keyWords'] = keyWords
			item['description'] = article.cleaned_text[:4000]
			if article.top_image:
				item['image'] = article.top_image.src
			else:
				item['image'] = ""
			insert(item)
			Id += 1
			
			queueLock.release()
			
			
		else:
			queueLock.release()
		time.sleep(1)
Example #39
0
 def process_item(self, item, spider):
     if 'pdf_Link' in item:
         pdfName = item['report_name'] + u".pdf"
         PDFPath = os.path.join(PDF_PATH, item['source_name'])
         if not os.path.exists(PDFPath): os.makedirs(PDFPath)
         filepath = os.path.join(PDFPath, pdfName)
         try:
             content = self.downloadPDF(item['pdf_Link'], filepath)
             item["report_content"] = content
         except:
             self.jsonInfoStored(item, pdfName)
             log.msg(
                 "pdf download failure, information is serializing to json files",
                 level=log.INFO)
     elif 'content_Link' in item:
         from goose import Goose
         from goose.text import StopWordsChinese
         try:
             g = Goose({'stopwords_class': StopWordsChinese})
             article = g.extract(url=item['content_Link'])
             content = article.cleaned_text
             del item['content_Link']
             item["report_content"] = content
         except:
             log.msg("Content extracted failure from page:%s" %
                     item['report_link'],
                     level=log.INFO)
     return item
def fetch_content_for_url(url):
    try:
        g = Goose()
        article = g.extract(url=url)
        return article.cleaned_text
    except:
        return ''
Example #41
0
def GoogleSearch(argu):
	url2 = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&start="

	q = "&q="
	keyWords = argu #"love story taylor"
	startNums = ["0","4","8","12","16"]

	searchResults = []
	print "Start to crawl Google with keyWords: %s" % keyWords


	for num in startNums:
		theUrl = url2 + num + q + keyWords
		f = urllib.urlopen(theUrl)
		j = json.load(f)
		searchResults += j["responseData"]["results"]

	Id = 1
	g = Goose()
	with open("result.dat", "w") as of:
		for obj in searchResults:
			print "Extracting No.%d result page..." % Id
			# print obj["unescapedUrl"]
			resultUrl = obj["unescapedUrl"]
			# print resultUrl
			article = g.extract(url = resultUrl)
			# print article.title
			line = article.title + "|*|" + resultUrl + "|*|" + article.cleaned_text[:4000]
			of.write(str(Id) + "|*|")
			of.write(line.encode('utf-8') + "|**|")
			Id += 1
	print "-----End-----"
	return
Example #42
0
    def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
        '''
        Read links and associated categories for specified articles 
        in text file seperated by a space

        Args:
            corpus_dir (str): The directory to save the generated corpus
            datastore_type (Optional[str]): Format to save generated corpus.
                                            Specify either 'file' or 'sqlite'.
            db_name (Optional[str]): Name of database if 'sqlite' is selected.
        '''

        self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
        #self.g = Goose({'browser_user_agent': 'Mozilla'})
        self.corpus_dir = corpus_dir
        self.datastore_type = datastore_type
        self.db_name = db_name
        self.stats = defaultdict(int)

        self._create_corpus_dir(self.corpus_dir)

        self.db = None
        if self.datastore_type == 'sqlite':
            self.db = self.corpus_dir + '/' + self.db_name
            self._set_up_db(self.db)
Example #43
0
File: yay.py Project: vayan/yay
 def on_pubmsg(self, serv, ev):
     canal = ev.target()
     message = ev.arguments()[0].lower()
     if self.channels[canal].has_user("Yppy"):
         return
     url = re.search("(?P<url>https?://[^\s]+)", message)
     if url:
         url = url.group(0)
         try:
             self.lasturl = url
             hostname = urlparse.urlparse(url).hostname
             g = Goose()
             article = g.extract(url=url)
             tinyurl = urllib2.urlopen("http://tinyurl.com/api-create.php?url=" + url).read()
             title = article.title.encode('utf-8')[:70]
             ret = "Title : %s (%s) | %s" % (title, hostname, tinyurl)
             serv.privmsg(canal, ret)
         except:  # todo log error
             e = sys.exc_info()[0]
             print(e)
             return
     if "!sum" in message:
         try:
             response = unirest.post("http://192.81.222.194:1142/api",{}, {"url": self.lasturl})
             print response.body
             for bullet in response.body:
                 serv.privmsg(canal, ("* %s" % (bullet).encode('utf-8')))
         except:  # todo log error
             e = sys.exc_info()[0]
             print(e)
             return
Example #44
0
def categorize(request, article_url):
	
	#load model
	f = open('my_classifier.pickle')
	classif = pickle.load(f)
	f.close()
	print "loaded model"

	#categorize incoming article
	g = Goose()
	article = g.extract(url=article_url)

	#get list of words
	words = dict()

	
	article_text = article.cleaned_text
		
	for word in word_tokenize(article_text):
		words.setdefault(('%s' % word), 0)	
		words[('%s' % word)] += 1

	print "got words!"

	


	classified = classif.classify(words)

	output = ""
	output += "PREDICTED: %s <br>" % classified
	output += "<br><br> %s" % article_text

	return HttpResponse(output)
Example #45
0
def extract_body(html):
	"""
	 Extract the body text of a web page
	"""
	g = Goose({'enable_image_fetching':False})
	article = g.extract(raw_html=html)
	return article.cleaned_text
Example #46
0
def print_news(url, content='title'):
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    g = Goose()

    article = g.extract(url=url)

    #If there is a meta description available, print that else go for
    #summarize
    if content == 'full' and article.meta_description:
        print(article.meta_description)
        return

    news_text = article.cleaned_text

    parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    if content == 'title' or content == 'full':
        #Print article title
        print('\t* ' + str(article.title.encode('ascii', 'ignore')))

    if content == 'full':
        #Print a n-line summary
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
    return
Example #47
0
def TF_IDF_url():
    doc_list = []
    bloblist = []
    #The name of book is written on tran_data.txt
    f = open('data/train_data1.txt')
    g = Goose({'stopwords_class': StopWordsKorean})

    lines = f.readlines()
    f.close()

    for line in lines:
        doc_class = Textdoc()
        doc_class.save_title(g.extract(url=line).title)
        doc_class.save_content(g.extract(url=line).cleaned_text)
        doc_list.append(doc_class)
        bloblist.append(doc_class.content)
#		print doc_class.content

    t = 0
    for i, blob in enumerate(bloblist):
        #pprint(get_nouns(blob))
        print("Top words in document {}".format(i + 1))
        scores = {
            word: tfidf(word, blob, bloblist)
            for word in get_nouns(blob)
        }
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        for word, score in sorted_words[:5]:
            doc_list[t].add_word(word, round(score, 5))


#			print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
        t = t + 1

    return doc_list
Example #48
0
def scrape(url):
    """
    Function to request and parse a given URL. Returns only the "relevant"
    text.

    Parameters
    ----------

    url : String.
            URL to request and parse.

    Returns
    -------

    text : String.
            Parsed text from the specified website.

    meta : String.
            Parsed meta description of an article. Usually equivalent to the
            lede.
    """
    logger = logging.getLogger('scraper_log')
    page = requests.get(url)
    g = Goose()
    try:
        article = g.extract(raw_html=page.content)
        text = article.cleaned_text
        meta = article.meta_description
        return text, meta
    #Generic error catching is bad
    except Exception, e:
        print 'There was an error. Check the log file for more information.'
        logger.warning('Problem scraping URL: {}. {}.'.format(url, e))
Example #49
0
    def save(self, *args, **kwargs):
        from goose import Goose
        from text.blob import TextBlob
        g = Goose()
        article = g.extract(url=self.url)
        try:
            b = TextBlob(article.title)
            lang = b.detect_language()
        except:
            lang='en'

        g = Goose({'use_meta_language': False, 'target_language':lang, 'paper_class':'soup'})
        if not self.title:
            self.title = article.title
        if not self.newspaper:
            self.newspaper = article.domain
        if not self.content:
            self.content = article.cleaned_text
        try:
            if article.top_image.src:
                layout = Photo()
                #layout.photo = "images/news/"+str(self.id)+".jpg"
                layout.url = article.top_image.src
                layout.article = self
                layout.save() 
        except:
            pass
        super(Article, self).save()
Example #50
0
def hackers_news():
    total_data = []
    obj = get_context()
    base_url, target_url = obj.urls()
    parsed_source = obj.get_parsed_source(base_url, target_url)
    news_urls = parsed_source.xpath("//table[@id='hnmain']//table//tr[@class='athing']")
    for each_data in news_urls:
        news_url = each_data.xpath(".//td[@class='title']//span[@class='deadmark']//following-sibling::a[1]//@href")
        news_url = "".join(news_url)
        upvotes = each_data.xpath(".//following-sibling::tr[1]//td[@class='subtext']//span//text()")
        upvotes = "".join(upvotes)
        posted_on = each_data.xpath(
            ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[2]//text()"
        )
        posted_on = "".join(posted_on)
        comments = each_data.xpath(
            ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[3]//text()"
        )
        comments = "".join(comments)
        g = Goose()
        article = g.extract(url=news_url)
        content = article.cleaned_text
        content = " ".join(content.split()).replace("\n", "").replace("\t", "").replace("\r", "")
        try:
            content = content.encode("utf-8").decode("ascii", "ignore").encode("ascii")
        except:
            try:
                content = content.decode("ascii", "ignore").encode("ascii")
            except:
                try:
                    content = content.encode("utf-8")
                except:
                    content = "No news found"
        connection, cursor = obj.get_connection()
        duplicate_query = "SELECT news_url FROM hackers_news WHERE news_url=%s"
        duplicate_values = (news_url,)
        cursor.execute(duplicate_query, duplicate_values)
        duplicate_data = cursor.fetchall()
        if duplicate_data:
            insert_data = "update hackers_news set upvotes =" + upvotes + ",comments=" + comments + " where news_url=%s"
            values = (news_url,)
            cursor.execute(insert_data, values)
            connection.commit()
        else:
            try:
                insert_data = (
                    "insert into hackers_news(news_url,news_content,upvotes,posted_on,comments) values(%s,%s,%s,%s,%s)"
                )
                values = (news_url, content, upvotes, posted_on, comments)
                cursor.execute(insert_data, values)
                connection.commit()
            except:
                continue
        cursor.close()
        connection.close()
        total_data.append(
            {"news_url": news_url, "content": content, "upvotes": upvotes, "posted_on": posted_on, "comments": comments}
        )
    context_dict = {"total_data": total_data}
    return context_dict
Example #51
0
def getUrl(item):
    url_name = re.split('&&', item)
    url = url_name[1]
    name = url_name[0]
    print url
    print name
    html_name = name + '.html'
    print html_name
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    # print article.raw_html
    currentDir = os.getcwd() + '/' + 'pages' + '/' + name
    if not os.path.exists(currentDir):
        os.makedirs(currentDir)
    f = open(currentDir + '/' + html_name, 'a')
    google_transfer = open("transfer.js").read()
    print google_transfer

    f.write(google_transfer + article.raw_html)
    f.close()
    print article.title

    f_md = open(currentDir + '/' + name + '.md', 'a')
    f_md.write(article.cleaned_text.encode('utf-8'))
    f_md.close()
Example #52
0
def extract(URL):
	"""
	This function extract the page's text body of the given URL.

	Return:
		page_title: the value of the <title> html tag
		text_extracted: the extracted body text
		img: top_image url extracted
	"""

	g = Goose()

	text, text_type= _get_html_content_from_url(URL)

	if text_type != 'text/plain':
	#article = g.extract(url=URL)
		article = g.extract(raw_html=text)

		img = ''

		try:
			img = article.top_image.src
		except:
			img = ''
		return (article.title,article.cleaned_text,img)
	else:
		print "it's a plain/text"
		return ('plaintext',text,'n/a')
Example #53
0
def HTMLParser(url):
    response = get(url)
    extractor = Goose()
    article = extractor.extract(raw_html=response.content)
    text = article.cleaned_text

    return str(text.encode("ascii", "ignore"))
Example #54
0
def cmd_readstream(args, t, active_events):

    import textwrap
    from goose import Goose, Configuration
    config = Configuration()
    config.enable_image_fetching = False
    g = Goose(config)
    raw_stream = True

    for arg in args:
        if arg == "articles":
            raw_stream = False

    for event in active_events:
        print event
        if event.query_id.startswith("TS13"):
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id))

        if raw_stream is True:
            from cuttsum.trecdata import SCChunkResource
            si_iter = SCChunkResource().streamitem_iter(event, corpus)
        else:
            from cuttsum.pipeline import ArticlesResource
            si_iter = ArticlesResource().streamitem_iter(event, corpus)

        for hour, path, si in si_iter:
            if si.body.clean_visible is not None:
                print si.stream_id
                try:
                    text_height = t.height - 4
                    #n_chars = t.
                    article = g.extract(raw_html=si.body.clean_html)
                    lines = textwrap.wrap(article.cleaned_text)
                    idx = 0
                    while 1:
                        print t.clear
                        print "hour:", hour
                        print "title:", article.title
                        print "article:"
                        print "\n".join(lines[idx:idx + text_height])
                        #print article.cleaned_text

                        with t.cbreak():
                            char = t.inkey()
                            if char == "i" and idx > 0:
                                idx -= 1  #idx - 1 if idx > 0 else 0
                            elif char == "k" and idx + text_height < len(
                                    lines):
                                idx += 1
                            elif char == "l":
                                break

                except Exception, e:
                    print e
                    continue
Example #55
0
def get_article_parts(html):
    """Take HTML and return extracted headline and body."""
    g = Goose({'use_meta_language': False, 'enable_image_fetching': False})
    try:
        article = g.extract(raw_html=html)
    except:
        return None, None
    return article.title.strip().encode('utf-8'), article.cleaned_text.strip().encode('utf-8')
Example #56
0
 def GetDesc_goose(self, url) :
     try :
         g = Goose( {'stopwords_class': StopWordsChinese} )
         article = g.extract(url = url)
         return str(article.cleaned_text).replace('\t','').replace('\r','').replace('\b','').replace('"',"'").encode('utf-8')
     except Exception, ex:
         l.Warning("Goose_Crawl Failed %s" % str(ex))
         return "NULL"
Example #57
0
 def get_full_text(self):
     if self.text_url:
         g = Goose()
         page = g.extract(self.text_url)
         print >> sys.stderr, "Extracting text for %s" % page.title
         self.text = page.cleaned_text
     else:
         print >> sys.stderr, 'No script found for %s' % self.title
Example #58
0
def goose_extractor_content(url):
	g = Goose()
	article = g.extract(url=url)
	#article.title ## returns article title
	#article.meta_description ## returns article meta description
	#article.top_image.src ## returns path for main image in article
	#article = g.extract(url=url) ## these scrape full text from article
	return article.cleaned_text
Example #59
0
def cmd_readstream(args, t, active_events):
    
    import textwrap
    from goose import Goose, Configuration
    config = Configuration()
    config.enable_image_fetching = False
    g = Goose(config)
    raw_stream = True

    for arg in args:
        if arg == "articles":
            raw_stream = False

    for event in active_events:
        print event
        if event.query_id.startswith("TS13"):        
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id)) 

        if raw_stream is True:
            from cuttsum.trecdata import SCChunkResource
            si_iter = SCChunkResource().streamitem_iter(event, corpus)
        else:
            from cuttsum.pipeline import ArticlesResource
            si_iter = ArticlesResource().streamitem_iter(event, corpus)

        for hour, path, si in si_iter:
            if si.body.clean_visible is not None:
                print si.stream_id
                try:
                    text_height = t.height-4
                    #n_chars = t.
                    article = g.extract(raw_html=si.body.clean_html)
                    lines = textwrap.wrap(article.cleaned_text)
                    idx = 0
                    while 1:
                        print t.clear
                        print "hour:", hour
                        print "title:", article.title
                        print "article:"
                        print "\n".join(lines[idx:idx+text_height])
                    #print article.cleaned_text                
                    
                        with t.cbreak():
                            char = t.inkey()
                            if char == "i" and idx > 0:
                                idx -= 1 #idx - 1 if idx > 0 else 0
                            elif char == "k" and idx + text_height < len(lines):
                                idx += 1 
                            elif char == "l":
                                break

                except Exception, e:
                    print e
                    continue