Ejemplo n.º 1
0
    def __process_goose(self):
        goose_config = Configuration()
        goose_config.browser_user_agent = 'Mozilla 5.0'
        goose_config.enable_image_fetching = True
        g = Goose(config=goose_config)
        try:
            article = g.extract(self.url)

            if article.top_image.src:
                self.images = self.get_all_images_from_example_src(
                    article.top_image.src)

        except (requests.exceptions.ConnectionError,
                requests.exceptions.ReadTimeout):
            return None
        return article
Ejemplo n.º 2
0
import pickle
from sklearn.model_selection import train_test_split
from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS
from Utility import Utility_func
import sys
import os

from algorithm.predictive_model import Classify
import json
import pandas as pd
from goose3 import Goose
from goose3.configuration import Configuration

config = Configuration()
config.strict = False  # turn of strict exception handling
config.browser_user_agent = 'Mozilla 5.0'  # set the browser agent string
config.http_timeout = 5.05  # set http timeout in seconds
with Goose(config) as g:
    pass


def getContent(url):
    try:
        content = g.extract(url)
        rs = {
            "url": url,
            "type": None,
            "title": content.title,
            "description": content.meta_description
        }
    except: