Esempio n. 1
0
    def parse_news_text(self, page_html: str, url: str) -> dict:
        if self._extractor is None:
            config = Configuration()
            config.stopwords_class = StopWords
            config.strict = False

            extractor = Goose(config)
            self._extractor = extractor
        article = self._extractor.extract(raw_html=page_html)
        news_text = re.sub(r'\s+', r' ', article.cleaned_text)
        return {'url': url, 'text': news_text}
Esempio n. 2
0
    def __init__(self, config=None):
        self.config = Configuration()
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)
        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or \
                not self.config.enable_image_fetching:
            return
        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            raise Exception(self.config.local_storage_path +
                            " directory does not seem to exist, "
                            "you need to set this for image processing downloads"
                            )

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            f = os.fdopen(level, "w")
            f.close()
            os.remove(path)
        except IOError:
            raise Exception(self.config.local_storage_path +
                            " directory is not writeble, "
                            "you need to set this for image processing downloads"
                            )
Esempio n. 3
0
    def __process_goose(self):
        goose_config = Configuration()
        goose_config.browser_user_agent = 'Mozilla 5.0'
        goose_config.enable_image_fetching = True
        g = Goose(config=goose_config)
        try:
            article = g.extract(self.url)

            if article.top_image.src:
                self.images = self.get_all_images_from_example_src(
                    article.top_image.src)

        except (requests.exceptions.ConnectionError,
                requests.exceptions.ReadTimeout):
            return None
        return article
Esempio n. 4
0
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)
        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return
        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            raise Exception(
                self.config.local_storage_path +
                " directory does not seem to exist, "
                "you need to set this for image processing downloads")
Esempio n. 5
0
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from deep_translator import GoogleTranslator
import requests
from goose3 import Goose
from goose3.configuration import Configuration, ArticleContextPattern
import pickle
import joblib
import numpy as np

#funcao para pegar o text no link e limpar
config = Configuration()
config.known_context_patterns = [
    ArticleContextPattern(attr="class", value="n_text")
]

classes = np.arange(100)

df = pd.read_csv('files/newDataset.csv')
conversion_dict = {0: 'Real', 1: 'Fake', 2: 'Neutral'}
df['label'] = df['label'].replace(conversion_dict)

x_train, x_test, y_train, y_test = train_test_split(df['text'],
                                                    df['label'],
                                                    test_size=0.25,
                                                    random_state=7,
                                                    shuffle=True)
Esempio n. 7
0
__version__ = '1.0'

import pickle
from sklearn.model_selection import train_test_split
from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS
from Utility import Utility_func
import sys
import os

from algorithm.predictive_model import Classify
import json
import pandas as pd
from goose3 import Goose
from goose3.configuration import Configuration

config = Configuration()
config.strict = False  # turn of strict exception handling
config.browser_user_agent = 'Mozilla 5.0'  # set the browser agent string
config.http_timeout = 5.05  # set http timeout in seconds
with Goose(config) as g:
    pass


def getContent(url):
    try:
        content = g.extract(url)
        rs = {
            "url": url,
            "type": None,
            "title": content.title,
            "description": content.meta_description
Esempio n. 8
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = False
     return config
Esempio n. 9
0
import requests
from CQUPT_Spider.utils.common import get_main_content

#url = "http://xwzx.cqupt.edu.cn/cqupt_xwzx/news.jsp?id=5U5933J7L60QL982"
#url = "http://yjs.cqupt.edu.cn/info/1006/4248.htm"
url = "http://cxy.cqupt.edu.cn/info/1105/1282.htm"
#url = "http://xylyh.cqupt.edu.cn/info/1009/1786.htm"
g = Goose({'stopwords_class': StopWordsChinese})

resp = requests.get(url)
content = g.extract(
    raw_html=resp.text.encode(resp.encoding).decode(resp.apparent_encoding))
# print(content.cleaned_text)

# 清理标签
config = Configuration()
doc_clean = DocumentCleaner(config, None)
crawler = Crawler(config)
elemetn_html = doc_clean.clean(
    crawler.get_document(
        resp.text.encode(resp.encoding).decode(resp.apparent_encoding)))
no_tags_html = elemetn_html.text_content()

# 绘图参数
html_content_list = no_tags_html.splitlines()
line_length = [len(line.strip()) for line in html_content_list]
lines = [line.strip() for line in no_tags_html.splitlines()]
main_content, content_lines = get_main_content(lines)
print(main_content)
print(content_lines)
main_line_x = [i for i in range(content_lines[0], content_lines[1] + 1)]
 def __init__(self, url):
     self.url = url
     self.cleaned_text = None
     self.config = Configuration()
     self.config.browser_user_agent = 'Mozilla 5.0'
     self.config.stopwords_class = StopWordsChinese
Esempio n. 11
0
def extractArticle(url):
    from goose3.configuration import Configuration
    config = Configuration()
    # config.local_storage_path = tmp_dir
    return Goose(config).extract(url=url)