def parse_news_text(self, page_html: str, url: str) -> dict: if self._extractor is None: config = Configuration() config.stopwords_class = StopWords config.strict = False extractor = Goose(config) self._extractor = extractor article = self._extractor.extract(raw_html=page_html) news_text = re.sub(r'\s+', r' ', article.cleaned_text) return {'url': url, 'text': news_text}
def __init__(self, config=None): self.config = Configuration() if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or \ not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): raise Exception(self.config.local_storage_path + " directory does not seem to exist, " "you need to set this for image processing downloads" ) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: f = os.fdopen(level, "w") f.close() os.remove(path) except IOError: raise Exception(self.config.local_storage_path + " directory is not writeble, " "you need to set this for image processing downloads" )
def __process_goose(self): goose_config = Configuration() goose_config.browser_user_agent = 'Mozilla 5.0' goose_config.enable_image_fetching = True g = Goose(config=goose_config) try: article = g.extract(self.url) if article.top_image.src: self.images = self.get_all_images_from_example_src( article.top_image.src) except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): return None return article
def __init__(self, config=None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): raise Exception( self.config.local_storage_path + " directory does not seem to exist, " "you need to set this for image processing downloads")
def __init__(self, config=None): # Use the passed in configuration if it is of the right type, otherwise # use the default as a base if isinstance(config, Configuration): self.config = config else: self.config = Configuration() # if config was a passed in dict, parse it into the stored configuration if isinstance(config, dict): for k, v in list(config.items()): if hasattr(self.config, k): setattr(self.config, k, v) # setup a single network connection self.fetcher = NetworkFetcher(self.config) self.finalizer = weakref.finalize(self, self.close) # we don't need to go further if image extractor or local_storage is not set if not self.config.local_storage_path or not self.config.enable_image_fetching: return # test if config.local_storage_path is a directory if not os.path.isdir(self.config.local_storage_path): os.makedirs(self.config.local_storage_path) if not os.path.isdir(self.config.local_storage_path): msg = ( '{} directory does not seem to exist, you need to set this for ' 'image processing downloads').format( self.config.local_storage_path) raise Exception(msg) # test to write a dummy file to the directory to check is directory is writable level, path = mkstemp(dir=self.config.local_storage_path) try: with os.fdopen(level, "w"): pass os.remove(path) except IOError: msg = ( '{} directory is not writeble, you need to set this for image ' 'processing downloads').format(self.config.local_storage_path) raise Exception(msg)
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.model_selection import cross_val_score from deep_translator import GoogleTranslator import requests from goose3 import Goose from goose3.configuration import Configuration, ArticleContextPattern import pickle import joblib import numpy as np #funcao para pegar o text no link e limpar config = Configuration() config.known_context_patterns = [ ArticleContextPattern(attr="class", value="n_text") ] classes = np.arange(100) df = pd.read_csv('files/newDataset.csv') conversion_dict = {0: 'Real', 1: 'Fake', 2: 'Neutral'} df['label'] = df['label'].replace(conversion_dict) x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=7, shuffle=True)
__version__ = '1.0' import pickle from sklearn.model_selection import train_test_split from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS from Utility import Utility_func import sys import os from algorithm.predictive_model import Classify import json import pandas as pd from goose3 import Goose from goose3.configuration import Configuration config = Configuration() config.strict = False # turn of strict exception handling config.browser_user_agent = 'Mozilla 5.0' # set the browser agent string config.http_timeout = 5.05 # set http timeout in seconds with Goose(config) as g: pass def getContent(url): try: content = g.extract(url) rs = { "url": url, "type": None, "title": content.title, "description": content.meta_description
def getConfig(self): config = Configuration() config.enable_image_fetching = False return config
import requests from CQUPT_Spider.utils.common import get_main_content #url = "http://xwzx.cqupt.edu.cn/cqupt_xwzx/news.jsp?id=5U5933J7L60QL982" #url = "http://yjs.cqupt.edu.cn/info/1006/4248.htm" url = "http://cxy.cqupt.edu.cn/info/1105/1282.htm" #url = "http://xylyh.cqupt.edu.cn/info/1009/1786.htm" g = Goose({'stopwords_class': StopWordsChinese}) resp = requests.get(url) content = g.extract( raw_html=resp.text.encode(resp.encoding).decode(resp.apparent_encoding)) # print(content.cleaned_text) # 清理标签 config = Configuration() doc_clean = DocumentCleaner(config, None) crawler = Crawler(config) elemetn_html = doc_clean.clean( crawler.get_document( resp.text.encode(resp.encoding).decode(resp.apparent_encoding))) no_tags_html = elemetn_html.text_content() # 绘图参数 html_content_list = no_tags_html.splitlines() line_length = [len(line.strip()) for line in html_content_list] lines = [line.strip() for line in no_tags_html.splitlines()] main_content, content_lines = get_main_content(lines) print(main_content) print(content_lines) main_line_x = [i for i in range(content_lines[0], content_lines[1] + 1)]
def __init__(self, url): self.url = url self.cleaned_text = None self.config = Configuration() self.config.browser_user_agent = 'Mozilla 5.0' self.config.stopwords_class = StopWordsChinese
def extractArticle(url): from goose3.configuration import Configuration config = Configuration() # config.local_storage_path = tmp_dir return Goose(config).extract(url=url)