Python Goose.close Beispiele

Programmiersprache: Python

Namespace / Paketname: goose3

Klasse / Typ: Goose

Methode / Funktion: close

Beispiele auf hotexamples.com: 7

Python Goose.close - 7 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die goose3.Goose.close, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Goose(30)

extract(30)

close(7)

Häufig verwendete Methoden

Goose (30)

extract (30)

close (7)

Beispiel #1

Datei anzeigen

	def parse_item(self, response):
	
		self.iter_count += 1
		
		html = response.body
		
		# Objeto Goose para extraer datos de la pagina
		goose_extractor = Goose()
		article = goose_extractor.extract(raw_html=html)
		
		# Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no
		if(response.css('h2').re('.Examples:.')):
			self.trope_count+=1
			follow = True
			json_file = self.generate_json(article)
			self.create_files(json_file, 'tropo')
			
			# Archivo para comprobar los tropos indexados
			#with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp:
			#	fp.write(response.url+'\n')
			
		else:
			self.non_trope_count += 1
			if('Laconic' in response.url):
				print('Encontrado un Laconic!')
				self.laconic_count += 1
				json_file = self.generate_json(article)
				self.create_files(json_file, 'laconic')
			else:
				print('Enlace ignorado! (no era un tropo)')
			follow = False
		
		# Cerrar objeto goose
		goose_extractor.close()

Beispiel #2

Datei anzeigen

def get_paragraphs_GOO(str_text, mode):
  """
  using Goose
  """
  g = Goose()
  article = g.extract(raw_html=str_text)
  list_paragraphs = re.split("\n\n",article.cleaned_text)
  g.close()
  return list_paragraphs

Beispiel #3

Datei anzeigen

Datei: realtime.py Projekt: patilganesh1207/python-auto-posting

def get(url):
    print(url)
    if url == "0":
        return "00000000000000000"
    html = ""
    while (html == ""):
        driver = webdriver.Chrome(executable_path=r'C:/server_main/cred/chromedriver.exe')
        driver.minimize_window()
        try:
            driver.get(url)
        except Exception as ex:
            print(str(ex))
        #time.sleep(3)
        html = driver.page_source
        if "This site can’t be reached" in html or "No internet" in html:
            print ("Reloading")
            html = ""
        driver.quit()
            
    ds = ['', '', '', '', '', '', '', '', '']
    g = Goose()
    data = g.extract(raw_html=html)
    ds[1] = data.cleaned_text 
    ds[2] = data.title
    ds[3] = data.meta_description
    ds[4] = data.meta_keywords
    tag = data.tags
    ds[5] = tag
    if "[]" not in ds[5]:
        i=0
        db=""
        for t in tag:
            i=i+1
            if i>7:   #tags count now 7
              print ("tags: "+str(i))
              ds[5]=db.split(",")
              break
            else:
                db+=(t+",")
    ds[6] = data.meta_favicon
    g.close()
    bs = BeautifulSoup(html, 'html.parser')
    images = bs.find_all('img', {'src':re.compile('.jpg')})
    for image in images: 
        #print(image['src']+'\n')
        ds[7] = (image['src'])
        break
    print ("get done")
    return ds

Beispiel #4

Datei anzeigen

def get_text_sanitized_from_url(url):
    logging("Criando uma lista com as stop words da língua Portuguesa.")
    list_stopwords = set(stopwords.words("portuguese"))

    logging("Abrindo conexão com o Gooose.")
    goose = Goose()

    logging("Buscando a notícia com base na URL.")
    notice = goose.extract(url)

    # Seto o texto da materia
    text = notice.cleaned_text

    logging("Limpando caracteres invalidos.")
    # Remove caracter especial, acentuação, etc
    text = str(unicodedata.normalize("NFKD", text).encode("ASCII", "ignore"))

    logging("Removendo todo caracter na qual não seja LETRA.")
    # Mantem somente letras no conteudo do texto
    text = re.sub(r"[^A-Za-z]+", ' ', text)

    logging("Convertendo o texto em minusculo.")
    # Seto o texto para minusculo
    text = text.lower()

    logging("Transformando o texto em um array de palavras.")
    # Transforma o texto em uma lista separando palavra por palavra
    text_words = word_tokenize(text)

    logging("Removendo do texto todas as stops words.")
    # Remove todas as stops words do texto
    text = [word for word in text_words if not word in list_stopwords and len(word) > 2]

    logging("Convertando o array de palavras higienizados em uma só string separando por espaço.")
    # Transforma a lista de palavras em uma unica string

    text = (" ").join(text)

    logging("Fechando conexão com o Gooose.")
    goose.close()

    return text

Beispiel #5

Datei anzeigen

def get_webdata(url_list):
    """

    :param url_list: List of URLs
    :return: Dataframe with cleaned text extracted from the URLS and their Sentiment Classifier predictions

    """

    webdata_df = pd.DataFrame()

    for url in url_list:
        # Getting html and Extracting the Metadata

        g = Goose()
        article = g.extract(url=url)
        webdata_df = webdata_df.append(
            {
                'url': url,
                'title': article.title,
                'article_desc': article.meta_description,
                'date_pub': article.publish_date,
                'text': article.cleaned_text
            },
            ignore_index=True)

        g.close()

    webdata_df['clean_text'] = (webdata_df['text'].pipe(hero.clean))

    sentiment = score_article(webdata_df['clean_text'].to_list())

    webdata_df['Predicted_sentiment'] = sentiment

    webdata_df.drop(['text'], axis=1)

    return webdata_df

Beispiel #6

Datei anzeigen

Datei: web-scraper.py Projekt: amlmike/web-scraper

#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup

url = "https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html"
resp = requests.get(url)

soup = BeautifulSoup(resp.text, 'html.parser')
print(soup.prettify())

from goose3 import Goose
g = Goose()
article = g.extract(
    url='https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html'
)
print(article.cleaned_text)
g.close()

Beispiel #7

Datei anzeigen

Datei: extract_test_articles.py Projekt: aadilk97/fake_news_detection

def extract_articles(i):
    articles = []
    g = Goose()

    with open(filepaths[i]) as f:
        urls = []
        data = json.load(f)
        name = data['Claim_ID']
        query = data['Claim']

        for item in data['Google Results'][0]['results']:
            if 'snopes' not in item['link'] and 'pdf' not in item['link']:
                item['link'] = item['link'].replace('https', 'http')
                urls.append(item['link'])

        if data['Credibility'] == 'false':
            cred = '0'

        else:
            cred = '1'

        urls_google = []
        try:
            for url in search(query, stop=30):
                if 'snopes' not in url:
                    urls_google.append(url)
        except:
            print('Some error')

        for url in urls_google:
            url = url.replace('https', 'http')
            if url not in urls:
                urls.append(url)

    print('At claim .... ', name)
    count = 0
    checkpoint = time.time()
    timeout = 5
    extracted_articles_domains = []
    for url in urls:
        try:
            ## Extracting the articles using goose and extracting the text body and checking overlap with the claim
            article = g.extract(url=url)
            article = article.cleaned_text
            res = is_relevant(query, article)

            ## Checking if the overlap is greater than a threshold value
            if res == True:
                articles.append(article)
                parsed_uri = urlparse(url)
                domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                extracted_articles_domains.append(domain)
                count += 1
                checkpoint = time.time()
                filepath = os.path.join(dump_path, name, (str(count) + '.txt'))
                os.makedirs(os.path.dirname(filepath), exist_ok=True)
                f = open(filepath, 'w')
                f.write(article)
                f.close()

            # if time.time() > checkpoint + timeout:
            # 	print ('Timed-out ....', name)
            # 	break

        ## Checking for connection error of requests
        except requests.exceptions.ConnectionError as e:
            # print ('Some error at file =', name)
            continue

        except:
            # print (sys.exc_info()[0])
            continue

    g.close()
    return articles, cred, name, query, extracted_articles_domains