コード例 #1
0
	def parse_item(self, response):
	
		self.iter_count += 1
		
		html = response.body
		
		# Objeto Goose para extraer datos de la pagina
		goose_extractor = Goose()
		article = goose_extractor.extract(raw_html=html)
		
		# Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no
		if(response.css('h2').re('.Examples:.')):
			self.trope_count+=1
			follow = True
			json_file = self.generate_json(article)
			self.create_files(json_file, 'tropo')
			
			# Archivo para comprobar los tropos indexados
			#with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp:
			#	fp.write(response.url+'\n')
			
		else:
			self.non_trope_count += 1
			if('Laconic' in response.url):
				print('Encontrado un Laconic!')
				self.laconic_count += 1
				json_file = self.generate_json(article)
				self.create_files(json_file, 'laconic')
			else:
				print('Enlace ignorado! (no era un tropo)')
			follow = False
		
		# Cerrar objeto goose
		goose_extractor.close()
コード例 #2
0
def get_paragraphs_GOO(str_text, mode):
  """
  using Goose
  """
  g = Goose()
  article = g.extract(raw_html=str_text)
  list_paragraphs = re.split("\n\n",article.cleaned_text)
  g.close()
  return list_paragraphs
コード例 #3
0
def get(url):
    print(url)
    if url == "0":
        return "00000000000000000"
    html = ""
    while (html == ""):
        driver = webdriver.Chrome(executable_path=r'C:/server_main/cred/chromedriver.exe')
        driver.minimize_window()
        try:
            driver.get(url)
        except Exception as ex:
            print(str(ex))
        #time.sleep(3)
        html = driver.page_source
        if "This site can’t be reached" in html or "No internet" in html:
            print ("Reloading")
            html = ""
        driver.quit()
            
    ds = ['', '', '', '', '', '', '', '', '']
    g = Goose()
    data = g.extract(raw_html=html)
    ds[1] = data.cleaned_text 
    ds[2] = data.title
    ds[3] = data.meta_description
    ds[4] = data.meta_keywords
    tag = data.tags
    ds[5] = tag
    if "[]" not in ds[5]:
        i=0
        db=""
        for t in tag:
            i=i+1
            if i>7:   #tags count now 7
              print ("tags: "+str(i))
              ds[5]=db.split(",")
              break
            else:
                db+=(t+",")
    ds[6] = data.meta_favicon
    g.close()
    bs = BeautifulSoup(html, 'html.parser')
    images = bs.find_all('img', {'src':re.compile('.jpg')})
    for image in images: 
        #print(image['src']+'\n')
        ds[7] = (image['src'])
        break
    print ("get done")
    return ds
コード例 #4
0
def get_text_sanitized_from_url(url):
    logging("Criando uma lista com as stop words da língua Portuguesa.")
    list_stopwords = set(stopwords.words("portuguese"))

    logging("Abrindo conexão com o Gooose.")
    goose = Goose()

    logging("Buscando a notícia com base na URL.")
    notice = goose.extract(url)

    # Seto o texto da materia
    text = notice.cleaned_text

    logging("Limpando caracteres invalidos.")
    # Remove caracter especial, acentuação, etc
    text = str(unicodedata.normalize("NFKD", text).encode("ASCII", "ignore"))

    logging("Removendo todo caracter na qual não seja LETRA.")
    # Mantem somente letras no conteudo do texto
    text = re.sub(r"[^A-Za-z]+", ' ', text)

    logging("Convertendo o texto em minusculo.")
    # Seto o texto para minusculo
    text = text.lower()

    logging("Transformando o texto em um array de palavras.")
    # Transforma o texto em uma lista separando palavra por palavra
    text_words = word_tokenize(text)

    logging("Removendo do texto todas as stops words.")
    # Remove todas as stops words do texto
    text = [word for word in text_words if not word in list_stopwords and len(word) > 2]

    logging("Convertando o array de palavras higienizados em uma só string separando por espaço.")
    # Transforma a lista de palavras em uma unica string

    text = (" ").join(text)

    logging("Fechando conexão com o Gooose.")
    goose.close()

    return text
コード例 #5
0
def get_webdata(url_list):
    """

    :param url_list: List of URLs
    :return: Dataframe with cleaned text extracted from the URLS and their Sentiment Classifier predictions

    """

    webdata_df = pd.DataFrame()

    for url in url_list:
        # Getting html and Extracting the Metadata

        g = Goose()
        article = g.extract(url=url)
        webdata_df = webdata_df.append(
            {
                'url': url,
                'title': article.title,
                'article_desc': article.meta_description,
                'date_pub': article.publish_date,
                'text': article.cleaned_text
            },
            ignore_index=True)

        g.close()

    webdata_df['clean_text'] = (webdata_df['text'].pipe(hero.clean))

    sentiment = score_article(webdata_df['clean_text'].to_list())

    webdata_df['Predicted_sentiment'] = sentiment

    webdata_df.drop(['text'], axis=1)

    return webdata_df
コード例 #6
0
ファイル: web-scraper.py プロジェクト: amlmike/web-scraper
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup

url = "https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html"
resp = requests.get(url)

soup = BeautifulSoup(resp.text, 'html.parser')
print(soup.prettify())

from goose3 import Goose
g = Goose()
article = g.extract(
    url='https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html'
)
print(article.cleaned_text)
g.close()
コード例 #7
0
def extract_articles(i):
    articles = []
    g = Goose()

    with open(filepaths[i]) as f:
        urls = []
        data = json.load(f)
        name = data['Claim_ID']
        query = data['Claim']

        for item in data['Google Results'][0]['results']:
            if 'snopes' not in item['link'] and 'pdf' not in item['link']:
                item['link'] = item['link'].replace('https', 'http')
                urls.append(item['link'])

        if data['Credibility'] == 'false':
            cred = '0'

        else:
            cred = '1'

        urls_google = []
        try:
            for url in search(query, stop=30):
                if 'snopes' not in url:
                    urls_google.append(url)
        except:
            print('Some error')

        for url in urls_google:
            url = url.replace('https', 'http')
            if url not in urls:
                urls.append(url)

    print('At claim .... ', name)
    count = 0
    checkpoint = time.time()
    timeout = 5
    extracted_articles_domains = []
    for url in urls:
        try:
            ## Extracting the articles using goose and extracting the text body and checking overlap with the claim
            article = g.extract(url=url)
            article = article.cleaned_text
            res = is_relevant(query, article)

            ## Checking if the overlap is greater than a threshold value
            if res == True:
                articles.append(article)
                parsed_uri = urlparse(url)
                domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                extracted_articles_domains.append(domain)
                count += 1
                checkpoint = time.time()
                filepath = os.path.join(dump_path, name, (str(count) + '.txt'))
                os.makedirs(os.path.dirname(filepath), exist_ok=True)
                f = open(filepath, 'w')
                f.write(article)
                f.close()

            # if time.time() > checkpoint + timeout:
            # 	print ('Timed-out ....', name)
            # 	break

        ## Checking for connection error of requests
        except requests.exceptions.ConnectionError as e:
            # print ('Some error at file =', name)
            continue

        except:
            # print (sys.exc_info()[0])
            continue

    g.close()
    return articles, cred, name, query, extracted_articles_domains