def parse_item(self, response): self.iter_count += 1 html = response.body # Objeto Goose para extraer datos de la pagina goose_extractor = Goose() article = goose_extractor.extract(raw_html=html) # Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no if(response.css('h2').re('.Examples:.')): self.trope_count+=1 follow = True json_file = self.generate_json(article) self.create_files(json_file, 'tropo') # Archivo para comprobar los tropos indexados #with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp: # fp.write(response.url+'\n') else: self.non_trope_count += 1 if('Laconic' in response.url): print('Encontrado un Laconic!') self.laconic_count += 1 json_file = self.generate_json(article) self.create_files(json_file, 'laconic') else: print('Enlace ignorado! (no era un tropo)') follow = False # Cerrar objeto goose goose_extractor.close()
def get_paragraphs_GOO(str_text, mode): """ using Goose """ g = Goose() article = g.extract(raw_html=str_text) list_paragraphs = re.split("\n\n",article.cleaned_text) g.close() return list_paragraphs
def get(url): print(url) if url == "0": return "00000000000000000" html = "" while (html == ""): driver = webdriver.Chrome(executable_path=r'C:/server_main/cred/chromedriver.exe') driver.minimize_window() try: driver.get(url) except Exception as ex: print(str(ex)) #time.sleep(3) html = driver.page_source if "This site can’t be reached" in html or "No internet" in html: print ("Reloading") html = "" driver.quit() ds = ['', '', '', '', '', '', '', '', ''] g = Goose() data = g.extract(raw_html=html) ds[1] = data.cleaned_text ds[2] = data.title ds[3] = data.meta_description ds[4] = data.meta_keywords tag = data.tags ds[5] = tag if "[]" not in ds[5]: i=0 db="" for t in tag: i=i+1 if i>7: #tags count now 7 print ("tags: "+str(i)) ds[5]=db.split(",") break else: db+=(t+",") ds[6] = data.meta_favicon g.close() bs = BeautifulSoup(html, 'html.parser') images = bs.find_all('img', {'src':re.compile('.jpg')}) for image in images: #print(image['src']+'\n') ds[7] = (image['src']) break print ("get done") return ds
def get_text_sanitized_from_url(url): logging("Criando uma lista com as stop words da língua Portuguesa.") list_stopwords = set(stopwords.words("portuguese")) logging("Abrindo conexão com o Gooose.") goose = Goose() logging("Buscando a notícia com base na URL.") notice = goose.extract(url) # Seto o texto da materia text = notice.cleaned_text logging("Limpando caracteres invalidos.") # Remove caracter especial, acentuação, etc text = str(unicodedata.normalize("NFKD", text).encode("ASCII", "ignore")) logging("Removendo todo caracter na qual não seja LETRA.") # Mantem somente letras no conteudo do texto text = re.sub(r"[^A-Za-z]+", ' ', text) logging("Convertendo o texto em minusculo.") # Seto o texto para minusculo text = text.lower() logging("Transformando o texto em um array de palavras.") # Transforma o texto em uma lista separando palavra por palavra text_words = word_tokenize(text) logging("Removendo do texto todas as stops words.") # Remove todas as stops words do texto text = [word for word in text_words if not word in list_stopwords and len(word) > 2] logging("Convertando o array de palavras higienizados em uma só string separando por espaço.") # Transforma a lista de palavras em uma unica string text = (" ").join(text) logging("Fechando conexão com o Gooose.") goose.close() return text
def get_webdata(url_list): """ :param url_list: List of URLs :return: Dataframe with cleaned text extracted from the URLS and their Sentiment Classifier predictions """ webdata_df = pd.DataFrame() for url in url_list: # Getting html and Extracting the Metadata g = Goose() article = g.extract(url=url) webdata_df = webdata_df.append( { 'url': url, 'title': article.title, 'article_desc': article.meta_description, 'date_pub': article.publish_date, 'text': article.cleaned_text }, ignore_index=True) g.close() webdata_df['clean_text'] = (webdata_df['text'].pipe(hero.clean)) sentiment = score_article(webdata_df['clean_text'].to_list()) webdata_df['Predicted_sentiment'] = sentiment webdata_df.drop(['text'], axis=1) return webdata_df
#!/usr/bin/env python3 import requests from bs4 import BeautifulSoup url = "https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html" resp = requests.get(url) soup = BeautifulSoup(resp.text, 'html.parser') print(soup.prettify()) from goose3 import Goose g = Goose() article = g.extract( url='https://www.cnn.com/2019/06/03/politics/jared-kushner-axios/index.html' ) print(article.cleaned_text) g.close()
def extract_articles(i): articles = [] g = Goose() with open(filepaths[i]) as f: urls = [] data = json.load(f) name = data['Claim_ID'] query = data['Claim'] for item in data['Google Results'][0]['results']: if 'snopes' not in item['link'] and 'pdf' not in item['link']: item['link'] = item['link'].replace('https', 'http') urls.append(item['link']) if data['Credibility'] == 'false': cred = '0' else: cred = '1' urls_google = [] try: for url in search(query, stop=30): if 'snopes' not in url: urls_google.append(url) except: print('Some error') for url in urls_google: url = url.replace('https', 'http') if url not in urls: urls.append(url) print('At claim .... ', name) count = 0 checkpoint = time.time() timeout = 5 extracted_articles_domains = [] for url in urls: try: ## Extracting the articles using goose and extracting the text body and checking overlap with the claim article = g.extract(url=url) article = article.cleaned_text res = is_relevant(query, article) ## Checking if the overlap is greater than a threshold value if res == True: articles.append(article) parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) extracted_articles_domains.append(domain) count += 1 checkpoint = time.time() filepath = os.path.join(dump_path, name, (str(count) + '.txt')) os.makedirs(os.path.dirname(filepath), exist_ok=True) f = open(filepath, 'w') f.write(article) f.close() # if time.time() > checkpoint + timeout: # print ('Timed-out ....', name) # break ## Checking for connection error of requests except requests.exceptions.ConnectionError as e: # print ('Some error at file =', name) continue except: # print (sys.exc_info()[0]) continue g.close() return articles, cred, name, query, extracted_articles_domains