def processArticle(self, response): url = response.url score = calc_score(url) if score >= 3: try: print 'this url maybe a news_url' item = ArticleItem() g = Goose() article = g.extract(url=url) title = article.title content = article.cleaned_text if len(content) == 0: print 'news in chinese' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) content = article.cleaned_text item['articleTitle'] = title item['articleUrl'] = url item['articleContent'] = content yield item except: self.logger.info('item in article failed') else: print 'this url maybe not a news_url, ' + ' score only ' + str( score) print 'you can check this url: ' + url return
def get_url_extract_body_text(url, config=None): # url ="https://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2" if config: g = Goose(config) else: g = Goose() article = g.extract(url=url) return article.cleaned_text
def __init__(self,url,author): self.url = url self.author = author goose = Goose({'stopwords_class': StopWordsChinese}) article = goose.extract(url=url) if article.title == '': goose = Goose() article = goose.extract(url=url) self.title = article.title self.summary = article.cleaned_text[:150] self.body = article.cleaned_text
def get_url_extract_links(url, config=None): # url ="https://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2" if config: g = Goose(config) else: g = Goose() article = g.extract(url=url) # print('title is ', article.title) # print('cleaned_text is ', article.cleaned_text) # print('meta is ', article.meta_description) # print('link is', article.canonical_link) print('link is:') for link in article.links: print(link)
def getproxyip_list(urllist): proxyurllist = [] for url in urllist: time.sleep(10) g = Goose() article = g.extract(url=url) soup = BeautifulSoup(article.raw_html, "html.parser") proxy_list = soup.find_all("tr") for proxyip in proxy_list: iplist = [] all_td = proxyip.find_all("td") if all_td: for td_line in all_td: val = td_line.text.strip() if val: iplist.append(val) else: if td_line.div: iplist.append(td_line.div["title"]) else: iplist.append(u"中国") proxyurllist.append(iplist) print(iplist) return proxyurllist
def getUrl(item): url_name = re.split('&&', item) url = url_name[1] name = url_name[0] print url print name html_name = name + '.html' print html_name g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) # print article.raw_html currentDir = os.getcwd() + '/' + 'pages' + '/' + name if not os.path.exists(currentDir): os.makedirs(currentDir) f = open(currentDir + '/' + html_name, 'a') google_transfer = open("transfer.js").read() print google_transfer f.write(google_transfer + article.raw_html) f.close() print article.title f_md = open(currentDir + '/' + name + '.md', 'a') f_md.write(article.cleaned_text.encode('utf-8')) f_md.close()
def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg g = Goose() article = g.extract(url=task['url']) # print article.cleaned_text task['text'] = article.cleaned_text # # Scraping CNN news # text = None # if task['source']['id'] == 'cnn': # print "Scraping CNN news" # text = cnn_news_scraper.extractNews(task['url']) # else: # print "News source [%s] is not supported." % task['source']['name'] # # task['text'] = text dedupe_news_queue_client.sendMessage(task)
def get_data(rss, num): #pathToCSV = '/Users/Michal/Downloads/dialogflow-java-client-master2/samples/clients/VirtualTradingAssistant/src/main/java/ai/api/examples/fileStore/file.csv' #pathToCSV = 'C:\\Users\\ojwoo\\Documents\\Warwick\\CS261\\Coursework\\dialogflow-java-client-master\\samples\\clients\\VirtualTradingAssistant\\src\\main\\java\\ai\\api\\examples\\fileStore\\file.csv' #pathToCSV = '/Users/Michal/Desktop/apache-tomcat-8.5.28/bin/misc/file.csv' pathToCSV = 'C:\\apache-tomcat-8.5.28\\bin\\misc\\news.csv' with open(pathToCSV, 'w') as csvfile: wr = csv.writer(csvfile, delimiter='@', quotechar='#') index = 0 for e in rss['entries']: if (index == int(num)): break wr.writerow([(e['title']).encode('utf-8')]) wr.writerow([(e['link']).encode('utf-8')]) try: g = Goose() article = g.extract(url=e['link']) cleaned_text = article.cleaned_text sent = sentiment(cleaned_text) if sent[0] < 0: sent = 50 - (sent[0] * -50) else: sent = sent[0] * 50 + 50 wr.writerow([str(round(sent, 2)) + '%']) except TypeError: wr.writerow(['Sentiment Unavailable']) index = index + 1
def extract_title(html): """ Extract the body title of a web page """ g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) return article.title
def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'): ''' Read links and associated categories for specified articles in text file seperated by a space Args: corpus_dir (str): The directory to save the generated corpus datastore_type (Optional[str]): Format to save generated corpus. Specify either 'file' or 'sqlite'. db_name (Optional[str]): Name of database if 'sqlite' is selected. ''' self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'}) #self.g = Goose({'browser_user_agent': 'Mozilla'}) self.corpus_dir = corpus_dir self.datastore_type = datastore_type self.db_name = db_name self.stats = defaultdict(int) self._create_corpus_dir(self.corpus_dir) self.db = None if self.datastore_type == 'sqlite': self.db = self.corpus_dir + '/' + self.db_name self._set_up_db(self.db)
def run(self): df = pd.read_csv(LINKS_CSV) g = Goose({'stopwords_class': StopWordsChinese}) df['content'] = df['url'].apply( lambda x: g.extract(url=x).cleaned_text) with self.output().open('w') as f: df.to_json(f, orient='records')
def crawlerWebLink(url): g = Goose() article = g.extract(url=url) print(article.title) print(article.meta_description) print(article.cleaned_text)
def parse_item(self, response): article = Goose().extract(raw_html=response.body) yield Article(title=article.title, text=article.cleaned_text, url=response.url, field=self.name)
def sms_ahoy_reply(): """Respond to incoming messages with a friendly SMS.""" body = request.values.get('Body', None) # Start our response with open('news') as data_file: current_news = json.load(data_file) headlines = current_news['articles'] message = "" if "news" in body or "News" in body: headlines = current_news['articles'] i = 0 while i < 19: message = message + str(i + 1) + ". " + headlines[i]['title'] + "\n" i = i + 1 elif "more" in body: i = int(body.split()[0]) headlines = current_news['articles'] url = headlines[i - 1]['url'] g = Goose() article = g.extract(url=url) message = article.cleaned_text[:1000] message2 = article.cleaned_text[1000:][:1000] message = message + "..." else: i = int(body) message = "" message = headlines[i - 1]['description'] resp = MessagingResponse() # Add a message resp.message(message) return str(resp)
def grab(location, keywords, publication, publication_date, title): goose = Goose() try: raw_article = goose.extract(url=location) description = raw_article.meta_description.encode("utf8") article = raw_article.cleaned_text.encode("utf8") split_keywords = keywords.split(',') summary = pyteaser.SummarizeUrl(location) output = json.dumps({ "title": title, "keywords": split_keywords, "publication": publication, "publication_date": publication_date, "description": description, "source": location, "article": article, "summary": summary }) logging.warning('Succesfully grabbed through Goose.') logging.warning('Location: %s, Publication: %s' % (location, publication)) return output except: logging.critical('Unable to get article through Goose.') logging.critical('Location: %s, Publication: %s' % (location, publication)) return None
def parse(self, response, pr=None): for i in response.xpath('//a/@href').extract(): if 'https://' in i or 'http://' in i: continue c = 0 if re.match(r'(.*)\/(.*\.html)', response.url): c = 1 urls = response.url.split("/") urls = '/'.join(urls[0:len(urls) - c - i.count("../")]) if urls[-1] == '/': target = urls + i.split("../")[-1] else: target = urls + '/' + i.split("../")[-1] # print "target:", target if target == pr: continue yield scrapy.Request(target, callback=lambda res: self.parse(res, response.url)) article = Goose().extract(raw_html=response.body) yield { "url": response.url, "article": article }
def getContent(self): g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'}); urls = self.getNodeLinks(); for i, url in enumerate(urls): article = g.extract(url=url); self.writteFile(i, 'title', article.title); self.writteFile(i, 'article', article.cleaned_text);
def get_text(article_url): goose = Goose() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(article_url) raw_html = response.read() article = goose.extract(raw_html=raw_html) return article.cleaned_text
def HTMLParser(url): response = get(url) extractor = Goose() article = extractor.extract(raw_html=response.content) text = article.cleaned_text return str(text.encode("ascii", "ignore"))
def process_item(self, item, spider): if 'pdf_Link' in item: pdfName = item['report_name'] + u".pdf" PDFPath = os.path.join(PDF_PATH, item['source_name']) if not os.path.exists(PDFPath): os.makedirs(PDFPath) filepath = os.path.join(PDFPath, pdfName) try: content = self.downloadPDF(item['pdf_Link'], filepath) item["report_content"] = content except: self.jsonInfoStored(item, pdfName) log.msg( "pdf download failure, information is serializing to json files", level=log.INFO) elif 'content_Link' in item: from goose import Goose from goose.text import StopWordsChinese try: g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=item['content_Link']) content = article.cleaned_text del item['content_Link'] item["report_content"] = content except: log.msg("Content extracted failure from page:%s" % item['report_link'], level=log.INFO) return item
def run(index): r = urllib2.urlopen(CNN_URL+index).read() soup = BeautifulSoup(r,"lxml") headlines = [] links = [] articles = [] print ("Start reading news for: "+ index) for div in soup.find_all('table', 'wsod_newsTable')[0]: for col in div: link = col.find('a')['href'] headline = col.find('a').contents[0] g = Goose() articles.append(g.extract(url=link).cleaned_text) print ("Finished Reading!") tokens = [] for article in articles: tokens += word_tokenize(article) tokens = filter(lambda word: word not in string.punctuation, tokens) result = [] for word in set(tokens): if tokens.count(word)>20 and tokens.count(word)<100: result.append((word, tokens.count(word))) return result
class GetContentPipeline(object): goose = Goose({'stopwords_class': StopWordsChinese}) def process_item(self, item, spider): if item: url = item.url new_content = NewsContent() new_content.news = item article = GetContentPipeline.goose.extract(url=url) if (not article) or not (article.top_node): item.delete() raise DropItem(u"无法获取内容 %s" % item) text = article.top_node.text_content() if not text: item.delete() raise DropItem(u"无法获取内容 %s" % item) content = etree.tostring(article.top_node) text = BeautifulSoup(content).getText() if len(text) < 100: item.delete() raise DropItem(u"获取内容太短 %s" % item) new_content.content = content try: img = article.top_image.src new_content.content_img = img movie = article.movies[0].src new_content.movie = movie except: pass new_content.save() return item
def extract(url): ''' 提取网页正文 ''' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) return article.cleaned_text
def parsepage(data, link): try: goo = Goose({'stopwords_class': StopWordsChinese}) article = goo.extract(raw_html=data) return article except: traceback.print_exc()
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
def extract_body(html): """ Extract the body text of a web page """ g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) return article.cleaned_text
def print_news(url, content='title'): #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) g = Goose() article = g.extract(url=url) #If there is a meta description available, print that else go for #summarize if content == 'full' and article.meta_description: print(article.meta_description) return news_text = article.cleaned_text parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) if content == 'title' or content == 'full': #Print article title print('\t* ' + str(article.title.encode('ascii', 'ignore'))) if content == 'full': #Print a n-line summary for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return
def SplitArticle(url): g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) total_words = len(article.cleaned_text) current_word = 0 last_sentence = '' sentences_pool = [] while current_word < total_words: sub_article = last_sentence + article.cleaned_text[ current_word:min(current_word + 100, total_words)] complete = (sub_article[-1] == u'。') sentences = sub_article.split(u'。') for s in range(len(sentences) - 1): for sub_s in sentences[s].split('\n'): if not sub_s == '': sentences_pool.append(sub_s.encode('utf-8') + '\n') if not complete: last_sentence = sentences[-1] else: last_sentence = '' for sub_s in sentences[-1].split('\n'): if not sub_s == '': sentences_pool.append(sub_s.encode('utf-8') + '\n') current_word = min(current_word + 100, total_words) return sentences_pool
def get_parser(url, tokenizer): useragent = ' '.join([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)", "AppleWebKit/537.36 (KHTML, like Gecko)", "Chrome/52.0.2743.116 Safari/537.36" ]) # Scrape Web Page With HTMLParser and Goose and select the best scrape html_parser = HtmlParser.from_url(url, tokenizer) article = Goose({'browser_user_agent': useragent}) # Goose raises IndexError when requesting unfamiliar sites. try: extract = article.extract(url=url) except: extract = article.extract(raw_html=requests.get(url).text) goose_parser = PlaintextParser(extract, tokenizer) # Aggregate Site Metadata meta = { k: v for (k, v) in extract.infos.items() if k not in ('cleaned_text', 'links', 'tweets', 'movies') } # Select Best Parser parser = ( html_parser if len(goose_parser.document.words) < len(html_parser.document.words) else # noqa goose_parser) return parser, meta
def goose_extractor(url): '''webpage extraction using Goose Library''' article = Goose().extract(url=url) return article.title, article.meta_description,\ article.cleaned_text