def download_article(url): """ Download the html content of a news page :param url: news page's url :type url: string :return: news page's content :rtype: requests.models.Response """ article = { 'link': url, 'source': 'crawler_estadao' } logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}".format(url)) return None extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['body_content'] = extract_content(news) article['published_time'] = extract_published_time(url, soup) return article
def createResource(url): if len(url)>200: print "Los links largos de duckduckgo no funcionan" return None else: r=Resource.objects.filter(url=url) if len(r)>0: print "El recurso ya lo tenia" r=r[0] else: g = Goose() try: a= g.extract(url=url) except: a=None if a==None or a.title==None or a.title=="": title="notitle" else: title=a.title try: tags=["one","two"] r=Resource.objects.create(title=title,url=url,status=Resource.ADDED) r.tags.add("one two") except TypeError as e: print e print "no ha ido bien" print title print url print "Creado el recurso para "+url return r
def get_page_content(url): g = Goose({'stopwords_class': StopWordsChinese}) try: article = g.extract(url=url) except Exception, e: print e article = None
def get_article(self, html): config = self.getConfig() self.parser = config.get_parser() g = Goose(config=config) return g.extract(url = "http://www.null.com", raw_html = html)
def _article(self): """Analyse resource content, return Goose interface""" # switch method depending on content_type # for pdf, fall back to teseract if pdf2text yields not much # (then use the larger, or maybe composit) g = Goose() return g.extract(raw_html=self._decode())
def get_parser(url, tokenizer): useragent = ' '.join([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)", "AppleWebKit/537.36 (KHTML, like Gecko)", "Chrome/52.0.2743.116 Safari/537.36" ]) # Scrape Web Page With HTMLParser and Goose and select the best scrape html_parser = HtmlParser.from_url(url, tokenizer) article = Goose({'browser_user_agent': useragent}) # Goose raises IndexError when requesting unfamiliar sites. try: extract = article.extract(url=url) except: extract = article.extract(raw_html=requests.get(url).text) goose_parser = PlaintextParser(extract, tokenizer) # Aggregate Site Metadata meta = { k: v for (k, v) in extract.infos.items() if k not in ('cleaned_text', 'links', 'tweets', 'movies') } # Select Best Parser parser = ( html_parser if len(goose_parser.document.words) < len(html_parser.document.words) else # noqa goose_parser) return parser, meta
def extract_body(html): """ Extract the body text of a web page """ g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) return article.cleaned_text
def extract_title(html): """ Extract the body title of a web page """ g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) return article.title
def createResource(url): if resolve(url)!=None: url=resolve(url) g = Goose() a= g.extract(url=url) if len(url)>200: print "Los links largos de duckduckgo no funcionan" return None else: r=Resource.objects.filter(url=url) if len(r)>0: print "El recurso ya lo tenia" r=r[0] else: if a.title==None or a.title=="": title="notitle" else: title=a.title try: r=Resource.objects.create(title=title,url=url) except: print "no ha ido bien" print title print url print "Creado el recurso para "+url return r
def article_extractor(url): articleObject = [] print("Program started ...") articleExtractor = Goose() article = articleExtractor.extract(url=url) #build article content articleBody = "" for letter in article.cleaned_text: articleBody += str(letter.encode('utf-8', 'ignore')) #save article content in a file f1 = open('./output.txt', 'w+') f1.write(article.title + '\n') f1.write(article.meta_description + '\n') f1.write(articleBody) f1.close() articleObject.append(article.title) articleObject.append(article.meta_description) articleObject.append(articleBody) return articleObject
def get_link_data_task(link_id): dbsession = get_link_data_task.dbsession services = get_link_data_task.services flags = get_link_data_task.flags if not flags: return link = services.link.get_link_by_id(link_id) if link is None: return html = None if 'screenshot' in flags: data, html = services.screenshot.capture(link.url, 1024, 800) # TODO: Investigate if this way of generating filename can create clashes # TODO: Delete the previous file if it exist filename = services.file.create(data, str(uuid.uuid4()) + '.png', 'screenshots') link.meta['screenshot'] = filename if 'html' in flags: link.meta['html'] = html if html else requests.get(link.url).text # this should move to a service too if 'text' in flags or 'title' in flags: goose = Goose() a = goose.extract(raw_html=html if html else requests.get(link.url).text) if 'text' in flags: link.meta['text'] = a.cleaned_text if 'title' in flags: link.meta['title'] = a.title dbsession.commit() # we are outside the web transaction
def generate_feature_matrix(data, stemmer, **prune_params): config = Configuration() config.enable_image_fetching = False config.use_meta_language = False goose = Goose(config) _parser = HTMLParser() sr_index = HashedIndex() for url_path, label in data.items(): if os.path.exists(url_path): with open(url_path, 'r') as html_file: html_text = html_file.read() text = unicode(goose.extract(raw_html=html_text).cleaned_text) text = _parser.unescape(text) for token in word_tokenize(text, stemmer=stemmer): sr_index.add_term_occurrence(token, url_path) sr_index.prune(**prune_params) X = sr_index.generate_feature_matrix(mode='tfidf') y = np.zeros(len(sr_index.documents())) for index, doc in enumerate(sr_index.documents()): y[index] = 0 if data[doc] is None else 1 return X, y
def get_article(url): g = Goose() article = g.extract(url=url) regex = re.compile('[^a-zA-Z]') article = regex.sub(' ', article.title) article = re.sub(' +', ' ', article) return article
def SplitArticle(url): g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) total_words = len(article.cleaned_text) current_word = 0 last_sentence = '' sentences_pool = [] while current_word < total_words: sub_article = last_sentence + article.cleaned_text[ current_word:min(current_word + 100, total_words)] complete = (sub_article[-1] == u'。') sentences = sub_article.split(u'。') for s in range(len(sentences) - 1): for sub_s in sentences[s].split('\n'): if not sub_s == '': sentences_pool.append(sub_s.encode('utf-8') + '\n') if not complete: last_sentence = sentences[-1] else: last_sentence = '' for sub_s in sentences[-1].split('\n'): if not sub_s == '': sentences_pool.append(sub_s.encode('utf-8') + '\n') current_word = min(current_word + 100, total_words) return sentences_pool
def extract_title(html): """ Extract the body title of a web page """ g = Goose({'enable_image_fetching':False}) article = g.extract(raw_html=html) return article.title
def parse(self, response): hxs = HtmlXPathSelector(response) links = hxs.select("//a/@href").extract() # We stored already crawled links in this list crawledLinks = [] # Pattern to check proper link linkPattern = re.compile( """^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+ ]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\? \+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$""") for link in links: if linkPattern.match(link) and not link in crawledLinks: crawledLinks.append(link) yield Request(link, self.parse) # Goose dziala lepiej, niz soup, do tego moze wyciagac img # moze wyciagac tekst, albo z url, albo z czystego html g = Goose() raw_html = response.body article = g.extract(raw_html=raw_html) text = article.cleaned_text if text.isspace() or not text: pass item = Website() item['text'] = text item['filename'] = '1.txt' yield item
def process_item(self, item, spider): if "pdf_Link" in item: pdfName = item["report_name"] + u".pdf" PDFPath = os.path.join(PDF_PATH, item["source_name"]) if not os.path.exists(PDFPath): os.makedirs(PDFPath) filepath = os.path.join(PDFPath, pdfName) try: content = self.downloadPDF(item["pdf_Link"], filepath) item["report_content"] = content except: self.jsonInfoStored(item, pdfName) log.msg("pdf download failure, information is serializing to json files", level=log.INFO) elif "content_Link" in item: from goose import Goose from goose.text import StopWordsChinese try: g = Goose({"stopwords_class": StopWordsChinese}) article = g.extract(url=item["content_Link"]) content = article.cleaned_text del item["content_Link"] item["report_content"] = content except: log.msg("Content extracted failure from page:%s" % item["report_link"], level=log.INFO) return item
def scrape_category(url, c_label): extract_feed_world = "http://pipes.yahoo.com/pipes/pipe.run?_id=a625f9823d9b5c4858865b107dcc2516&_render=json&urlinput1=%s" % urllib.quote_plus(url) data_world = urllib2.urlopen(extract_feed_world) json_data_world = json.load(data_world) for item in json_data_world['value']['items']: # link = urllib2.urlopen(item['link']) # link = link.geturl() if not [x for x, y in enumerate(Categorized_Labeled_Article.objects.all()) if (y.url == item['link'])]: try: cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) request = urllib2.Request(item['link']) response = opener.open(request) url = response.geturl() g = Goose() article = g.extract(url=url) readable_article = article.cleaned_text #Save in database article = Categorized_Labeled_Article.objects.create(text=readable_article,label=c_label,url=item['link']) article.save() print article.label except (urllib2.HTTPError, UnicodeDecodeError, AttributeError, IOError): print "error %s" % item['link']
def run(self): df = pd.read_csv(LINKS_CSV) g = Goose({'stopwords_class': StopWordsChinese}) df['content'] = df['url'].apply( lambda x: g.extract(url=x).cleaned_text) with self.output().open('w') as f: df.to_json(f, orient='records')
def run(index): r = urllib2.urlopen(CNN_URL+index).read() soup = BeautifulSoup(r,"lxml") headlines = [] links = [] articles = [] print ("Start reading news for: "+ index) for div in soup.find_all('table', 'wsod_newsTable')[0]: for col in div: link = col.find('a')['href'] headline = col.find('a').contents[0] g = Goose() articles.append(g.extract(url=link).cleaned_text) print ("Finished Reading!") tokens = [] for article in articles: tokens += word_tokenize(article) tokens = filter(lambda word: word not in string.punctuation, tokens) result = [] for word in set(tokens): if tokens.count(word)>20 and tokens.count(word)<100: result.append((word, tokens.count(word))) return result
def getproxyip_list(urllist): proxyurllist = [] for url in urllist: time.sleep(10) g = Goose() article = g.extract(url=url) soup = BeautifulSoup(article.raw_html, "html.parser") proxy_list = soup.find_all("tr") for proxyip in proxy_list: iplist = [] all_td = proxyip.find_all("td") if all_td: for td_line in all_td: val = td_line.text.strip() if val: iplist.append(val) else: if td_line.div: iplist.append(td_line.div["title"]) else: iplist.append(u"中国") proxyurllist.append(iplist) print(iplist) return proxyurllist
def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg g = Goose() article = g.extract(url=task['url']) # print article.cleaned_text task['text'] = article.cleaned_text # # Scraping CNN news # text = None # if task['source']['id'] == 'cnn': # print "Scraping CNN news" # text = cnn_news_scraper.extractNews(task['url']) # else: # print "News source [%s] is not supported." % task['source']['name'] # # task['text'] = text dedupe_news_queue_client.sendMessage(task)
def sms_ahoy_reply(): """Respond to incoming messages with a friendly SMS.""" body = request.values.get('Body', None) # Start our response with open('news') as data_file: current_news = json.load(data_file) headlines = current_news['articles'] message = "" if "news" in body or "News" in body: headlines = current_news['articles'] i = 0 while i < 19: message = message + str(i + 1) + ". " + headlines[i]['title'] + "\n" i = i + 1 elif "more" in body: i = int(body.split()[0]) headlines = current_news['articles'] url = headlines[i - 1]['url'] g = Goose() article = g.extract(url=url) message = article.cleaned_text[:1000] message2 = article.cleaned_text[1000:][:1000] message = message + "..." else: i = int(body) message = "" message = headlines[i - 1]['description'] resp = MessagingResponse() # Add a message resp.message(message) return str(resp)
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
def parsepage(data, link): try: goo = Goose({'stopwords_class': StopWordsChinese}) article = goo.extract(raw_html=data) return article except: traceback.print_exc()
def grab(location, keywords, publication, publication_date, title): goose = Goose() try: raw_article = goose.extract(url=location) description = raw_article.meta_description.encode("utf8") article = raw_article.cleaned_text.encode("utf8") split_keywords = keywords.split(',') summary = pyteaser.SummarizeUrl(location) output = json.dumps({ "title": title, "keywords": split_keywords, "publication": publication, "publication_date": publication_date, "description": description, "source": location, "article": article, "summary": summary }) logging.warning('Succesfully grabbed through Goose.') logging.warning('Location: %s, Publication: %s' % (location, publication)) return output except: logging.critical('Unable to get article through Goose.') logging.critical('Location: %s, Publication: %s' % (location, publication)) return None
def extract(url): ''' 提取网页正文 ''' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) return article.cleaned_text
def GetDesc_goose(self, url) : article = "NULL" try : g = Goose( {'stopwords_class': StopWordsChinese} ) article = g.extract(url = url) except Exception, ex: l.Warning("Goose_Crawl Failed %s" % str(ex))
def download_url(self, url): url = self.url #g = Goose() #g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'}) g = Goose({'parser_class':'soup'}) #does this parser works for all? article = g.extract(url=url) self.title = article.title self.description = article.meta_description self.keywords = article.meta_keywords self.content = article.cleaned_text self.domain = article.domain self.movies = article.movies try: self.original_image_url = article.top_image.src except AttributeError: self.original_image_url = "" self.favicon_url = article.meta_favicon self.final_url = article.final_url #test self.domain_link = article.tags
def crawlerWebLink(url): g = Goose() article = g.extract(url=url) print(article.title) print(article.meta_description) print(article.cleaned_text)
def get_data(rss, num): #pathToCSV = '/Users/Michal/Downloads/dialogflow-java-client-master2/samples/clients/VirtualTradingAssistant/src/main/java/ai/api/examples/fileStore/file.csv' #pathToCSV = 'C:\\Users\\ojwoo\\Documents\\Warwick\\CS261\\Coursework\\dialogflow-java-client-master\\samples\\clients\\VirtualTradingAssistant\\src\\main\\java\\ai\\api\\examples\\fileStore\\file.csv' #pathToCSV = '/Users/Michal/Desktop/apache-tomcat-8.5.28/bin/misc/file.csv' pathToCSV = 'C:\\apache-tomcat-8.5.28\\bin\\misc\\news.csv' with open(pathToCSV, 'w') as csvfile: wr = csv.writer(csvfile, delimiter='@', quotechar='#') index = 0 for e in rss['entries']: if (index == int(num)): break wr.writerow([(e['title']).encode('utf-8')]) wr.writerow([(e['link']).encode('utf-8')]) try: g = Goose() article = g.extract(url=e['link']) cleaned_text = article.cleaned_text sent = sentiment(cleaned_text) if sent[0] < 0: sent = 50 - (sent[0] * -50) else: sent = sent[0] * 50 + 50 wr.writerow([str(round(sent, 2)) + '%']) except TypeError: wr.writerow(['Sentiment Unavailable']) index = index + 1
def getContent(self): g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'}); urls = self.getNodeLinks(); for i, url in enumerate(urls): article = g.extract(url=url); self.writteFile(i, 'title', article.title); self.writteFile(i, 'article', article.cleaned_text);
def get_text(article_url): goose = Goose() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(article_url) raw_html = response.read() article = goose.extract(raw_html=raw_html) return article.cleaned_text
def process_data(threadName, q): while not exitFlag: queueLock.acquire() if not workQueue.empty(): global Id print "%s processing No.%s result page..." % (threadName, Id) data = q.get() g = Goose() resultUrl = data["unescapedUrl"] article = g.extract(url = resultUrl) item = {} item['title'] = data["titleNoFormatting"] item['url'] = resultUrl item['keyWords'] = keyWords item['description'] = article.cleaned_text[:4000] if article.top_image: item['image'] = article.top_image.src else: item['image'] = "" insert(item) Id += 1 queueLock.release() else: queueLock.release() time.sleep(1)
def process_item(self, item, spider): if 'pdf_Link' in item: pdfName = item['report_name'] + u".pdf" PDFPath = os.path.join(PDF_PATH, item['source_name']) if not os.path.exists(PDFPath): os.makedirs(PDFPath) filepath = os.path.join(PDFPath, pdfName) try: content = self.downloadPDF(item['pdf_Link'], filepath) item["report_content"] = content except: self.jsonInfoStored(item, pdfName) log.msg( "pdf download failure, information is serializing to json files", level=log.INFO) elif 'content_Link' in item: from goose import Goose from goose.text import StopWordsChinese try: g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=item['content_Link']) content = article.cleaned_text del item['content_Link'] item["report_content"] = content except: log.msg("Content extracted failure from page:%s" % item['report_link'], level=log.INFO) return item
def fetch_content_for_url(url): try: g = Goose() article = g.extract(url=url) return article.cleaned_text except: return ''
def GoogleSearch(argu): url2 = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&start=" q = "&q=" keyWords = argu #"love story taylor" startNums = ["0","4","8","12","16"] searchResults = [] print "Start to crawl Google with keyWords: %s" % keyWords for num in startNums: theUrl = url2 + num + q + keyWords f = urllib.urlopen(theUrl) j = json.load(f) searchResults += j["responseData"]["results"] Id = 1 g = Goose() with open("result.dat", "w") as of: for obj in searchResults: print "Extracting No.%d result page..." % Id # print obj["unescapedUrl"] resultUrl = obj["unescapedUrl"] # print resultUrl article = g.extract(url = resultUrl) # print article.title line = article.title + "|*|" + resultUrl + "|*|" + article.cleaned_text[:4000] of.write(str(Id) + "|*|") of.write(line.encode('utf-8') + "|**|") Id += 1 print "-----End-----" return
def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'): ''' Read links and associated categories for specified articles in text file seperated by a space Args: corpus_dir (str): The directory to save the generated corpus datastore_type (Optional[str]): Format to save generated corpus. Specify either 'file' or 'sqlite'. db_name (Optional[str]): Name of database if 'sqlite' is selected. ''' self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'}) #self.g = Goose({'browser_user_agent': 'Mozilla'}) self.corpus_dir = corpus_dir self.datastore_type = datastore_type self.db_name = db_name self.stats = defaultdict(int) self._create_corpus_dir(self.corpus_dir) self.db = None if self.datastore_type == 'sqlite': self.db = self.corpus_dir + '/' + self.db_name self._set_up_db(self.db)
def on_pubmsg(self, serv, ev): canal = ev.target() message = ev.arguments()[0].lower() if self.channels[canal].has_user("Yppy"): return url = re.search("(?P<url>https?://[^\s]+)", message) if url: url = url.group(0) try: self.lasturl = url hostname = urlparse.urlparse(url).hostname g = Goose() article = g.extract(url=url) tinyurl = urllib2.urlopen("http://tinyurl.com/api-create.php?url=" + url).read() title = article.title.encode('utf-8')[:70] ret = "Title : %s (%s) | %s" % (title, hostname, tinyurl) serv.privmsg(canal, ret) except: # todo log error e = sys.exc_info()[0] print(e) return if "!sum" in message: try: response = unirest.post("http://192.81.222.194:1142/api",{}, {"url": self.lasturl}) print response.body for bullet in response.body: serv.privmsg(canal, ("* %s" % (bullet).encode('utf-8'))) except: # todo log error e = sys.exc_info()[0] print(e) return
def categorize(request, article_url): #load model f = open('my_classifier.pickle') classif = pickle.load(f) f.close() print "loaded model" #categorize incoming article g = Goose() article = g.extract(url=article_url) #get list of words words = dict() article_text = article.cleaned_text for word in word_tokenize(article_text): words.setdefault(('%s' % word), 0) words[('%s' % word)] += 1 print "got words!" classified = classif.classify(words) output = "" output += "PREDICTED: %s <br>" % classified output += "<br><br> %s" % article_text return HttpResponse(output)
def extract_body(html): """ Extract the body text of a web page """ g = Goose({'enable_image_fetching':False}) article = g.extract(raw_html=html) return article.cleaned_text
def print_news(url, content='title'): #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) g = Goose() article = g.extract(url=url) #If there is a meta description available, print that else go for #summarize if content == 'full' and article.meta_description: print(article.meta_description) return news_text = article.cleaned_text parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) if content == 'title' or content == 'full': #Print article title print('\t* ' + str(article.title.encode('ascii', 'ignore'))) if content == 'full': #Print a n-line summary for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return
def TF_IDF_url(): doc_list = [] bloblist = [] #The name of book is written on tran_data.txt f = open('data/train_data1.txt') g = Goose({'stopwords_class': StopWordsKorean}) lines = f.readlines() f.close() for line in lines: doc_class = Textdoc() doc_class.save_title(g.extract(url=line).title) doc_class.save_content(g.extract(url=line).cleaned_text) doc_list.append(doc_class) bloblist.append(doc_class.content) # print doc_class.content t = 0 for i, blob in enumerate(bloblist): #pprint(get_nouns(blob)) print("Top words in document {}".format(i + 1)) scores = { word: tfidf(word, blob, bloblist) for word in get_nouns(blob) } sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:5]: doc_list[t].add_word(word, round(score, 5)) # print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) t = t + 1 return doc_list
def scrape(url): """ Function to request and parse a given URL. Returns only the "relevant" text. Parameters ---------- url : String. URL to request and parse. Returns ------- text : String. Parsed text from the specified website. meta : String. Parsed meta description of an article. Usually equivalent to the lede. """ logger = logging.getLogger('scraper_log') page = requests.get(url) g = Goose() try: article = g.extract(raw_html=page.content) text = article.cleaned_text meta = article.meta_description return text, meta #Generic error catching is bad except Exception, e: print 'There was an error. Check the log file for more information.' logger.warning('Problem scraping URL: {}. {}.'.format(url, e))
def save(self, *args, **kwargs): from goose import Goose from text.blob import TextBlob g = Goose() article = g.extract(url=self.url) try: b = TextBlob(article.title) lang = b.detect_language() except: lang='en' g = Goose({'use_meta_language': False, 'target_language':lang, 'paper_class':'soup'}) if not self.title: self.title = article.title if not self.newspaper: self.newspaper = article.domain if not self.content: self.content = article.cleaned_text try: if article.top_image.src: layout = Photo() #layout.photo = "images/news/"+str(self.id)+".jpg" layout.url = article.top_image.src layout.article = self layout.save() except: pass super(Article, self).save()
def hackers_news(): total_data = [] obj = get_context() base_url, target_url = obj.urls() parsed_source = obj.get_parsed_source(base_url, target_url) news_urls = parsed_source.xpath("//table[@id='hnmain']//table//tr[@class='athing']") for each_data in news_urls: news_url = each_data.xpath(".//td[@class='title']//span[@class='deadmark']//following-sibling::a[1]//@href") news_url = "".join(news_url) upvotes = each_data.xpath(".//following-sibling::tr[1]//td[@class='subtext']//span//text()") upvotes = "".join(upvotes) posted_on = each_data.xpath( ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[2]//text()" ) posted_on = "".join(posted_on) comments = each_data.xpath( ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[3]//text()" ) comments = "".join(comments) g = Goose() article = g.extract(url=news_url) content = article.cleaned_text content = " ".join(content.split()).replace("\n", "").replace("\t", "").replace("\r", "") try: content = content.encode("utf-8").decode("ascii", "ignore").encode("ascii") except: try: content = content.decode("ascii", "ignore").encode("ascii") except: try: content = content.encode("utf-8") except: content = "No news found" connection, cursor = obj.get_connection() duplicate_query = "SELECT news_url FROM hackers_news WHERE news_url=%s" duplicate_values = (news_url,) cursor.execute(duplicate_query, duplicate_values) duplicate_data = cursor.fetchall() if duplicate_data: insert_data = "update hackers_news set upvotes =" + upvotes + ",comments=" + comments + " where news_url=%s" values = (news_url,) cursor.execute(insert_data, values) connection.commit() else: try: insert_data = ( "insert into hackers_news(news_url,news_content,upvotes,posted_on,comments) values(%s,%s,%s,%s,%s)" ) values = (news_url, content, upvotes, posted_on, comments) cursor.execute(insert_data, values) connection.commit() except: continue cursor.close() connection.close() total_data.append( {"news_url": news_url, "content": content, "upvotes": upvotes, "posted_on": posted_on, "comments": comments} ) context_dict = {"total_data": total_data} return context_dict
def getUrl(item): url_name = re.split('&&', item) url = url_name[1] name = url_name[0] print url print name html_name = name + '.html' print html_name g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) # print article.raw_html currentDir = os.getcwd() + '/' + 'pages' + '/' + name if not os.path.exists(currentDir): os.makedirs(currentDir) f = open(currentDir + '/' + html_name, 'a') google_transfer = open("transfer.js").read() print google_transfer f.write(google_transfer + article.raw_html) f.close() print article.title f_md = open(currentDir + '/' + name + '.md', 'a') f_md.write(article.cleaned_text.encode('utf-8')) f_md.close()
def extract(URL): """ This function extract the page's text body of the given URL. Return: page_title: the value of the <title> html tag text_extracted: the extracted body text img: top_image url extracted """ g = Goose() text, text_type= _get_html_content_from_url(URL) if text_type != 'text/plain': #article = g.extract(url=URL) article = g.extract(raw_html=text) img = '' try: img = article.top_image.src except: img = '' return (article.title,article.cleaned_text,img) else: print "it's a plain/text" return ('plaintext',text,'n/a')
def HTMLParser(url): response = get(url) extractor = Goose() article = extractor.extract(raw_html=response.content) text = article.cleaned_text return str(text.encode("ascii", "ignore"))
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height - 4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx + text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len( lines): idx += 1 elif char == "l": break except Exception, e: print e continue
def get_article_parts(html): """Take HTML and return extracted headline and body.""" g = Goose({'use_meta_language': False, 'enable_image_fetching': False}) try: article = g.extract(raw_html=html) except: return None, None return article.title.strip().encode('utf-8'), article.cleaned_text.strip().encode('utf-8')
def GetDesc_goose(self, url) : try : g = Goose( {'stopwords_class': StopWordsChinese} ) article = g.extract(url = url) return str(article.cleaned_text).replace('\t','').replace('\r','').replace('\b','').replace('"',"'").encode('utf-8') except Exception, ex: l.Warning("Goose_Crawl Failed %s" % str(ex)) return "NULL"
def get_full_text(self): if self.text_url: g = Goose() page = g.extract(self.text_url) print >> sys.stderr, "Extracting text for %s" % page.title self.text = page.cleaned_text else: print >> sys.stderr, 'No script found for %s' % self.title
def goose_extractor_content(url): g = Goose() article = g.extract(url=url) #article.title ## returns article title #article.meta_description ## returns article meta description #article.top_image.src ## returns path for main image in article #article = g.extract(url=url) ## these scrape full text from article return article.cleaned_text
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height-4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx+text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len(lines): idx += 1 elif char == "l": break except Exception, e: print e continue