class ArticleExtractor: def __init__(self): self.goose = Goose() def article_from_url(self, url): return self.goose.extract(url=url) def article_from_html(self, html): return self.goose.extract(raw_html=html)
def __init__(self,url,author): self.url = url self.author = author goose = Goose({'stopwords_class': StopWordsChinese}) article = goose.extract(url=url) if article.title == '': goose = Goose() article = goose.extract(url=url) self.title = article.title self.summary = article.cleaned_text[:150] self.body = article.cleaned_text
class ArticleExtractor(object): def __init__(self): self.g = Goose({'stopwords_class': StopWordsChinese}) def extractUrl(self, url=None): if url is not None: return self.g.extract(url=url) return None def extractHtm(self, html=None): if html is not None: return self.g.extract(raw_html=html) return None
def extract(url): ''' 提取网页正文 ''' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) return article.cleaned_text
def createResource(url): if resolve(url)!=None: url=resolve(url) g = Goose() a= g.extract(url=url) if len(url)>200: print "Los links largos de duckduckgo no funcionan" return None else: r=Resource.objects.filter(url=url) if len(r)>0: print "El recurso ya lo tenia" r=r[0] else: if a.title==None or a.title=="": title="notitle" else: title=a.title try: r=Resource.objects.create(title=title,url=url) except: print "no ha ido bien" print title print url print "Creado el recurso para "+url return r
def get_link_data_task(link_id): dbsession = get_link_data_task.dbsession services = get_link_data_task.services flags = get_link_data_task.flags if not flags: return link = services.link.get_link_by_id(link_id) if link is None: return html = None if 'screenshot' in flags: data, html = services.screenshot.capture(link.url, 1024, 800) # TODO: Investigate if this way of generating filename can create clashes # TODO: Delete the previous file if it exist filename = services.file.create(data, str(uuid.uuid4()) + '.png', 'screenshots') link.meta['screenshot'] = filename if 'html' in flags: link.meta['html'] = html if html else requests.get(link.url).text # this should move to a service too if 'text' in flags or 'title' in flags: goose = Goose() a = goose.extract(raw_html=html if html else requests.get(link.url).text) if 'text' in flags: link.meta['text'] = a.cleaned_text if 'title' in flags: link.meta['title'] = a.title dbsession.commit() # we are outside the web transaction
class SoloSpider(CrawlSpider): name = "solo" rules = (Rule(LinkExtractor(), callback='parse_items', follow=True),) def __init__(self, **kw): super(SoloSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') self.g = Goose() self.url = url self.allowed_domains = [url] self.start_urls = ['http://www.' + url] def parse_items(self, response): gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SoloItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r'[\',|!]', '', v), lambda v: re.sub(r'\s+', ' ', v) ) il.add_value('siteurl', parse_base_url(response.url)) il.add_value('pageurl', response.url) il.add_value('text', fulltext.encode('ascii', 'ignore')) il.add_xpath('pagetitle', '//title/text()') return il.load_item()
class SoloSpider(CrawlSpider): name = "solo" rules = (Rule(LinkExtractor(), callback="parse_items", follow=True),) def __init__(self, **kw): super(SoloSpider, self).__init__(**kw) url = kw.get("url") or kw.get("domain") self.g = Goose() self.url = url self.allowed_domains = [url] self.start_urls = ["http://www." + url] def parse_items(self, response): gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SoloItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r"[\',|!]", "", v), lambda v: re.sub(r"\s+", " ", v) ) il.add_value("siteurl", parse_base_url(response.url)) il.add_value("pageurl", response.url) il.add_value("text", fulltext.encode("ascii", "ignore")) il.add_xpath("pagetitle", "//title/text()") return il.load_item()
class GooseAPI: def __init__(self, url): self.url = url self.goose = Goose() self.extracted_content = None def extract(self): self.extracted_content = self.goose.extract(url = self.url) return { 'title': self.extracted_content.title, 'summary': self.extracted_content.meta_description, 'content': self.extracted_content.content_html, 'published_at': self.extracted_content.publish_date, 'assets': self.images() } def images(self): images = [] for image in self.extracted_content.images: images.append({ 'url': image.src, 'width': image.width, 'height': image.height, 'type': 'image' }) return images
def createResource(url): if len(url)>200: print "Los links largos de duckduckgo no funcionan" return None else: r=Resource.objects.filter(url=url) if len(r)>0: print "El recurso ya lo tenia" r=r[0] else: g = Goose() try: a= g.extract(url=url) except: a=None if a==None or a.title==None or a.title=="": title="notitle" else: title=a.title try: tags=["one","two"] r=Resource.objects.create(title=title,url=url,status=Resource.ADDED) r.tags.add("one two") except TypeError as e: print e print "no ha ido bien" print title print url print "Creado el recurso para "+url return r
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
def fetch_content_for_url(url): try: g = Goose() article = g.extract(url=url) return article.cleaned_text except: return ''
def get_article(self, html): config = self.getConfig() self.parser = config.get_parser() g = Goose(config=config) return g.extract(url = "http://www.null.com", raw_html = html)
def hackers_news(): total_data = [] obj = get_context() base_url, target_url = obj.urls() parsed_source = obj.get_parsed_source(base_url, target_url) news_urls = parsed_source.xpath("//table[@id='hnmain']//table//tr[@class='athing']") for each_data in news_urls: news_url = each_data.xpath(".//td[@class='title']//span[@class='deadmark']//following-sibling::a[1]//@href") news_url = "".join(news_url) upvotes = each_data.xpath(".//following-sibling::tr[1]//td[@class='subtext']//span//text()") upvotes = "".join(upvotes) posted_on = each_data.xpath( ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[2]//text()" ) posted_on = "".join(posted_on) comments = each_data.xpath( ".//following-sibling::tr[1]//td[@class='subtext']//span//following-sibling::a[3]//text()" ) comments = "".join(comments) g = Goose() article = g.extract(url=news_url) content = article.cleaned_text content = " ".join(content.split()).replace("\n", "").replace("\t", "").replace("\r", "") try: content = content.encode("utf-8").decode("ascii", "ignore").encode("ascii") except: try: content = content.decode("ascii", "ignore").encode("ascii") except: try: content = content.encode("utf-8") except: content = "No news found" connection, cursor = obj.get_connection() duplicate_query = "SELECT news_url FROM hackers_news WHERE news_url=%s" duplicate_values = (news_url,) cursor.execute(duplicate_query, duplicate_values) duplicate_data = cursor.fetchall() if duplicate_data: insert_data = "update hackers_news set upvotes =" + upvotes + ",comments=" + comments + " where news_url=%s" values = (news_url,) cursor.execute(insert_data, values) connection.commit() else: try: insert_data = ( "insert into hackers_news(news_url,news_content,upvotes,posted_on,comments) values(%s,%s,%s,%s,%s)" ) values = (news_url, content, upvotes, posted_on, comments) cursor.execute(insert_data, values) connection.commit() except: continue cursor.close() connection.close() total_data.append( {"news_url": news_url, "content": content, "upvotes": upvotes, "posted_on": posted_on, "comments": comments} ) context_dict = {"total_data": total_data} return context_dict
def crawlerWebLink(url): g = Goose() article = g.extract(url=url) print(article.title) print(article.meta_description) print(article.cleaned_text)
def save(self, *args, **kwargs): from goose import Goose from text.blob import TextBlob g = Goose() article = g.extract(url=self.url) try: b = TextBlob(article.title) lang = b.detect_language() except: lang='en' g = Goose({'use_meta_language': False, 'target_language':lang, 'paper_class':'soup'}) if not self.title: self.title = article.title if not self.newspaper: self.newspaper = article.domain if not self.content: self.content = article.cleaned_text try: if article.top_image.src: layout = Photo() #layout.photo = "images/news/"+str(self.id)+".jpg" layout.url = article.top_image.src layout.article = self layout.save() except: pass super(Article, self).save()
def process_item(self, item, spider): if "pdf_Link" in item: pdfName = item["report_name"] + u".pdf" PDFPath = os.path.join(PDF_PATH, item["source_name"]) if not os.path.exists(PDFPath): os.makedirs(PDFPath) filepath = os.path.join(PDFPath, pdfName) try: content = self.downloadPDF(item["pdf_Link"], filepath) item["report_content"] = content except: self.jsonInfoStored(item, pdfName) log.msg("pdf download failure, information is serializing to json files", level=log.INFO) elif "content_Link" in item: from goose import Goose from goose.text import StopWordsChinese try: g = Goose({"stopwords_class": StopWordsChinese}) article = g.extract(url=item["content_Link"]) content = article.cleaned_text del item["content_Link"] item["report_content"] = content except: log.msg("Content extracted failure from page:%s" % item["report_link"], level=log.INFO) return item
def scrape_category(url, c_label): extract_feed_world = "http://pipes.yahoo.com/pipes/pipe.run?_id=a625f9823d9b5c4858865b107dcc2516&_render=json&urlinput1=%s" % urllib.quote_plus(url) data_world = urllib2.urlopen(extract_feed_world) json_data_world = json.load(data_world) for item in json_data_world['value']['items']: # link = urllib2.urlopen(item['link']) # link = link.geturl() if not [x for x, y in enumerate(Categorized_Labeled_Article.objects.all()) if (y.url == item['link'])]: try: cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) request = urllib2.Request(item['link']) response = opener.open(request) url = response.geturl() g = Goose() article = g.extract(url=url) readable_article = article.cleaned_text #Save in database article = Categorized_Labeled_Article.objects.create(text=readable_article,label=c_label,url=item['link']) article.save() print article.label except (urllib2.HTTPError, UnicodeDecodeError, AttributeError, IOError): print "error %s" % item['link']
def download_article(url): """ Download the html content of a news page :param url: news page's url :type url: string :return: news page's content :rtype: requests.models.Response """ article = { 'link': url, 'source': 'crawler_estadao' } logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}".format(url)) return None extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['body_content'] = extract_content(news) article['published_time'] = extract_published_time(url, soup) return article
def process_data(threadName, q): while not exitFlag: queueLock.acquire() if not workQueue.empty(): global Id print "%s processing No.%s result page..." % (threadName, Id) data = q.get() g = Goose() resultUrl = data["unescapedUrl"] article = g.extract(url = resultUrl) item = {} item['title'] = data["titleNoFormatting"] item['url'] = resultUrl item['keyWords'] = keyWords item['description'] = article.cleaned_text[:4000] if article.top_image: item['image'] = article.top_image.src else: item['image'] = "" insert(item) Id += 1 queueLock.release() else: queueLock.release() time.sleep(1)
def GetDesc_goose(self, url) : article = "NULL" try : g = Goose( {'stopwords_class': StopWordsChinese} ) article = g.extract(url = url) except Exception, ex: l.Warning("Goose_Crawl Failed %s" % str(ex))
def extract(URL): """ This function extract the page's text body of the given URL. Return: page_title: the value of the <title> html tag text_extracted: the extracted body text img: top_image url extracted """ g = Goose() text, text_type= _get_html_content_from_url(URL) if text_type != 'text/plain': #article = g.extract(url=URL) article = g.extract(raw_html=text) img = '' try: img = article.top_image.src except: img = '' return (article.title,article.cleaned_text,img) else: print "it's a plain/text" return ('plaintext',text,'n/a')
def extract_title(html): """ Extract the body title of a web page """ g = Goose({'enable_image_fetching':False}) article = g.extract(raw_html=html) return article.title
def download_url(self, url): url = self.url #g = Goose() #g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'}) g = Goose({'parser_class':'soup'}) #does this parser works for all? article = g.extract(url=url) self.title = article.title self.description = article.meta_description self.keywords = article.meta_keywords self.content = article.cleaned_text self.domain = article.domain self.movies = article.movies try: self.original_image_url = article.top_image.src except AttributeError: self.original_image_url = "" self.favicon_url = article.meta_favicon self.final_url = article.final_url #test self.domain_link = article.tags
def categorize(request, article_url): #load model f = open('my_classifier.pickle') classif = pickle.load(f) f.close() print "loaded model" #categorize incoming article g = Goose() article = g.extract(url=article_url) #get list of words words = dict() article_text = article.cleaned_text for word in word_tokenize(article_text): words.setdefault(('%s' % word), 0) words[('%s' % word)] += 1 print "got words!" classified = classif.classify(words) output = "" output += "PREDICTED: %s <br>" % classified output += "<br><br> %s" % article_text return HttpResponse(output)
def extract_body(html): """ Extract the body text of a web page """ g = Goose({'enable_image_fetching':False}) article = g.extract(raw_html=html) return article.cleaned_text
def on_pubmsg(self, serv, ev): canal = ev.target() message = ev.arguments()[0].lower() if self.channels[canal].has_user("Yppy"): return url = re.search("(?P<url>https?://[^\s]+)", message) if url: url = url.group(0) try: self.lasturl = url hostname = urlparse.urlparse(url).hostname g = Goose() article = g.extract(url=url) tinyurl = urllib2.urlopen("http://tinyurl.com/api-create.php?url=" + url).read() title = article.title.encode('utf-8')[:70] ret = "Title : %s (%s) | %s" % (title, hostname, tinyurl) serv.privmsg(canal, ret) except: # todo log error e = sys.exc_info()[0] print(e) return if "!sum" in message: try: response = unirest.post("http://192.81.222.194:1142/api",{}, {"url": self.lasturl}) print response.body for bullet in response.body: serv.privmsg(canal, ("* %s" % (bullet).encode('utf-8'))) except: # todo log error e = sys.exc_info()[0] print(e) return
def _article(self): """Analyse resource content, return Goose interface""" # switch method depending on content_type # for pdf, fall back to teseract if pdf2text yields not much # (then use the larger, or maybe composit) g = Goose() return g.extract(raw_html=self._decode())
class Html_parser(object): """ use goose to parse raw html and """ def __init__(self,need_stem): #set up goose config = Configuration() config.enable_image_fetching = False self._g = Goose(config) self._need_stem = need_stem def get_text(self,file_path): raw_html = "" with open(file_path) as f: raw_html = f.read() if not raw_html: return None try: article = self._g.extract(raw_html = raw_html) except lxml.etree.ParserError as e: return None text = article.title + ".\n" + article.cleaned_text if self._need_stem: text = re.sub("\w+",do_stem,text) #words = re.findall("\w+",text,re.MULTILINE) #w = map(stem,words) #text = " ".join(w) return text
def scrape(url): """ Function to request and parse a given URL. Returns only the "relevant" text. Parameters ---------- url : String. URL to request and parse. Returns ------- text : String. Parsed text from the specified website. meta : String. Parsed meta description of an article. Usually equivalent to the lede. """ logger = logging.getLogger('scraper_log') page = requests.get(url) g = Goose() try: article = g.extract(raw_html=page.content) text = article.cleaned_text meta = article.meta_description return text, meta #Generic error catching is bad except Exception, e: print 'There was an error. Check the log file for more information.' logger.warning('Problem scraping URL: {}. {}.'.format(url, e))
def getUrl(item): url_name = re.split('&&', item) url = url_name[1] name = url_name[0] print url print name html_name = name + '.html' print html_name g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url) # print article.raw_html currentDir = os.getcwd() + '/' + 'pages' + '/' + name if not os.path.exists(currentDir): os.makedirs(currentDir) f = open(currentDir + '/' +html_name,'a') f.write(article.raw_html) f.close() #print article.title f_md = open(currentDir + '/' + name + '.md' ,'a') f_md.write(article.cleaned_text.encode('utf-8')) f_md.close()
def articleExtractor(): url = request.args.get('url', '') articleObject = [] print("Program started ...") articleExtractor = Goose() article = articleExtractor.extract(url=url) #build article content articleBody = "" for letter in article.cleaned_text: articleBody += str(letter.encode('utf-8', 'ignore')) articleObject.append(article.title) articleObject.append(article.meta_description) articleObject.append(articleBody) #return article main text return jsonify(articleObject)
def stockQuery(stockSymbol): """ Returns a table with various information about a stock""" stockSymbol = stockSymbol.upper() end_date = datetime.date.today() start_date = datetime.date.today() - datetime.timedelta(days=2) seed_url = "https://www.google.com/finance/company_news?q=" + stockSymbol + "&ei=kT3SWJGvNMeguASQ3K7wCw&startdate=" + str( start_date) + "&enddate=" + str(end_date) + "&start=1&num=15" #seed_url1="https://www.google.com/finance/company_news?q=AAPL&ei=kT3SWJGvNMeguASQ3K7wCw&startdate=2017-03-21&enddate=2017-03-23&start=1&num=15" return_list = [] r = requests.get(seed_url) soup = BeautifulSoup(r.content, 'lxml') url_list = soup.find_all('a', id='n-cn-') for url in url_list: table = { 'title': None, 'text': None, 'img_url': None, 'url': None, 'publish_date': None } url = url['href'] url = url[url.find('url=') + 4:url.find('&cid')] g = Goose() article = g.extract(url=url) try: title = article.title table['title'] = title except Exception, e: table['title'] = "" try: text = article.cleaned_text table['text'] = text except Exception, e: table['text'] = ""
def initial_check(): url_link = "http://fetchrss.com/rss/59549c628a93f872018b4567709026440.xml" # get all the links of news title links = [] text = [] title = [] rss = feedparser.parse(url_link) for post in rss.entries: links.append(post.link) title.append(post.title_detail.value) oldlinks = rssdata.objects.values_list('link', flat=True) print("old links are: \n ", oldlinks) for i in range(0, len(links)): if links[i] not in oldlinks: response = get(links[i]) extractor = Goose() article = extractor.extract(raw_html=response.content) texts = article.cleaned_text news_story = texts.encode('utf-8') print("new links:\n", links[i]) extract(links[i], news_story, title[i])
def processURL(url): toReturn = {} score = svm.compute(url) t = lxml.html.parse(url) title = t.find(".//title").text response = get(url) extractor = Goose() article = extractor.extract(raw_html=response.content) file = article.cleaned_text keywords = nlp.generateEntity(file) toReturn['title'] = title toReturn['score'] = score toReturn['keywords'] = keywords toReturn['url'] = url return json.dumps(toReturn)
def download_article(url): article = {'link': url, 'source': 'crawler_oglobo'} logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex)) return None extractor = Goose({'use_meta_language': False, 'target_language': 'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['published_time'] = extract_published_time(soup) article['body_content'] = extract_content(news) return article
def retrieve_data_for_link(param): logging.debug('retrieve_data_for_link - param = {}'.format(param)) (full_link, tmp_news_folder) = param link = full_link[0] google_title = full_link[1] link_datetime = full_link[2] compliant_filename_for_link = slugify(link)[:50] max_len = 100 if len(compliant_filename_for_link) > max_len: logging.debug( 'max length exceeded for filename ({}). Truncating.'.format( compliant_filename_for_link)) compliant_filename_for_link = compliant_filename_for_link[:max_len] pickle_file = '{}/{}.pkl'.format(tmp_news_folder, compliant_filename_for_link) already_fetched = os.path.isfile(pickle_file) if not already_fetched: try: """html = download_html_from_link(link) soup = BeautifulSoup(html, 'html.parser') content = get_content(soup) full_title = complete_title(soup, google_title) """ goose_client = Goose() g_content = goose_client.extract(url=link) article = { 'link': link, 'title': g_content.title, 'content': g_content.cleaned_text, 'meta_description': g_content.meta_description, 'datetime': link_datetime } pickle.dump(article, open(pickle_file, 'wb')) except Exception as e: logging.error(e) logging.error( 'ERROR - could not download article with link {}'.format(link)) pass
def clean_pp_html(url, pp_html): """ Cleans the privacy policy html of html tags :param url: the pp url :param pp_html: the pp html :return: the clean html """ ret_val = '' try: print("processing the following url {}".format(url)) tempfile.tempdir = os.getcwd() g = Goose() ret_val = g.extract(raw_html=pp_html).cleaned_text except Exception as e: print(e) if ret_val == '': try: soup = BeautifulSoup(pp_html) ret_val = soup.body.getText() except Exception as ee: print(ee) return ret_val
def download_article(url): """ Download the html content of a news page :param url: news page's url :type url: string :return: news page's content :rtype: requests.models.Response """ article = {'link': url, 'source': 'crawler_folha_sao_paulo'} logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}".format(url)) return None extractor = Goose({'use_meta_language': False, 'target_language': 'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.content) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['category'] = extract_category(url) article['published_time'] = extract_published_time(soup) content = extract_content(news, soup) if len(content) is 2: article['link'], article['body_content'] = content else: article['body_content'] = content return article
class SoloSpider(CrawlSpider): name = "solo" rules = (Rule(LinkExtractor(), callback='parse_items', follow=True), ) def __init__(self, **kw): super(SoloSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') self.g = Goose() self.url = url self.allowed_domains = [url] self.start_urls = ['http://www.' + url] # self.link_extractor = LinkExtractor() def parse_items(self, response): # print 'PARSE ITEMS' gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SoloItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r'[\',|!]', '', v), lambda v: re.sub(r'\s+', ' ', v)) il.add_value('siteurl', self.parse_base_url(response.url)) il.add_value('pageurl', response.url) il.add_value('text', fulltext.encode('ascii', 'ignore')) il.add_xpath('pagetitle', '//title/text()') yield il.load_item() def parse_base_url(self, url): url = re.sub(r'((http(s)?://)?(www.)?)', '', url.lower()) # strip head # print url.find('/') return url[:url.find('/')] if url.find('/') != -1 else url
def catchpg(x, dir_to_write, file_id): g = Goose() print '=== Start ===' print x try: a = g.extract(url=x) to_write = a.cleaned_text.replace('\r', '').replace('\n', ' ').strip() + '\n' to_write += a.title + '\n' to_write += a.top_image.src # translate(string.maketrans(string.punctuation, ' ' * len(string.punctuation))) if len(to_write.strip()) > 0: output = open(dir_to_write + os.sep + str(file_id), 'wb') output.write(to_write.encode('utf-8')) print 'caught ^_^Y' else: print 'None -_-!', x except Exception as e: print e print 'Missed -_-!', x print '=== End ===\n'
def extract_article_content(urls_extracted): connection = pymysql.connect(host, user=user, port=port, passwd=password, db=dbname) for url in urls_extracted: print url g = Goose() article = g.extract(url=url) article_title = re.sub( r'(?mis)[\[\]\!\@\#\$\%\&\*\`\~\^\-\_\"\{\}\:\;\<\>\'\/\\\|\(\)\n\r]*', '', article.title).encode('utf-8') if len(article_title) == 0: article_title = " " article_content_1 = re.sub( r'(?mis)[\[\]\!\@\#\$\%\&\*\`\~\^\-\_\"\{\}\:\;\<\>\/\'\\\|\(\)\n\r]*', '', article.cleaned_text).encode('utf-8') if len(article_content_1) == 0: article_content_1 = " " cursor = connection.cursor() sql = "INSERT INTO article_data(url,title,article_content,added_dt) VALUES ('{0}','{1}','{2}','{3}')".format( url, article_title, article_content_1, datetime.today().strftime("%Y-%m-%d")) #try: cursor.execute(sql) # Commit your changes in the database connection.commit() #except: # print "yes" # connection.rollback() connection.close()
def initial_check(): print("here") url_link = "http://fetchrss.com/rss/5bf76e868a93f84c038b45675bf76e658a93f869028b4567.xml" # get all the links of news title links = [] text = [] title = [] rss = feedparser.parse(url_link) for post in rss.entries: links.append(post.link) title.append(post.title_detail.value) oldlinks = rssdata.objects.values_list('link', flat=True) # print oldlinks # print links for i in range(0, len(links)): if links[i] not in oldlinks: response = get(links[i]) extractor = Goose() article = extractor.extract(raw_html=response.content) texts = article.cleaned_text news_story = texts.encode('utf-8') # print(news_story) extract(links[i], news_story, title[i])
def extract_entry_data(url, fetch_images=True): """ Fetch the full content for a feed entry url. Args: | url (str) -- the url of the entry. Returns: | entry_data -- Goose object. | str -- the full text, including html. """ html = _get_html(url) g = Goose() g.config.enable_image_fetching = fetch_images try: # Use Goose to extract data from the raw html, # Use readability to give us the html of the main document. return g.extract(raw_html=html), Document(html).summary() except UnicodeDecodeError as e: logger.exception('UnicodeDecodeError with html: {0}'.format(html)) return None, ''
def getArticle(self, url, raw_html, language=None): g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=url, raw_html=raw_html) return article
# install Goose https://github.com/grangier/python-goose # # Done so far: basic keyword extraction using tagger works. # # Concerns about keyword extraction using Tagger library: # https://github.com/apresta/tagger # - dictionary should be built from relevant corpi to article to be more # effective at attracting attention in immersive interface # - TF-IDF is a function provided in the module build_dict... if articles # in collection ever accumulate enough around one subject, use TF-IDF # # immediate todos: # - implement multitag from goose import Goose import tagger import pickle url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile" g = Goose() article = g.extract(url=url).cleaned_text weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(), tagger.Rater(weights)) best_3_tags = mytagger(article, 6) print best_3_tags
def get_text(url): g = Goose() article = g.extract(url=url) with codecs.open(article.link_hash + “.speech”, “w”, “utf-8-sig”) as text_file: text_file.write(article.cleaned_text)
def run(keyword): headers = {'User-agent': "HotJava/1.1.2 FCS"} logging.debug('KEYWORD = {}'.format(keyword)) #generate_articles(keyword) safe_keyword = "+".join(keyword.split(" ")) link = "https://www.quora.com/search?q=%s"%safe_keyword session = get_tor_session() print(session.get("http://httpbin.org/ip").text) response = session.get(link, headers=headers, timeout=20) print response.status_code if response.status_code != 200: pass html = response.content soup = BeautifulSoup(html, 'html.parser') # q_data = {ques_link: [ques_text, [{ # external_link, link_text, answer_id, answer_text, # meta_keyword, meta_description, meta_title, image, video, favicon, domain}]]} q_data = {} print list(soup.find_all("a", {"class": "question_link"})) for i in soup.find_all("a", {"class": "question_link"}): print i print i.text, i['href'] ques_link = "https://www.quora.com/%s"%i['href'] ques_response = session.get(ques_link, headers=headers, timeout=20) if ques_response.status_code != 200: session = get_tor_session() ques_response = session.get(ques_link, headers=headers, timeout=20) ques_html = ques_response.content ques_soup = BeautifulSoup(ques_html, 'html.parser') answers = ques_soup.find_all("div", {"class": "AnswerBase"}) ans_links = ques_soup.find_all("span", {"class": "qlink_container"}) print list(answers) if len(list(answers)) and len(list(ans_links)): q_data[i['href']] = {'text': i.text, 'links' : []} for ans in answers: print ans.text external_links = ans.find_all("span", {"class": "qlink_container"}) for e_link in external_links: if len(list(e_link.children)): a_link = list(e_link.children)[0] if bool(urlparse.urlparse(a_link['href']).netloc): print a_link['href'] link_url = a_link['href'] if 'https://www.quora.com/_/redirect' in a_link['href']: try: link_url = filter(lambda y: y[0] == 'url', map(lambda x: x.split("="), urlparse.urlparse(link_url).query.split("&")))[0][1] except: print sys.exc_info() if len(urlparse.urlparse(link_url).path) < 3: pass signal.signal(signal.SIGALRM, g_timeout_handler) signal.alarm(20) try: # Got external link goose_client = Goose() g_content = goose_client.extract(url = link_url) q_data[i['href']]['links'].append({ 'title': g_content.title, 'meta_description': g_content.meta_description, 'image': g_content.top_image.src \ if g_content.top_image else '-', 'video': g_content.movies[0].src \ if len(g_content.movies) else '-', 'favicon': g_content.meta_favicon, 'domain': g_content.domain, 'a_link': a_link['href'], 'a_link_text': a_link.text, 'a_link_answer_id': ans.get('id'), 'a_link_answer_text': ans.text, }) except Exception as ex: if "goose_timeout" in ex: print "Goose Timeout!" else: print "New Error", ex q_data[i['href']]['links'].append({ 'a_link': a_link['href'], 'a_link_text': a_link.text, 'a_link_answer_id': ans.get('id'), 'a_link_answer_text': ans.text, }) finally: signal.alarm(0) json_file_n = safe_keyword + ''.join(random.choice(string.ascii_uppercase + string.digits) \ for _ in range(5)) + '.json' with open(json_file_n, 'w') as json_file: json.dump(q_data, json_file) return
text_file = open( "./headlines/headline" + str(year) + str(month) + ".txt", "w") text_file_arti = open( "./articles/article" + str(year) + str(month) + ".txt", "w") print(str(year) + str(month)) value = api.query(year, month) val = value['response']['docs'] for v in val: for l in lines: try: if l.lower() in v['headline']['main'].lower(): head += (str(count) + " " + v['pub_date'][0:10] + " " + v['headline']['main'] + '\n') response = get(v['web_url']) extractor = Goose() article1 = extractor.extract(raw_html=response.content) text = article1.cleaned_text if text == "": article = article + (str(count) + " " + v['pub_date'][0:10] + " " + v['snippet'] + '\n') else: article = article + ( str(count) + " " + v['pub_date'][0:10] + " " + (text.encode('utf-8').strip() ).decode('utf-8').strip() + '\n') print(str(count)) count = count + 1 break except: pass
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params): """ Transforms a given data source to a corresponding feature matrix and label vector based on the "Bag of Concepts" model which uses Wikipedia as an exogenous knowledge source for Word Sense Disambiguation and as additional domain knowledge. Contains logging code which is displayed depending on the currently set logging level of the root logger. :param wiki: WikiIndex instance to some database index :param data: data labels loaded using a load_data_source method :param n_concepts: number of concepts to use per page. :param word_concept_params: word concept parameters to use for generation of concepts. :return: Numpy Feature Matrix and Label Vector. """ config = Configuration() config.enable_image_fetching = False config.use_meta_language = False goose = Goose(config) results = {} concepts = set() # Iterate through the data and perform training for index, (abs_path, label) in enumerate(data.items()): if not os.path.exists(abs_path): continue with open(abs_path, 'r') as fp: html_text = fp.read() # Determine relative path using a simple heuristic cutoff = abs_path.find('pages/') rel_path = abs_path[cutoff + 6:] logging.info('\n%d: http://%s' % (index, rel_path[:-3])) article = goose.extract(raw_html=html_text) if len(article.cleaned_text) > 500: logging.info('%s (%s)', article.title, label) search_results, terms, query_vector = wiki.word_concepts( article.cleaned_text, article.title, **word_concept_params) if search_results: results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]] # Remove any concepts which have a weight of 0 results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path]) for search_result in search_results[:n_concepts]: concepts.add(search_result.page_id) logging.info(search_results[:n_concepts]) else: logging.warn('No word concepts returned') else: logging.info('Document is of insufficient length') shape = (len(results), len(concepts)) concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)]) feature_matrix = np.zeros(shape=shape) label_vector = np.zeros(len(results)) for i, (abs_path, page_list) in enumerate(results.iteritems()): label_vector[i] = 1 if data[abs_path] is not None else 0 for page_id, weight in page_list: j = concepts_index[page_id] feature_matrix[i, j] = weight return feature_matrix, label_vector
import sys from goose import Goose import codecs filename = sys.argv[1] try: with open("tmp/htmls/" + filename, "rb") as f: html = f.read() except: print("No file named as : ", filename) sys.exit(0) g = Goose() article = g.extract(raw_html=html) with codecs.open("tmp/texts/" + filename, "w", "utf-8") as g: g.write(article.cleaned_text) print("Finished html-to-text : " + filename)
def gooseExample(): g = Goose() url = "http://www.chinadaily.com.cn/a/201712/22/WS5a3c7473a31008cf16da2d9e.html" article = g.extract(url=url) print(article.title) print(article.cleaned_text[:150])
doc = open(doc_path, 'r') doc = codecs.open(doc_path, encoding='utf-8', mode='r') text = doc.read() text = text.encode('ascii', 'ignore') elif doc_type == '-w': g = Goose() # determine if this is a New York Times url, in which case # we cannot use goose alone and must also rely on urllib2 sites = 'www.(nytimes)|(theonion)' if re.search(sites, doc_path): print('handling special case') # do the nytimes thing opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(doc_path) raw_html = response.read() article = g.extract(raw_html=raw_html) text = article.cleaned_text.encode('ascii', 'ignore') else: # just use goose article = g.extract(url=doc_path) text = article.cleaned_text.encode('ascii', 'ignore') # Tokenize the document to be summarized tok = PunktSentenceTokenizer() doc = ' '.join(text.strip().split('\n')) sentences = tok.tokenize(doc) # Pass tokenized document to tr_func.normalize # which generates a graph containing vertices # for each sentence in the document
def spider(): #url = raw_input("Enter a website to crawl articles from: ") print "Crawling from KseStocks Business..." #nltk.download('stopwords') r = requests.get("http://ksestocks.com/NewsCentral/Business_News") data = r.text soup = BeautifulSoup(data, "lxml") dict = {} counter = 0 for link in soup.find_all('a'): # print(link.get('href')) dict[counter] = link.get('href') counter += 1 print dict print "URLs DICTIONARY" # print urls_dict print "\n\nGoose Beginning from here \n" dict_of_validated_urls = {} for key, value in dict.iteritems(): # print dict[key] check = validators.url(dict[key]) # print check if check: dict_of_validated_urls[key] = value ####### Pass URL of article here ########## print dict_of_validated_urls print len(dict_of_validated_urls) keywords = { 0: "twitter", 1: "facebook", 2: "fashion", 3: "entertainment", 4: "epaper", 5: "sport", 6: "politics", 7: "images", 8: "obituary", 9: "watch-live", 10: "herald", 11: "supplements", 12: "classifieds", 13: "aurora", 14: "cityfm", 15: "#comments", 16: "expo", 17: "nnews", 18: "latest-news", 19: "category", 20: "videos", 21: "tv-shows", 22: "urdu", 23: "live", 24: "php", 25: "trending", 26: "privacy", 27: "about", 28: "aspx", 29: "faq", 30: "talent", 31: "ratecardon", 32: "advertise" } print "Validation\n" for key in dict_of_validated_urls.keys(): for values in keywords.values(): if values in dict_of_validated_urls[key]: print dict_of_validated_urls[key] del dict_of_validated_urls[key] break print len(dict_of_validated_urls) print dict_of_validated_urls dict_of_articles = {} counter = 0 for key, value in dict_of_validated_urls.iteritems(): dict_of_articles[counter] = value counter += 1 counter = 0 print dict_of_articles dict_of_cleaned_urls = {} for key, value in dict_of_articles.items(): if value not in dict_of_cleaned_urls.values(): dict_of_cleaned_urls[key] = value print "Clean URLs:" print dict_of_cleaned_urls text = "" filtered_sentence = [] dict_of_cleaned_articles_and_titles = {} cnx = mysql.connector.connect(user='******', password='******', host='localhost', database='articles') cursor = cnx.cursor() for key in dict_of_cleaned_urls.keys(): url = dict_of_cleaned_urls[key] g = Goose() article = g.extract(url=url) print article.title # dict_of_cleaned_articles_and_titles[article.title] print "Title printed" print "\n" # print article.meta_description text = article.cleaned_text stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filtered_sentence = [w for w in word_tokens if not w in stop_words] for words in filtered_sentence: #filtered_sentence = words.encode('ascii', 'ignore') filtered_sentence = words.encode("utf-8") print filtered_sentence print type(filtered_sentence) filtered_sentence = str(filtered_sentence) for w in word_tokens: if w not in stop_words: # filtered_sentence.append(w) filtered_sentence += " " + w print "Filtered:" print filtered_sentence print "Text printed" # print article.top_image.src # dict_of_cleaned_articles_and_titles[article.title] = filtered_sentence data = (article.title, filtered_sentence) # data = (title, file_text) cursor.execute( "SELECT Title, COUNT(*) FROM articles_table WHERE Title = %s GROUP BY Title", (article.title, )) # query = msg = cursor.fetchone() # check if it is empty and print error if not msg: cursor.execute( "insert into articles_table (Title, Text) values(%s,%s)", (data)) # cursor.execute(add_to_db_query, data) cnx.commit() print "Added to Database" id = "[]" delstatmt = "DELETE FROM articles_table WHERE Text = %s" cursor.execute(delstatmt, (id, )) cnx.commit() cursor.close() cnx.close() #option = raw_input("\nPress q to quit or any other to restart program: ") #print "\n" #if option == 'q': # exit() print "Done Crawling and Cleaned Database!\n"
BASE_URL = 'https://www.fxstreet.com/cryptocurrencies/news?q=&hPP=50&idx=FxsIndexPro&p=0&is_v=1' client_response = Page(BASE_URL) source = client_response.html soup = BeautifulSoup(source, 'html.parser') all_links = soup.find_all('h4', class_='fxs_headline_tiny') #goose to extract content #install goose again by reaching cd ~ directory and doing steps mentioned #https://github.com/grangier/python-goose g = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup'}) #you don't have to close csv file F = csv.writer(open("fxNewsLink.csv", 'w')) for elem in all_links: link = elem.contents[1]['href'] article = g.extract(url=link) content = article.cleaned_text #to get date of article we oprn the link and extract time element response = urlopen(link).read() date_soup = BeautifulSoup(response, 'html.parser') date = date_soup.find_all('time')[0]['datetime'] date = dateparser.parse(date) timestamp = datetime.now() title = article.title #encoding was required as ascii unicode popped up #list of element used as strings have commas and these commas act as delimiters #so prevent normal commas to act as delimiters we used a list and csv package out = [ link.encode("utf-8"),
min_length = 3 words = map(lambda word: word.lower(), word_tokenize(text)) words = [word for word in words if word not in cachedStopWords] tokens = (list(map(lambda token: PorterStemmer().stem(token), words))) p = re.compile('[a-zA-Z]+') filtered_tokens = filter( lambda token: p.match(token) and len(token) >= min_length, tokens) return filtered_tokens from goose import Goose if __name__ == "__main__": url = 'http://www.reuters.com/article/global-oil-idUSL3N16408T' g = Goose() article = g.extract(url=url) a = article.cleaned_text html_dict = [] tokenhtml = tokenize(a) print(tokenhtml) for i in range(0, len(tokenhtml)): body = '' body += tokenhtml[i] + ' ' html_dict.append({"label": "0", "text": body}) sc = SparkContext() htmldata = sc.parallelize(html_dict) labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True) tf = HashingTF().transform( htmldata.map(lambda doc: doc["text"], preservesPartitioning=True))
def getArticle(self, url, raw_html, language=None): config = Configuration() config.enable_image_fetching = False g = Goose(config=config) article = g.extract(url=url, raw_html=raw_html) return article
def get_texteaser(url): g = Goose() article = g.extract(url=url) response = unirest.post("http://x.textteaser.com/api", {}, {"token": apikey, "text": article.cleaned_text, "title": article.title}) print response.body return json.dumps(response.body['sentences'])
print '--------------------------------------------' print articleno print link print '--------------------------------------------' r = urllib.urlopen(link).read() soup = BeautifulSoup(r, "lxml") text = soup.find('div', class_='columnLeft') if text is None: continue text = text.find('p') date = soup.find('span', class_='timestamp') articles.append([ date.get_text().encode('utf-8'), text.get_text().encode('utf-8') ]) else: print '--------------------------------------------' print articleno print link print '--------------------------------------------' article = goose.extract(url=link) date = article.publish_date text = article.cleaned_text.encode('utf-8') if date is None or text is None: continue articles.append([date, text]) with open('applefoolarticles.csv', 'wb') as f: writer = csv.writer(f) writer.writerows(articles)
def crawl_news(news_pool, min_body_len, doc_dir_path, doc_encoding): i = 1 for newslink in news_pool: try: response = urllib.request.urlopen(newslink, timeout=10) html = response.read() except Exception as e: print("URL-Request-----%s: %s"%(type(e), newslink)) continue try: soup = BeautifulSoup(html, 'lxml') # http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/ div1 = soup.find('div',class_='qq_mainArea') if repr(div1) != "None": cmt_id_start = div1.text.find('cmt_id') cmt_id_end = div1.text.find('cmt_is_group') cmt_id = div1.text[cmt_id_start+9:cmt_id_end] cmt_id_end = cmt_id.find(';') cmt_id = cmt_id[0:cmt_id_end] title = div1.find('h1').text time = div1.find('span',class_='a_time').text body = div1.find('div',class_='Cnt-Main-Article-QQ').text else: continue try: commentlist = getComments(cmt_id, limit*max_iter) except: commentlist = ["NULL"] try: commentnum = getCommentsNum(cmt_id) except: commentnum = str(len(commentlist)) except: print("Crawl URL " + newslink + " failed.") commentlist = ["NULL"] continue doc = ET.Element("doc") ET.SubElement(doc, "source").text = "Tencent" ET.SubElement(doc, "id").text = "%d"%(i) ET.SubElement(doc, "url").text = newslink ET.SubElement(doc, "title").text = title ET.SubElement(doc, "datetime").text = time#time[0:16] body_cleaned = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", body) if len(body_cleaned)/len(body) <= 0.85: try: g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=newslink) body = article.cleaned_text ET.SubElement(doc, "body").text = body #title_cleaned = article.title except: ET.SubElement(doc, "body").text = body_cleaned if len(body_cleaned)/len(body) <= 0.5: ET.SubElement(doc, "body").text = "Potential video or image news." else: print(len(body_cleaned)/len(body)) ET.SubElement(doc, "body").text = body comment = '\r\n'.join(list(commentlist)) #comment_cleaned = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", comment) ET.SubElement(doc, "comments").text = comment ET.SubElement(doc, "comments_num").text = commentnum tree = ET.ElementTree(doc) tree.write(doc_dir_path + time.replace(' ','-').replace(':','-') + "_%d.xml"%(i), encoding = doc_encoding, xml_declaration = True) i += 1