Beispiel #1
0
 def parse_urls(cls, urlList):
     parser = cls()
     for url in urlList:
         content = webutils.clean_html(webutils.read_url(url, "utf-8"))
         parser.feed(content)
     parser.close()
     return parser.get_happenings()
Beispiel #2
0
 def parse_urls(cls, urlList):
     parser = cls()
     for url in urlList:
         print("content?")
         content = webutils.clean_html(webutils.read_url(url, "iso-8859-2"))
         print("before feed ", url)
         print("content ", content)
         parser.feed(content)
     parser.close()
     return parser.get_happenings()
Beispiel #3
0
 def parse_urls(cls, urlList):
     parser = cls()
     for url in urlList:
         # try:
         content = webutils.clean_html(webutils.read_url(url, "utf-8"))
         print("before feed ", url)
         parser.feed(content)
         parser.fit_url(url)
         parser.close()
         # except Exception as e:
         #     print("error ", e)
     return parser.get_happenings()
Beispiel #4
0
 def obtain_happening_details(cls, url):
     if not url:
         return
     content = webutils.clean_html(webutils.read_url(url, ))
     happening_parser = cls()
     happening_parser.feed(content)
     happening_parser.close()
     happening = {}
     try:
         happening = happening_parser.get_happening()
     except:
         print("Błąd parsowania " + url)
     return happening
Beispiel #5
0
 def extract_nexts(self, content):
     print("content?")
     base_url = "http://mp.pl/cukrzyca"
     content = webutils.read_url(base_url)
     # patternUrl = 'http\:\//www.mp.pl/cukrzyca/aktualnosci/157649,morwa-czy-rzeczywiscie-pomaga-w-leczeniu-cukrzycy'
     patternUrl = 'http\:\//www.mp.pl\/cukrzyca\/aktualnosci\/\d+,.*'
     link_extractor = webutils.LinkExtractor(patternUrl)
     link_extractor.feed(content)
     urls = []
     print('link_extracot', link_extractor.results)
     for result in link_extractor.results:
         print("result ", result)
         happening = {}
         happening = {'source_url': result.get('url')}
         if happening not in urls:
             urls.append(happening)
     urls = urls[:1]
     print('finally')
     return urls
Beispiel #6
0
 def obtain_happening_details(cls, url, counter):
     counter += 1
     if not url:
         return
     content = webutils.clean_html(webutils.read_url(url, ))
     happening_parser = cls()
     happening_parser.feed_parsers(happening_parser,content)
     urls = happening_parser.extract_nexts(content)
     # for url in urls:
     #     happening = {}
     #     happening = {'source_url': result.get('url')}
     #     if happening not in urls:
     #         urls.append(happening)
     if  counter < 3:
         happening_parser.happenings.extract(happening_parser.obtain_happenings_details(urls))
     try:
         happening = happening_parser.get_happening()
     except :
         print("Błąd parsowania "+url )
     return happening