def parse_urls(cls, urlList): parser = cls() for url in urlList: content = webutils.clean_html(webutils.read_url(url, "utf-8")) parser.feed(content) parser.close() return parser.get_happenings()
def parse_urls(cls, urlList): parser = cls() for url in urlList: print("content?") content = webutils.clean_html(webutils.read_url(url, "iso-8859-2")) print("before feed ", url) print("content ", content) parser.feed(content) parser.close() return parser.get_happenings()
def parse_urls(cls, urlList): parser = cls() for url in urlList: # try: content = webutils.clean_html(webutils.read_url(url, "utf-8")) print("before feed ", url) parser.feed(content) parser.fit_url(url) parser.close() # except Exception as e: # print("error ", e) return parser.get_happenings()
def obtain_happening_details(cls, url): if not url: return content = webutils.clean_html(webutils.read_url(url, )) happening_parser = cls() happening_parser.feed(content) happening_parser.close() happening = {} try: happening = happening_parser.get_happening() except: print("Błąd parsowania " + url) return happening
def extract_nexts(self, content): print("content?") base_url = "http://mp.pl/cukrzyca" content = webutils.read_url(base_url) # patternUrl = 'http\:\//www.mp.pl/cukrzyca/aktualnosci/157649,morwa-czy-rzeczywiscie-pomaga-w-leczeniu-cukrzycy' patternUrl = 'http\:\//www.mp.pl\/cukrzyca\/aktualnosci\/\d+,.*' link_extractor = webutils.LinkExtractor(patternUrl) link_extractor.feed(content) urls = [] print('link_extracot', link_extractor.results) for result in link_extractor.results: print("result ", result) happening = {} happening = {'source_url': result.get('url')} if happening not in urls: urls.append(happening) urls = urls[:1] print('finally') return urls
def obtain_happening_details(cls, url, counter): counter += 1 if not url: return content = webutils.clean_html(webutils.read_url(url, )) happening_parser = cls() happening_parser.feed_parsers(happening_parser,content) urls = happening_parser.extract_nexts(content) # for url in urls: # happening = {} # happening = {'source_url': result.get('url')} # if happening not in urls: # urls.append(happening) if counter < 3: happening_parser.happenings.extract(happening_parser.obtain_happenings_details(urls)) try: happening = happening_parser.get_happening() except : print("Błąd parsowania "+url ) return happening