def convert_text(text, base_url, output_format): """ Convert an HTML document to output markup; attempts to find a plausible article to excerpt. Arguments: doc -- the BeautifulSoup document object base_url -- the base URL of the original document format -- the output format to use; uses anything supported by Pandoc Returns: the converted document fragment """ out_html = '\n\n'.join(_extract(text, base_url)) # create a new DOM document from the joined blockquotes out_dom = BeautifulSoup(out_html, features='html.parser') # convert all href, src, and srcset attributes for attr in ('href', 'src'): for node in out_dom.findAll(**{attr: True}): node[attr] = urllib.parse.urljoin(base_url, node[attr]) for node in out_dom.findAll(srcset=True): node['srcset'] = _rewrite_srcset(node['srcset'], base_url) # strip out attributes we don't want for attr in ('id', 'class'): for node in out_dom.findAll(**{attr: True}): del node[attr] return pypandoc.convert_text(out_dom.decode_contents(), output_format, 'html')
def get_link(url): try: req = requests.get(url, headers=headers) soup = BeautifulSoup(req.text, 'lxml') if soup.text.find('您访问过于频繁') >= 0: print('需要启用验证码', soup.decode_contents()) get_code(soup.decode_contents()) return '' links = soup.select('#page_list > ul > li > a') except Exception as ex: print(ex) for link in links: href = link.get('href') # print(href) get_info(href) time.sleep(1)
def _get_artist_longbio(self, artist_page_soup): """ Extract the artist short bio. Args artist_page_soup: soup of the artist page html. Return (list of strings) each list element corresponds to a paragraph. ----------------------------------------------------------------------- """ bio_link = artist_page_soup.find("p", { "class": "biography" }).find("a")["href"] self.conn.browser.get(bio_link) WebDriverWait(self.conn.browser, 20).until( EC.presence_of_element_located((By.CLASS_NAME, "text"))) soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser') soup = BeautifulSoup(soup.decode_contents(), 'lxml') soup = soup.find("div", {"itemprop": "reviewBody"}) long_bio = [ x.strip() for x in soup.text.split("\n") if (len(x.strip()) > 0 and x.find("<") == -1) ] return long_bio
def test_strainer(css, input_html, output_html, **kwargs): parse_only = strainer_from_css(css, **kwargs) input_soup = BeautifulSoup('<main>%s</main>' % input_html, 'lxml', parse_only=parse_only) assert input_soup.decode_contents().strip() == output_html
def amplify_html(rendered_html): bs = BeautifulSoup(rendered_html) for image in bs.find_all('img', attrs={'src': True}): amp_img = bs.new_tag('amp-img', src=image.get("src"), alt=image.get("alt", ""), layout="responsive", width=image.get("width", 550), height=image.get("height", 368)) amp_img['class'] = image.get("class", "") image.replace_with(amp_img) for iframe in bs.find_all('iframe', attrs={'src': True}): amp_iframe = bs.new_tag('amp-iframe') iframe_src = iframe['src'] if iframe_src.startswith('//'): iframe_src = 'https:{}'.format(iframe_src) amp_iframe.attrs['src'] = iframe_src if iframe.has_attr('title'): amp_iframe.attrs['title'] = iframe['title'] amp_iframe.attrs['width'] = '200' amp_iframe.attrs['height'] = '100' amp_iframe.attrs['layout'] = 'responsive' amp_iframe.attrs['frameborder'] = iframe.get('frameborder', 0) amp_iframe.attrs['sandbox'] = iframe.get( 'sandbox', 'allow-scripts allow-same-origin') iframe.replace_with(amp_iframe) # Remove style attribute to remove large bottom padding for div in bs.find_all("div", {'class': 'responsive-object'}): del div['style'] return bs.decode_contents()
def _parse_line_breaks_in_dom_elem(self, element: BeautifulSoup) -> str: '''Converts a dome element of type BeautifulSoup into a string replacing line breaks to \n ''' element = element.decode_contents() parsed_string = str(element).replace('<br/>', '\n').replace('<br>', '\n') return parsed_string.strip()
def sanitize_html(html_value, valid_tags=VALID_TAGS): soup = BeautifulSoup(html_value) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] for tag in soup.find_all(True): if tag.name not in valid_tags: tag.hidden = True return soup.decode_contents()
def getReply(self): soup = BeautifulSoup(self.driver.page_source, 'html.parser') soup = soup.find( "div", class_="flash flash-warning panel panel-warning").find( class_='panel-heading') replies = soup.decode_contents(formatter="html") reply = replies.split("<br/><br/>")[0] print(reply[3:]) return jsonify({'reply': reply[3:]})
def test_sitemap_xml(self): """ Check we can retrieve a list of all URLs in the URL map at /sitemap.txt """ response = self.client.get("/sitemap.xml") self.assertIn("application/xml", response.headers.get("content-type")) soup = BeautifulSoup(response.data, features="html.parser") expected_soup = BeautifulSoup( open("./tests/fixtures/templates/sitemap.xml").read().replace( "\n", ""), features="html.parser", ) self.assertEqual(soup.decode_contents(), expected_soup.decode_contents())
def absolute_links(html, scheme='//', request=None): """ 1. Все ссылки становятся абсолютными с target=_blank. 2. Ко всем таблицам добавляются аттрибуты cellpadding, cellspacing и border """ site = get_current_site(request) soup = Soup(html, 'html5lib') for tag in soup.findAll('a'): href = tag.get('href') if not href: continue tag['target'] = '_blank' if href.startswith('//'): tag['href'] = '%s%s' % (scheme, href[2:]) elif href.startswith('/'): tag['href'] = '%s%s%s' % (scheme, site.domain, href) for tag in soup.findAll('img'): if tag.has_attr('height'): del tag['height'] src = tag.get('src') if not src: continue if src.startswith('//'): tag['src'] = '%s%s' % (scheme, src[2:]) elif src.startswith('/'): tag['src'] = '%s%s%s' % (scheme, site.domain, src) # srcset srcset = tag.get('srcset') if not srcset: continue srcset_final = [] for srcset_part in srcset.split(','): url, width = srcset_part.strip().split() if url.startswith('//'): url = '%s%s' % (scheme, url[2:]) elif src.startswith('/'): url = '%s%s%s' % (scheme, site.domain, url) srcset_final.append('%s %s' % (url, width)) tag['srcset'] = ','.join(srcset_final) # Добавление аттрибутов к таблицам for tag in soup.findAll('table'): for attr in ('border', 'cellpadding', 'cellspacing'): if not tag.has_attr(attr): tag[attr] = '0' return soup.decode_contents()
def _soup_artist_page(self): """ Soup the artist page. Return Soup of the artist page ----------------------------------------------------------------------- """ soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser') soup = BeautifulSoup(soup.decode_contents(), 'lxml') return soup
def parse_ns_token(ns_token): ns_node = BeautifulSoup(ns_token, 'lxml-xml').NS assert ns_node is not None err_type = ns_node.attrs.get('type', '').strip() if ns_node.i is None and ns_node.c is None: # error is detected but not edited ori = cor = ns_node.decode_contents().strip() else: ori = ns_node.i.decode_contents().strip() if ns_node.i else '' cor = ns_node.c.decode_contents().strip() if ns_node.c else '' return Edit(ori, cor, err_type)
def sanitize_html(html_value, valid_tags=VALID_TAGS, valid_attributes=VALID_ATTRIBUTES): """ Maybe we should have used Bleach (https://github.com/jsocol/bleach) """ soup = BeautifulSoup(html_value) for tag in soup.find_all(True): if tag.name not in valid_tags: tag.hidden = True else: # it might have bad attributes for attr in tag.attrs.keys(): if attr not in valid_attributes: del tag[attr] return soup.decode_contents()
def _goto_artist_from_song(self, song_query): """ Navigate to the artist page from a song page string. Args song_query: the last part of the song page url. """ self.conn.query("/song/" + quote(song_query, safe='')) WebDriverWait(self.conn.browser, 20).until( EC.presence_of_element_located((By.CLASS_NAME, "song-artist"))) soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser') soup = BeautifulSoup(soup.decode_contents(), 'lxml') soup = soup.find("h2", {"class": "song-artist"}) artist_link = soup.find("a")["href"] self.conn.browser.get(artist_link)
def change_html_img_src_to_absolute(html_content, parser='lxml'): html_bs = BeautifulSoup(html_content, parser) if html_bs.find('img') is not None: for image_tag in html_bs.find_all('img'): if 'http' in image_tag.attrs['src']: file_url = image_tag.attrs['src'] else: file_url = self.hdu_mirror_url + image_tag.attrs['src'] file_url = file_url.replace('../', '/') image_tag.attrs['src'] = self.oss.upload_file_from_url( file_url, diy_prefix='hdu/%s/' % self.problem_id) return html_bs.decode_contents(formatter="html") return html_content
def fix_relative_links(html: str, url: str) -> str: """ Fixes issue with relative links, replaces '/index.html' with 'https://domain.com/index.html' """ parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) soup = BeautifulSoup(html, 'html.parser') for a in soup.find_all('a'): if 'href' in a and is_relative(a['href']): a['href'] = url + a['href'] for img in soup.find_all('img'): if is_relative(img['src']): img['src'] = domain + img['src'] return soup.decode_contents()
def scrape_nfl_dot_com_strip(): if too_soon_to_request_nfl(): # print(f'Returning - only {seconds_elapsed} seconds have passed.') print('Returning') return update_request_time() raw_html = simple_get('https://www.nfl.com/') html = BeautifulSoup(raw_html, 'html.parser') content_string = html.decode_contents() start_location = content_string.find('__INITIAL_DATA__') json_start = content_string.find('{', start_location) json_end = content_string.find('\n', json_start) start_location = content_string.find('__REACT_ROOT_ID__') # Subtract 1 to remove the semicolon that ends the Javascript statement y = json.loads(content_string[json_start:json_end - 1]) # print(json.dumps(y)) for game in y['uiState']['scoreStripGames']: # FIXME: range length must not be hard coded for i in range(1): # print(f"{game['awayTeam']['identifier']}") # print(f"{conference_matchups[i][0]}") if game['awayTeam']['identifier'] == sb_matchups[i][0]: status = game['status'] # print(status) if not status['isUpcoming']: # print('Not upcoming') if status['phaseDescription'] == 'FINAL': print(f"{game['awayTeam']['identifier']} finished") sb_finished[i] = True home_score = game['homeTeam']['scores']['pointTotal'] away_score = game['awayTeam']['scores']['pointTotal'] delta = home_score - away_score # print(f'Delta is {delta}') sb_result[i] = delta in_progress = status['isInProgress'] # print(in_progress) in_progress_overtime = status['isInProgressOvertime'] # print(in_progress_overtime) in_half = status['isHalf'] # print(in_half) if in_progress or in_progress_overtime or in_half: home_score = game['homeTeam']['scores']['pointTotal'] away_score = game['awayTeam']['scores']['pointTotal'] delta = home_score - away_score # print(f'Delta is {delta}') sb_result[i] = delta sb_in_progress[i] = True
def load_bjcp_json(): try: file = open('styleguide-2015.min.json') content_string = file.read() bjcp_json = json.loads(content_string) file.close() return bjcp_json except FileNotFoundError: file = open('testfile.txt', 'w') file.write('Hello') file.close() raw_html = simple_get( 'https://github.com/gthmb/bjcp-2015-json/blob/master/json/styleguide-2015.min.json' ) html = BeautifulSoup(raw_html, 'html.parser') content_string = html.decode_contents() start_location = content_string.find('{"styleguide') end_location = content_string.find('</td>', start_location) y = json.loads(content_string[start_location:end_location]) return y
def test_a_nfl_page(self): raw_html = self.simple_get('https://www.nfl.com/') html = BeautifulSoup(raw_html, 'html.parser') content_string = html.decode_contents() start_location = content_string.find('__INITIAL_DATA__') print(start_location) json_start = content_string.find('{', start_location) print(json_start) json_end = content_string.find('\n', json_start) print(json_end) start_location = content_string.find('__REACT_ROOT_ID__') print(start_location) y = json.loads(content_string[json_start:json_end - 1]) # print(json.dumps(y)) for game in y['uiState']['scoreStripGames']: print(game['awayTeam']['identifier'] + ' at ' + game['homeTeam']['identifier']) print(' Away Team:') for key in game['homeTeam'].keys(): s = game['homeTeam'] value = s[key] print(f' {key}: {s[key]}') for key in game['awayTeam'].keys(): s = game['awayTeam'] value = s[key] print(f' {key}: {s[key]}') print(' Status:') for key in game['status'].keys(): status = game['status'] value = status[key] print(f' {key}: {status[key]}') # print(game.keys()) # print (f'Response has {len(raw_html)} bytes.') game1 = y['uiState']['scoreStripGames'][0]['status'] game4 = y['uiState']['scoreStripGames'][3]['status'] print([k for k in game1.keys() if k in game4.keys()]) self.assertEquals(True, True) '''
def clean_body(html: str) -> str: """Cleans html body from unneccesary elements.""" soup = BeautifulSoup(html, 'html.parser') # Set of tags that should be removed to_remove = [ 'nav', 'aside', 'meta', 'ins', 'footer', 'script', 'table[class*="infobox"]', 'div[class*="navigation"]', 'div[class*="footer"]', 'div[class*="social-container"]', 'div[class*="Left"]', 'div[class*="Share"]', 'div[class*="embed"]', 'div[class*="crumb"]', 'div[class*="sharing"]', 'div[class*="related"]', 'div[class*="comments"]', 'div[class*="widget"]', 'div[class*="meta"]', 'div[class*="noprint"]', 'div[class*="nav"]', 'div[id*="nav"]', 'table[class*="nav"]', 'span[class*="mw-editsection"]', 'div[id*="author"]', 'div[class*="author"]', 'span[class*="dsq-postid"]' ] for selector in to_remove: for item in soup.select(selector): item.decompose() return soup.decode_contents()
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(5) articles = {} #link url = link[0] response = requests.get(url) html = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") #extract title find_title = soup.find('article', class_="newslistouter container-base") title = find_title.find('h1').get_text( strip=True) if find_title else '' articles['title'] = title if ("foto:" in title.lower()) or "video:" in title.lower(): return False #extract subcategory from breadcrumb bc = soup.find('ul', class_="breadcrumb") if not bc: return False sub = bc.findAll('li')[-2].get_text(strip=True) if bc else '' #category articles['category'] = 'Otomotif' articles['subcategory'] = sub #article_url articles['url'] = url #article article = soup.find('div', class_="content") #extract date scripts = json.loads( soup.findAll( 'script', {'type': 'application/ld+json'})[-1].get_text(strip=True)) pubdate = scripts['datePublished'] pubdate = pubdate[0:19].strip(' \t\n\r') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"), '%Y-%m-%d %H:%M:%S') #articleid articleid = url.replace('/', '') articleid = url.split('-') articleid = int(articleid[-1][-5:]) articles['id'] = articleid #extract editor author = soup.find( 'div', class_="publish-cont").find('a').get_text(strip=True) articles['author'] = author #source articles['source'] = 'oto' #extract comments count articles['comments'] = 0 #extract tags articles['tags'] = scripts['keywords'] #extract images image = article.find('img')['src'] articles['images'] = image detail = article #hapus link sisip for img in detail.findAll('img'): img.decompose() for div in detail.findAll('div'): div.decompose() for src in detail.findAll('p'): if ("sumber:" in src.get_text(strip=True).lower()): src.decompose() for p in detail.findAll('p'): if ("baca juga" in p.get_text(strip=True).lower()) and (p.find('a')): p.decompose() # print(detail) #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) # print(content) articles['content'] = content #print('memasukkan berita id ', articles['id']) return articles
def getDetailBerita(self, count, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(10) articles = {} #link url = link[0] print(url) try: response = requests.get(url) except: return False html = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") # print(soup) #extract subcategory from breadcrumb bc = soup.find('div', class_="breadcrumb") if not bc: return False sub = bc.findAll('a')[1].get_text(strip=True) if ("foto" in sub.lower()) or ("detiktv" in sub.lower()) or ( "video" in sub.lower()) or ("photos" in sub.lower()) or ( "videos" in sub.lower()): return False articles['subcategory'] = sub #category articles['category'] = link[1] articles['url'] = url article = soup.find('article') #extract date pubdate = soup.find("meta", attrs={'name': 'publishdate'}) if pubdate: pubdate = pubdate['content'].strip(' \t\n\r') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%Y/%m/%d %H:%M:%S"), '%Y-%m-%d %H:%M:%S') id = soup.find("meta", attrs={'name': 'articleid'}) articles['id'] = int(id['content']) if id else int( datetime.strptime(pubdate, "%Y/%m/%d %H:%M:%S").timestamp()) + len(url) else: pubdate = soup.find('span', {'class': 'date'}) pubdate = pubdate.get_text(strip=True).strip(' \t\n\r').replace( " WIB", '') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%A, %d %b %Y %H:%M"), '%Y-%m-%d %H:%M:%S') id = soup.find("meta", attrs={'name': 'articleid'}) articles['id'] = int(id['content']) if id else int( datetime.strptime(pubdate, "%A, %d %b %Y %H:%M").timestamp()) + len(url) #extract author author = soup.find("meta", attrs={'name': 'author'}) articles['author'] = author['content'] if author else '' #extract title title = article.find('meta', {"property": "og:title"}) articles['title'] = title.get_text(strip=True) if title else '' #source articles['source'] = 'detik' #extract comments count komentar = soup.find('a', class_="komentar") articles['comments'] = int( komentar.find('span').get_text(strip=True).replace( 'Komentar', '').strip(' \t\n\r')) if komentar else 0 #extract tags tags = article.find('div', class_="detail_tag") articles['tags'] = ','.join( [x.get_text(strip=True) for x in tags.findAll('a')]) if tags else '' #extract images images = article.find('div', class_="pic_artikel") articles['images'] = images.find('img')['src'] if images else '' #extract detail if articles['category'] == 'news': detail = article.find('div', class_="detail_text") else: detail = article.find('div', attrs={"id": "detikdetailtext"}) if not detail: detail = soup.find('div', attrs={"class": "read__content full mt20"}) if not detail: detail = soup.find('div', attrs={"id": "detikdetailtext"}) if not detail: return False #hapus link sisip if detail.findAll('table', class_="linksisip"): for link in detail.findAll('table', class_="linksisip"): link.decompose() #hapus video sisip if detail.findAll('div', class_="sisip_embed_sosmed"): for tag in detail.findAll('div', class_="sisip_embed_sosmed"): tag.decompose() #hapus all setelah clear fix if detail.find('div', class_="clearfix mb20"): for det in detail.find('div', class_="clearfix mb20").findAllNext(): det.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() for p in detail.findAll('p'): if ("baca juga" in p.get_text(strip=True).lower()) and (p.find('a')): p.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = re.sub(r'(Tonton juga).*', '', content) print('memasukkan berita id ', articles['id']) return articles
tag_name.string = args.label o.bndbox.xmin.string = str( int(round(float(o.bndbox.xmin.string)) * scale_width)) o.bndbox.xmax.string = str( int(round(float(o.bndbox.xmax.string)) * scale_width)) o.bndbox.ymin.string = str( int(round(float(o.bndbox.ymin.string)) * scale_height)) o.bndbox.ymax.string = str( int(round(float(o.bndbox.ymax.string)) * scale_height)) xml_out = 'tmp.xml' if xml_out == xml_in: raise Exception('{} will be overwritten'.format(xml_out)) print('Writing ' + xml_out) f = open('tmp.xml', "w") f.write(soup.decode_contents()) f.close() # a bit of hacky workaround to print a better looking xml than what beautifulsoup produces xmlf = parse('tmp.xml') pretty_xml_as_string = xmlf.toprettyxml() shutil.copyfile(xml_in, xml_in + '.bak') xml_out = xml_in # remove empty lines pretty_xml_as_string = os.linesep.join( [s for s in pretty_xml_as_string.splitlines() if s.strip()]) with open(xml_out, 'w') as f: f.write(pretty_xml_as_string) f.close()
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(5) articles = {} #link url = link[0] + '?page=all' print(url) options = Options() options.add_argument('--headless') options.add_argument( '--disable-gpu') # Last I checked this was necessary. options.add_argument('--disable-extensions') options.add_argument("--incognito") driver = webdriver.Chrome("../chromedriver.exe", chrome_options=options) html = '' try: driver.get(url) # Extract HTML texts contained in Response object: html except ConnectionError: driver.quit() print("Connection Error, but it's still trying...") time.sleep(10) details = self.getDetailBerita(link) html = driver.page_source driver.quit() # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") scripts = soup.findAll('script', {'type': 'application/ld+json'}) if scripts: scripts = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", scripts[0].get_text(strip=True))) scripts = json.loads(scripts) else: return False #category categories = soup.findAll('meta', {'name': 'cXenseParse:category'}) articles[ 'category'] = categories[0]['content'] if categories else 'Berita' if len(categories) > 1: articles[ 'subcategory'] = categories[1]['content'] if categories else '' else: articles['subcategory'] = '' articles['url'] = url article = soup.find('div', {'id': 'article_con'}) #extract date pubdate = scripts['datePublished'] pubdate = pubdate[0:19].strip(' \t\n\r') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"), '%Y-%m-%d %H:%M:%S') id = soup.find('meta', {"property": "android:app_id"}) articles['id'] = int(id['content']) if id else int( datetime.strptime(pubdate, "%d-%b-%Y %H:%M").timestamp()) + len(url) #extract author articles['author'] = scripts['author']['name'] #extract title articles['title'] = scripts['headline'] #source articles['source'] = 'tribunnews' #extract comments count articles['comments'] = 0 #extract tags tags = article.find('div', class_="mb10 f16 ln24 mb10 mt5") articles['tags'] = ','.join([ x.get_text(strip=True).replace('#', '') for x in tags.findAll('a') ]) if tags else '' #extract images articles['images'] = scripts['image']['url'] #extract detail detail = article.find('div', attrs={'class': 'side-article txt-article'}) #hapus video sisip if detail.findAll('div'): for div in detail.findAll('div'): if div.find('script'): div.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() #hapus all noscript for ns in detail.findAll('noscript'): ns.decompose() #hapus linksisip for ls in detail.findAll('p', class_="baca"): if ls.find('strong'): if 'baca' in ls.find('strong').get_text(strip=True).lower(): ls.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = content print('memasukkan berita id ', articles['id']) return articles
def convertInternal(self, text, wikiPages): soup = BeautifulSoup(text) imgs = [] #Output of doxygen #http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html #...and <map> tag #Accepted by mediawiki #http://meta.wikimedia.org/wiki/Help:HTML_in_wikitext #Output from doxygen and not supported by mediawiki #We must convert these #<a href="..."> #<a name="..."> #<img src="..." ...> #<map> #Convert <a>s for a in soup("a"): #A normal link newStr = None if "href" in a.attrs: href = a.attrs["href"] #Get link and fragment portions of href hashPos = href.rfind("#") fragment = "" if hashPos != -1: fragment = href[hashPos:] link = href[:hashPos] else: link = href #Compare to list of wiki pages and change if necessary internalLink = False if link == "" and (fragment == "" or fragment == "#"): #Empty link newStr = "" elif link == "": #Local link with only fragment internalLink = True else: #Test if it matches an internal file, if not, external link for page in wikiPages: if link == page.filename: internalLink = True link = page.normtitle.title break #What's the content? text = a.string tags = a.select("*") if text: #Simple text string if not internalLink: newStr = "[" + href + " " + text + "]" else: newStr = "[[" + link + fragment + "|" + text + "]]" elif len(tags) == 1 and tags[0].name == "img": #One image inside the a tag img = tags[0] imgs.append(ImagePage(self.filepath, img.attrs["src"])) newStr = "[[File:" + img.attrs["src"] + "|link=" + link + fragment + "]]" else: #Something else doxymwglobal.msg(doxymwglobal.msgType.debug, "Unhandled link with unknown contents") newStr = "" #A named anchor or anchor with ID elif "name" in a.attrs or "id" in a.attrs: newStr = soup.new_tag("span") #Named anchors or ID'd anchors just become spans with IDs if "name" in a.attrs: newStr.attrs["id"] = a.attrs["name"] else: #"id" in a.attrs: newStr.attrs["id"] = a.attrs["id"] newStr.attrs["style"] = "width:0;height:0;font-size:0;" else: newStr = "" a.replace_with(newStr) #Convert and store <img>s for img in soup("img"): #File this image for later use imgs.append(ImagePage(self.filepath, img.attrs["src"])) #Convert the image newStr = "[[File:" + img.attrs["src"] + "]]" img.replace_with(newStr) #Convert <maps> #For now just delete them, we'll have to rely on a MW extension for this one later for map in soup("map"): map.replace_with("") return (soup.decode_contents(formatter="html"), imgs)
def get_content(self, url, model): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' request = urllib.request.Request(url, headers={'User-Agent': user_agent}) html = urllib.request.urlopen(request).read() soup = BeautifulSoup(html, 'html.parser') links = soup.find_all( 'a', attrs={ 'class': 'tv-widget-idea__title apply-overflow-tooltip js-widget-idea__popup' }, href=True) urls = [] for link in links: url = link['href'] urls.append('https://www.tradingview.com' + url) for url in urls: request = urllib.request.Request( url, headers={'User-Agent': user_agent}) html = urllib.request.urlopen(request).read() soup = BeautifulSoup(html, 'html.parser') title_class = 'tv-chart-view__title-name' title = soup.find( 'h1', attrs={ 'class': title_class }, ).get_text() content_text_class = 'tv-chart-view__description' content = soup.find( 'div', attrs={ 'class': content_text_class }, ).decode_contents() diagram_class = 'tv-card-social-item apply-common-tooltip tv-card-social-item--agrees tv-card-social-item--button tv-card-social-item--border tv-social-row__item' diagram = soup.find( 'span', attrs={'class': diagram_class}, ) diagram_id = diagram['data-image_url'] image = "https://www.tradingview.com/i/" + diagram_id title = str(title) # content = content.replace('<div class="tv-chart-view__description selectable">', '') # content.replace('<span class="tv-chart-view__tag-page-link">', '') # content.replace('</span>', '') soup = BeautifulSoup(content, 'html.parser') for a_tag in soup.findAll('a'): a_tag.unwrap() for span_tag in soup.findAll('span'): span_tag.unwrap() content = soup.decode_contents() content = content[:content.find('<br/>\n<br/>')] try: new_content = model(title=title, text=content, image_url=image, url=url) new_content.save() except IntegrityError: pass
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(10) articles = {} #link url = link[0] print(url) try: response = requests.get(url) except ConnectionError: print("Connection Error, but it's still trying...") time.sleep(20) details = self.getDetailBerita(link) html = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") #extract scrip json ld scripts_all = soup.findAll('script', attrs={'type': 'application/ld+json'}) scripts = '' scripts2 = '' if len(scripts_all) >= 2: scripts = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", scripts_all[-2].get_text(strip=True))) scripts = json.loads(scripts) scripts2 = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", scripts_all[-1].get_text(strip=True))) scripts2 = json.loads(scripts2) else: return False #category articles['category'] = scripts2['itemListElement'][0]['item']['name'] articles['subcategory'] = scripts2['itemListElement'][1]['item'][ 'name'] articles['url'] = url article = soup.find('section', class_="content-post clearfix") #extract date pubdate = soup.find('time', class_="date") pubdate = pubdate['datetime'] if pubdate else '1970-01-01' pubdate = pubdate.strip(' \t\n\r') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%Y-%m-%d"), '%Y-%m-%d %H:%M:%S') articles['id'] = int( datetime.strptime(pubdate, "%Y-%m-%d").timestamp()) + len(url) #extract author articles['author'] = scripts['author']['name'] #extract title articles['title'] = scripts['headline'] #source articles['source'] = 'idntimes' #extract comments count # articles['comments'] = int(soup.find('span', class_="commentWidget-total").find('b').get_text(strip=True).strip(' \t\n\r')) articles['comments'] = 0 #extract tags tags = article.find('div', class_="content-post-topic") articles['tags'] = ','.join( [x.get_text(strip=True) for x in tags.findAll('a')]) if tags else '' #extract images articles['images'] = scripts['image']['url'] #extract detail detail = article.find('article', attrs={'id': 'article-content'}) #hapus div if detail.findAll('div'): for div in detail.findAll('div'): if div.find('script'): div.decompose() #hapus link sisip if detail.findAll('strong'): for b in detail.findAll('strong'): if ("baca juga" in b.get_text(strip=True).lower()): b.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = content print('memasukkan berita id ', articles['id']) return articles
def parse_html(html_str, court_name, flag): try: soup = BeautifulSoup(html_str, "html.parser") soup = BeautifulSoup(str(soup.prettify()), "html.parser") date_h4 = soup.find_all('h4', {'align': 'center'})[0] month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip() table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0] table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" subject = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if flag: i = 1 else: i = 0 for td in td_list: i += 1 if i == 2: judgment_day = escape_string(str(td.decode_contents())) judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace( 'JUDGEMENTS FOR THE MONTH OF', '') if i == 3: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) case_no = escape_string(str(a_tag.text).replace("\n", "").strip()) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name)) if i == 4: font_tag = BeautifulSoup(str(td), "html.parser").font if font_tag is not None: span_tag = font_tag.span else: span_tag = BeautifulSoup(str(td), "html.parser").span if span_tag is None: span_tag = BeautifulSoup(str(td), "html.parser") party = str(span_tag.decode_contents()).split("<br/>") petitioner = escape_string( str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">', '').strip()) petitioner = re.sub(r'(\\x(.){2})', '', petitioner) respondent = escape_string(str(party[2]).replace('</td>', '').strip()) respondent = re.sub(r'(\\x(.){2})', '', respondent) if i == 5: subject = escape_string(str(td.decode_contents()).strip()) if i == 6: judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip()) judge_name = re.sub(r'(\\x(.){2})', '', judge_name) judge_name = re.sub(r'', '', judge_name, re.U) # if case_no != "NULL" and insert_check and td_list: if case_no != "NULL" and td_list: sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \ "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \ "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def getDetailBerita(self, link): time.sleep(5) articles = {} #link url = link[0] response = requests.get(url) html = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") #extract subcategory from breadcrumb bc = soup.find('div', class_="breadcrumbs") if not bc: return False cat = bc.findAll('a')[-2].get_text(strip=True) sub = bc.findAll('a')[-1].get_text(strip=True) #articles article_id = soup.find('meta', attrs={"property": "og:image"})['content'] articles['id'] = int(article_id.replace( '//', '').split('/')[6]) if article_id != "" else '' #category #category articles['category'] = cat articles['subcategory'] = sub articles['url'] = url article = soup.find('div', class_="tru") #extract date pubdate = soup.find('meta', attrs={'property': 'og:updated_time'})['content'] pubdate = datetime.fromtimestamp(int(pubdate)) pubdate = datetime.strftime(pubdate, "%Y-%m-%d %H:%M:%S") pubdate_author = soup.find('div', class_='reg').text pubdate_author_split = pubdate_author.split(' \xa0\xa0 • \xa0\xa0 ') articles['pubdate'] = pubdate #extract author author = pubdate_author_split[0] articles['author'] = author #extract title title = soup.find('meta', attrs={"property": "og:title"}) articles['title'] = title['content'] if title else '' if ("foto" in sub.lower()) or "video" in sub.lower(): return False #source articles['source'] = 'metrotvnews' #extract comments count articles['comments'] = 0 #extract tags tags = soup.find('div', class_="line").findAll('a', class_="tag") articles['tags'] = ','.join([x.get_text(strip=True) for x in tags]) #extract images articles['images'] = soup.find('img', class_="pic")['src'] #extract detail detail = soup.find('div', class_="tru") #hapus link sisip for link in detail.findAll('div', class_="related"): link.decompose() #hapus video sisip for tag in detail.findAll('iframe', class_="embedv"): tag.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() for tabel in detail.findAll('table'): tabel.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = content.strip(' ') #print('memasukkan berita id ', articles['id']) return articles
def main(): csvwriter = csv.writer(file('shop.csv', 'wb')) csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'CATEGORY', 'IMAGE']) # create path if needed if IS_save_html and not os.path.exists(Bin + '/phtml'): os.mkdir(Bin + '/phtml') if not os.path.exists(Bin + '/uploads'): os.mkdir(Bin + '/uploads') DEBUG_BARCODE = None re_ingredients = [ re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('<成分><br/>\s*(.*?)\s*<br/>\s*【'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] re_barcodes = [ re.compile(r'JANコード:(\d{13}|\d{8})\b'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] url = "http://search.rakuten.co.jp/search/inshop-mall/-/-/sid.242246-st.A?x=35" c = get_url(url) page_now = 1 while True: soup = BeautifulSoup(c) rsrSResultPhoto = soup.find_all('img', attrs={'src': re.compile('ex=96x96')}) rsrSResultPhoto = map(lambda x: x.find_parent('a', attrs={'href': re.compile('http')}), rsrSResultPhoto) rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto) rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto) if not rsrSResultPhoto: print '## CAN NOT FIND ANY RESULT RELATED TO ' + url break next_page = False pages = soup.find_all('a', attrs={'href': re.compile('-p.\d+-')}) pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages) if pages: next_page = pages[0]['href'] page_now = page_now + 1 # if page_now > 10: break to_fix = 0 name, ingredients, image, matched_url = '', '', '', '' for in_url in rsrSResultPhoto: if 'http://item.rakuten.co.jp/book/' in in_url: continue print "\n\n" name, ingredients, image, matched_url = '', '', '', in_url c = get_url(in_url) if not c: continue # skip c.replace("<tr></font></td>", "</font></td>") soup = BeautifulSoup(c) cc = soup.decode_contents() barcode = '' for re_i in re_barcodes: m = re.search(re_i, cc) if m: barcode = m.group(1) if not barcode: barcode = soup.find('span', attrs={'class': 'item_number'}) if barcode: barcode = barcode.get_text() barcode = re.sub('-(.*?)$', '', barcode) if (len(barcode) != 13 and len(barcode) != 8) or not barcode.isdigit(): print "UNKNOWN barcode: " + barcode.encode('utf8') barcode = '' if not barcode: print "CAN NOT GET BARCODE FROM " + in_url continue print "get barcode as " + barcode.encode('utf8') for re_i in re_ingredients: m = re.search(re_i, cc) if m: tmptext = m.group(1).strip() soup2 = BeautifulSoup(tmptext) ingredients = soup2.get_text().strip() if len(ingredients) < 1000: break if '原材料'.decode('utf8') in cc and not ingredients: if DEBUG_BARCODE: print cc print "FIXME for " + in_url to_fix = 1 if DEBUG_BARCODE: print ingredients if not len(name): name = soup.find('span', attrs={'class': 'content_title'}) if name: name = name.get_text() name = re.sub('【\d+】'.decode('utf8'), '', name) image = soup.find('a', attrs={'class': re.compile('ImageMain')}) if image and 'href' in image.attrs: image = image['href'] elif image: image = image.find('img') if image: image = image['src'] image = re.sub('\?.+$', '', image) category = soup.find('td', attrs={'class': 'sdtext'}) if category: category = category.get_text().strip() if not ingredients: print 'no ingredients' continue if not image: print 'no image' continue # FIXME get_url(image, Bin + "/uploads/" + barcode + ".jpg"); ingredients = ingredients.encode('utf8') ingredients = re.sub('\s+', ' ', ingredients).strip() name = name.encode('utf8') name = re.sub('\s+', ' ', name).strip() if not category: category = '' category = category.encode('utf8') category = re.sub('\s+', ' ', category).strip() csvwriter.writerow([barcode, ingredients, name, category, "uploads/" + barcode + ".jpg", matched_url]) if not next_page: break # when it's an end print "### get next page: " + next_page c = get_url(next_page)
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(5) articles = {} #link url = link[0] response = requests.get(url) html2 = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html2, "html5lib") print(url) #category articles['category'] = 'Properti' sb = soup.find('meta', {'property': 'article:section'}) articles['subcategory'] = sb['content'] if sb else '' articles['url'] = url article = soup.find('div', {'id': 'post-content'}) #extract date pubdate = soup.find('meta', {'property': 'article:published_time'}) pubdate = pubdate['content'] if pubdate else '1970-01-01T01:01:01+00:00' pubdate = pubdate[0:19].strip(' \t\n\r') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"), '%Y-%m-%d %H:%M:%S') id = soup.find('div', {'id': 'ajax-load-more'}) articles['id'] = int(id['data-post-id']) if id else int( datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S").timestamp()) + len(url) #extract author author = article.find('span', {'class': 'author'}) articles['author'] = author.get_text(strip=True) if author else '' #extract title title = soup.find('meta', {'property': 'og:title'}) articles['title'] = title['content'] if title else '' #source articles['source'] = 'housingestate' #extract comments count articles['comments'] = 0 #extract tags tags = soup.find('meta', {'property': 'article:tag'}) articles['tags'] = tags['content'] if tags else '' #extract images images = soup.find("meta", attrs={'property': 'og:image'}) articles['images'] = images['content'] if images else '' #extract detail detail = article.find('div', attrs={'class': 'content-txt'}) #hapus video sisip if detail.findAll('div'): for div in detail.findAll('div'): if div.find('script'): div.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() #hapus all noscript for ns in detail.findAll('noscript'): ns.decompose() #hapus linksisip for ls in detail.findAll('p'): if ls.find('strong'): if 'baca' in ls.find('strong').get_text(strip=True).lower(): ls.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = content print('memasukkan berita id ', articles['id']) return articles
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(5) articles = {} #link url = link[0]+'?page=all' try: response = requests.get(url) except ConnectionError: print("Connection Error, but it's still trying...") time.sleep(10) details = self.getDetailBerita(link) html2 = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html2, "html5lib") print(url) scripts = soup.findAll('script', attrs={'type':'application/ld+json'}) if scripts: scripts = re.sub(r'\n|\t|\b|\r','',unicodedata.normalize("NFKD",scripts[-1].get_text(strip=True))) scripts = json.loads(scripts) else: return False #category articles['category'] = 'Otomotif' articles['subcategory'] = link[1] articles['url'] = url article = soup.find('div', class_="read__article clearfix") #extract date pubdate = soup.find('meta', {'name':'content_date'}) pubdate = pubdate['content'] if pubdate else '1970-01-01 00:00:00' pubdate = pubdate.strip(' \t\n\r') articles['pubdate'] = datetime.strftime(datetime.strptime(pubdate, "%Y-%m-%d %H:%M:%S"), '%Y-%m-%d %H:%M:%S') id = soup.find('meta', {'name':'content_id'}) articles['id'] = int(id['content']) if id else int(datetime.strptime(pubdate, "%d-%b-%Y %H:%M").timestamp()) + len(url) #extract author author = soup.find('meta', {'name':'content_author'}) articles['author'] = author['content'] if author else '' #extract title articles['title'] = scripts['headline'] #source articles['source'] = 'gridoto' #extract comments count articles['comments'] = 0 #extract tags tags = soup.find('meta', {'name':'content_tag'}) articles['tags'] = tags['content'] if tags else '' #extract images images = soup.find("meta", attrs={'property':'og:image'}) articles['images'] = images['content'] if images else '' #extract detail detail = article.find('div', attrs={'class':'read__right'}) #hapus video sisip if detail.findAll('div'): for div in detail.findAll('div'): if div.find('script'): div.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() #hapus all noscript for ns in detail.findAll('noscript'): ns.decompose() #hapus linksisip for ls in detail.findAll('p'): if ls.find('strong'): if 'baca' in ls.find('strong').get_text(strip=True).lower(): ls.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub(r'\n|\t|\b|\r','',unicodedata.normalize("NFKD",detail.get_text(strip=True))) articles['content'] = content print('memasukkan berita id ', articles['id']) return articles
def main(): csvwriter = csv.writer(file('pkdata.csv', 'wb')) csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE']) # create path if needed if IS_save_html and not os.path.exists(Bin + '/phtml'): os.mkdir(Bin + '/phtml') if not os.path.exists(Bin + '/uploads'): os.mkdir(Bin + '/uploads') DEBUG_BARCODE = None re_ingredients = [ re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] re_barcodes = [ re.compile(r'JANコード:(\d{13}|\d{8})\b'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] keyword = "ヌードル" # url = "http://search.rakuten.co.jp/search/mall/" + urllib.quote(keyword).decode('utf8') + "/100227/?grp=product" url = "http://search.rakuten.co.jp/search/mall/-/100283/?l-id=gt_swt_l_xs_100283" c = get_url(url) page_now = 1 while True: soup = BeautifulSoup(c) rsrSResultPhoto = soup.find_all('div', attrs={'class': 'rsrSResultPhoto'}) rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto) rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto) rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto) if not rsrSResultPhoto: print '## CAN NOT FIND ANY RESULT RELATED TO ' + keyword break next_page = False rsrPagination = soup.find('div', attrs={'class': 'rsrPagination'}) if rsrPagination: pages = rsrPagination.find_all('a') pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages) if pages: next_page = pages[0]['href'] page_now = page_now + 1 # if page_now > 10: break to_fix = 0 name, ingredients, image, matched_url = '', '', '', '' for in_url in rsrSResultPhoto: if 'http://item.rakuten.co.jp/book/' in in_url: continue print "\n\n" name, ingredients, image, matched_url = '', '', '', in_url c = get_url(in_url) if not c: continue # skip c.replace("<tr></font></td>", "</font></td>") soup = BeautifulSoup(c) cc = soup.decode_contents() barcode = '' for re_i in re_barcodes: m = re.search(re_i, cc) if m: barcode = m.group(1) if not barcode: barcode = soup.find('span', attrs={'class': 'item_number'}) if barcode: barcode = barcode.get_text() barcode = re.sub('-(.*?)$', '', barcode) if (len(barcode) != 13 and len(barcode) != 8) or not barcode.isdigit(): print "UNKNOWN barcode: " + barcode.encode('utf8') barcode = '' if not barcode: print "CAN NOT GET BARCODE FROM " + in_url continue print "get barcode as " + barcode.encode('utf8') trs = soup.find_all('tr') while True: if not len(trs): break tr = trs.pop(0) __trs = tr.find_all('tr') if len(__trs): continue tds = tr.find_all(re.compile("^t[dh]$")) tds = map(lambda x: x.get_text().strip(), tds) tds = filter(lambda x: len(x), tds) if not len(tds): continue if tds[0] == '商品名'.decode('utf8'): if len(tds) > 1: name = tds[1] elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() elif ( len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0])) ) or ( tds[0].endswith('原材料'.decode('utf8')) ): if not ingredients: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() # remove BAD for next choice if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000: ingredients = '' for re_i in re_ingredients: m = re.search(re_i, cc) if m: tmptext = m.group(1).strip() soup2 = BeautifulSoup(tmptext) ingredients = soup2.get_text().strip() if len(ingredients) < 1000: break if '原材料'.decode('utf8') in cc and not ingredients: if DEBUG_BARCODE: print cc print "FIXME for " + in_url to_fix = 1 if DEBUG_BARCODE: print ingredients if not len(name): name = soup.find('span', attrs={'class': 'content_title'}) if name: name = name.get_text() name = re.sub('【\d+】'.decode('utf8'), '', name) image = soup.find('a', attrs={'class': re.compile('ImageMain')}) if image and 'href' in image.attrs: image = image['href'] elif image: image = image.find('img') if image: image = image['src'] image = re.sub('\?.+$', '', image) if not ingredients: print 'no ingredients' continue if not image: print 'no image' continue # FIXME get_url(image, Bin + "/uploads/" + barcode + ".jpg"); ingredients = ingredients.encode('utf8') ingredients = re.sub('\s+', ' ', ingredients).strip() name = name.encode('utf8') name = re.sub('\s+', ' ', name).strip() csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url]) if not next_page: break # when it's an end print "### get next page: " + next_page c = get_url(next_page)
def getDetailBerita(self, link): """ Mengambil seluruh element dari halaman berita """ time.sleep(5) articles = {} #link url = link[0] response = requests.get(url) html = response.text # Create a BeautifulSoup object from the HTML: soup soup = BeautifulSoup(html, "html5lib") #category articles['category'] = 'Otomotif' articles['subcategory'] = link[1] articles['url'] = url article = soup.find('div', class_="left-content") #extract date pubdate = article.find('li', {'class': 'publish-date'}) pubdate = pubdate.get_text( strip=True).split(',') if pubdate else ['', '01-Jan-1970 00:00'] pubdate = pubdate[1].strip(' \t\n\r').replace('Ags', 'Agt').replace( 'Juli', 'Jul').replace('Juni', 'Jun').replace('Dec', 'Des') articles['pubdate'] = datetime.strftime( datetime.strptime(pubdate, "%d-%b-%Y %H:%M"), '%Y-%m-%d %H:%M:%S') articles['id'] = int( datetime.strptime(pubdate, "%d-%b-%Y %H:%M").timestamp()) + len(url) #extract author author = article.find('span', {'itemprop': 'author'}) articles['author'] = author.get_text(strip=True) if author else '' #extract title title = article.find('h1', {'class': 'entry-title'}) articles['title'] = title.get_text(strip=True) if title else '' #source articles['source'] = 'carreview' #extract comments count articles['comments'] = 0 #extract tags tags = article.find('div', class_="post-meta") articles['tags'] = ','.join([ x.get_text(strip=True).replace('#', '') for x in tags.findAll('a') ]) if tags else '' #extract images images = soup.find("meta", attrs={'property': 'og:image'}) articles['images'] = images['content'] if images else '' #extract detail detail = article.find('div', attrs={'class': 'entry-content'}) #hapus video sisip if detail.findAll('div'): for div in detail.findAll('div'): if div.find('script'): div.decompose() #hapus all script for script in detail.findAll('script'): script.decompose() #hapus all noscript for ns in detail.findAll('noscript'): ns.decompose() #hapus desc for p in detail.findAll('p', class_="lead"): p.decompose() #hapus linksisip for ls in detail.findAll('a'): if ls.find('strong'): if 'baca' in ls.find('strong').get_text(strip=True).lower(): ls.decompose() #extract content detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib") content = re.sub( r'\n|\t|\b|\r', '', unicodedata.normalize("NFKD", detail.get_text(strip=True))) articles['content'] = content print('memasukkan berita id ', articles['id']) return articles
def main(): csvwriter = csv.writer(file('pdata.csv', 'wb')) csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE']) # create path if needed if IS_save_html and not os.path.exists(Bin + '/phtml'): os.mkdir(Bin + '/phtml') if not os.path.exists(Bin + '/uploads'): os.mkdir(Bin + '/uploads') DEBUG_BARCODE = None re_ingredients = [ re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] txtfile = sys.argv[1] fh = open(txtfile, 'r') wfh = open('missing_barcodes.txt', 'w') while True: barcode = fh.readline() if not barcode: break barcode = barcode.strip() if barcode == 'Barcode': continue if DEBUG_BARCODE and barcode != DEBUG_BARCODE: continue print "\n\n" url = "http://search.rakuten.co.jp/search/mall?sitem=" + barcode + "&g=0&myButton.x=0&myButton.y=0&v=2&s=1&p=1&min=&max=&sf=0&st=A&nitem=&grp=product"; c = get_url(url) soup = BeautifulSoup(c) rsrSResultPhoto = soup.find_all('div', attrs={'class': 'rsrSResultPhoto'}) rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto) rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto) rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto) if not rsrSResultPhoto: print '## MISSING results for ' + barcode wfh.write(barcode + "\n") continue to_fix = 0 name, ingredients, image, matched_url = '', '', '', '' for in_url in rsrSResultPhoto: if 'http://item.rakuten.co.jp/book/' in in_url: continue if 'rakuten.co.jp/doremi/' in in_url: continue # skip BAD if 'rakuten.co.jp/at-life/' in in_url: continue name, ingredients, image, matched_url = '', '', '', in_url c = get_url(in_url) if not c: continue # skip c.replace("<tr></font></td>", "</font></td>") soup = BeautifulSoup(c) trs = soup.find_all('tr') while True: if not len(trs): break tr = trs.pop(0) __trs = tr.find_all('tr') if len(__trs): continue tds = tr.find_all(re.compile("^t[dh]$")) tds = map(lambda x: x.get_text().strip(), tds) tds = filter(lambda x: len(x), tds) if not len(tds): continue if tds[0] == '商品名'.decode('utf8'): if len(tds) > 1: name = tds[1] elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() elif ( len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0])) ) or ( tds[0].endswith('原材料'.decode('utf8')) ): if not ingredients: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() # remove BAD for next choice if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000: ingredients = '' cc = soup.decode_contents() for re_i in re_ingredients: m = re.search(re_i, cc) if m: tmptext = m.group(1).strip() soup2 = BeautifulSoup(tmptext) ingredients = soup2.get_text().strip() if len(ingredients) < 1000: break if '原材料'.decode('utf8') in cc and not ingredients: if DEBUG_BARCODE: print cc print "FIXME for " + in_url to_fix = 1 if DEBUG_BARCODE: print ingredients if not len(name): name = soup.find('span', attrs={'class': 'content_title'}) if name: name = name.get_text() name = re.sub('【\d+】'.decode('utf8'), '', name) image = soup.find('a', attrs={'class': re.compile('ImageMain')}) if image and 'href' in image.attrs: image = image['href'] elif image: image = image.find('img') if image: image = image['src'] image = re.sub('\?.+$', '', image) if name and ingredients: break if not image: print 'no image' wfh.write(barcode + "\n") continue # FIXME sys.exit(1) if not name: print 'no name' sys.exit(1) if not ingredients: print 'no ingredients' if to_fix: print "REAL FIXME: " + barcode wfh.write(barcode + "\n") continue ## FIXME sys.exit(1) get_url(image, Bin + "/uploads/" + barcode + ".jpg"); ingredients = ingredients.encode('utf8') ingredients = re.sub('\s+', ' ', ingredients).strip() name = name.encode('utf8') name = re.sub('\s+', ' ', name).strip() csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url]) fh.close() wfh.close()
def main(): csvwriter = csv.writer(file('ysdata.csv', 'wb')) csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE']) # create path if needed if IS_save_html and not os.path.exists(Bin + '/yhtml'): os.mkdir(Bin + '/yhtml') if not os.path.exists(Bin + '/uploads'): os.mkdir(Bin + '/uploads') DEBUG_BARCODE = None re_ingredients = [ re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料名】\s*(.*?)\s*<br/>【'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE), ] keyword = "ヌードル" # url = "http://search.rakuten.co.jp/search/mall/" + urllib.quote(keyword).decode('utf8') + "/100227/?grp=product" url = "http://category.shopping.yahoo.co.jp/list/1167/?tab_ex=commerce&fr=shp-prop" c = get_url(url) page_now = 1 while True: soup = BeautifulSoup(c) rsrSResultPhoto = soup.find_all('h3', attrs={'class': 'elName'}) rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto) rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto) rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto) if not rsrSResultPhoto: print '## CAN NOT FIND ANY RESULT RELATED TO ' + keyword break next_page = False rsrPagination = soup.find('div', attrs={'id': 'Sp1'}) if rsrPagination: pages = rsrPagination.find_all('a') pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages) if pages: next_page = pages[0]['href'] page_now = page_now + 1 # if page_now > 3: break to_fix = 0 name, ingredients, image, matched_url = '', '', '', '' for in_url in rsrSResultPhoto: if 'aff.makeshop.jp' in in_url: continue print "\n\n" name, ingredients, image, matched_url = '', '', '', in_url c = get_url(in_url) if not c: continue # skip soup = BeautifulSoup(c) barcode = soup.find('p', attrs={'class': 'jan'}) if barcode: barcode = barcode.get_text() barcode = re.sub('^(.*?):'.decode('utf8'), '', barcode) if len(barcode) != 13 or not barcode.isdigit(): print "UNKNOWN barcode: " + barcode.encode('utf8') barcode = '' if not barcode: h1 = soup.find('h1') if h1: m = re.search('\D(\d{13})\D', h1.get_text()) if m: barcode = m.group(1) if not barcode: print "CAN NOT GET BARCODE FROM " + in_url continue print "get barcode as " + barcode.encode('utf8') trs = soup.find_all('tr') while True: if not len(trs): break tr = trs.pop(0) __trs = tr.find_all('tr') if len(__trs): continue tds = tr.find_all(re.compile("^t[dh]$")) tds = map(lambda x: x.get_text().strip(), tds) tds = filter(lambda x: len(x), tds) if not len(tds): continue if tds[0] == '商品名'.decode('utf8'): if len(tds) > 1: name = tds[1] elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() elif ( len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0])) ) or ( tds[0].endswith('原材料'.decode('utf8')) ): if not ingredients: if len(tds) > 1: ingredients = tds[1] else: ingredients = trs.pop(0).get_text().strip() # remove BAD for next choice if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000: ingredients = '' cc = soup.decode_contents() for re_i in re_ingredients: m = re.search(re_i, cc) if m: tmptext = m.group(1).strip() soup2 = BeautifulSoup(tmptext) ingredients = soup2.get_text().strip() if len(ingredients) < 1000: break if '原材料'.decode('utf8') in cc and not ingredients: if DEBUG_BARCODE: print cc print "FIXME for " + in_url to_fix = 1 if DEBUG_BARCODE: print ingredients if not len(name): name = soup.find('span', attrs={'property': 'rdfs:label'}) if not name: name = soup.find('h1', attrs={'itemprop': 'name'}) if name: name = name.get_text() name = re.sub('【\d+】'.decode('utf8'), '', name) image = soup.find('span', attrs={'rel': re.compile('media:image')}) if image: image = image.parent['href'] # href="javascript:openItemImage('/mizota/enlargedimage.html?code=100200044&img=http://item.shopping.c.yimg.jp/i/l/mizota_100200044');" m = re.search("img=(.*?)\'", image) if m: image = m.group(1) else: image = soup.find('img', attrs={'id': 'productlargeImage'}) if image: image = image['src'] if image.startswith('//'): image = 'http' + image if not ingredients: print 'no ingredients' continue if not image: print 'no image' continue # FIXME get_url(image, Bin + "/uploads/" + barcode + ".jpg"); ingredients = ingredients.encode('utf8') ingredients = re.sub('\s+', ' ', ingredients).strip() name = name.encode('utf8') name = re.sub('\s+', ' ', name).strip() csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url]) if not next_page: break # when it's an end print "### get next page: " + next_page c = get_url(next_page)