Ejemplo n.º 1
0
def convert_text(text, base_url, output_format):
    """ Convert an HTML document to output markup; attempts to find
    a plausible article to excerpt.

    Arguments:

    doc -- the BeautifulSoup document object

    base_url -- the base URL of the original document

    format -- the output format to use; uses anything supported by Pandoc

    Returns: the converted document fragment
    """

    out_html = '\n\n'.join(_extract(text, base_url))

    # create a new DOM document from the joined blockquotes
    out_dom = BeautifulSoup(out_html, features='html.parser')

    # convert all href, src, and srcset attributes
    for attr in ('href', 'src'):
        for node in out_dom.findAll(**{attr: True}):
            node[attr] = urllib.parse.urljoin(base_url, node[attr])
    for node in out_dom.findAll(srcset=True):
        node['srcset'] = _rewrite_srcset(node['srcset'], base_url)

    # strip out attributes we don't want
    for attr in ('id', 'class'):
        for node in out_dom.findAll(**{attr: True}):
            del node[attr]

    return pypandoc.convert_text(out_dom.decode_contents(), output_format,
                                 'html')
Ejemplo n.º 2
0
def get_link(url):
    try:
        req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, 'lxml')
        if soup.text.find('您访问过于频繁') >= 0:
            print('需要启用验证码', soup.decode_contents())
            get_code(soup.decode_contents())
            return ''
        links = soup.select('#page_list > ul > li > a')
    except Exception as ex:
        print(ex)
    for link in links:
        href = link.get('href')
        # print(href)
        get_info(href)
        time.sleep(1)
Ejemplo n.º 3
0
    def _get_artist_longbio(self, artist_page_soup):
        """
        Extract the artist short bio.

        Args
            artist_page_soup: soup of the artist page html.

        Return
            (list of strings) each list element corresponds to a paragraph.

        -----------------------------------------------------------------------
        """
        bio_link = artist_page_soup.find("p", {
            "class": "biography"
        }).find("a")["href"]
        self.conn.browser.get(bio_link)
        WebDriverWait(self.conn.browser, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "text")))
        soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser')
        soup = BeautifulSoup(soup.decode_contents(), 'lxml')
        soup = soup.find("div", {"itemprop": "reviewBody"})
        long_bio = [
            x.strip() for x in soup.text.split("\n")
            if (len(x.strip()) > 0 and x.find("<") == -1)
        ]
        return long_bio
Ejemplo n.º 4
0
        def test_strainer(css, input_html, output_html, **kwargs):
            parse_only = strainer_from_css(css, **kwargs)
            input_soup = BeautifulSoup('<main>%s</main>' % input_html,
                                       'lxml',
                                       parse_only=parse_only)

            assert input_soup.decode_contents().strip() == output_html
Ejemplo n.º 5
0
def amplify_html(rendered_html):
    bs = BeautifulSoup(rendered_html)

    for image in bs.find_all('img', attrs={'src': True}):
        amp_img = bs.new_tag('amp-img',
                             src=image.get("src"),
                             alt=image.get("alt", ""),
                             layout="responsive",
                             width=image.get("width", 550),
                             height=image.get("height", 368))
        amp_img['class'] = image.get("class", "")
        image.replace_with(amp_img)

    for iframe in bs.find_all('iframe', attrs={'src': True}):
        amp_iframe = bs.new_tag('amp-iframe')
        iframe_src = iframe['src']
        if iframe_src.startswith('//'):
            iframe_src = 'https:{}'.format(iframe_src)
        amp_iframe.attrs['src'] = iframe_src
        if iframe.has_attr('title'):
            amp_iframe.attrs['title'] = iframe['title']
        amp_iframe.attrs['width'] = '200'
        amp_iframe.attrs['height'] = '100'
        amp_iframe.attrs['layout'] = 'responsive'
        amp_iframe.attrs['frameborder'] = iframe.get('frameborder', 0)
        amp_iframe.attrs['sandbox'] = iframe.get(
            'sandbox', 'allow-scripts allow-same-origin')
        iframe.replace_with(amp_iframe)

    # Remove style attribute to remove large bottom padding
    for div in bs.find_all("div", {'class': 'responsive-object'}):
        del div['style']

    return bs.decode_contents()
Ejemplo n.º 6
0
 def _parse_line_breaks_in_dom_elem(self, element: BeautifulSoup) -> str:
     '''Converts a dome element of type BeautifulSoup into a string replacing line breaks to \n
     '''
     element = element.decode_contents()
     parsed_string = str(element).replace('<br/>',
                                          '\n').replace('<br>', '\n')
     return parsed_string.strip()
Ejemplo n.º 7
0
    def sanitize_html(html_value, valid_tags=VALID_TAGS):
        soup = BeautifulSoup(html_value)
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
        for tag in soup.find_all(True):
            if tag.name not in valid_tags:
                tag.hidden = True

        return soup.decode_contents()
Ejemplo n.º 8
0
 def getReply(self):
     soup = BeautifulSoup(self.driver.page_source, 'html.parser')
     soup = soup.find(
         "div", class_="flash flash-warning panel panel-warning").find(
             class_='panel-heading')
     replies = soup.decode_contents(formatter="html")
     reply = replies.split("<br/><br/>")[0]
     print(reply[3:])
     return jsonify({'reply': reply[3:]})
Ejemplo n.º 9
0
    def test_sitemap_xml(self):
        """
        Check we can retrieve a list of all URLs in the URL map at
        /sitemap.txt
        """

        response = self.client.get("/sitemap.xml")

        self.assertIn("application/xml", response.headers.get("content-type"))

        soup = BeautifulSoup(response.data, features="html.parser")
        expected_soup = BeautifulSoup(
            open("./tests/fixtures/templates/sitemap.xml").read().replace(
                "\n", ""),
            features="html.parser",
        )
        self.assertEqual(soup.decode_contents(),
                         expected_soup.decode_contents())
Ejemplo n.º 10
0
def absolute_links(html, scheme='//', request=None):
    """
        1. Все ссылки становятся абсолютными с target=_blank.
        2. Ко всем таблицам добавляются аттрибуты cellpadding, cellspacing и border
    """
    site = get_current_site(request)

    soup = Soup(html, 'html5lib')
    for tag in soup.findAll('a'):
        href = tag.get('href')
        if not href:
            continue

        tag['target'] = '_blank'
        if href.startswith('//'):
            tag['href'] = '%s%s' % (scheme, href[2:])
        elif href.startswith('/'):
            tag['href'] = '%s%s%s' % (scheme, site.domain, href)

    for tag in soup.findAll('img'):
        if tag.has_attr('height'):
            del tag['height']

        src = tag.get('src')
        if not src:
            continue

        if src.startswith('//'):
            tag['src'] = '%s%s' % (scheme, src[2:])
        elif src.startswith('/'):
            tag['src'] = '%s%s%s' % (scheme, site.domain, src)

        # srcset
        srcset = tag.get('srcset')
        if not srcset:
            continue

        srcset_final = []
        for srcset_part in srcset.split(','):
            url, width = srcset_part.strip().split()
            if url.startswith('//'):
                url = '%s%s' % (scheme, url[2:])
            elif src.startswith('/'):
                url = '%s%s%s' % (scheme, site.domain, url)
            srcset_final.append('%s %s' % (url, width))
        tag['srcset'] = ','.join(srcset_final)

    # Добавление аттрибутов к таблицам
    for tag in soup.findAll('table'):
        for attr in ('border', 'cellpadding', 'cellspacing'):
            if not tag.has_attr(attr):
                tag[attr] = '0'

    return soup.decode_contents()
Ejemplo n.º 11
0
    def _soup_artist_page(self):
        """
        Soup the artist page.

        Return
            Soup of the artist page

        -----------------------------------------------------------------------
        """
        soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser')
        soup = BeautifulSoup(soup.decode_contents(), 'lxml')
        return soup
Ejemplo n.º 12
0
def parse_ns_token(ns_token):
    ns_node = BeautifulSoup(ns_token, 'lxml-xml').NS
    assert ns_node is not None

    err_type = ns_node.attrs.get('type', '').strip()

    if ns_node.i is None and ns_node.c is None:
        # error is detected but not edited
        ori = cor = ns_node.decode_contents().strip()
    else:
        ori = ns_node.i.decode_contents().strip() if ns_node.i else ''
        cor = ns_node.c.decode_contents().strip() if ns_node.c else ''

    return Edit(ori, cor, err_type)
Ejemplo n.º 13
0
    def sanitize_html(html_value, valid_tags=VALID_TAGS, valid_attributes=VALID_ATTRIBUTES):
        """ Maybe we should have used Bleach (https://github.com/jsocol/bleach)
        """
        soup = BeautifulSoup(html_value)

        for tag in soup.find_all(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else: # it might have bad attributes
                for attr in tag.attrs.keys():
                    if attr not in valid_attributes:
                        del tag[attr]

        return soup.decode_contents()
Ejemplo n.º 14
0
    def _goto_artist_from_song(self, song_query):
        """
        Navigate to the artist page from a song page string.

        Args
            song_query: the last part of the song page url.
        """
        self.conn.query("/song/" + quote(song_query, safe=''))
        WebDriverWait(self.conn.browser, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, "song-artist")))
        soup = BeautifulSoup(self.conn.browser.page_source, 'html.parser')
        soup = BeautifulSoup(soup.decode_contents(), 'lxml')
        soup = soup.find("h2", {"class": "song-artist"})
        artist_link = soup.find("a")["href"]
        self.conn.browser.get(artist_link)
Ejemplo n.º 15
0
        def change_html_img_src_to_absolute(html_content, parser='lxml'):
            html_bs = BeautifulSoup(html_content, parser)
            if html_bs.find('img') is not None:
                for image_tag in html_bs.find_all('img'):
                    if 'http' in image_tag.attrs['src']:
                        file_url = image_tag.attrs['src']
                    else:
                        file_url = self.hdu_mirror_url + image_tag.attrs['src']
                    file_url = file_url.replace('../', '/')

                    image_tag.attrs['src'] = self.oss.upload_file_from_url(
                        file_url, diy_prefix='hdu/%s/' % self.problem_id)

                return html_bs.decode_contents(formatter="html")
            return html_content
Ejemplo n.º 16
0
def fix_relative_links(html: str, url: str) -> str:
    """
    Fixes issue with relative links, replaces
    '/index.html' with 'https://domain.com/index.html'
    """
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
    soup = BeautifulSoup(html, 'html.parser')
    for a in soup.find_all('a'):
        if 'href' in a and is_relative(a['href']):
            a['href'] = url + a['href']
    for img in soup.find_all('img'):
        if is_relative(img['src']):
            img['src'] = domain + img['src']

    return soup.decode_contents()
Ejemplo n.º 17
0
    def sanitize_html(html_value,
                      valid_tags=VALID_TAGS,
                      valid_attributes=VALID_ATTRIBUTES):
        """ Maybe we should have used Bleach (https://github.com/jsocol/bleach)
        """
        soup = BeautifulSoup(html_value)

        for tag in soup.find_all(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:  # it might have bad attributes
                for attr in tag.attrs.keys():
                    if attr not in valid_attributes:
                        del tag[attr]

        return soup.decode_contents()
Ejemplo n.º 18
0
def scrape_nfl_dot_com_strip():
    if too_soon_to_request_nfl():
        # print(f'Returning - only {seconds_elapsed} seconds have passed.')
        print('Returning')
        return
    update_request_time()
    raw_html = simple_get('https://www.nfl.com/')
    html = BeautifulSoup(raw_html, 'html.parser')
    content_string = html.decode_contents()
    start_location = content_string.find('__INITIAL_DATA__')
    json_start = content_string.find('{', start_location)
    json_end = content_string.find('\n', json_start)
    start_location = content_string.find('__REACT_ROOT_ID__')
    # Subtract 1 to remove the semicolon that ends the Javascript statement
    y = json.loads(content_string[json_start:json_end - 1])
    # print(json.dumps(y))
    for game in y['uiState']['scoreStripGames']:
        # FIXME: range length must not be hard coded
        for i in range(1):
            # print(f"{game['awayTeam']['identifier']}")
            # print(f"{conference_matchups[i][0]}")
            if game['awayTeam']['identifier'] == sb_matchups[i][0]:
                status = game['status']
                # print(status)
                if not status['isUpcoming']:
                    # print('Not upcoming')
                    if status['phaseDescription'] == 'FINAL':
                        print(f"{game['awayTeam']['identifier']} finished")
                        sb_finished[i] = True
                        home_score = game['homeTeam']['scores']['pointTotal']
                        away_score = game['awayTeam']['scores']['pointTotal']
                        delta = home_score - away_score
                        # print(f'Delta is {delta}')
                        sb_result[i] = delta
                    in_progress = status['isInProgress']
                    # print(in_progress)
                    in_progress_overtime = status['isInProgressOvertime']
                    # print(in_progress_overtime)
                    in_half = status['isHalf']
                    # print(in_half)
                    if in_progress or in_progress_overtime or in_half:
                        home_score = game['homeTeam']['scores']['pointTotal']
                        away_score = game['awayTeam']['scores']['pointTotal']
                        delta = home_score - away_score
                        # print(f'Delta is {delta}')
                        sb_result[i] = delta
                        sb_in_progress[i] = True
Ejemplo n.º 19
0
def load_bjcp_json():
    try:
        file = open('styleguide-2015.min.json')
        content_string = file.read()
        bjcp_json = json.loads(content_string)
        file.close()
        return bjcp_json
    except FileNotFoundError:
        file = open('testfile.txt', 'w')
        file.write('Hello')
        file.close()
        raw_html = simple_get(
            'https://github.com/gthmb/bjcp-2015-json/blob/master/json/styleguide-2015.min.json'
        )
        html = BeautifulSoup(raw_html, 'html.parser')
        content_string = html.decode_contents()
        start_location = content_string.find('{"styleguide')
        end_location = content_string.find('</td>', start_location)
        y = json.loads(content_string[start_location:end_location])
        return y
Ejemplo n.º 20
0
 def test_a_nfl_page(self):
     raw_html = self.simple_get('https://www.nfl.com/')
     html = BeautifulSoup(raw_html, 'html.parser')
     content_string = html.decode_contents()
     start_location = content_string.find('__INITIAL_DATA__')
     print(start_location)
     json_start = content_string.find('{', start_location)
     print(json_start)
     json_end = content_string.find('\n', json_start)
     print(json_end)
     start_location = content_string.find('__REACT_ROOT_ID__')
     print(start_location)
     y = json.loads(content_string[json_start:json_end - 1])
     # print(json.dumps(y))
     for game in y['uiState']['scoreStripGames']:
         print(game['awayTeam']['identifier'] + ' at ' +
               game['homeTeam']['identifier'])
         print('  Away Team:')
         for key in game['homeTeam'].keys():
             s = game['homeTeam']
             value = s[key]
             print(f'    {key}: {s[key]}')
         for key in game['awayTeam'].keys():
             s = game['awayTeam']
             value = s[key]
             print(f'    {key}: {s[key]}')
         print('  Status:')
         for key in game['status'].keys():
             status = game['status']
             value = status[key]
             print(f'    {key}: {status[key]}')
         # print(game.keys())
     # print (f'Response has {len(raw_html)} bytes.')
     game1 = y['uiState']['scoreStripGames'][0]['status']
     game4 = y['uiState']['scoreStripGames'][3]['status']
     print([k for k in game1.keys() if k in game4.keys()])
     self.assertEquals(True, True)
     '''
Ejemplo n.º 21
0
def clean_body(html: str) -> str:
    """Cleans html body from unneccesary elements."""
    soup = BeautifulSoup(html, 'html.parser')
    # Set of tags that should be removed
    to_remove = [
        'nav',
        'aside',
        'meta',
        'ins',
        'footer',
        'script',
        'table[class*="infobox"]',
        'div[class*="navigation"]',
        'div[class*="footer"]',
        'div[class*="social-container"]',
        'div[class*="Left"]',
        'div[class*="Share"]',
        'div[class*="embed"]',
        'div[class*="crumb"]',
        'div[class*="sharing"]',
        'div[class*="related"]',
        'div[class*="comments"]',
        'div[class*="widget"]',
        'div[class*="meta"]',
        'div[class*="noprint"]',
        'div[class*="nav"]',
        'div[id*="nav"]',
        'table[class*="nav"]',
        'span[class*="mw-editsection"]',
        'div[id*="author"]',
        'div[class*="author"]',
        'span[class*="dsq-postid"]'
    ]
    for selector in to_remove:
        for item in soup.select(selector):
            item.decompose()
    return soup.decode_contents()
Ejemplo n.º 22
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(5)
        articles = {}
        #link
        url = link[0]
        response = requests.get(url)
        html = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")

        #extract title
        find_title = soup.find('article',
                               class_="newslistouter container-base")
        title = find_title.find('h1').get_text(
            strip=True) if find_title else ''
        articles['title'] = title
        if ("foto:" in title.lower()) or "video:" in title.lower():
            return False

        #extract subcategory from breadcrumb
        bc = soup.find('ul', class_="breadcrumb")
        if not bc:
            return False
        sub = bc.findAll('li')[-2].get_text(strip=True) if bc else ''

        #category
        articles['category'] = 'Otomotif'
        articles['subcategory'] = sub

        #article_url
        articles['url'] = url

        #article
        article = soup.find('div', class_="content")

        #extract date
        scripts = json.loads(
            soup.findAll(
                'script',
                {'type': 'application/ld+json'})[-1].get_text(strip=True))
        pubdate = scripts['datePublished']
        pubdate = pubdate[0:19].strip(' \t\n\r')
        articles['pubdate'] = datetime.strftime(
            datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"),
            '%Y-%m-%d %H:%M:%S')

        #articleid
        articleid = url.replace('/', '')
        articleid = url.split('-')
        articleid = int(articleid[-1][-5:])
        articles['id'] = articleid

        #extract editor
        author = soup.find(
            'div', class_="publish-cont").find('a').get_text(strip=True)
        articles['author'] = author

        #source
        articles['source'] = 'oto'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        articles['tags'] = scripts['keywords']

        #extract images
        image = article.find('img')['src']
        articles['images'] = image

        detail = article
        #hapus link sisip
        for img in detail.findAll('img'):
            img.decompose()

        for div in detail.findAll('div'):
            div.decompose()

        for src in detail.findAll('p'):
            if ("sumber:" in src.get_text(strip=True).lower()):
                src.decompose()

        for p in detail.findAll('p'):
            if ("baca juga"
                    in p.get_text(strip=True).lower()) and (p.find('a')):
                p.decompose()
        # print(detail)
        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        # print(content)
        articles['content'] = content
        #print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 23
0
    def getDetailBerita(self, count, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(10)
        articles = {}
        #link
        url = link[0]

        print(url)
        try:
            response = requests.get(url)
        except:
            return False
        html = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")
        # print(soup)
        #extract subcategory from breadcrumb
        bc = soup.find('div', class_="breadcrumb")
        if not bc:
            return False

        sub = bc.findAll('a')[1].get_text(strip=True)
        if ("foto" in sub.lower()) or ("detiktv" in sub.lower()) or (
                "video" in sub.lower()) or ("photos" in sub.lower()) or (
                    "videos" in sub.lower()):
            return False

        articles['subcategory'] = sub
        #category
        articles['category'] = link[1]
        articles['url'] = url

        article = soup.find('article')

        #extract date
        pubdate = soup.find("meta", attrs={'name': 'publishdate'})
        if pubdate:
            pubdate = pubdate['content'].strip(' \t\n\r')
            articles['pubdate'] = datetime.strftime(
                datetime.strptime(pubdate, "%Y/%m/%d %H:%M:%S"),
                '%Y-%m-%d %H:%M:%S')
            id = soup.find("meta", attrs={'name': 'articleid'})
            articles['id'] = int(id['content']) if id else int(
                datetime.strptime(pubdate,
                                  "%Y/%m/%d %H:%M:%S").timestamp()) + len(url)
        else:
            pubdate = soup.find('span', {'class': 'date'})
            pubdate = pubdate.get_text(strip=True).strip(' \t\n\r').replace(
                " WIB", '')
            articles['pubdate'] = datetime.strftime(
                datetime.strptime(pubdate, "%A, %d %b %Y %H:%M"),
                '%Y-%m-%d %H:%M:%S')

            id = soup.find("meta", attrs={'name': 'articleid'})
            articles['id'] = int(id['content']) if id else int(
                datetime.strptime(pubdate,
                                  "%A, %d %b %Y %H:%M").timestamp()) + len(url)

        #extract author
        author = soup.find("meta", attrs={'name': 'author'})
        articles['author'] = author['content'] if author else ''

        #extract title
        title = article.find('meta', {"property": "og:title"})
        articles['title'] = title.get_text(strip=True) if title else ''

        #source
        articles['source'] = 'detik'

        #extract comments count
        komentar = soup.find('a', class_="komentar")
        articles['comments'] = int(
            komentar.find('span').get_text(strip=True).replace(
                'Komentar', '').strip(' \t\n\r')) if komentar else 0

        #extract tags
        tags = article.find('div', class_="detail_tag")
        articles['tags'] = ','.join(
            [x.get_text(strip=True)
             for x in tags.findAll('a')]) if tags else ''

        #extract images
        images = article.find('div', class_="pic_artikel")
        articles['images'] = images.find('img')['src'] if images else ''

        #extract detail
        if articles['category'] == 'news':
            detail = article.find('div', class_="detail_text")
        else:
            detail = article.find('div', attrs={"id": "detikdetailtext"})
            if not detail:
                detail = soup.find('div',
                                   attrs={"class": "read__content full mt20"})
                if not detail:
                    detail = soup.find('div', attrs={"id": "detikdetailtext"})
        if not detail:
            return False
        #hapus link sisip
        if detail.findAll('table', class_="linksisip"):
            for link in detail.findAll('table', class_="linksisip"):
                link.decompose()

        #hapus video sisip
        if detail.findAll('div', class_="sisip_embed_sosmed"):
            for tag in detail.findAll('div', class_="sisip_embed_sosmed"):
                tag.decompose()

        #hapus all setelah clear fix
        if detail.find('div', class_="clearfix mb20"):
            for det in detail.find('div',
                                   class_="clearfix mb20").findAllNext():
                det.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        for p in detail.findAll('p'):
            if ("baca juga"
                    in p.get_text(strip=True).lower()) and (p.find('a')):
                p.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = re.sub(r'(Tonton juga).*', '', content)
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 24
0
                    tag_name.string = args.label
                    o.bndbox.xmin.string = str(
                        int(round(float(o.bndbox.xmin.string)) * scale_width))
                    o.bndbox.xmax.string = str(
                        int(round(float(o.bndbox.xmax.string)) * scale_width))
                    o.bndbox.ymin.string = str(
                        int(round(float(o.bndbox.ymin.string)) * scale_height))
                    o.bndbox.ymax.string = str(
                        int(round(float(o.bndbox.ymax.string)) * scale_height))

        xml_out = 'tmp.xml'
        if xml_out == xml_in:
            raise Exception('{} will be overwritten'.format(xml_out))

        print('Writing ' + xml_out)
        f = open('tmp.xml', "w")
        f.write(soup.decode_contents())
        f.close()

        # a bit of hacky workaround to print a better looking xml than what beautifulsoup produces
        xmlf = parse('tmp.xml')
        pretty_xml_as_string = xmlf.toprettyxml()
        shutil.copyfile(xml_in, xml_in + '.bak')
        xml_out = xml_in
        # remove empty lines
        pretty_xml_as_string = os.linesep.join(
            [s for s in pretty_xml_as_string.splitlines() if s.strip()])
        with open(xml_out, 'w') as f:
            f.write(pretty_xml_as_string)
        f.close()
Ejemplo n.º 25
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(5)
        articles = {}
        #link
        url = link[0] + '?page=all'
        print(url)
        options = Options()
        options.add_argument('--headless')
        options.add_argument(
            '--disable-gpu')  # Last I checked this was necessary.
        options.add_argument('--disable-extensions')
        options.add_argument("--incognito")

        driver = webdriver.Chrome("../chromedriver.exe",
                                  chrome_options=options)
        html = ''
        try:
            driver.get(url)
            # Extract HTML texts contained in Response object: html
        except ConnectionError:
            driver.quit()
            print("Connection Error, but it's still trying...")
            time.sleep(10)
            details = self.getDetailBerita(link)

        html = driver.page_source
        driver.quit()

        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")

        scripts = soup.findAll('script', {'type': 'application/ld+json'})
        if scripts:
            scripts = re.sub(
                r'\n|\t|\b|\r', '',
                unicodedata.normalize("NFKD", scripts[0].get_text(strip=True)))
            scripts = json.loads(scripts)
        else:
            return False
        #category
        categories = soup.findAll('meta', {'name': 'cXenseParse:category'})

        articles[
            'category'] = categories[0]['content'] if categories else 'Berita'
        if len(categories) > 1:
            articles[
                'subcategory'] = categories[1]['content'] if categories else ''
        else:
            articles['subcategory'] = ''

        articles['url'] = url

        article = soup.find('div', {'id': 'article_con'})

        #extract date
        pubdate = scripts['datePublished']
        pubdate = pubdate[0:19].strip(' \t\n\r')
        articles['pubdate'] = datetime.strftime(
            datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"),
            '%Y-%m-%d %H:%M:%S')

        id = soup.find('meta', {"property": "android:app_id"})
        articles['id'] = int(id['content']) if id else int(
            datetime.strptime(pubdate,
                              "%d-%b-%Y %H:%M").timestamp()) + len(url)

        #extract author
        articles['author'] = scripts['author']['name']

        #extract title
        articles['title'] = scripts['headline']

        #source
        articles['source'] = 'tribunnews'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        tags = article.find('div', class_="mb10 f16 ln24 mb10 mt5")
        articles['tags'] = ','.join([
            x.get_text(strip=True).replace('#', '') for x in tags.findAll('a')
        ]) if tags else ''

        #extract images
        articles['images'] = scripts['image']['url']

        #extract detail
        detail = article.find('div',
                              attrs={'class': 'side-article txt-article'})

        #hapus video sisip
        if detail.findAll('div'):
            for div in detail.findAll('div'):
                if div.find('script'):
                    div.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        #hapus all noscript
        for ns in detail.findAll('noscript'):
            ns.decompose()

        #hapus linksisip
        for ls in detail.findAll('p', class_="baca"):
            if ls.find('strong'):
                if 'baca' in ls.find('strong').get_text(strip=True).lower():
                    ls.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = content
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 26
0
    def convertInternal(self, text, wikiPages):
        soup = BeautifulSoup(text)
        imgs = []
        
        #Output of doxygen
        #http://www.stack.nl/~dimitri/doxygen/manual/htmlcmds.html
        #...and <map> tag

        #Accepted by mediawiki
        #http://meta.wikimedia.org/wiki/Help:HTML_in_wikitext

        #Output from doxygen and not supported by mediawiki
        #We must convert these
        #<a href="...">
        #<a name="...">
        #<img src="..." ...>
        #<map>
        
        #Convert <a>s
        for a in soup("a"):
            #A normal link
            newStr = None
            if "href" in a.attrs:
                href = a.attrs["href"]
                #Get link and fragment portions of href
                hashPos = href.rfind("#")
                fragment = ""
                if hashPos != -1:
                    fragment = href[hashPos:]
                    link = href[:hashPos]
                else:
                    link = href
                
                #Compare to list of wiki pages and change if necessary
                internalLink = False
                if link == "" and (fragment == "" or fragment == "#"): #Empty link
                    newStr = ""
                elif link == "": #Local link with only fragment
                    internalLink = True
                else: #Test if it matches an internal file, if not, external link
                    for page in wikiPages:
                        if link == page.filename:
                            internalLink = True
                            link = page.normtitle.title
                            break
                
                #What's the content?
                text = a.string
                tags = a.select("*")
                if text: #Simple text string
                    if not internalLink:
                        newStr = "[" + href + " " + text + "]"
                    else:
                        newStr = "[[" + link + fragment + "|" + text + "]]"
                elif len(tags) == 1 and tags[0].name == "img": #One image inside the a tag
                    img = tags[0]
                    imgs.append(ImagePage(self.filepath, img.attrs["src"]))
                    newStr = "[[File:" + img.attrs["src"] + "|link=" + link + fragment + "]]"
                else: #Something else
                    doxymwglobal.msg(doxymwglobal.msgType.debug, "Unhandled link with unknown contents")
                    newStr = ""
            
            #A named anchor or anchor with ID
            elif "name" in a.attrs or "id" in a.attrs:
                newStr = soup.new_tag("span")
                #Named anchors or ID'd anchors just become spans with IDs
                if "name" in a.attrs:
                    newStr.attrs["id"] = a.attrs["name"]
                else: #"id" in a.attrs:
                    newStr.attrs["id"] = a.attrs["id"]
                newStr.attrs["style"] = "width:0;height:0;font-size:0;"
                
            else:
                newStr = ""
                
            a.replace_with(newStr)
            
        #Convert and store <img>s
        for img in soup("img"):
            #File this image for later use
            imgs.append(ImagePage(self.filepath, img.attrs["src"]))
            
            #Convert the image
            newStr = "[[File:" + img.attrs["src"] + "]]"
            img.replace_with(newStr)
            
        #Convert <maps>
        #For now just delete them, we'll have to rely on a MW extension for this one later
        for map in soup("map"):
            map.replace_with("") 
            
        return (soup.decode_contents(formatter="html"), imgs)
Ejemplo n.º 27
0
    def get_content(self, url, model):
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        request = urllib.request.Request(url,
                                         headers={'User-Agent': user_agent})
        html = urllib.request.urlopen(request).read()
        soup = BeautifulSoup(html, 'html.parser')

        links = soup.find_all(
            'a',
            attrs={
                'class':
                'tv-widget-idea__title apply-overflow-tooltip js-widget-idea__popup'
            },
            href=True)

        urls = []
        for link in links:
            url = link['href']
            urls.append('https://www.tradingview.com' + url)

        for url in urls:
            request = urllib.request.Request(
                url, headers={'User-Agent': user_agent})
            html = urllib.request.urlopen(request).read()
            soup = BeautifulSoup(html, 'html.parser')

            title_class = 'tv-chart-view__title-name'
            title = soup.find(
                'h1',
                attrs={
                    'class': title_class
                },
            ).get_text()

            content_text_class = 'tv-chart-view__description'
            content = soup.find(
                'div',
                attrs={
                    'class': content_text_class
                },
            ).decode_contents()

            diagram_class = 'tv-card-social-item apply-common-tooltip tv-card-social-item--agrees tv-card-social-item--button tv-card-social-item--border tv-social-row__item'
            diagram = soup.find(
                'span',
                attrs={'class': diagram_class},
            )

            diagram_id = diagram['data-image_url']
            image = "https://www.tradingview.com/i/" + diagram_id

            title = str(title)

            # content = content.replace('<div class="tv-chart-view__description selectable">', '')
            # content.replace('<span class="tv-chart-view__tag-page-link">', '')
            # content.replace('</span>', '')
            soup = BeautifulSoup(content, 'html.parser')
            for a_tag in soup.findAll('a'):
                a_tag.unwrap()
            for span_tag in soup.findAll('span'):
                span_tag.unwrap()

            content = soup.decode_contents()
            content = content[:content.find('<br/>\n<br/>')]
            try:
                new_content = model(title=title,
                                    text=content,
                                    image_url=image,
                                    url=url)

                new_content.save()
            except IntegrityError:
                pass
Ejemplo n.º 28
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(10)
        articles = {}
        #link
        url = link[0]
        print(url)
        try:
            response = requests.get(url)
        except ConnectionError:
            print("Connection Error, but it's still trying...")
            time.sleep(20)
            details = self.getDetailBerita(link)
        html = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")

        #extract scrip json ld
        scripts_all = soup.findAll('script',
                                   attrs={'type': 'application/ld+json'})
        scripts = ''
        scripts2 = ''
        if len(scripts_all) >= 2:
            scripts = re.sub(
                r'\n|\t|\b|\r', '',
                unicodedata.normalize("NFKD",
                                      scripts_all[-2].get_text(strip=True)))
            scripts = json.loads(scripts)
            scripts2 = re.sub(
                r'\n|\t|\b|\r', '',
                unicodedata.normalize("NFKD",
                                      scripts_all[-1].get_text(strip=True)))
            scripts2 = json.loads(scripts2)
        else:
            return False

        #category
        articles['category'] = scripts2['itemListElement'][0]['item']['name']
        articles['subcategory'] = scripts2['itemListElement'][1]['item'][
            'name']

        articles['url'] = url

        article = soup.find('section', class_="content-post clearfix")

        #extract date
        pubdate = soup.find('time', class_="date")
        pubdate = pubdate['datetime'] if pubdate else '1970-01-01'
        pubdate = pubdate.strip(' \t\n\r')
        articles['pubdate'] = datetime.strftime(
            datetime.strptime(pubdate, "%Y-%m-%d"), '%Y-%m-%d %H:%M:%S')

        articles['id'] = int(
            datetime.strptime(pubdate, "%Y-%m-%d").timestamp()) + len(url)

        #extract author
        articles['author'] = scripts['author']['name']

        #extract title
        articles['title'] = scripts['headline']

        #source
        articles['source'] = 'idntimes'

        #extract comments count
        # articles['comments'] = int(soup.find('span', class_="commentWidget-total").find('b').get_text(strip=True).strip(' \t\n\r'))
        articles['comments'] = 0

        #extract tags
        tags = article.find('div', class_="content-post-topic")
        articles['tags'] = ','.join(
            [x.get_text(strip=True)
             for x in tags.findAll('a')]) if tags else ''

        #extract images
        articles['images'] = scripts['image']['url']

        #extract detail
        detail = article.find('article', attrs={'id': 'article-content'})

        #hapus div
        if detail.findAll('div'):
            for div in detail.findAll('div'):
                if div.find('script'):
                    div.decompose()

        #hapus link sisip
        if detail.findAll('strong'):
            for b in detail.findAll('strong'):
                if ("baca juga" in b.get_text(strip=True).lower()):
                    b.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = content
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 29
0
def parse_html(html_str, court_name, flag):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        soup = BeautifulSoup(str(soup.prettify()), "html.parser")

        date_h4 = soup.find_all('h4', {'align': 'center'})[0]
        month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip()

        table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0]
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            subject = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            if flag:
                i = 1
            else:
                i = 0
            for td in td_list:
                i += 1
                if i == 2:
                    judgment_day = escape_string(str(td.decode_contents()))
                    judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace(
                        'JUDGEMENTS FOR THE MONTH OF', '')

                if i == 3:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(str(base_url + a_tag.get('href')))
                    case_no = escape_string(str(a_tag.text).replace("\n", "").strip())

                    # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                    #     insert_check = True
                    pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name))

                if i == 4:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    if font_tag is not None:
                        span_tag = font_tag.span
                    else:
                        span_tag = BeautifulSoup(str(td), "html.parser").span
                        if span_tag is None:
                            span_tag = BeautifulSoup(str(td), "html.parser")

                    party = str(span_tag.decode_contents()).split("<br/>")
                    petitioner = escape_string(
                        str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">',
                                              '').strip())
                    petitioner = re.sub(r'(\\x(.){2})', '', petitioner)

                    respondent = escape_string(str(party[2]).replace('</td>', '').strip())
                    respondent = re.sub(r'(\\x(.){2})', '', respondent)

                if i == 5:
                    subject = escape_string(str(td.decode_contents()).strip())

                if i == 6:
                    judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip())
                    judge_name = re.sub(r'(\\x(.){2})', '', judge_name)
                    judge_name = re.sub(r'', '', judge_name, re.U)

            # if case_no != "NULL" and insert_check and td_list:
            if case_no != "NULL" and td_list:
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \
                            "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \
                            "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
Ejemplo n.º 30
0
    def getDetailBerita(self, link):
        time.sleep(5)
        articles = {}
        #link
        url = link[0]
        response = requests.get(url)
        html = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")

        #extract subcategory from breadcrumb
        bc = soup.find('div', class_="breadcrumbs")
        if not bc:
            return False
        cat = bc.findAll('a')[-2].get_text(strip=True)
        sub = bc.findAll('a')[-1].get_text(strip=True)

        #articles
        article_id = soup.find('meta', attrs={"property":
                                              "og:image"})['content']
        articles['id'] = int(article_id.replace(
            '//', '').split('/')[6]) if article_id != "" else ''

        #category
        #category
        articles['category'] = cat
        articles['subcategory'] = sub

        articles['url'] = url

        article = soup.find('div', class_="tru")

        #extract date
        pubdate = soup.find('meta', attrs={'property':
                                           'og:updated_time'})['content']
        pubdate = datetime.fromtimestamp(int(pubdate))
        pubdate = datetime.strftime(pubdate, "%Y-%m-%d %H:%M:%S")

        pubdate_author = soup.find('div', class_='reg').text
        pubdate_author_split = pubdate_author.split(' \xa0\xa0 • \xa0\xa0 ')
        articles['pubdate'] = pubdate

        #extract author
        author = pubdate_author_split[0]
        articles['author'] = author

        #extract title
        title = soup.find('meta', attrs={"property": "og:title"})
        articles['title'] = title['content'] if title else ''

        if ("foto" in sub.lower()) or "video" in sub.lower():
            return False

        #source
        articles['source'] = 'metrotvnews'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        tags = soup.find('div', class_="line").findAll('a', class_="tag")
        articles['tags'] = ','.join([x.get_text(strip=True) for x in tags])

        #extract images
        articles['images'] = soup.find('img', class_="pic")['src']

        #extract detail
        detail = soup.find('div', class_="tru")

        #hapus link sisip
        for link in detail.findAll('div', class_="related"):
            link.decompose()

        #hapus video sisip
        for tag in detail.findAll('iframe', class_="embedv"):
            tag.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        for tabel in detail.findAll('table'):
            tabel.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = content.strip(' ')
        #print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 31
0
def main():
	csvwriter = csv.writer(file('shop.csv', 'wb'))
	csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'CATEGORY', 'IMAGE'])

	# create path if needed
	if IS_save_html and not os.path.exists(Bin + '/phtml'):
		os.mkdir(Bin + '/phtml')
	if not os.path.exists(Bin + '/uploads'):
		os.mkdir(Bin + '/uploads')

	DEBUG_BARCODE = None

	re_ingredients = [
		re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),

		re.compile('<成分><br/>\s*(.*?)\s*<br/>\s*【'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),

		re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	re_barcodes = [
		re.compile(r'JANコード:(\d{13}|\d{8})\b'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	url = "http://search.rakuten.co.jp/search/inshop-mall/-/-/sid.242246-st.A?x=35"
	c = get_url(url)

	page_now = 1
	while True:
		soup = BeautifulSoup(c)
		rsrSResultPhoto = soup.find_all('img', attrs={'src': re.compile('ex=96x96')})
		rsrSResultPhoto = map(lambda x: x.find_parent('a', attrs={'href': re.compile('http')}), rsrSResultPhoto)
		rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto)
		rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto)

		if not rsrSResultPhoto:
			print '## CAN NOT FIND ANY RESULT RELATED TO ' + url
			break

		next_page = False
		pages = soup.find_all('a', attrs={'href': re.compile('-p.\d+-')})
		pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages)
		if pages: next_page = pages[0]['href']
		page_now = page_now + 1
		# if page_now > 10: break

		to_fix = 0
		name, ingredients, image, matched_url = '', '', '', ''
		for in_url in rsrSResultPhoto:
			if 'http://item.rakuten.co.jp/book/' in in_url: continue

			print "\n\n"

			name, ingredients, image, matched_url = '', '', '', in_url
			c = get_url(in_url)
			if not c: continue # skip
			c.replace("<tr></font></td>", "</font></td>")

			soup = BeautifulSoup(c)
			cc = soup.decode_contents()

			barcode = ''
			for re_i in re_barcodes:
				m = re.search(re_i, cc)
				if m:
					barcode = m.group(1)

			if not barcode:
				barcode = soup.find('span', attrs={'class': 'item_number'})
				if barcode:
					barcode = barcode.get_text()
					barcode = re.sub('-(.*?)$', '', barcode)
					if (len(barcode) != 13 and len(barcode) != 8) or not barcode.isdigit():
						print "UNKNOWN barcode: " + barcode.encode('utf8')
						barcode = ''
			if not barcode:
				print "CAN NOT GET BARCODE FROM " + in_url
				continue
			print "get barcode as " + barcode.encode('utf8')

			for re_i in re_ingredients:
				m = re.search(re_i, cc)
				if m:
					tmptext = m.group(1).strip()
					soup2 = BeautifulSoup(tmptext)
					ingredients = soup2.get_text().strip()
					if len(ingredients) < 1000: break

			if '原材料'.decode('utf8') in cc and not ingredients:
				if DEBUG_BARCODE: print cc
				print "FIXME for " + in_url
				to_fix = 1

			if DEBUG_BARCODE: print ingredients

			if not len(name):
				name = soup.find('span', attrs={'class': 'content_title'})
				if name:
					name = name.get_text()
					name = re.sub('【\d+】'.decode('utf8'), '', name)

			image = soup.find('a', attrs={'class': re.compile('ImageMain')})
			if image and 'href' in image.attrs:
				image = image['href']
			elif image:
				image = image.find('img')
				if image:
					image = image['src']
					image = re.sub('\?.+$', '', image)

			category = soup.find('td', attrs={'class': 'sdtext'})
			if category: category = category.get_text().strip()

			if not ingredients:
				print 'no ingredients'
				continue

			if not image:
				print 'no image'
				continue # FIXME

			get_url(image, Bin + "/uploads/" + barcode + ".jpg");

			ingredients = ingredients.encode('utf8')
			ingredients = re.sub('\s+', ' ', ingredients).strip()
			name = name.encode('utf8')
			name = re.sub('\s+', ' ', name).strip()
			if not category: category = ''
			category = category.encode('utf8')
			category = re.sub('\s+', ' ', category).strip()
			csvwriter.writerow([barcode, ingredients, name, category, "uploads/" + barcode + ".jpg", matched_url])

		if not next_page: break # when it's an end
		print "### get next page: " + next_page
		c = get_url(next_page)
Ejemplo n.º 32
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(5)
        articles = {}
        #link
        url = link[0]
        response = requests.get(url)
        html2 = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html2, "html5lib")
        print(url)
        #category
        articles['category'] = 'Properti'
        sb = soup.find('meta', {'property': 'article:section'})
        articles['subcategory'] = sb['content'] if sb else ''

        articles['url'] = url

        article = soup.find('div', {'id': 'post-content'})

        #extract date
        pubdate = soup.find('meta', {'property': 'article:published_time'})
        pubdate = pubdate['content'] if pubdate else '1970-01-01T01:01:01+00:00'
        pubdate = pubdate[0:19].strip(' \t\n\r')
        articles['pubdate'] = datetime.strftime(
            datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%S"),
            '%Y-%m-%d %H:%M:%S')

        id = soup.find('div', {'id': 'ajax-load-more'})
        articles['id'] = int(id['data-post-id']) if id else int(
            datetime.strptime(pubdate,
                              "%Y-%m-%dT%H:%M:%S").timestamp()) + len(url)

        #extract author
        author = article.find('span', {'class': 'author'})
        articles['author'] = author.get_text(strip=True) if author else ''

        #extract title
        title = soup.find('meta', {'property': 'og:title'})
        articles['title'] = title['content'] if title else ''

        #source
        articles['source'] = 'housingestate'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        tags = soup.find('meta', {'property': 'article:tag'})
        articles['tags'] = tags['content'] if tags else ''

        #extract images
        images = soup.find("meta", attrs={'property': 'og:image'})
        articles['images'] = images['content'] if images else ''

        #extract detail
        detail = article.find('div', attrs={'class': 'content-txt'})

        #hapus video sisip
        if detail.findAll('div'):
            for div in detail.findAll('div'):
                if div.find('script'):
                    div.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        #hapus all noscript
        for ns in detail.findAll('noscript'):
            ns.decompose()

        #hapus linksisip
        for ls in detail.findAll('p'):
            if ls.find('strong'):
                if 'baca' in ls.find('strong').get_text(strip=True).lower():
                    ls.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = content
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 33
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(5)
        articles = {}
        #link
        url = link[0]+'?page=all'
        try:
            response = requests.get(url)
        except ConnectionError:
            print("Connection Error, but it's still trying...")
            time.sleep(10)
            details = self.getDetailBerita(link)
        html2 = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html2, "html5lib")
        print(url)
        scripts = soup.findAll('script', attrs={'type':'application/ld+json'})
        if scripts:
            scripts = re.sub(r'\n|\t|\b|\r','',unicodedata.normalize("NFKD",scripts[-1].get_text(strip=True)))
            scripts = json.loads(scripts)
        else:
            return False
        #category
        articles['category'] = 'Otomotif'
        articles['subcategory'] = link[1]

        articles['url'] = url

        article = soup.find('div', class_="read__article clearfix")

        #extract date
        pubdate = soup.find('meta', {'name':'content_date'})
        pubdate = pubdate['content'] if pubdate else '1970-01-01 00:00:00'
        pubdate = pubdate.strip(' \t\n\r')
        articles['pubdate'] = datetime.strftime(datetime.strptime(pubdate, "%Y-%m-%d %H:%M:%S"), '%Y-%m-%d %H:%M:%S')

        id = soup.find('meta', {'name':'content_id'})
        articles['id'] = int(id['content']) if id else int(datetime.strptime(pubdate, "%d-%b-%Y %H:%M").timestamp()) + len(url)

        #extract author
        author = soup.find('meta', {'name':'content_author'})
        articles['author'] = author['content'] if author else ''

        #extract title
        articles['title'] = scripts['headline']

        #source
        articles['source'] = 'gridoto'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        tags = soup.find('meta', {'name':'content_tag'})
        articles['tags'] = tags['content'] if tags else ''

        #extract images
        images = soup.find("meta", attrs={'property':'og:image'})
        articles['images'] = images['content'] if images else ''

        #extract detail
        detail = article.find('div', attrs={'class':'read__right'})

        #hapus video sisip
        if detail.findAll('div'):
            for div in detail.findAll('div'):
                if div.find('script'):
                    div.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        #hapus all noscript
        for ns in detail.findAll('noscript'):
            ns.decompose()

        #hapus linksisip
        for ls in detail.findAll('p'):
            if ls.find('strong'):
                if 'baca' in ls.find('strong').get_text(strip=True).lower():
                    ls.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '), "html5lib")
        content = re.sub(r'\n|\t|\b|\r','',unicodedata.normalize("NFKD",detail.get_text(strip=True)))
        articles['content'] = content
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 34
0
def main():
	csvwriter = csv.writer(file('pkdata.csv', 'wb'))
	csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE'])

	# create path if needed
	if IS_save_html and not os.path.exists(Bin + '/phtml'):
		os.mkdir(Bin + '/phtml')
	if not os.path.exists(Bin + '/uploads'):
		os.mkdir(Bin + '/uploads')

	DEBUG_BARCODE = None

	re_ingredients = [
		re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	re_barcodes = [
		re.compile(r'JANコード:(\d{13}|\d{8})\b'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	keyword = "ヌードル"
	# url = "http://search.rakuten.co.jp/search/mall/" + urllib.quote(keyword).decode('utf8') + "/100227/?grp=product"
	url = "http://search.rakuten.co.jp/search/mall/-/100283/?l-id=gt_swt_l_xs_100283"
	c = get_url(url)

	page_now = 1
	while True:
		soup = BeautifulSoup(c)
		rsrSResultPhoto = soup.find_all('div', attrs={'class': 'rsrSResultPhoto'})
		rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto)
		rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto)
		rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto)

		if not rsrSResultPhoto:
			print '## CAN NOT FIND ANY RESULT RELATED TO ' + keyword
			break

		next_page = False
		rsrPagination = soup.find('div', attrs={'class': 'rsrPagination'})
		if rsrPagination:
			pages = rsrPagination.find_all('a')
			pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages)
			if pages: next_page = pages[0]['href']
		page_now = page_now + 1
		# if page_now > 10: break

		to_fix = 0
		name, ingredients, image, matched_url = '', '', '', ''
		for in_url in rsrSResultPhoto:
			if 'http://item.rakuten.co.jp/book/' in in_url: continue

			print "\n\n"

			name, ingredients, image, matched_url = '', '', '', in_url
			c = get_url(in_url)
			if not c: continue # skip
			c.replace("<tr></font></td>", "</font></td>")

			soup = BeautifulSoup(c)
			cc = soup.decode_contents()

			barcode = ''
			for re_i in re_barcodes:
				m = re.search(re_i, cc)
				if m:
					barcode = m.group(1)

			if not barcode:
				barcode = soup.find('span', attrs={'class': 'item_number'})
				if barcode:
					barcode = barcode.get_text()
					barcode = re.sub('-(.*?)$', '', barcode)
					if (len(barcode) != 13 and len(barcode) != 8) or not barcode.isdigit():
						print "UNKNOWN barcode: " + barcode.encode('utf8')
						barcode = ''
			if not barcode:
				print "CAN NOT GET BARCODE FROM " + in_url
				continue
			print "get barcode as " + barcode.encode('utf8')

			trs = soup.find_all('tr')
			while True:
				if not len(trs): break
				tr = trs.pop(0)
				__trs = tr.find_all('tr')
				if len(__trs): continue

				tds = tr.find_all(re.compile("^t[dh]$"))
				tds = map(lambda x: x.get_text().strip(), tds)
				tds = filter(lambda x: len(x), tds)
				if not len(tds): continue

				if tds[0] == '商品名'.decode('utf8'):
					if len(tds) > 1: name = tds[1]
				elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2:
					if len(tds) > 1:
						ingredients = tds[1]
					else:
						ingredients = trs.pop(0).get_text().strip()
				elif (
						len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0]))
					) or (
						tds[0].endswith('原材料'.decode('utf8'))
					):
					if not ingredients:
						if len(tds) > 1:
							ingredients = tds[1]
						else:
							ingredients = trs.pop(0).get_text().strip()
				# remove BAD for next choice
				if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000:
					ingredients = ''

			for re_i in re_ingredients:
				m = re.search(re_i, cc)
				if m:
					tmptext = m.group(1).strip()
					soup2 = BeautifulSoup(tmptext)
					ingredients = soup2.get_text().strip()
					if len(ingredients) < 1000: break

			if '原材料'.decode('utf8') in cc and not ingredients:
				if DEBUG_BARCODE: print cc
				print "FIXME for " + in_url
				to_fix = 1

			if DEBUG_BARCODE: print ingredients

			if not len(name):
				name = soup.find('span', attrs={'class': 'content_title'})
				if name:
					name = name.get_text()
					name = re.sub('【\d+】'.decode('utf8'), '', name)

			image = soup.find('a', attrs={'class': re.compile('ImageMain')})
			if image and 'href' in image.attrs:
				image = image['href']
			elif image:
				image = image.find('img')
				if image:
					image = image['src']
					image = re.sub('\?.+$', '', image)


			if not ingredients:
				print 'no ingredients'
				continue

			if not image:
				print 'no image'
				continue # FIXME

			get_url(image, Bin + "/uploads/" + barcode + ".jpg");

			ingredients = ingredients.encode('utf8')
			ingredients = re.sub('\s+', ' ', ingredients).strip()
			name = name.encode('utf8')
			name = re.sub('\s+', ' ', name).strip()
			csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url])

		if not next_page: break # when it's an end
		print "### get next page: " + next_page
		c = get_url(next_page)
Ejemplo n.º 35
0
    def getDetailBerita(self, link):
        """
        Mengambil seluruh element dari halaman berita
        """
        time.sleep(5)
        articles = {}
        #link
        url = link[0]
        response = requests.get(url)
        html = response.text
        # Create a BeautifulSoup object from the HTML: soup
        soup = BeautifulSoup(html, "html5lib")

        #category
        articles['category'] = 'Otomotif'
        articles['subcategory'] = link[1]

        articles['url'] = url

        article = soup.find('div', class_="left-content")

        #extract date
        pubdate = article.find('li', {'class': 'publish-date'})
        pubdate = pubdate.get_text(
            strip=True).split(',') if pubdate else ['', '01-Jan-1970 00:00']
        pubdate = pubdate[1].strip(' \t\n\r').replace('Ags', 'Agt').replace(
            'Juli', 'Jul').replace('Juni', 'Jun').replace('Dec', 'Des')
        articles['pubdate'] = datetime.strftime(
            datetime.strptime(pubdate, "%d-%b-%Y %H:%M"), '%Y-%m-%d %H:%M:%S')
        articles['id'] = int(
            datetime.strptime(pubdate,
                              "%d-%b-%Y %H:%M").timestamp()) + len(url)

        #extract author
        author = article.find('span', {'itemprop': 'author'})
        articles['author'] = author.get_text(strip=True) if author else ''

        #extract title
        title = article.find('h1', {'class': 'entry-title'})
        articles['title'] = title.get_text(strip=True) if title else ''

        #source
        articles['source'] = 'carreview'

        #extract comments count
        articles['comments'] = 0

        #extract tags
        tags = article.find('div', class_="post-meta")
        articles['tags'] = ','.join([
            x.get_text(strip=True).replace('#', '') for x in tags.findAll('a')
        ]) if tags else ''

        #extract images
        images = soup.find("meta", attrs={'property': 'og:image'})
        articles['images'] = images['content'] if images else ''

        #extract detail
        detail = article.find('div', attrs={'class': 'entry-content'})

        #hapus video sisip
        if detail.findAll('div'):
            for div in detail.findAll('div'):
                if div.find('script'):
                    div.decompose()

        #hapus all script
        for script in detail.findAll('script'):
            script.decompose()

        #hapus all noscript
        for ns in detail.findAll('noscript'):
            ns.decompose()

        #hapus desc
        for p in detail.findAll('p', class_="lead"):
            p.decompose()

        #hapus linksisip
        for ls in detail.findAll('a'):
            if ls.find('strong'):
                if 'baca' in ls.find('strong').get_text(strip=True).lower():
                    ls.decompose()

        #extract content
        detail = BeautifulSoup(detail.decode_contents().replace('<br/>', ' '),
                               "html5lib")
        content = re.sub(
            r'\n|\t|\b|\r', '',
            unicodedata.normalize("NFKD", detail.get_text(strip=True)))
        articles['content'] = content
        print('memasukkan berita id ', articles['id'])

        return articles
Ejemplo n.º 36
0
def main():
	csvwriter = csv.writer(file('pdata.csv', 'wb'))
	csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE'])

	# create path if needed
	if IS_save_html and not os.path.exists(Bin + '/phtml'):
		os.mkdir(Bin + '/phtml')
	if not os.path.exists(Bin + '/uploads'):
		os.mkdir(Bin + '/uploads')

	DEBUG_BARCODE = None

	re_ingredients = [
		re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	txtfile = sys.argv[1]
	fh = open(txtfile, 'r')
	wfh = open('missing_barcodes.txt', 'w')
	while True:
		barcode = fh.readline()
		if not barcode: break
		barcode = barcode.strip()
		if barcode == 'Barcode': continue
		if DEBUG_BARCODE and barcode != DEBUG_BARCODE: continue

		print "\n\n"

		url = "http://search.rakuten.co.jp/search/mall?sitem=" + barcode + "&g=0&myButton.x=0&myButton.y=0&v=2&s=1&p=1&min=&max=&sf=0&st=A&nitem=&grp=product";
		c = get_url(url)
		soup = BeautifulSoup(c)
		rsrSResultPhoto = soup.find_all('div', attrs={'class': 'rsrSResultPhoto'})
		rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto)
		rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto)
		rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto)

		if not rsrSResultPhoto:
			print '## MISSING results for ' + barcode
			wfh.write(barcode + "\n")
			continue

		to_fix = 0
		name, ingredients, image, matched_url = '', '', '', ''
		for in_url in rsrSResultPhoto:
			if 'http://item.rakuten.co.jp/book/' in in_url: continue
			if 'rakuten.co.jp/doremi/' in in_url: continue # skip BAD
			if 'rakuten.co.jp/at-life/' in in_url: continue

			name, ingredients, image, matched_url = '', '', '', in_url
			c = get_url(in_url)
			if not c: continue # skip
			c.replace("<tr></font></td>", "</font></td>")

			soup = BeautifulSoup(c)
			trs = soup.find_all('tr')
			while True:
				if not len(trs): break
				tr = trs.pop(0)
				__trs = tr.find_all('tr')
				if len(__trs): continue

				tds = tr.find_all(re.compile("^t[dh]$"))
				tds = map(lambda x: x.get_text().strip(), tds)
				tds = filter(lambda x: len(x), tds)
				if not len(tds): continue

				if tds[0] == '商品名'.decode('utf8'):
					if len(tds) > 1: name = tds[1]
				elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2:
					if len(tds) > 1:
						ingredients = tds[1]
					else:
						ingredients = trs.pop(0).get_text().strip()
				elif (
						len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0]))
					) or (
						tds[0].endswith('原材料'.decode('utf8'))
					):
					if not ingredients:
						if len(tds) > 1:
							ingredients = tds[1]
						else:
							ingredients = trs.pop(0).get_text().strip()
				# remove BAD for next choice
				if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000:
					ingredients = ''

			cc = soup.decode_contents()
			for re_i in re_ingredients:
				m = re.search(re_i, cc)
				if m:
					tmptext = m.group(1).strip()
					soup2 = BeautifulSoup(tmptext)
					ingredients = soup2.get_text().strip()
					if len(ingredients) < 1000: break

			if '原材料'.decode('utf8') in cc and not ingredients:
				if DEBUG_BARCODE: print cc
				print "FIXME for " + in_url
				to_fix = 1

			if DEBUG_BARCODE: print ingredients

			if not len(name):
				name = soup.find('span', attrs={'class': 'content_title'})
				if name:
					name = name.get_text()
					name = re.sub('【\d+】'.decode('utf8'), '', name)

			image = soup.find('a', attrs={'class': re.compile('ImageMain')})
			if image and 'href' in image.attrs:
				image = image['href']
			elif image:
				image = image.find('img')
				if image:
					image = image['src']
					image = re.sub('\?.+$', '', image)

			if name and ingredients: break

		if not image:
			print 'no image'
			wfh.write(barcode + "\n")
			continue # FIXME
			sys.exit(1)
		if not name:
			print 'no name'
			sys.exit(1)
		if not ingredients:
			print 'no ingredients'
			if to_fix:
				print "REAL FIXME: " + barcode
			wfh.write(barcode + "\n")
			continue ## FIXME
			sys.exit(1)

		get_url(image, Bin + "/uploads/" + barcode + ".jpg");

		ingredients = ingredients.encode('utf8')
		ingredients = re.sub('\s+', ' ', ingredients).strip()
		name = name.encode('utf8')
		name = re.sub('\s+', ' ', name).strip()
		csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url])

	fh.close()
	wfh.close()
Ejemplo n.º 37
0
def main():
	csvwriter = csv.writer(file('ysdata.csv', 'wb'))
	csvwriter.writerow(['BARCODE', 'INGREDIENTS', 'PRODUCT NAME', 'IMAGE'])

	# create path if needed
	if IS_save_html and not os.path.exists(Bin + '/yhtml'):
		os.mkdir(Bin + '/yhtml')
	if not os.path.exists(Bin + '/uploads'):
		os.mkdir(Bin + '/uploads')

	DEBUG_BARCODE = None

	re_ingredients = [
		re.compile('原材料名</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名:\s*(.*?)\s*</span>\s*<br/>\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名</td>\s*<td[^\>]*>\s*(.*?)\s*<hr/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料名】\s*(.*?)\s*<br/>【'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*(.*?)\s*</p>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料に含まれるアレルギー物質:?\s*</div><div[^\>]*>(.*?)\s*</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料:\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\S?\s*<br/>\s*(.*?)\s*<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b>\s*<br/>\s*<br/>\s*<br/>\s*(.*?)<br/>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</\w{2,3}>\s*<div[^\>]*>\s*(.*?)</div>'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料\s*<br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
		re.compile('原材料</b><br/><br/><br/>\s*(<table.*?</table>)'.decode('utf8'), re.I|re.DOTALL|re.MULTILINE),
	]

	keyword = "ヌードル"
	# url = "http://search.rakuten.co.jp/search/mall/" + urllib.quote(keyword).decode('utf8') + "/100227/?grp=product"
	url = "http://category.shopping.yahoo.co.jp/list/1167/?tab_ex=commerce&fr=shp-prop"
	c = get_url(url)

	page_now = 1
	while True:
		soup = BeautifulSoup(c)
		rsrSResultPhoto = soup.find_all('h3', attrs={'class': 'elName'})
		rsrSResultPhoto = map(lambda x: x.find('a', attrs={'href': re.compile('http')}), rsrSResultPhoto)
		rsrSResultPhoto = filter(lambda x: x is not None, rsrSResultPhoto)
		rsrSResultPhoto = map(lambda x: x['href'], rsrSResultPhoto)

		if not rsrSResultPhoto:
			print '## CAN NOT FIND ANY RESULT RELATED TO ' + keyword
			break

		next_page = False
		rsrPagination = soup.find('div', attrs={'id': 'Sp1'})
		if rsrPagination:
			pages = rsrPagination.find_all('a')
			pages = filter(lambda x: x.get_text().strip() == str(page_now + 1), pages)
			if pages: next_page = pages[0]['href']
		page_now = page_now + 1
		# if page_now > 3: break

		to_fix = 0
		name, ingredients, image, matched_url = '', '', '', ''
		for in_url in rsrSResultPhoto:
			if 'aff.makeshop.jp' in in_url: continue
			print "\n\n"

			name, ingredients, image, matched_url = '', '', '', in_url
			c = get_url(in_url)
			if not c: continue # skip

			soup = BeautifulSoup(c)

			barcode = soup.find('p', attrs={'class': 'jan'})
			if barcode:
				barcode = barcode.get_text()
				barcode = re.sub('^(.*?):'.decode('utf8'), '', barcode)
				if len(barcode) != 13 or not barcode.isdigit():
					print "UNKNOWN barcode: " + barcode.encode('utf8')
					barcode = ''
			if not barcode:
				h1 = soup.find('h1')
				if h1:
					m = re.search('\D(\d{13})\D', h1.get_text())
					if m: barcode = m.group(1)
			if not barcode:
				print "CAN NOT GET BARCODE FROM " + in_url
				continue
			print "get barcode as " + barcode.encode('utf8')

			trs = soup.find_all('tr')
			while True:
				if not len(trs): break
				tr = trs.pop(0)
				__trs = tr.find_all('tr')
				if len(__trs): continue

				tds = tr.find_all(re.compile("^t[dh]$"))
				tds = map(lambda x: x.get_text().strip(), tds)
				tds = filter(lambda x: len(x), tds)
				if not len(tds): continue

				if tds[0] == '商品名'.decode('utf8'):
					if len(tds) > 1: name = tds[1]
				elif tds[0].endswith('原材料'.decode('utf8')) and len(tds) <= 2:
					if len(tds) > 1:
						ingredients = tds[1]
					else:
						ingredients = trs.pop(0).get_text().strip()
				elif (
						len(tds[0]) < 50 and ('原材料'.decode('utf8') in tds[0] or ('成分'.decode('utf8') in tds[0] and '栄養成分'.decode('utf8') not in tds[0]))
					) or (
						tds[0].endswith('原材料'.decode('utf8'))
					):
					if not ingredients:
						if len(tds) > 1:
							ingredients = tds[1]
						else:
							ingredients = trs.pop(0).get_text().strip()
				# remove BAD for next choice
				if 'item.rakuten.co.jp' in ingredients or 'iframe' in ingredients or len(ingredients) > 1000:
					ingredients = ''

			cc = soup.decode_contents()
			for re_i in re_ingredients:
				m = re.search(re_i, cc)
				if m:
					tmptext = m.group(1).strip()
					soup2 = BeautifulSoup(tmptext)
					ingredients = soup2.get_text().strip()
					if len(ingredients) < 1000: break

			if '原材料'.decode('utf8') in cc and not ingredients:
				if DEBUG_BARCODE: print cc
				print "FIXME for " + in_url
				to_fix = 1

			if DEBUG_BARCODE: print ingredients

			if not len(name):
				name = soup.find('span', attrs={'property': 'rdfs:label'})
				if not name: name = soup.find('h1', attrs={'itemprop': 'name'})
				if name:
					name = name.get_text()
					name = re.sub('【\d+】'.decode('utf8'), '', name)

			image = soup.find('span', attrs={'rel': re.compile('media:image')})
			if image:
				image = image.parent['href']
				# href="javascript:openItemImage('/mizota/enlargedimage.html?code=100200044&img=http://item.shopping.c.yimg.jp/i/l/mizota_100200044');"
				m = re.search("img=(.*?)\'", image)
				if m: image = m.group(1)
			else:
				image = soup.find('img', attrs={'id': 'productlargeImage'})
				if image:
					image = image['src']
					if image.startswith('//'): image = 'http' + image

			if not ingredients:
				print 'no ingredients'
				continue

			if not image:
				print 'no image'
				continue # FIXME

			get_url(image, Bin + "/uploads/" + barcode + ".jpg");

			ingredients = ingredients.encode('utf8')
			ingredients = re.sub('\s+', ' ', ingredients).strip()
			name = name.encode('utf8')
			name = re.sub('\s+', ' ', name).strip()
			csvwriter.writerow([barcode, ingredients, name, "uploads/" + barcode + ".jpg", matched_url])

		if not next_page: break # when it's an end
		print "### get next page: " + next_page
		c = get_url(next_page)