Example #1
0
    def strip_post(soup, year, month, temp_dir, ua, pause, attempts, time_out, res_to_skip, target_prefix):
        # dentro la variabile post ci sono gli articoli, identificati tramite {'class': 'post-title entry-title'}

        # controllo necessario per verificare che esista almeno un articolo pubblicato nel mese in esame
        if soup.find('h3', {'class': 'post-title entry-title'}):
            list_post = []
            post_url = []
            try:
                post = soup.find_all('h3', {'class': 'post-title entry-title'})
            except:
                socks.setdefaultproxy()
                EmailNotifiers.emergencyCall('Tag error: \n'
                                             'post = soup.find_all(''h3'', {''class'': ''post-title entry-title''})')
                exit(1)

            # salva la lista delle url da aprire per ottenere gli hash
            for title in post:
                try:
                    if title.a:
                        post_url.append(Utility.sanitize_string(str(title.a['href']), target_prefix))  # url completo del post
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'post_url.append(title.a[''href''])')
                    exit(1)

            # ################################################################ #
            # ho aperto un file html con i post di un mese                     #
            # lo leggo, lo divido e ne salvo il contenuto in hash SHA-128      #
            # ciclo per parire tutti i post relativi il mese corrente          #
            # creo il dizionario inerente anno e mese corrente                 #
            # ################################################################ #

            for i in range(0, len(post_url), 1):
                url = post_url[i]
                logging.info('- analysis: %s' % url)
                # UNCOMMENT TO STRAP POST AND SAVE NEW FILES
                Utility.request_and_save('{0}'.format(str(url)), '{0}{1}_{2}_{3}.txt'.format(str(temp_dir),
                                                                                             str(year),
                                                                                             str(month), str(i)),
                                         ua, attempts, pause, time_out)

                # Sanitize file
                Utility.sanitize_it('{0}{1}_{2}_{3}.txt'.format(str(temp_dir), str(year), str(month), str(i)), target_prefix)

                soup = BeautifulSoup(open('{0}{1}_{2}_{3}.txt'.format(str(temp_dir),
                                                                      str(year),
                                                                      str(month),
                                                                      str(i))))

                # analizzo soltanto il div ('div', attrs={'class': 'date-outer'}) che contiene il post
                # senza parti dinamiche, esterne o css
                try:
                    if soup.find('div', attrs={'class': 'date-outer'}):
                        light_post = soup.find('div', attrs={'class': 'date-outer'})
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'soup.find(''div'', attrs={''class'': ''date-outer''})')
                    exit(1)

                # per ogni url/post che analizzi: scomponi, calcola hash e salva
                try:
                    if light_post.find('h3', {'class': 'post-title entry-title'}).contents[0]:
                        title = light_post.find('h3', {'class': 'post-title entry-title'}).contents[0]
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'light_post.find(''h3'','
                                                 '{''class'': ''post-title entry-title''}).contents[0]')
                    exit(1)

                title = title.strip('\n')
                # post url is variable url
                body = light_post

                # elimino gli script
                for x in body.find_all('script'):
                    x.extract()

                # salvo la sezione riservata ai commenti - soltanto il testo
                try:
                    if body.find('div', {'class': 'comments', 'id': 'comments'}):
                        comments = body.find('div', {'class': 'comments', 'id': 'comments'})
                        comments = comments.get_text()
                        # elimino commenti
                        for x in body.find_all('div', {'class': 'comments', 'id': 'comments'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find(''div'', {''class'': ''comments'', ''id'': ''comments''}):')
                    exit(1)

                # DA QUESTO PUNTO BODY NON HA I COMMENTI E NON HA SCRIPT
                # elimino le etichette
                try:
                    if body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}):
                        for x in body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find_all(''div'','
                                                 '{''class'': ''post-footer-line post-footer-line-2''}):')
                    exit(1)

                # elimino i bottoni di condivisione
                try:
                    if body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}):
                        for x in body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find_all(''div'','
                                                 '{''class'': ''post-share-buttons goog-inline-block''})')
                    exit(1)

                # #################################### #
                # ---------- START HASHING ----------- #
                # #################################### #
                dict_post = Utility.hash_all(body, title, url, comments, temp_dir, pause, ua, attempts, time_out, res_to_skip)
                list_post.append(dict_post)
            return list_post