Example #1
0
    def request_and_save_img(u, p, ua, a, pause, time_out):
        start = time.time()
        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', ua)]
        i = 0
        the_proof = True
        while i < int(a) and the_proof:
            try:
                response = opener.open(u, timeout=float(time_out))
                the_proof = False
            except urllib2.HTTPError as error:
                the_proof = True
                logging.debug('- Error at attempt number: {0} with image url: {1} -'.format(str(error.code), str(u)))
                i += 1
                time.sleep(float(pause))
        if the_proof:
            stop = time.time() - start
            message = 'Error after {0} seconds, while timeout was: {1} -\n'.format(str(stop), str(time_out))
            message += 'Error after {0} attempts, while saving image: {1} -'.format(str(a), str(u))
            logging.debug(message)
            socks.setdefaultproxy()
            EmailNotifiers.debug_error_Call(message)
            exit(1)
        html_data = response.read()

        try:
            f = open(p, 'w')
            f.write(html_data)
            f.close()
        except IOError:
            logging.info('- ERROR in request_and_save with file {0} -'.format(str(p)))
Example #2
0
def handler(c, e, t, signal, frame):
    logging.info('- Autokill -')
    # stop tor session
    socks.setdefaultproxy()
    import urllib2

    logging.info(
        '- Closing tor. Public IP address: {0}'.format(str(urllib2.urlopen('http://icanhazip.com').read())))
    # sending debug mail
    EmailNotifiers.debugCall()
    # exit(2) - by autokill
    exit(2)
Example #3
0
    def report_me(y, m, t, on_tor):
        path = 'temp/0/'
        list_file = []
        for x in os.listdir(path):
            if 'txt' in x and y + '_' + m + '_' in x:
                list_file.append(x)

        for i in range(len(list_file)):
            soup = BeautifulSoup(open('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i))))
            if soup.find('h3', {'class': 'post-title entry-title'}).contents[0]:
                title = soup.find('h3', {'class': 'post-title entry-title'}).contents[0]
                title = title.strip('\n')
                title = title.encode('utf-8')

            if title == t:
                logging.debug("- Intercept for post: {}-".format(str(t)))
                if on_tor['USE']:
                    socks.setdefaultproxy()
                    EmailNotifiers.interceptCall('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i)))
                    socks.setdefaultproxy(socks.SOCKS5, str(on_tor['P_SERVER']), int(on_tor['P_PORT']), True)
                    socket.socket = socks.socksocket
                else:
                    EmailNotifiers.interceptCall('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i)))
Example #4
0
    def strip_post(soup, year, month, temp_dir, ua, pause, attempts, time_out, res_to_skip, target_prefix):
        # dentro la variabile post ci sono gli articoli, identificati tramite {'class': 'post-title entry-title'}

        # controllo necessario per verificare che esista almeno un articolo pubblicato nel mese in esame
        if soup.find('h3', {'class': 'post-title entry-title'}):
            list_post = []
            post_url = []
            try:
                post = soup.find_all('h3', {'class': 'post-title entry-title'})
            except:
                socks.setdefaultproxy()
                EmailNotifiers.emergencyCall('Tag error: \n'
                                             'post = soup.find_all(''h3'', {''class'': ''post-title entry-title''})')
                exit(1)

            # salva la lista delle url da aprire per ottenere gli hash
            for title in post:
                try:
                    if title.a:
                        post_url.append(Utility.sanitize_string(str(title.a['href']), target_prefix))  # url completo del post
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'post_url.append(title.a[''href''])')
                    exit(1)

            # ################################################################ #
            # ho aperto un file html con i post di un mese                     #
            # lo leggo, lo divido e ne salvo il contenuto in hash SHA-128      #
            # ciclo per parire tutti i post relativi il mese corrente          #
            # creo il dizionario inerente anno e mese corrente                 #
            # ################################################################ #

            for i in range(0, len(post_url), 1):
                url = post_url[i]
                logging.info('- analysis: %s' % url)
                # UNCOMMENT TO STRAP POST AND SAVE NEW FILES
                Utility.request_and_save('{0}'.format(str(url)), '{0}{1}_{2}_{3}.txt'.format(str(temp_dir),
                                                                                             str(year),
                                                                                             str(month), str(i)),
                                         ua, attempts, pause, time_out)

                # Sanitize file
                Utility.sanitize_it('{0}{1}_{2}_{3}.txt'.format(str(temp_dir), str(year), str(month), str(i)), target_prefix)

                soup = BeautifulSoup(open('{0}{1}_{2}_{3}.txt'.format(str(temp_dir),
                                                                      str(year),
                                                                      str(month),
                                                                      str(i))))

                # analizzo soltanto il div ('div', attrs={'class': 'date-outer'}) che contiene il post
                # senza parti dinamiche, esterne o css
                try:
                    if soup.find('div', attrs={'class': 'date-outer'}):
                        light_post = soup.find('div', attrs={'class': 'date-outer'})
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'soup.find(''div'', attrs={''class'': ''date-outer''})')
                    exit(1)

                # per ogni url/post che analizzi: scomponi, calcola hash e salva
                try:
                    if light_post.find('h3', {'class': 'post-title entry-title'}).contents[0]:
                        title = light_post.find('h3', {'class': 'post-title entry-title'}).contents[0]
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'light_post.find(''h3'','
                                                 '{''class'': ''post-title entry-title''}).contents[0]')
                    exit(1)

                title = title.strip('\n')
                # post url is variable url
                body = light_post

                # elimino gli script
                for x in body.find_all('script'):
                    x.extract()

                # salvo la sezione riservata ai commenti - soltanto il testo
                try:
                    if body.find('div', {'class': 'comments', 'id': 'comments'}):
                        comments = body.find('div', {'class': 'comments', 'id': 'comments'})
                        comments = comments.get_text()
                        # elimino commenti
                        for x in body.find_all('div', {'class': 'comments', 'id': 'comments'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find(''div'', {''class'': ''comments'', ''id'': ''comments''}):')
                    exit(1)

                # DA QUESTO PUNTO BODY NON HA I COMMENTI E NON HA SCRIPT
                # elimino le etichette
                try:
                    if body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}):
                        for x in body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find_all(''div'','
                                                 '{''class'': ''post-footer-line post-footer-line-2''}):')
                    exit(1)

                # elimino i bottoni di condivisione
                try:
                    if body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}):
                        for x in body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}):
                            x.extract()
                except:
                    socks.setdefaultproxy()
                    EmailNotifiers.emergencyCall('Tag error: \n'
                                                 'body.find_all(''div'','
                                                 '{''class'': ''post-share-buttons goog-inline-block''})')
                    exit(1)

                # #################################### #
                # ---------- START HASHING ----------- #
                # #################################### #
                dict_post = Utility.hash_all(body, title, url, comments, temp_dir, pause, ua, attempts, time_out, res_to_skip)
                list_post.append(dict_post)
            return list_post