def request_and_save_img(u, p, ua, a, pause, time_out): start = time.time() opener = urllib2.build_opener() opener.addheaders = [('User-Agent', ua)] i = 0 the_proof = True while i < int(a) and the_proof: try: response = opener.open(u, timeout=float(time_out)) the_proof = False except urllib2.HTTPError as error: the_proof = True logging.debug('- Error at attempt number: {0} with image url: {1} -'.format(str(error.code), str(u))) i += 1 time.sleep(float(pause)) if the_proof: stop = time.time() - start message = 'Error after {0} seconds, while timeout was: {1} -\n'.format(str(stop), str(time_out)) message += 'Error after {0} attempts, while saving image: {1} -'.format(str(a), str(u)) logging.debug(message) socks.setdefaultproxy() EmailNotifiers.debug_error_Call(message) exit(1) html_data = response.read() try: f = open(p, 'w') f.write(html_data) f.close() except IOError: logging.info('- ERROR in request_and_save with file {0} -'.format(str(p)))
def handler(c, e, t, signal, frame): logging.info('- Autokill -') # stop tor session socks.setdefaultproxy() import urllib2 logging.info( '- Closing tor. Public IP address: {0}'.format(str(urllib2.urlopen('http://icanhazip.com').read()))) # sending debug mail EmailNotifiers.debugCall() # exit(2) - by autokill exit(2)
def report_me(y, m, t, on_tor): path = 'temp/0/' list_file = [] for x in os.listdir(path): if 'txt' in x and y + '_' + m + '_' in x: list_file.append(x) for i in range(len(list_file)): soup = BeautifulSoup(open('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i)))) if soup.find('h3', {'class': 'post-title entry-title'}).contents[0]: title = soup.find('h3', {'class': 'post-title entry-title'}).contents[0] title = title.strip('\n') title = title.encode('utf-8') if title == t: logging.debug("- Intercept for post: {}-".format(str(t))) if on_tor['USE']: socks.setdefaultproxy() EmailNotifiers.interceptCall('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i))) socks.setdefaultproxy(socks.SOCKS5, str(on_tor['P_SERVER']), int(on_tor['P_PORT']), True) socket.socket = socks.socksocket else: EmailNotifiers.interceptCall('{0}{1}_{2}_{3}.txt'.format(path, str(y), str(m), str(i)))
def strip_post(soup, year, month, temp_dir, ua, pause, attempts, time_out, res_to_skip, target_prefix): # dentro la variabile post ci sono gli articoli, identificati tramite {'class': 'post-title entry-title'} # controllo necessario per verificare che esista almeno un articolo pubblicato nel mese in esame if soup.find('h3', {'class': 'post-title entry-title'}): list_post = [] post_url = [] try: post = soup.find_all('h3', {'class': 'post-title entry-title'}) except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'post = soup.find_all(''h3'', {''class'': ''post-title entry-title''})') exit(1) # salva la lista delle url da aprire per ottenere gli hash for title in post: try: if title.a: post_url.append(Utility.sanitize_string(str(title.a['href']), target_prefix)) # url completo del post except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'post_url.append(title.a[''href''])') exit(1) # ################################################################ # # ho aperto un file html con i post di un mese # # lo leggo, lo divido e ne salvo il contenuto in hash SHA-128 # # ciclo per parire tutti i post relativi il mese corrente # # creo il dizionario inerente anno e mese corrente # # ################################################################ # for i in range(0, len(post_url), 1): url = post_url[i] logging.info('- analysis: %s' % url) # UNCOMMENT TO STRAP POST AND SAVE NEW FILES Utility.request_and_save('{0}'.format(str(url)), '{0}{1}_{2}_{3}.txt'.format(str(temp_dir), str(year), str(month), str(i)), ua, attempts, pause, time_out) # Sanitize file Utility.sanitize_it('{0}{1}_{2}_{3}.txt'.format(str(temp_dir), str(year), str(month), str(i)), target_prefix) soup = BeautifulSoup(open('{0}{1}_{2}_{3}.txt'.format(str(temp_dir), str(year), str(month), str(i)))) # analizzo soltanto il div ('div', attrs={'class': 'date-outer'}) che contiene il post # senza parti dinamiche, esterne o css try: if soup.find('div', attrs={'class': 'date-outer'}): light_post = soup.find('div', attrs={'class': 'date-outer'}) except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'soup.find(''div'', attrs={''class'': ''date-outer''})') exit(1) # per ogni url/post che analizzi: scomponi, calcola hash e salva try: if light_post.find('h3', {'class': 'post-title entry-title'}).contents[0]: title = light_post.find('h3', {'class': 'post-title entry-title'}).contents[0] except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'light_post.find(''h3'',' '{''class'': ''post-title entry-title''}).contents[0]') exit(1) title = title.strip('\n') # post url is variable url body = light_post # elimino gli script for x in body.find_all('script'): x.extract() # salvo la sezione riservata ai commenti - soltanto il testo try: if body.find('div', {'class': 'comments', 'id': 'comments'}): comments = body.find('div', {'class': 'comments', 'id': 'comments'}) comments = comments.get_text() # elimino commenti for x in body.find_all('div', {'class': 'comments', 'id': 'comments'}): x.extract() except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'body.find(''div'', {''class'': ''comments'', ''id'': ''comments''}):') exit(1) # DA QUESTO PUNTO BODY NON HA I COMMENTI E NON HA SCRIPT # elimino le etichette try: if body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}): for x in body.find_all('div', {'class': 'post-footer-line post-footer-line-2'}): x.extract() except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'body.find_all(''div'',' '{''class'': ''post-footer-line post-footer-line-2''}):') exit(1) # elimino i bottoni di condivisione try: if body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}): for x in body.find_all('div', {'class': 'post-share-buttons goog-inline-block'}): x.extract() except: socks.setdefaultproxy() EmailNotifiers.emergencyCall('Tag error: \n' 'body.find_all(''div'',' '{''class'': ''post-share-buttons goog-inline-block''})') exit(1) # #################################### # # ---------- START HASHING ----------- # # #################################### # dict_post = Utility.hash_all(body, title, url, comments, temp_dir, pause, ua, attempts, time_out, res_to_skip) list_post.append(dict_post) return list_post