def run(self): post_to_verify = Utility.find_and_strip(self._temp_dir, self._dict) result = {} coherence_max = 0 for i in range(0, len(post_to_verify), 1): coherence = 0 one_result = {} url_to_verify = post_to_verify[i]['url'] logging.info('- Verifying url: {0} -'.format(str(url_to_verify))) text = post_to_verify[i]['text'] text = text.lower() text = text.strip('\n') title = post_to_verify[i]['title'] title = title.strip('\n') # TODO: new title = title.lower() if self._find_in_title(title): one_result['title'] = title coherence += 1 logging.debug("Incrementing coherence +1 (find_in_title)") words = self._find_in_text(text) if len(words): one_result['keywords found'] = [] for word in words: one_result['keywords found'].append(word) logging.debug("Word found: %s" % (word)) coherence += self._neighborhood(text, words) if coherence: if coherence > coherence_max: coherence_max = coherence logging.debug('- Coherence: {0}' .format(str(coherence))) sentences = self._find_in_sentence(text) if len(sentences): one_result['sentences'] = [] for sentence in sentences: one_result['sentences'].append(sentence) if len(one_result): result[url_to_verify] = one_result logging.debug(result) return result, coherence_max
def scrape_and_save(self): logging.info('- IP address over tor service: ' + urllib2.urlopen('http://icanhazip.com').read()) dict_tot = {} # scrape the blog and save all html pages per month logging.info('- Scraping and saving hash from {0} {1} to {2} {3} -'.format(str(self._month_start), str(self._year_start), str(self._month_end), str(self._year_end))) if self._year_end == self._year_start: dict_hash = {} # scrapeing first year for month in range(self._month_start, self._month_end + 1): if month < 10: Utility.request_and_save( "{0}/{1}/0{2}".format(str(self._target), str(self._year_start), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) else: Utility.request_and_save( "{0}/{1}/{2}".format(str(self._target), str(self._year_start), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) soup = BeautifulSoup( open("{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)))) buffer_list = Utility.strip_post(soup, self._year_start, month, self._temp_dir, self._user_agent, self._pause, self._attempts, self._time_out, self._res_to_skip, self._target_prefix) if buffer_list: dict_hash[month] = buffer_list if dict_hash: dict_tot[self._year_start] = dict_hash else: for year in range(self._year_start, self._year_end + 1): if year == self._year_end: dict_hash = {} # scrapeing first year for month in range(1, self._month_end + 1): if month < 10: Utility.request_and_save( "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) else: Utility.request_and_save( "{0}/{1}/{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) soup = BeautifulSoup( open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)))) buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent, self._pause, self._attempts, self._time_out, self._res_to_skip, self._target_prefix) if buffer_list: dict_hash[month] = buffer_list if dict_hash: dict_tot[year] = dict_hash elif year == self._year_start: # scrapeing last year dict_hash = {} for month in range(self._month_start, 13): if month < 10: Utility.request_and_save( "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) else: Utility.request_and_save( "{0}/{1}/{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)))) buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent, self._pause, self._attempts, self._time_out, self._res_to_skip, self._target_prefix) if buffer_list: dict_hash[month] = buffer_list if dict_hash: dict_tot[year] = dict_hash else: # if it's neither the first nor the last year dict_hash = {} for month in range(1, 13): if month < 10: Utility.request_and_save( "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) else: Utility.request_and_save( "{0}/{1}/{2}".format(str(self._target), str(year), str(month)), "{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)), self._user_agent, self._attempts, self._pause, self._time_out) soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month)))) buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent, self._pause, self._attempts, self._time_out, self._res_to_skip, self._target_prefix) if buffer_list: dict_hash[month] = buffer_list if dict_hash: dict_tot[year] = dict_hash JsonConverter.save_dictionary_to_file(self._hash_path, dict_tot) logging.info('- Scraped and saved all hash with no errors -')