예제 #1
0
    def run(self):
        post_to_verify = Utility.find_and_strip(self._temp_dir, self._dict)
        result = {}
        coherence_max = 0
        for i in range(0, len(post_to_verify), 1):
            coherence = 0
            one_result = {}
            url_to_verify = post_to_verify[i]['url']
            logging.info('- Verifying url: {0} -'.format(str(url_to_verify)))
            text = post_to_verify[i]['text']
            text = text.lower()
            text = text.strip('\n')
            title = post_to_verify[i]['title']
            title = title.strip('\n')       # TODO: new
            title = title.lower()

            if self._find_in_title(title):
                one_result['title'] = title
                coherence += 1
                logging.debug("Incrementing coherence +1 (find_in_title)")

            words = self._find_in_text(text)
            if len(words):
                one_result['keywords found'] = []
                for word in words:
                    one_result['keywords found'].append(word)
                    logging.debug("Word found: %s" % (word))
                    coherence += self._neighborhood(text, words)

            if coherence:
                if coherence > coherence_max:
                    coherence_max = coherence
                logging.debug('- Coherence: {0}' .format(str(coherence)))

            sentences = self._find_in_sentence(text)
            if len(sentences):
                one_result['sentences'] = []
                for sentence in sentences:
                    one_result['sentences'].append(sentence)

            if len(one_result):
                result[url_to_verify] = one_result

        logging.debug(result)
        return result, coherence_max
예제 #2
0
    def scrape_and_save(self):

        logging.info('- IP address over tor service: ' + urllib2.urlopen('http://icanhazip.com').read())
        dict_tot = {}
        # scrape the blog and save all html pages per month
        logging.info('- Scraping and saving hash from {0} {1} to {2} {3} -'.format(str(self._month_start),
                                                                                   str(self._year_start),
                                                                                   str(self._month_end),
                                                                                   str(self._year_end)))
        if self._year_end == self._year_start:
            dict_hash = {}
            # scrapeing first year
            for month in range(self._month_start, self._month_end + 1):

                if month < 10:
                    Utility.request_and_save(
                        "{0}/{1}/0{2}".format(str(self._target), str(self._year_start), str(month)),
                        "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)),
                        self._user_agent, self._attempts, self._pause, self._time_out)
                else:
                    Utility.request_and_save(
                        "{0}/{1}/{2}".format(str(self._target), str(self._year_start), str(month)),
                        "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)),
                        self._user_agent, self._attempts, self._pause, self._time_out)

                soup = BeautifulSoup(
                    open("{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month))))

                buffer_list = Utility.strip_post(soup, self._year_start, month, self._temp_dir,
                                                 self._user_agent, self._pause, self._attempts, self._time_out, 
						 self._res_to_skip, self._target_prefix)
                if buffer_list:
                    dict_hash[month] = buffer_list
            if dict_hash:
                dict_tot[self._year_start] = dict_hash
        else:
            for year in range(self._year_start, self._year_end + 1):

                if year == self._year_end:
                    dict_hash = {}
                    # scrapeing first year
                    for month in range(1, self._month_end + 1):

                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(
                            open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))

                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip, 
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash

                elif year == self._year_start:
                    # scrapeing last year
                    dict_hash = {}
                    for month in range(self._month_start, 13):
                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))
                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip, 
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash
                else:
                    # if it's neither the first nor the last year
                    dict_hash = {}
                    for month in range(1, 13):
                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))
                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip,
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash

        JsonConverter.save_dictionary_to_file(self._hash_path, dict_tot)
        logging.info('- Scraped and saved all hash with no errors -')