def notify(config, result, result_k, coherence, extension):
        now = datetime.datetime.now()
        # data format: YYYYMMDD
        now_month = now.month
        if int(now_month) < 10:
            now_month = "0" + str(now.month)
        today = str(now.year) + str(now_month) + str(now.day)

        logging.info("- EmailNotifier is sending...")
        smtp_server = config["SERVER"]
        smtp_port = config["PORT"]
        smtp_username = config["USERNAME"]
        smtp_password = config["PASSWORD"]
        smtp_from = config["FROM"]
        smtp_to = config["TO"]
        smtp_cc = config["CC"]
        smtp_bcc = config["BCC"]
        smtp_subject = config["SUBJECT"]
        smtp_body = config["BODY"]

        # Attachments
        smtp_new = config["NEW"] + "_" + today
        smtp_mod = config["MOD"] + "_" + today
        smtp_del = config["DEL"] + "_" + today
        smtp_new_k = config["NEW_K"] + "_" + today
        smtp_mod_k = config["MOD_K"] + "_" + today

        # Attachment format: <dir><file_name>_<date>
        smtp_file_new = config["PATH_TEMP"] + smtp_new
        smtp_file_mod = config["PATH_TEMP"] + smtp_mod
        smtp_file_del = config["PATH_TEMP"] + smtp_del
        smtp_file_new_k = config["PATH_TEMP"] + smtp_new_k
        smtp_file_mod_k = config["PATH_TEMP"] + smtp_mod_k

        try:
            logging.info("- EmailNotifier is sending...")
            server = SMTP(smtp_server, smtp_port)
            server.set_debuglevel(True)
            msg = MIMEMultipart()
            msg["From"] = smtp_from
            msg["To"] = "; ".join(smtp_to)
            emails = [smtp_to]
            if smtp_cc is not "":
                msg["CC"] = "; ".join(smtp_cc)
                emails += smtp_cc
            if smtp_bcc is not "":
                msg["BCC"] = "; ".join(smtp_bcc)
                emails += smtp_bcc

            # Added priority to subject
            priority = ""
            if coherence:
                if coherence == 1:
                    priority = "[Info] "
                elif 1 < coherence < 4:
                    priority = "[Warning] "
                else:
                    priority = "[Danger] "
            msg["Subject"] = priority + smtp_subject

            body = MIMEText(smtp_body, "html")
            msg.attach(body)
            # logging.debug(msg.as_string())
            # If dictionary of new posts exists, it creates attachment
            if "new" in result:
                try:
                    if extension == "c":
                        smtp_new += ".csv"
                        smtp_file_new += ".csv"
                        CsvConverter.save_dictionary_to_file(smtp_file_new, result["new"], "new")
                    elif extension == "x":
                        smtp_new += ".xml"
                        smtp_file_new += ".xml"
                        XmlConverter.save_dictionary_to_file(smtp_file_new, result["new"])
                    elif extension == "j":
                        smtp_new += ".json"
                        smtp_file_new += ".json"
                        JsonConverter.save_dictionary_to_file(smtp_file_new, result["new"])
                    logging.info("- Attaching file: {0}".format(str(smtp_file_new)))
                    with open(smtp_file_new, "rb") as fil:
                        part = MIMEApplication(fil.read(), "text/plain", filename=smtp_new)
                    part.add_header("Content-Disposition", "attachment", filename=smtp_new)
                    msg.attach(part)
                except OSError, e:
                    logging.error("- Error walking dir '%s': %s" % (dir, e))
                    raise OSError
            # If dictionary of post changes exists, it creates attachment
            if "mod" in result:
                try:
                    if extension == "c":
                        smtp_mod += ".csv"
                        smtp_file_mod += ".csv"
                        CsvConverter.save_dictionary_to_file(smtp_file_mod, result["mod"], "mod")
                    elif extension == "x":
                        smtp_mod += ".xml"
                        smtp_file_mod += ".xml"
                        XmlConverter.save_dictionary_to_file(smtp_file_mod, result["mod"])
                    elif extension == "j":
                        smtp_mod += ".json"
                        smtp_file_mod += ".json"
                        JsonConverter.save_dictionary_to_file(smtp_file_mod, result["mod"])
                    logging.info("- Attaching file: {0}".format(str(smtp_file_mod)))
                    with open(smtp_file_mod, "rb") as fil:
                        part = MIMEApplication(fil.read(), "text/plain", filename=smtp_mod)
                    part.add_header("Content-Disposition", "attachment", filename=smtp_mod)
                    msg.attach(part)
                except OSError, e:
                    logging.error("- Error walking dir '%s': %s" % (dir, e))
                    raise OSError
Exemple #2
0
    def check_changes(self, new_dict_path, on_tor):
        logging.info('- Starting check of changes -')
        old_dict_hash = JsonConverter.open_dictionary_from_file(self._hash_path)
        new_dict_hash = JsonConverter.open_dictionary_from_file(new_dict_path)
        past_dict_hash = copy.deepcopy(old_dict_hash)

        if self._year_start == self._year_end:
            for month in old_dict_hash[str(self._year_start)]:
                if int(month) < self._month_start:
                    del past_dict_hash[str(self._year_start)][month]
                elif int(month) > self._month_end:
                    del past_dict_hash[str(self._year_start)][month]
            for year in old_dict_hash.keys():
                if int(year) < self._year_start:
                    del past_dict_hash[year]
        else:
            for year in old_dict_hash.keys():
                if int(year) < self._year_start:
                    del past_dict_hash[year]
                elif int(year) == self._year_start:
                    for month in old_dict_hash[year]:
                        if int(month) < self._month_start:
                            del past_dict_hash[year][month]
                elif int(year) == self._year_end:
                    for month in old_dict_hash[year]:
                        if int(month) > self._month_end:
                            del past_dict_hash[year][month]

        # #################### #
        #    REPORT CHANGES    #
        #        OR NEWS       #
        # #################### #
        report_new = {}
        report_mod = {}
        report_mod_verbosity = {}
        for year in new_dict_hash.keys():
            report_dict_new = {}
            report_dict_mod = {}
            report_dict_mod_verbosity = {}
            if year in past_dict_hash.keys():
                for month in new_dict_hash[year]:
                    report_list_new = []
                    report_list_mod = []
                    report_list_mod_verbosity = []
                    if month in past_dict_hash[year]:
                        if past_dict_hash[year][month] != new_dict_hash[year][month]:
                            for new_post in new_dict_hash[year][month]:
                                post_change_verbosity = {}
                                post_change = {}
                                for past_post in past_dict_hash[year][month]:
                                    if new_post['url'] == past_post['url']:

                                        if new_post['body'] != past_post['body']:
                                            post_change['body'] = 'changed'
                                            if self._verbose[4]:
                                                post_change_verbosity['body'] = 'changed'
                                                if self._debug:
                                                    Intercept.report_me(year, month, new_post['plain title'], on_tor)

                                        if new_post['text'] != past_post['text']:
                                            post_change['text'] = 'changed'
                                            if self._verbose[5]:
                                                post_change_verbosity['text'] = 'changed'

                                        if new_post['title'] != past_post['title']:
                                            post_change['old title'] = past_post['plain title']
                                            post_change['plain title'] = new_post['plain title']
                                            if self._verbose[0]:
                                                post_change_verbosity['old title'] = past_post['plain title']
                                                post_change_verbosity['plain title'] = new_post['plain title']

                                        if new_post['media'] != past_post['media']:
                                            post_change['media'] = 'changed'
                                            if self._verbose[2]:
                                                post_change_verbosity['media'] = 'changed'

                                        if new_post['comments'] != past_post['comments']:
                                            post_change['comments'] = 'changed'
                                            if self._verbose[3]:
                                                post_change_verbosity['comments'] = 'changed'
                                        if new_post['img'] != past_post['img']:
                                            post_change['img'] = 'changed'
                                            if self._verbose[1]:
                                                post_change_verbosity['img'] = 'changed'

                                        if len(post_change):
                                            post_change['plain url'] = new_post['plain url']
                                            post_change['plain title'] = new_post['plain title']
                                            report_list_mod.append(post_change)

                                        if len(post_change_verbosity):
                                            post_change_verbosity['plain url'] = new_post['plain url']
                                            post_change_verbosity['plain title'] = new_post['plain title']
                                            report_list_mod_verbosity.append(post_change_verbosity)

                            if len(report_list_mod):
                                report_dict_mod[month] = report_list_mod
                            if len(report_list_mod_verbosity):
                                report_dict_mod_verbosity[month] = report_list_mod_verbosity
                            # Searching for new post
                            for new_post in new_dict_hash[year][month]:
                                proof = True
                                for past_post in past_dict_hash[year][month]:
                                    if new_post['url'] == past_post['url']:
                                        proof = False
                                if proof:
                                    report_list_new.append(new_post)
                            if len(report_list_new):
                                report_dict_new[month] = report_list_new
                    # Whole new month
                    else:
                        logging.info('- New MONTH found -')
                        report_dict_new[month] = new_dict_hash[year][month]
                if len(report_dict_mod):
                    report_mod[year] = report_dict_mod
                if len(report_dict_mod_verbosity):
                    report_mod_verbosity[year] = report_dict_mod_verbosity
                if len(report_dict_new):
                    report_new[year] = report_dict_new
            else:
                # Whole new year
                logging.info('- New YEAR found -')
                report_new[year] = new_dict_hash[year]

        # #################### #
        #    REPORT DELETES    #
        # #################### #
        report_del = {}
        for year in past_dict_hash.keys():
            report_dict_del = {}
            if year in new_dict_hash.keys():
                for month in past_dict_hash[year]:
                    report_list_del = []
                    if month not in new_dict_hash[year]:
                        # miss one whole month
                        for past_post in past_dict_hash[year][month]:
                            post_change_del = {'title': past_post['plain title'],
                                               'url': past_post['plain url']}
                            report_list_del.append(post_change_del)
                        if len(report_list_del):
                            report_dict_del[month] = report_list_del
                    else:
                        for past_post in past_dict_hash[year][month]:
                            post_change_del = {}
                            proof = 0
                            for new_post in new_dict_hash[year][month]:
                                if past_post['url'] != new_post['url']:
                                    proof += 1
                            if proof == len(new_dict_hash[year][month]):
                                # miss one or more posts
                                post_change_del['title'] = past_post['plain title']
                                post_change_del['url'] = past_post['plain url']
                            if len(post_change_del):
                                report_list_del.append(post_change_del)
                        if len(report_list_del):
                            report_dict_del[month] = report_list_del
            else:
                # miss one whole year
                for month in past_dict_hash[year]:
                    report_list_del = []
                    for past_post in past_dict_hash[year][month]:
                        post_change_del = {'title': past_post['plain title'], 'url': past_post['plain url']}
                        report_list_del.append(post_change_del)
                    if len(report_list_del):
                        report_dict_del[month] = report_list_del
            if len(report_dict_del):
                report_del[year] = report_dict_del
        report = {}
        # report_new: new post dictionary
        # report_mod: changed post dictionary
        # report_del: deleted post dictionary
        if report_new:
            report['new'] = report_new
            logging.info('- REPORT NEWS -')
        if report_mod_verbosity:
            report['mod'] = report_mod_verbosity
            logging.info('- REPORT CHANGES -')
        if report_del:
            report['del'] = report_del
            logging.info('- REPORT DELETES -')
        logging.info('- END CHECK.....................................................OK')

        # saving changes on file
        if report_new:
            for year in report_new.keys():
                if year not in old_dict_hash.keys():
                    old_dict_hash[year] = new_dict_hash[year]
                else:
                    for month in report_new[year]:
                        old_dict_hash[year][month] = new_dict_hash[year][month]

        # All changes are saved, no matter about verbosity
        if report_mod:
            for year in report_mod.keys():
                if year not in old_dict_hash.keys():
                    old_dict_hash[year] = new_dict_hash[year]
                else:
                    for month in report_mod[year]:
                        old_dict_hash[year][month] = new_dict_hash[year][month]

        if report_del:
            for year in report_del.keys():
                if year in new_dict_hash.keys():
                    for month in report_del[year]:
                        if month in new_dict_hash[year]:
                            old_dict_hash[year][month] = new_dict_hash[year][month]
                        else:
                            del old_dict_hash[year][month]
                else:
                    del old_dict_hash[year]

        JsonConverter.save_dictionary_to_file(self._hash_path, old_dict_hash)
        logging.info('- SAVING CHANGES ...............................................OK')
        # return report_new, report_del and report_mod_verbosity
        return report
                    raise OSError
            # If dictionary of deletes exists, it creates attachment
            if "del" in result:
                try:
                    if extension == "c":
                        smtp_del += ".csv"
                        smtp_file_del += ".csv"
                        CsvConverter.save_dictionary_to_file(smtp_file_del, result["del"], "del")
                    elif extension == "x":
                        smtp_del += ".xml"
                        smtp_file_del += ".xml"
                        XmlConverter.save_dictionary_to_file(smtp_file_del, result["del"])
                    elif extension == "j":
                        smtp_del += ".json"
                        smtp_file_del += ".json"
                        JsonConverter.save_dictionary_to_file(smtp_file_del, result["del"])
                    logging.info("- Attaching file: {0}".format(str(smtp_file_del)))
                    with open(smtp_file_del, "rb") as fil:
                        part = MIMEApplication(fil.read(), "text/plain", filename=smtp_del)
                    part.add_header("Content-Disposition", "attachment", filename=smtp_del)
                    msg.attach(part)
                except OSError, e:
                    logging.error("- Error walking dir '%s': %s" % (dir, e))
                    raise OSError

            # ################### #
            # KEYWORDS DICTIONARY #
            # ################### #
            # If dictionary of keywords new exists, it creates attachment
            if result_k.has_key("new"):
                if len(result_k["new"]):
Exemple #4
0
    def scrape_and_save(self):

        logging.info('- IP address over tor service: ' + urllib2.urlopen('http://icanhazip.com').read())
        dict_tot = {}
        # scrape the blog and save all html pages per month
        logging.info('- Scraping and saving hash from {0} {1} to {2} {3} -'.format(str(self._month_start),
                                                                                   str(self._year_start),
                                                                                   str(self._month_end),
                                                                                   str(self._year_end)))
        if self._year_end == self._year_start:
            dict_hash = {}
            # scrapeing first year
            for month in range(self._month_start, self._month_end + 1):

                if month < 10:
                    Utility.request_and_save(
                        "{0}/{1}/0{2}".format(str(self._target), str(self._year_start), str(month)),
                        "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)),
                        self._user_agent, self._attempts, self._pause, self._time_out)
                else:
                    Utility.request_and_save(
                        "{0}/{1}/{2}".format(str(self._target), str(self._year_start), str(month)),
                        "{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month)),
                        self._user_agent, self._attempts, self._pause, self._time_out)

                soup = BeautifulSoup(
                    open("{0}{1}_{2}.html".format(str(self._temp_dir), str(self._year_start), str(month))))

                buffer_list = Utility.strip_post(soup, self._year_start, month, self._temp_dir,
                                                 self._user_agent, self._pause, self._attempts, self._time_out, 
						 self._res_to_skip, self._target_prefix)
                if buffer_list:
                    dict_hash[month] = buffer_list
            if dict_hash:
                dict_tot[self._year_start] = dict_hash
        else:
            for year in range(self._year_start, self._year_end + 1):

                if year == self._year_end:
                    dict_hash = {}
                    # scrapeing first year
                    for month in range(1, self._month_end + 1):

                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(
                            open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))

                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip, 
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash

                elif year == self._year_start:
                    # scrapeing last year
                    dict_hash = {}
                    for month in range(self._month_start, 13):
                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))
                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip, 
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash
                else:
                    # if it's neither the first nor the last year
                    dict_hash = {}
                    for month in range(1, 13):
                        if month < 10:
                            Utility.request_and_save(
                                "{0}/{1}/0{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        else:
                            Utility.request_and_save(
                                "{0}/{1}/{2}".format(str(self._target), str(year), str(month)),
                                "{0}{1}_{2}.html".format(str(self._temp_dir), str(year),
                                                         str(month)), self._user_agent, self._attempts, self._pause,
                                self._time_out)
                        soup = BeautifulSoup(open("{0}{1}_{2}.html".format(str(self._temp_dir), str(year), str(month))))
                        buffer_list = Utility.strip_post(soup, year, month, self._temp_dir, self._user_agent,
                                                         self._pause, self._attempts, self._time_out, self._res_to_skip,
							 self._target_prefix)
                        if buffer_list:
                            dict_hash[month] = buffer_list
                    if dict_hash:
                        dict_tot[year] = dict_hash

        JsonConverter.save_dictionary_to_file(self._hash_path, dict_tot)
        logging.info('- Scraped and saved all hash with no errors -')