コード例 #1
0
 def __init__(self,
              ref: str,
              creation_strategy: str,
              save_dir="",
              file_limit=1000000,
              table_name="TEMP",
              write_ahead_mode=True):
     if len(save_dir) == 0:
         default_dir = get_db_buffer_default_dir()
     else:
         default_dir = save_dir
     self._table_name = table_name
     self._write_ahead_mode = write_ahead_mode
     FileHandler.create_file_if_not_exist(default_dir)
     self.filename = default_dir + ref
     # file_exist = os.path.exists(self.filename)
     self.db = sqlite3.connect(self.filename, timeout=10)
     self.cur = self.db.cursor()
     #self.cur.execute("PRAGMA journal_mode = MEMORY")
     #if not file_exist:
     if self._write_ahead_mode:
         self.cur.execute("PRAGMA journal_mode = WAL;")
         self.cur.execute("PRAGMA synchronous = OFF;")
     self.exclusive_access_file_limit = file_limit
     # cannot ensure uniqueness of data in multithread access
     #self.cur.execute("CREATE TABLE IF NOT EXISTS TEMP (LINK TEXT, RS_CODE INTEGER, LEV INTEGER, L_TYPE INTEGER, PRIMARY KEY(LINK));")
     self.cur.execute(creation_strategy)
     self.db.commit()
コード例 #2
0
 def __init__(self, interval: int, ref: ProgressLogInterface, stop_event: Event):
     """
     logging prograss for long running method
     :param interval: period of logging in second
     :param ref: the reference object invoked logging
     :param stop_event: event to stop logging
     :return:
     """
     threading.Thread.__init__(self)
     self._interval = interval
     self._ref = ref
     self._stop_event = stop_event
     self.begin_time = int(time.time())
     self._ref_time = self.begin_time
     self._path = get_log_dir() + "Progress/"
     temp = ref.get_file_name()
     if len(temp) > 200:
         filename = temp[0:199]
     else:
         filename = temp
     if not filename.endswith(".csv"):
         filename += ".csv"
     self._file_path = self._path + filename
     FileHandler.create_file_if_not_exist(self._file_path)
     self._limit = ref.get_limit()
     self.limit_counter = 0
コード例 #3
0
 def __init__(self, interval: int, ref: ProgressLogInterface,
              stop_event: Event):
     """
     logging prograss for long running method
     :param interval: period of logging in second
     :param ref: the reference object invoked logging
     :param stop_event: event to stop logging
     :return:
     """
     threading.Thread.__init__(self)
     self._interval = interval
     self._ref = ref
     self._stop_event = stop_event
     self.begin_time = int(time.time())
     self._ref_time = self.begin_time
     self._path = get_log_dir() + "Progress/"
     temp = ref.get_file_name()
     if len(temp) > 200:
         filename = temp[0:199]
     else:
         filename = temp
     if not filename.endswith(".csv"):
         filename += ".csv"
     self._file_path = self._path + filename
     FileHandler.create_file_if_not_exist(self._file_path)
     self._limit = ref.get_limit()
     self.limit_counter = 0
コード例 #4
0
    def testEmailLogin(self):
        # Send the message via local SMTP server using Oauth2.
        from Email.SMTP import SMTP
        import httplib2
        from Email.Utility.Oauth2 import CustomOAuth2Credentials
        me = "*****@*****.**"
        you = "*****@*****.**"

        msg = get_msg(me, you)
        http = httplib2.Http()
        credentials = OAuth2Credentials.from_json(
            FileHandler.read_all_from_file(credentials_local_path))
        # scopes = credentials.retrieve_scopes(http)
        # for item in scopes:
        #     print(item)
        if credentials.access_token_expired:
            # http = credentials.authorize(http)
            credentials.refresh(http)
            jsoned = credentials.to_json()
            FileHandler.remove_file_if_exist(credentials_local_path)
            FileHandler.append_line_to_file(credentials_local_path,
                                            str(jsoned))
        auth_str = GenerateOAuth2String(me,
                                        access_token=credentials.access_token)
        s = SMTP(**gmail_provider)
        s.set_debuglevel(debuglevel=4)
        s.ehlo()
        s.starttls()
        s.authenticate_oauth2(auth_str)
        s.sendmail(me, you, msg.as_string())
        s.quit()
コード例 #5
0
 def add_proxies(self, proxies: []):
     if proxies is not None:
         convtered = []
         for proxy in proxies:
             if isinstance(proxy, ProxyStruct):
                 convtered.append((proxy.addr, proxy.port, proxy.alt_port,
                                   proxy.user_name, proxy.psd))
         FileHandler.create_file_if_not_exist(self._file_path)
         CsvLogger.log_to_file_path(self._file_path, convtered)
コード例 #6
0
 def testGmailAuthStep2(self):
     code = "4/zZRbhzmhulAsl6pasBMqmuOv5PCsdRuITTxyAWLkJOI#"
     credentials = flow.step2_exchange(code)
     access_token = credentials.access_token
     refresh_token = credentials.refresh_token
     print("access_token:", access_token, " refresh_token:", refresh_token)
     jsoned = credentials.to_json()
     FileHandler.remove_file_if_exist(credentials_local_path)
     FileHandler.append_line_to_file(credentials_local_path, str(jsoned))
     print(jsoned)
コード例 #7
0
 def run(self):
     FileHandler.create_file_if_not_exist(self._file_path)
     cols = ["Index", "Time/Min"] + self._ref.get_column_names()
     self._append(cols)
     while not self._stop_event.is_set() and self.limit_counter < self._limit:
         current_time = int(time.time())
         gap = current_time - self._ref_time
         if gap >= self._interval:
             self._ref_time = current_time
             self.report_progress()
         time.sleep(1)
コード例 #8
0
 def __init__(self, file_dir: str = "", file_name="UserAccounts.db"):
     if len(file_dir) == 0:
         file_dir = get_temp_db_dir()
     FileHandler.create_file_if_not_exist(file_dir)
     self._file_name = file_name
     file_path = file_dir + self._file_name
     self.db = sqlite3.connect(file_path)
     self.cur = self.db.cursor()
     self.cur.execute(
         "CREATE TABLE IF NOT EXISTS ACCOUNTS(TYPE INTEGER, USER_ID TEXT, PSD TEXT,"
         " LINK TEXT,ACCESS_ID TEXT, API_KEY TEXT, PROXY TEXT);")
     self.db.commit()
コード例 #9
0
 def run(self):
     FileHandler.create_file_if_not_exist(self._file_path)
     cols = ["Index", "Time/Min"] + self._ref.get_column_names()
     self._append(cols)
     while not self._stop_event.is_set(
     ) and self.limit_counter < self._limit:
         current_time = int(time.time())
         gap = current_time - self._ref_time
         if gap >= self._interval:
             self._ref_time = current_time
             self.report_progress()
         time.sleep(1)
コード例 #10
0
    def __init__(self,
                 file_name,
                 worker: ExternalTempInterface,
                 stop_event: Event,
                 buf_size=200,
                 output_f=1000,
                 dir_path="",
                 table_name="temp",
                 convert_input=True,
                 convert_output=True,
                 terminate_callback=None):
        """

        :param file_name:
        :param worker:
        :param stop_event:
        :param buf_size:
        :param dir_path:
        :param table_name:
        :param convert_input:
        :param convert_output: convert output to OnSiteLink by default, else return raw tuple data.
        :return:
        """
        self._file_name = file_name
        if len(dir_path) > 0:
            self._file_dir = dir_path
        else:
            self._file_dir = get_temp_db_dir()
        self._file_path = self._file_dir + self._file_name
        PrintLogger.print("ExternalTempDataDiskBuffer create path in init: " +
                          self._file_path)
        FileHandler.create_file_if_not_exist(self._file_path)
        self.stop_event = stop_event
        self._tab = table_name
        self._worker = worker
        self._get_lock = threading.RLock()
        self._put_lock = threading.RLock()
        self._convert_input = convert_input
        self._convert_output = convert_output
        FileBuffInterface.__init__(self,
                                   self._file_name,
                                   buf_size,
                                   output_f=output_f,
                                   power_save_mode=True,
                                   terminate_callback=terminate_callback)
        self.set_db_update_interval(10)

        self._is_reading = Event()
        self._need_to_vaccum = Event()
        self._total_record = self.count_all()
コード例 #11
0
 def log_to_file_path(file_path: str, rows: [()]):
     if len(rows) > 0:
         try:
             path = file_path
             if not path.endswith(".csv"):
                 path += ".csv"
             FileHandler.create_file_if_not_exist(path)
             with open(path, mode="a", newline="") as csv_file:
                 wr = csv.writer(csv_file, delimiter=",")
                 for row in rows:
                     wr.writerow(row)
                 csv_file.close()
         except Exception as ex:
             ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
コード例 #12
0
 def log_to_file_path(file_path: str, rows: [()]):
     if len(rows) > 0:
         try:
             path = file_path
             if not path.endswith(".csv"):
                 path += ".csv"
             FileHandler.create_file_if_not_exist(path)
             with open(path, mode='a', newline='') as csv_file:
                 wr = csv.writer(csv_file, delimiter=',')
                 for row in rows:
                     wr.writerow(row)
                 csv_file.close()
         except Exception as ex:
             ErrorLogger.log_error("CsvLogger", ex, "log_to_file_path()")
コード例 #13
0
def get_msg(me, you):
    # me == my email address
    # you == recipient's email address
    html_file_path = "D:/Test/email_content_saved.txt"
    text_file_path = "D:/Test/email_text.txt"

    # Create message container - the correct MIME type is multipart/alternative.
    msg = MIMEMultipart('alternative')
    msg['Subject'] = "100+ HIGH TF/CF/DA EXPIRED DOMAINS TO BUY ONLY $10 EACH"
    msg['From'] = me
    msg['To'] = you

    # Create the body of the message (a plain-text and an HTML version).
    text = ""
    html = FileHandler.read_all_from_file(html_file_path, 't')

    # Record the MIME types of both parts - text/plain and text/html.
    part1 = MIMEText(text, 'plain')
    part2 = MIMEText(html, 'html')

    # Attach parts into message container.
    # According to RFC 2046, the last part of a multipart message, in this case
    # the HTML message, is best and preferred.
    msg.attach(part1)
    msg.attach(part2)
    return msg
コード例 #14
0
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
コード例 #15
0
 def testGetBlogs(self):
     niche = "Society/Law"
     proxy_site = BuyProxyOrg(buy_proxy_org_account)
     proxies = proxy_site.get_proxies(timeout=5)
     keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"
     # countries = GoogleUtility.CountryCodeEnglish
     countries = ["uk", ]
     min_delay = 2
     max_delay = 5
     max_page = 2
     days_ago = 4*365
     target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law",
                             "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law",
                             "Intellectual property law", "international law", "tax law", "banking law", "competition law",
                             "consumer law", "environmental law"]
     suggested_keywords = []
     for country in countries:
         # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies,
         #                                                   country_code=country,
         #                                                   min_delay=min_delay, max_delay=max_delay)
         temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path)))
         # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at")
         # suggested_keywords += temp_keywords
         crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))]
         self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100,
                                         country_code=country, source_type=GoogleConst.SourceTypeBlog,
                                         min_delay=min_delay, max_delay=max_delay, days_ago=days_ago,
                                         proxies=proxies, use_browser=False)
コード例 #16
0
 def testRe(self):
     css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":"
     match = re.search(link_pattern, css_text)
     groups = match.group()
     if match is not None:
         for i in match.groups(0):
             print(i)
コード例 #17
0
 def testRe(self):
     css_text = FileHandler.read_all_from_file(
         "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     test_s = "if('undefined' === typeof wwhomepage) var wwhomepage = {}; wwhomepage.customPromoHeaders = {\" /web/20130415001342/http://www.bbc.co.uk\/news\/magazine-22094279\":"
     match = re.search(link_pattern, css_text)
     groups = match.group()
     if match is not None:
         for i in match.groups(0):
             print(i)
コード例 #18
0
 def log_error(ref: str, error: Exception, addtional: str = ""):
     path = get_log_dir() + ErrorLogger.FILE_NAME
     try:
         FileHandler.create_file_if_not_exist(path)
         lines = []
         lines.append(ref)
         lines.append("{0:d} {1:s}".format(ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc))))
         lines.append(str(error))
         if len(addtional) > 0:
             lines.append(addtional)
         with open(path, mode="a", newline="") as csv_file:
             wr = csv.writer(csv_file, delimiter=",")
             wr.writerow(lines)
             csv_file.close()
         # lines.append("")
         # FileHandler.append_lines_to_file(path, lines)
         ErrorLogger.Counter += 1
     except:
         pass
コード例 #19
0
 def testCss2Parse(self):
     css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     groups = []
     parse_str_sp = functools.partial(parse_str, groups, 1)
     temp = re.sub(link_pattern, parse_str_sp, css_text)
     # for item in groups:
     #     print(item)
     print("captured total: ", len(groups))
     for item in groups:
         if isinstance(item, LinkAttrs):
             print("res:", item.path,  "link:", item.link)
コード例 #20
0
 def log_error(ref: str, error: Exception, addtional: str = ""):
     path = get_log_dir() + ErrorLogger.FILE_NAME
     try:
         FileHandler.create_file_if_not_exist(path)
         lines = []
         lines.append(ref)
         lines.append("{0:d} {1:s}".format(
             ErrorLogger.Counter, str(datetime.datetime.now(tz=pytz.utc))))
         lines.append(str(error))
         if len(addtional) > 0:
             lines.append(addtional)
         with open(path, mode='a', newline='') as csv_file:
             wr = csv.writer(csv_file, delimiter=',')
             wr.writerow(lines)
             csv_file.close()
         # lines.append("")
         # FileHandler.append_lines_to_file(path, lines)
         ErrorLogger.Counter += 1
     except:
         pass
コード例 #21
0
 def testCss2Parse(self):
     css_text = FileHandler.read_all_from_file(
         "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")
     groups = []
     parse_str_sp = functools.partial(parse_str, groups, 1)
     temp = re.sub(link_pattern, parse_str_sp, css_text)
     # for item in groups:
     #     print(item)
     print("captured total: ", len(groups))
     for item in groups:
         if isinstance(item, LinkAttrs):
             print("res:", item.path, "link:", item.link)
コード例 #22
0
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
コード例 #23
0
 def __init__(self, ref: str, creation_strategy: str, save_dir="", file_limit=1000000,
              table_name="TEMP", write_ahead_mode=True):
     if len(save_dir) == 0:
         default_dir = get_db_buffer_default_dir()
     else:
         default_dir = save_dir
     self._table_name = table_name
     self._write_ahead_mode = write_ahead_mode
     FileHandler.create_file_if_not_exist(default_dir)
     self.filename = default_dir + ref
     # file_exist = os.path.exists(self.filename)
     self.db = sqlite3.connect(self.filename, timeout=10)
     self.cur = self.db.cursor()
     #self.cur.execute("PRAGMA journal_mode = MEMORY")
     #if not file_exist:
     if self._write_ahead_mode:
         self.cur.execute("PRAGMA journal_mode = WAL;")
         self.cur.execute("PRAGMA synchronous = OFF;")
     self.exclusive_access_file_limit = file_limit
     # cannot ensure uniqueness of data in multithread access
     #self.cur.execute("CREATE TABLE IF NOT EXISTS TEMP (LINK TEXT, RS_CODE INTEGER, LEV INTEGER, L_TYPE INTEGER, PRIMARY KEY(LINK));")
     self.cur.execute(creation_strategy)
     self.db.commit()
コード例 #24
0
    def testCssParse(self):
        css_text = FileHandler.read_all_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")

        section = css_text.split("}")
        groups = []
        parse_str_sp = functools.partial(parse_str, groups, 1)
        result = ""
        for sec in section:
            sec += "}"
            temp = re.sub(css_link_pattern, parse_str_sp, sec)
            result += temp
        for item in groups:
            print(item)

        print(result)
コード例 #25
0
    def __init__(self, file_name,  worker: ExternalTempInterface, stop_event: Event, buf_size=200, output_f=1000,
                 dir_path="",  table_name="temp", convert_input=True, convert_output=True, terminate_callback=None):
        """

        :param file_name:
        :param worker:
        :param stop_event:
        :param buf_size:
        :param dir_path:
        :param table_name:
        :param convert_input:
        :param convert_output: convert output to OnSiteLink by default, else return raw tuple data.
        :return:
        """
        self._file_name = file_name
        if len(dir_path) > 0:
            self._file_dir = dir_path
        else:
            self._file_dir = get_temp_db_dir()
        self._file_path = self._file_dir + self._file_name
        PrintLogger.print("ExternalTempDataDiskBuffer create path in init: " + self._file_path)
        FileHandler.create_file_if_not_exist(self._file_path)
        self.stop_event = stop_event
        self._tab = table_name
        self._worker = worker
        self._get_lock = threading.RLock()
        self._put_lock = threading.RLock()
        self._convert_input = convert_input
        self._convert_output = convert_output
        FileBuffInterface.__init__(self, self._file_name, buf_size, output_f=output_f, power_save_mode=True,
                                   terminate_callback=terminate_callback)
        self.set_db_update_interval(10)

        self._is_reading = Event()
        self._need_to_vaccum = Event()
        self._total_record = self.count_all()
コード例 #26
0
 def _write_to_power_save_db(self) -> bool:
     data = self.get_state_for_power_save_mode()
     if isinstance(data, Serializable):
         FileHandler.create_file_if_not_exist(self._recovery_file_path)
         try:
             db = sqlite3.connect(self._recovery_file_path)
             cur = db.cursor()
             cur.execute(
                 "CREATE TABLE IF NOT EXISTS STATE_TAB(STATE TEXT UNIQUE, STATE_V TEXT);"
             )
             data_converted = data.get_serializable_json()
             cur.execute(
                 "INSERT OR REPLACE INTO STATE_TAB (STATE, STATE_V) VALUES ( ?, ?);",
                 ("state", data_converted))
             db.commit()
             db.close()
             return True
         except Exception as ex:
             ErrorLogger.log_error(
                 "FileBuffInterface", ex,
                 "_write_to_power_save_db() " + self._recovery_file_path)
             return False
     else:
         return False
コード例 #27
0
    def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[],
                                 proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120):
        keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt"

        def save_callback(keywords: list):
            FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")

        if len(keyword_init) == 0:
            keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:]
            for item in keyword_init:
                print(item)
            print("total keywords:", len(keyword_init))
        if proxies is None:
            proxy_site = BuyProxyOrg(buy_proxy_org_account)
            proxies = proxy_site.get_proxies(timeout=5)
        current_level = 0
        keywords_pool = keyword_init
        while current_level < level:
            keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code,
                                                       min_delay=min_delay, max_delay=max_delay, callback=save_callback)
            keywords_pool += keyword_init
            current_level += 1
        FileHandler.remove_file_if_exist(keyword_log_path)
        FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")
コード例 #28
0
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
コード例 #29
0
    def testCssParse(self):
        css_text = FileHandler.read_all_from_file(
            "/Users/superCat/Desktop/PycharmProjectPortable/test/example.css")

        section = css_text.split("}")
        groups = []
        parse_str_sp = functools.partial(parse_str, groups, 1)
        result = ""
        for sec in section:
            sec += "}"
            temp = re.sub(css_link_pattern, parse_str_sp, sec)
            result += temp
        for item in groups:
            print(item)

        print(result)
コード例 #30
0
    def testMsgGen(self):
        email_template_path = "D:/Test/email_content_template.txt"
        email_content_save_path = "D:/Test/email_content_saved.txt"
        email_lines_before_table_path = "D:/Test/email_text_before_table.txt"
        email_lines_after_table_path = "D:/Test/email_text_after_table.txt"
        data_file_path = "D:/Test/data_sample.csv"
        # th for head cell, td for data cell
        email_template = FileHandler.read_all_from_file(email_template_path)
        cell_item_template = '<{0:s} style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                             'border-box;padding: 8px;text-align: left;line-height: 1.42857143;vertical-align: ' \
                             'bottom;border-top: 1px solid #ddd;border-bottom: 2px solid #ddd;border: 1px solid ' \
                             '#ddd!important;border-bottom-width: 2px;background-color: #fff!important;">' \
                             '{1:s}</{0:s}>'
        row_item_template = '<tr style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing:' \
                            ' border-box;page-break-inside: avoid;">{0:s}</tr>'
        line_format = '<p style="-webkit-box-sizing: border-box;-moz-box-sizing: border-box;box-sizing: ' \
                      'border-box;orphans: 3;widows: 3;margin: 0 0 10px;">{0:s}</p><br>'
        before_table_lines = FileHandler.read_lines_from_file(
            email_lines_before_table_path, remove_blank_line=False)
        after_table_lines = FileHandler.read_lines_from_file(
            email_lines_after_table_path, remove_blank_line=False)

        before_table_str = "".join(
            [line_format.format(x, ) for x in before_table_lines])
        after_table_str = "".join(
            [line_format.format(x, ) for x in after_table_lines])
        table_cells_str = ""
        with open(data_file_path, mode='r', newline='') as csv_file:
            reader = csv.reader(csv_file, delimiter=',')
            header = next(reader)
            header_row_str = row_item_template.format("".join(
                [cell_item_template.format(
                    "th",
                    x,
                ) for x in header]))
            for row in reader:
                table_cells_str += row_item_template.format("".join(
                    [cell_item_template.format(
                        "td",
                        x,
                    ) for x in row]))

        email_content = email_template.format(before_table_str, 50,
                                              header_row_str, table_cells_str,
                                              after_table_str)
        FileHandler.remove_file_if_exist(email_content_save_path)
        FileHandler.append_line_to_file(email_content_save_path, email_content)
        return email_content
コード例 #31
0
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
         explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                    external_stop_event=stop_event,
                                    download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
コード例 #32
0
 def delete_proxy_file(self):
     FileHandler.remove_file_if_exist(self._file_path)
コード例 #33
0
 def remove_power_save_db(self):
     FileHandler.remove_file_if_exist(self._recovery_file_path)
コード例 #34
0
 def save_callback(keywords: list):
     FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at")