def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("exporter start @%s", node()) self._source_set_joke = [ "http://neihanshequ.com/joke/", "http://neihanshequ.com/bar/1/", "http://neihanshequ.com/bar/11/", "http://neihanshequ.com/bar/76/", "http://neihanshequ.com/bar/80/", "http://neihanshequ.com/bar/82/", "http://neihanshequ.com/bar/59/", "http://neihanshequ.com/bar/5/", ] self._source_set_art = [ "http://neihanshequ.com/bar/25/", "http://neihanshequ.com/bar/26/", "http://neihanshequ.com/bar/3/", "http://neihanshequ.com/bar/53/", "http://neihanshequ.com/bar/46/", "http://neihanshequ.com/bar/49/", "http://neihanshequ.com/bar/69/", "http://neihanshequ.com/bar/51/", "http://neihanshequ.com/bar/60/", ]
def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() if not isdir(config_report_folder): makedirs(config_report_folder) self._log_info("create folder of charts: %s", config_report_folder) self._log_info("watchdog start @%s", node())
class ParserFollow(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("follow parser start @%s", node()) def process(self): status = False job = self._db_conn.queue_page_take_follow() if job != None: url = job['url'] text = job.get('text', "") links = self._parse_user_links(url, text) self._log_info("parse follow page: %s, link count: %d", url, len(links)) if links: for link in links: if not self._db_conn.queue_crawl_create(link): self._log_warning( "fail to add %s as 'new' job in queue_crawl", link) if not self._db_conn.queue_page_done_follow(url): self._log_warning( "fail to mark %s as 'done_follow' in queue_page", url) status = True else: self._log_warning("grab no follow pages to parse") sleep(config_idle_sleep) return status def _parse_user_links(self, url, text): links = [] soup = BeautifulSoup(text) for tag in soup.find_all(class_="follow-list-item"): if tag.find_all("a"): links.append(tag.find("a").get('href')) pagination = soup.find(class_="pagination") if pagination: for tag in pagination.find_all("a"): if "Next" == tag.text: links.append(tag.get('href')) return self._purge_data_list(links, config_parse_domain) def _purge_data_list(self, data, prefix=None): purged = [] for item in data: if item != None and len(item.strip()) > 0: if prefix == None or prefix in item: purged.append(item.strip()) else: purged.append(prefix + item.strip()) return purged def close(self): self._db_conn.close() self._log_info("follow parser exit") self._close_logger()
class Exporter(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("exporter start @%s", node()) def process(self): filelist = [] data = self._db_conn.profile_read() self._log_info("load all profiles data from database") filelist.append(self._save_as_json(data)) filelist.append(self._save_as_csv(data)) data = self._db_conn.profile_read('email') self._log_info("load profiles data with email from database") filelist.append(self._save_as_json(data, "profile_email.json")) filelist.append(self._save_as_csv(data, "profile_email.csv")) self._archive_into_zipfile(filelist) def _save_as_json(self, data, filename="profile.json"): with open(filename, 'w') as jsonfile: for item in data: dump(item, jsonfile, sort_keys=True) jsonfile.write("\n") self._log_info("save %d items as json file: %s", len(data), filename) return filename def _save_as_csv(self, data, filename="profile.csv"): fields = set() for item in data: fields = fields.union(set(item.keys())) with open(filename, 'w', encoding='utf8', newline='') as csvfile: writer = DictWriter(csvfile, extrasaction='ignore', dialect='excel', fieldnames=sorted(fields, reverse=True)) writer.writeheader() for item in data: writer.writerow(item) self._log_info("save %d items as csv file: %s", len(data), filename) return filename def _archive_into_zipfile(self, filelist): zipname = "profile_{}.zip".format(strftime("%Y-%m-%d_%H-%M-%S")) with ZipFile(zipname, 'w', ZIP_DEFLATED) as zip: for filename in filelist: zip.write(filename) remove(filename) self._log_info("archive exported files into %s", zipname) def close(self): self._db_conn.close() self._log_info("exporter exit") self._close_logger()
def add_urls_to_queue_crawl(self, urls): print("add_urls_to_queue_crawl with config_crawl_date_max =", config_crawl_date_max) with closing(DatabaseAccessor()) as dal: for url_prefix in urls: url = url_prefix + str(config_crawl_date_max) print("add {} - {}".format(url, dal.queue_crawl_create(url)))
class DatabaseBuilder(): def __init__(self, host=constant.host, port=constant.port): self.c = pymongo.MongoClient(host=host, port=port) self.db_accessor = DatabaseAccessor() def update_stock_hq(self, code, update_accessor_name='all', start_date=None, end_date=None, retry_times=3): if not end_date: end_date = date.today() accessor_map = constant.external_accessor_map checking_df = {} earliest_check_point = date.today() for accessor_name, accessor in accessor_map.items(): if update_accessor_name == 'all' or update_accessor_name == accessor_name: i_start, i_end, e_start, e_end = utility.get_dates(code, accessor_name, start_date, end_date) check_point = self.db_accessor.get_checkpoints(code=code, accessor_name=accessor_name) if e_start: df = accessor.get_hq(code, start_date=e_start, end_date=e_end, retry_times=retry_times) checking_df[accessor_name] = df earliest_check_point = min(earliest_check_point, check_point) if not checking_df or earliest_check_point == date.today(): return new_checkpoints = dict() for cur_date in pd.date_range(earliest_check_point, end_date):
def get_dates(code, accessor_name, start_date, end_date): """ this method split the date range into two part, internal_start, internal_end, external_start, external_end, in order to reduce the dates need be requested from external data source. :param code: string number code without pre/suffix, e.g. '600033' :param accessor_name: string for the external data source, e.g. 'sohu', 'sina' etc :param start_date: string '%Y-%m-%d' e.g. '2017-01-04' :param end_date: string '%Y-%m-%d' e.g. '2017-01-04' :return: (i_start, i_end, e_start, e_end) data type are datetime or None if e_start != None, means need read from external, e_end also != None, if i_end != None, means need read from internal database. But i_start can be None """ db_accessor = DatabaseAccessor() db_checkpoint = db_accessor.get_checkpoints(code, accessor_name) i_start = None i_end = None e_start = None e_end = None if start_date: start_date = date_parser.parse(start_date).date() else: start_date = constant.default_start_date if end_date: end_date = date_parser.parse(end_date).date() else: end_date = date.today() if db_checkpoint == constant.default_start_date or start_date > db_checkpoint: i_start = None i_end = None e_start = start_date e_end = end_date elif end_date <= db_checkpoint: i_start = None i_end = end_date e_start = None e_end = None else: i_start = start_date i_end = db_checkpoint e_start = db_checkpoint + timedelta(days=1) e_end = end_date return i_start, i_end, e_start, e_end
def get_hq(self, code, start_date=None, end_date=None, retry_times=3): i_start, i_end, e_start, e_end = utility.get_dates( code, self.accessor_name, start_date, end_date) e_df = None i_df = None retry = 0 code = constant.code_map.get(code[0]) + code if e_start: while retry < retry_times: url = self.hq_fast_baseURL.format(code, e_start, e_end) try: r = requests.get(url) if r.status_code == 200: page = r.content soup = BeautifulSoup(page) record_list = [] for el in soup.find_all(name='content'): hq_cur = dict() hq_cur['Close'] = float(el.attrs['c']) hq_cur['Open'] = float(el.attrs['o']) hq_cur['Volume'] = int(el.attrs['v']) hq_cur['High'] = float(el.attrs['h']) hq_cur['Low'] = float(el.attrs['l']) hq_cur['Date'] = date_parser.parse( el.attrs['d']).date() record_list.append(hq_cur) e_df = pd.DataFrame(record_list) if len(e_df) != 0: retry = retry_times + 1 e_df['Amount'] = np.NaN e_df = e_df[constant.hq_col_names] e_df = e_df.set_index('Date', drop=False).sort_index() e_df = utility.clean_hq_df(e_df) e_df = e_df[e_start:e_end] except Exception: retry += 1 if i_end: db_accessor = DatabaseAccessor() i_df = db_accessor.get_hq(code, i_start, i_end) df = pd.concat([i_df, e_df]).sort_index() return df
class Assigner(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("assigner start @%s", node()) def process(self): url = None flag = None job = self._db_conn.queue_page_take() if job != None: url = job['url'] text = job.get('text', "") flag = self._classify(url, text) self._log_info("%s is classified as '%s'", url, flag) if not self._db_conn.queue_page_done(url, flag): self._log_warning("fail to mark %s as '%s' in queue_page", url, flag) else: self._log_warning("grab no jobs to assign") sleep(config_idle_sleep) return url, flag def _classify(self, url, text): soup = BeautifulSoup(text) flag = "unknown" if soup.find_all(class_="vcard-names"): flag = "profile" elif soup.find_all(class_="follow-list"): flag = "follow" elif soup.find_all(class_="blankslate"): flag = "alone" elif soup.find_all(class_="org-name"): flag = "org" return flag def close(self): self._db_conn.close() self._log_info("assigner exit") self._close_logger()
def get_fhps(self, code, start_date=None, end_date=None, retry_times=3): i_start, i_end, e_start, e_end = utility.get_dates( code, self.accessor_name, start_date, end_date) e_df = None i_df = None url = self.fhps_baseURL.format(code) retry = 0 if e_start: while retry < retry_times: with utility.open_phantomJS_driver() as driver: try: driver.get(url) d_table = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.XPATH, self.fhps_table_xpath))) e_df = pd.read_html(d_table.get_attribute('outerHTML')) retry = retry_times + 1 except Exception: retry += 1 e_df = e_df[0].iloc[:, [1, 2, 3, 5]] e_df.columns = ['Sg', 'Zg', 'Fh', 'Date'] e_df[['Sg', 'Zg', 'Fh' ]] = e_df[['Sg', 'Zg', 'Fh']].astype(float).fillna(0) / 10 # getting the date type instead of datetime.datetime, refer to http://stackoverflow.com/a/34277514/4229125 e_df['Date'] = pd.to_datetime(e_df['Date'], errors='coerce').dt.date e_df.loc[:, 'Ps'] = e_df['Sg'] + e_df['Zg'] e_df = e_df[constant.fhps_col_names].set_index( 'Date', drop=False).sort_index() e_df = e_df[e_start:e_end] if i_end: db_accessor = DatabaseAccessor() i_df = db_accessor.get_fhps(code, i_start, i_end) df = pd.concat([i_df, e_df]).sort_index() return df
def get_hq(self, code, start_date=None, end_date=None, retry_times=3): i_start, i_end, e_start, e_end = utility.get_dates( code, self.accessor_name, start_date, end_date) e_df = None i_df = None if e_start: hq_url = self.hq_baseURL.format(code, e_start, e_end) retry = 0 while retry < retry_times: r = requests.get(hq_url) if r.status_code != 200: retry += 1 else: retry = retry_times + 1 # this means read successfully if retry != retry_times: page = r.content[1:-2] page_io = StringIO(page) data = json.load(page_io) e_df = pd.DataFrame(data['hq']).iloc[:, [0, 1, 6, 2, 5, 7, 8]] e_df.iloc[:, 0] = pd.to_datetime(e_df.iloc[:, 0], errors='coerce').dt.date for i in range(1, len(e_df.columns)): e_df.iloc[:, i] = e_df.iloc[:, i].astype( constant.hq_datatypes[i]) e_df.columns = constant.hq_col_names e_df = e_df.set_index('Date', drop=False).sort_index() e_df = utility.clean_hq_df(e_df) e_df = e_df[e_start:e_end] if i_end: db_accessor = DatabaseAccessor() i_df = db_accessor.get_hq(code, i_start, i_end) df = pd.concat([i_df, e_df]).sort_index() return df
class Crawler(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("crawler start @%s", node()) def process(self): status = False job = self._db_conn.queue_crawl_take() if job != None: url = job['url'] self._log_info("start to crawl %s", url) retry_times = config_crawl_retry while retry_times > 0: text = self._crawl_page(url) if text == None: retry_times -= 1 else: retry_times = 0 if text == None: self._log_warning("fail to crawl %s after %d attempts", url, config_crawl_retry) if not self._db_conn.queue_crawl_fail(url): self._log_warning( "fail to mark %s as 'fail' in queue_crawl", url) else: self._log_info("finish crawling %s, response length: %d", url, len(text)) if not self._db_conn.queue_page_create(url, text): self._log_warning( "fail to add %s as 'new' job in queue_page", url) if not self._db_conn.queue_crawl_done(url): self._log_warning( "fail to mark %s as 'done' in queue_crawl", url) status = True else: self._log_warning("grab no jobs to crawl") return status def _crawl_page(self, url): try: resp = get(url, timeout=config_crawl_timeout) if resp.status_code == codes.ok: return resp.text except Exception as e: pass def close(self): self._db_conn.close() self._log_info("crawler exit") self._close_logger()
class Crawler(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("crawler start @%s", node()) def process(self): status = False job = self._db_conn.queue_crawl_take() if job != None: url = job['url'] self._log_info("start to crawl %s", url) retry_times = config_crawl_retry while retry_times > 0: text = self._crawl_page(url) if text == None: retry_times -= 1 else: retry_times = 0 if text == None: self._log_warning("fail to crawl %s after %d attempts", url, config_crawl_retry) if not self._db_conn.queue_crawl_fail(url): self._log_warning("fail to mark %s as 'fail' in queue_crawl", url) else: self._log_info("finish crawling %s, response length: %d", url, len(text)) if not self._db_conn.queue_page_create(url, text): self._log_warning("fail to add %s as 'new' job in queue_page", url) if not self._db_conn.queue_crawl_done(url): self._log_warning("fail to mark %s as 'done' in queue_crawl", url) status = True else: self._log_warning("grab no jobs to crawl") return status def _crawl_page(self, url): try: resp = get(url, timeout=config_crawl_timeout) if resp.status_code == codes.ok: return resp.text except Exception as e: pass def close(self): self._db_conn.close() self._log_info("crawler exit") self._close_logger()
import os import sys THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) sys.path.append(THIS_FOLDER + '/../../database-layer') sys.path.append(THIS_FOLDER + '/../utilities') sys.path.append(THIS_FOLDER + '/../models') from TokenChecker import TokenChecker from DatabaseAccessor import DatabaseAccessor from User import User DBA = DatabaseAccessor() class TokenCheckHandler: def __init__(self): self.tokenChecker = TokenChecker() def getJsonResponse(self, isUserAuthorized, userId, responseCode): return { 'is user authorized': isUserAuthorized, 'user id': userId, 'response code': responseCode } def handleTokenCheck(self, authToken): tokenPayload = self.tokenChecker.getTokenPayload(str(authToken)) if tokenPayload == False: return self.getJsonResponse(False, 'unauthorized', 401)
def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("exporter start @%s", node())
def read_all_profile(self): with closing(DatabaseAccessor()) as dal: pprint(dal.profile_read())
def clear_queue_crawl_page_profile(self): with closing(DatabaseAccessor()) as dal: print("clear crawl - {}".format(dal.queue_crawl_clear())) print("clear page - {}".format(dal.queue_page_clear())) print("clear profile - {}".format(dal.profile_clear()))
def add_urls_to_queue_crawl(self, urls): with closing(DatabaseAccessor()) as dal: for url in urls: print("add {} - {}".format(url, dal.queue_crawl_create(url)))
class ParserProfile(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("profile parser start @%s", node()) def process(self): status_profile, status_like = False, False job = self._db_conn.queue_page_take_profile() if job != None: url = job['url'] text = job.get('text', "") profile, like = self._parse_profile_and_like(url, text) self._log_info( "parse profile page: %s, items count: { profile: %d, like: %d }", url, len(profile), len(like)) if profile: if not self._db_conn.profile_create(profile): self._log_warning("fail to add profile of %s in database", url) if not self._db_conn.queue_page_done_profile(url): self._log_warning( "fail to mark %s as 'done_profile' in queue_page", url) status_profile = True if like: for key in like: if not self._db_conn.queue_crawl_create(like[key]): self._log_warning( "fail to add %s as 'new' job in queue_crawl", like[key]) status_like = True else: self._log_warning("grab no profile pages to parse") sleep(config_idle_sleep) return status_profile, status_like def _parse_profile_and_like(self, url, text): profile = {} like = {} soup = BeautifulSoup(text) profile["url"] = url profile["login"] = self._parse_tag_text_by_itemprop( soup, "additionalName") profile["name"] = self._parse_tag_text_by_itemprop(soup, "name") profile["company"] = self._parse_tag_text_by_itemprop(soup, "worksFor") profile["location"] = self._parse_tag_text_by_itemprop( soup, "homeLocation") profile["blog"] = self._parse_tag_text_by_itemprop(soup, "url") profile["email"] = self._parse_tag_text_by_itemprop(soup, "email") profile["join_at"] = self._parse_tag_datetime_by_class( soup, "join-date") profile["follower"], like["follower"] = self._parse_tag_count_and_link( soup, "Follower") profile["following"], like[ "following"] = self._parse_tag_count_and_link(soup, "Following") profile["starred"], _ = self._parse_tag_count_and_link(soup, "Starred") return self._purge_data_dict(profile), self._purge_data_dict( like, config_parse_domain) def _purge_data_dict(self, data, prefix=None): purged = {} for key in data: if data[key] != None and len(data[key].strip()) > 0: if prefix == None: purged[key] = data[key].strip() else: purged[key] = prefix + data[key].strip() return purged def _parse_tag_text_by_itemprop(self, soup, item_name): tags = soup.find_all(itemprop=item_name) if len(tags) > 0: return tags[0].text def _parse_tag_string_by_class(self, soup, class_name): tags = soup.find_all(class_=class_name) if len(tags) > 0: return tags[0].string def _parse_tag_datetime_by_class(self, soup, class_name): tags = soup.find_all(class_=class_name) if len(tags) > 0: return tags[0].get('datetime') def _parse_tag_count_and_link(self, soup, text): tags = soup.find_all(class_="vcard-stat") count = None link = None for tag in tags: if text in tag.find(class_="text-muted").text: count = tag.find(class_="vcard-stat-count").text link = tag.get('href') break return count, link def close(self): self._db_conn.close() self._log_info("profile parser exit") self._close_logger()
def get_hq_full(self, code, start_date=None, end_date=None, retry_times=3): """ This method is too slow, by default it will not be used :param code: :param start_date: :param end_date: :param retry_times: :return: """ i_start, i_end, e_start, e_end = utility.get_dates( code, self.accessor_name, start_date, end_date) e_df = None i_df = None retry = 0 available_year_list = [] if e_start: while retry < retry_times: with utility.open_phantomJS_driver() as driver: try: e_df_list = [] year_start = e_start.year year_end = e_end.year jidu_start = (e_start.month - 1) / 3 + 1 jidu_end = (e_end.month - 1) / 3 + 1 year = year_end + 1 while year > year_start: year -= 1 for jidu in range(4, 0, -1): if year > year_end or (year == year_end and jidu > jidu_end): continue if year < year_start or (year == year_start and jidu < jidu_start): continue url = self.hq_full_baseURL.format( code, year, jidu) driver.get(url) if not available_year_list: # need find the available years first year_list_option = WebDriverWait( driver, 30).until( EC.presence_of_element_located( (By.XPATH, self.hq_year_list_xpath))) soup = BeautifulSoup( year_list_option.get_attribute( 'innerHTML')) for available_year in soup.find_all( name='option'): available_year_list.append( int(available_year.text)) if year_start < min(available_year_list): year_start = min(available_year_list) jidu_start = 1 if year_end > max(available_year_list): year_end = max(available_year_list) jidu_end = 4 continue d_table = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.XPATH, self.hq_table_xpath))) cur_e_df = pd.read_html( d_table.get_attribute('outerHTML'), header=1)[0] cur_e_df.columns = constant.hq_col_names cur_e_df['Date'] = pd.to_datetime( cur_e_df['Date'], errors='coerce').dt.date for i in range(1, len(cur_e_df.columns)): cur_e_df.iloc[:, i] = cur_e_df.iloc[:, i].astype( constant.hq_datatypes[i]) cur_e_df = cur_e_df.set_index( 'Date', drop=False).sort_index() e_df_list.append(cur_e_df) retry = retry_times + 1 except Exception: retry += 1 e_df = pd.concat(e_df_list).sort_index() e_df = utility.clean_hq_df(e_df) e_df = e_df[e_start:e_end] if i_end: db_accessor = DatabaseAccessor() i_df = db_accessor.get_fhps(code, i_start, i_end) df = pd.concat([i_df, e_df]).sort_index() return df
class Assigner(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("assigner start @%s", node()) def close(self): self._db_conn.close() self._log_info("assigner exit") self._close_logger() def process(self): url = None job = self._db_conn.queue_page_take_raw() if job != None: url = job['url'] text = job.get('text', "") parse_result = self._parse_raw_page(url, text) if parse_result == None: self._log_warning("fail to parse '%s' as JSON in queue_page", url) if not self._db_conn.queue_page_fail_raw(url): self._log_warning("fail to mark %s as 'fail' in queue_page", url) else: if parse_result[0] == None: self._log_warning("'%s' in queue_page indicates no more new content", url) else: self._log_info("%s indicates new crawling job: %s", url, parse_result[0]) if not self._db_conn.queue_crawl_create(parse_result[0]): self._log_warning("fail to add %s as 'new' job in queue_crawl", parse_result[0]) if parse_result[1] == None: self._log_warning("'%s' in queue_page contains on content", url) else: self._log_info("%s contains %d raw snippets", url, len(parse_result[1])) if not self._db_conn.queue_page_done_raw(url, parse_result[1]): self._log_warning("fail to append parsed data for %s in queue_crawl", url) else: self._log_warning("grab no jobs to assign") sleep(config_idle_sleep) return url def _parse_raw_page(self, url, text): try: page_content = loads(text) url_new, data_new = None, None if (page_content["data"]["has_more"]) and (page_content["data"]["max_time"] > config_crawl_date_min): url_new = sub(r"=(\d*)$", r"=" + str(page_content["data"]["max_time"]), url) if len(page_content["data"]["data"]) > 0: data_new = page_content["data"]["data"] result = (url_new, data_new) self._log_info( "%s data status - more: %s, min: %d, max: %d", url, page_content["data"]["has_more"], page_content["data"]["min_time"], page_content["data"]["max_time"]) except Exception as e: result = None return result
def __init__(self, host=constant.host, port=constant.port): self.c = pymongo.MongoClient(host=host, port=port) self.db_accessor = DatabaseAccessor()
def clear_queue_crawl_page_snippet(self): print("clear_queue_crawl_page_snippet") with closing(DatabaseAccessor()) as dal: print("clear crawl - {}".format(dal.queue_crawl_clear())) print("clear page - {}".format(dal.queue_page_clear())) print("clear snippet - {}".format(dal.snippet_clear()))
class WatchDog(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() if not isdir(config_report_folder): makedirs(config_report_folder) self._log_info("create folder of charts: %s", config_report_folder) self._log_info("watchdog start @%s", node()) def process(self): time_start = time() data = self._update_data() self._draw_charts_with_data(data) time_end = time() sleep(max(0, config_report_interval - (time_end - time_start))) return def _update_data(self): data = self._load_data() self._log_info("load existing data, count: %d", len(data)) time_start = time() status = { "crawl_all": self._db_conn.queue_crawl_count(), "crawl_new": self._db_conn.queue_crawl_count("new"), "crawl_fail": self._db_conn.queue_crawl_count("fail"), "crawl_done": self._db_conn.queue_crawl_count("done"), "page_all": self._db_conn.queue_page_count(), "page_new": self._db_conn.queue_page_count("new"), "page_profile": self._db_conn.queue_page_count("profile"), "page_profile_done": self._db_conn.queue_page_count("done_profile"), "page_follow": self._db_conn.queue_page_count("follow"), "page_follow_done": self._db_conn.queue_page_count("done_follow"), "page_unknown": self._db_conn.queue_page_count("unknown"), "profile": self._db_conn.profile_count(), "profile_email": self._db_conn.profile_count("email"), } time_end = time() status["duration"] = time_end - time_start status["date"] = datetime.utcnow() data.append(status) self._save_data(data) self._log_info("save existing data, count: %d", len(data)) self._log_info( dumps(status, sort_keys=True, indent=4, default=json_util.default)) return data def _load_data(self, filename=config_report_status): data = [] if isfile(filename): try: data_file = open(filename, 'r') content = data_file.read() data_file.close() data = loads(content, object_hook=json_util.object_hook) except Exception as e: self._log_exception("fail to load json file: %s", filename) else: self._log_warning("fail to find json file: %s", filename) return data def _save_data(self, data, filename=config_report_status): output_file = open(filename, 'w') output_file.write( dumps(data, sort_keys=True, indent=4, default=json_util.default)) output_file.close() def _draw_charts_with_data(self, data): chart_size_methods = [ self._draw_size_chart_summary, self._draw_size_chart_crawl, self._draw_size_chart_page, self._draw_size_chart_profile ] chart_delta_methods = [ self._draw_delta_chart_summary, self._draw_delta_chart_page ] pos_start = config_report_item * config_report_step - config_report_step + 1 if len(data) > pos_start: data_render = data[-pos_start::config_report_step] else: data_render = data[-config_report_item:] for method in chart_size_methods: result = method(data_render) self._log_info("save chart as %s", result) for method in chart_delta_methods: result = method(data[-config_report_item - 1:]) self._log_info("save chart as %s", result) def _get_stackedline_with_style(self): dark_rotate_style = RotateStyle('#9e6ffe') return StackedLine(fill=True, disable_xml_declaration=True, include_x_axis=False, human_readable=True, interpolate='hermite', style=dark_rotate_style) def _get_line_with_style(self): dark_rotate_style = RotateStyle('#9e6ffe') return Line(fill=False, disable_xml_declaration=True, include_x_axis=False, human_readable=True, interpolate='hermite', style=dark_rotate_style) def _extract_list(self, data, field): return [item[field] for item in data] def _extract_date_list(self, data_list): return [ data["date"].replace(tzinfo=timezone.utc).astimezone( tz=None).strftime("%H:%M") for data in data_list ] def _draw_size_chart_summary(self, data, filename="size_summary.svg"): filename = join(config_report_folder, filename) list_crawl = self._extract_list(data, "crawl_all") list_page = self._extract_list(data, "page_all") list_profile = self._extract_list(data, "profile") chart = self._get_line_with_style() chart.title = 'Queue Size Summary' chart.x_labels = self._extract_date_list(data) chart.add('Profile', list_profile) chart.add('Page', list_page) chart.add('Crawl', list_crawl) chart.render_to_file(filename) return filename def _draw_size_chart_crawl(self, data, filename="size_crawl.svg"): filename = join(config_report_folder, filename) list_all = self._extract_list(data, "crawl_all") list_new = self._extract_list(data, "crawl_new") list_fail = self._extract_list(data, "crawl_fail") list_done = [ list_all[i] - list_new[i] - list_fail[i] for i in range(len(list_all)) ] chart = self._get_stackedline_with_style() chart.title = 'Size of Queue Crawl' chart.x_labels = self._extract_date_list(data) chart.add('Done', list_done) chart.add('Todo', list_new) chart.render_to_file(filename) return filename def _draw_size_chart_page(self, data, filename="size_page.svg"): filename = join(config_report_folder, filename) list_all = self._extract_list(data, "page_all") list_new = self._extract_list(data, "page_new") list_profile = self._extract_list(data, "page_profile") list_follow = self._extract_list(data, "page_follow") list_unknown = self._extract_list(data, "page_unknown") list_done = [ list_all[i] - list_new[i] - list_profile[i] - list_follow[i] - list_unknown[i] for i in range(len(list_all)) ] chart = self._get_stackedline_with_style() chart.title = 'Size of Queue Page' chart.x_labels = self._extract_date_list(data) chart.add('Done', list_done) chart.add('Profile', list_profile) chart.add('Follow', list_follow) chart.add('Todo', list_new) chart.render_to_file(filename) return filename def _draw_size_chart_profile(self, data, filename="size_profile.svg"): filename = join(config_report_folder, filename) list_all = self._extract_list(data, "profile") list_email = self._extract_list(data, "profile_email") list_other = [ list_all[i] - list_email[i] for i in range(len(list_all)) ] chart = self._get_stackedline_with_style() chart.title = 'Size of Queue Profile' chart.x_labels = self._extract_date_list(data) chart.add('Other', list_other) chart.add('Email', list_email) chart.render_to_file(filename) return filename def _get_delta_list(self, data, field): value_list = self._extract_list(data, field) for i in range(len(value_list) - 1, 0, -1): value_list[i] -= value_list[i - 1] return value_list[1:] def _draw_delta_chart_summary(self, data, filename="delta_summary.svg"): filename = join(config_report_folder, filename) list_crawl = self._get_delta_list(data, "crawl_all") list_page = self._get_delta_list(data, "page_all") list_profile = self._get_delta_list(data, "profile") chart = self._get_line_with_style() chart.title = 'Queue Size Increase Summary' chart.x_labels = self._extract_date_list(data[1:]) chart.add('Profile', list_profile) chart.add('Page', list_page) chart.add('Crawl', list_crawl) chart.render_to_file(filename) return filename def _draw_delta_chart_page(self, data, filename="delta_page.svg"): filename = join(config_report_folder, filename) list_all = self._get_delta_list(data, "page_all") list_fail = self._get_delta_list(data, "crawl_fail") list_follow = self._get_delta_list(data, "page_follow_done") list_profile = self._get_delta_list(data, "page_profile_done") chart = self._get_line_with_style() chart.title = 'Queue Page Size Increase' chart.x_labels = self._extract_date_list(data[1:]) chart.add('Profile', list_profile) chart.add('Follow', list_follow) chart.add('Failed Crawl', list_fail) chart.add('New Crawl', list_all) chart.render_to_file(filename) return filename def close(self): self._db_conn.close() self._log_info("watchdog exit") self._close_logger()
class Parser(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("parser start @%s", node()) def close(self): self._db_conn.close() self._log_info("parser exit") self._close_logger() def process(self): count_valid = None count_duplicate = None job = self._db_conn.queue_page_take_data() if job != None: url = job['url'] data_list = job.get('data', []) self._log_info("parse json data from %s, items count %d", url, len(data_list)) count_valid = 0 count_duplicate = 0 for data_index, data_item in enumerate(data_list): snippet = self._extract_snippet_record(url, data_item) if snippet == None: self._log_warning( "fail to extract #%d record of '%s' json data in queue_page", data_index, url) else: if not self._db_conn.snippet_create(snippet): count_duplicate += 1 # self._log_warning("fail to add new snippet %s", snippet["url"]) else: count_valid += 1 self._log_info( "extract %d valid & %d duplicate snippets from %s json data", count_valid, count_duplicate, url) if not self._db_conn.queue_page_done_data(url): self._log_warning("fail to mark %s as 'done' in queue_crawl", url) else: self._log_warning("grab no json data to parse") sleep(config_idle_sleep) return (count_valid, count_duplicate) def _extract_snippet_record(self, url, data): try: snippet = { "url": config_parse_domain + str(data["group"]["group_id"]), "date": datetime.fromtimestamp(data["group"]["create_time"]), "content": data["group"]["content"], "archive": data, "source": url.split("?")[0], "source_name": data["group"]["category_name"], } snippet["count"] = { "digg": data["group"]["digg_count"], "bury": data["group"]["bury_count"], "favorite": data["group"]["favorite_count"], "comment": data["group"]["comment_count"], } if len(data["comments"]) > 0: comment_text = [] comment_digg = [] for comment in data["comments"]: comment_text.append(comment["text"]) comment_digg.append(comment["digg_count"]) snippet["comments"] = comment_text snippet["count"]["commdigg"] = comment_digg if len(snippet["content"].strip()) == 0: snippet = None except Exception as e: snippet = None return snippet
def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("crawler start @%s", node())
def get_fhps(self, code, start_date=None, end_date=None, retry_times=3): i_start, i_end, e_start, e_end = utility.get_dates( code, self.accessor_name, start_date, end_date) e_df = None i_df = None url = self.fhps_baseURL.format(code) table = None retry = 0 if e_start: while retry < retry_times: with utility.open_phantomJS_driver() as driver: try: driver.get(url) d_table = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.XPATH, self.fhps_table_xpath))) table = d_table.text if not table: retry += 1 else: retry = retry_times + 1 except Exception: retry += 1 if retry != retry_times: line_iter = iter(table.split('\n')) records = [] while True: record = dict() date_m = None try: line = line_iter.next() tokens = line.strip().split() if len(tokens) == 1: continue if tokens[0] != u'除权除息日': continue if len(tokens) == 2: next_line = line_iter.next() date_m = self.date_re.search(next_line) if not date_m: date_m = self.date_re.search(line) zg_m = self.zg_re.search(line) sg_m = self.sg_re.search(line) fh_m = self.fh_re.search(line) record['Date'] = date_parser.parse( date_m.group(1)).date() if zg_m: record['Zg'] = float(zg_m.group(1)) / 10. if sg_m: record['Sg'] = float(sg_m.group(1)) / 10. if fh_m: record['Fh'] = float(fh_m.group(1)) / 10. records.append(record) except StopIteration: break if records: e_df = pd.DataFrame(records) for field in ['Fh', 'Zg', 'Sg']: if field not in e_df: e_df.loc[:, field] = 0.0 e_df.loc[:, ['Fh', 'Zg', 'Sg' ]] = e_df.loc[:, ['Fh', 'Zg', 'Sg']].fillna(0.0) e_df.loc[:, 'Ps'] = e_df['Zg'] + e_df['Sg'] e_df = e_df[constant.fhps_col_names].set_index( 'Date', drop=False).sort_index() e_df = e_df[e_start:e_end] if i_end: db_accessor = DatabaseAccessor() i_df = db_accessor.get_fhps(code, i_start, i_end) df = pd.concat([i_df, e_df]).sort_index() return df
class Exporter(BaseLogger): def __init__(self, log_level=None): BaseLogger.__init__(self, self.__class__.__name__, log_level) self._db_conn = DatabaseAccessor() self._log_info("exporter start @%s", node()) self._source_set_joke = [ "http://neihanshequ.com/joke/", "http://neihanshequ.com/bar/1/", "http://neihanshequ.com/bar/11/", "http://neihanshequ.com/bar/76/", "http://neihanshequ.com/bar/80/", "http://neihanshequ.com/bar/82/", "http://neihanshequ.com/bar/59/", "http://neihanshequ.com/bar/5/", ] self._source_set_art = [ "http://neihanshequ.com/bar/25/", "http://neihanshequ.com/bar/26/", "http://neihanshequ.com/bar/3/", "http://neihanshequ.com/bar/53/", "http://neihanshequ.com/bar/46/", "http://neihanshequ.com/bar/49/", "http://neihanshequ.com/bar/69/", "http://neihanshequ.com/bar/51/", "http://neihanshequ.com/bar/60/", ] def process(self): filelist = [] data = self._db_conn.snippet_read() self._log_info("load all snippet data from database") filelist.append(self._save_as_json(data)) filelist.append(self._save_as_csv(data)) data_joke = self._select_data_column(data, self._source_set_joke) filelist.append(self._save_as_csv(data_joke, "snippet_joke.csv")) data_art = self._select_data_column(data, self._source_set_art) filelist.append(self._save_as_csv(data_art, "snippet_art.csv")) self._archive_into_zipfile(filelist) def _select_data_column(self, data_raw, source_set): data_new = [] for item_raw in data_raw: if item_raw["source"] not in source_set: continue for index in range(max(1, len(item_raw.get("comments", [])))): item_new = { "count_digg": item_raw["count"]["digg"], "count_bury": item_raw["count"]["bury"], "count_favorite": item_raw["count"]["favorite"], "count_comment": item_raw["count"]["comment"], "count_diggcomm": None, "text": item_raw["content"], "text_comment": None, "source": item_raw["source_name"], } if "comments" in item_raw: item_new["text_comment"] = item_raw["comments"][index] item_new["count_diggcomm"] = item_raw["count"]["commdigg"][index] data_new.append(item_new) return data_new def _save_as_json(self, data, filename="snippet.json"): with open(filename, 'w') as jsonfile: for item in data: dump(item, jsonfile, sort_keys=True) jsonfile.write("\n") self._log_info("save %d items as json file: %s", len(data), filename) return filename def _save_as_csv(self, data, filename="snippet.csv"): fields = set() for item in data: fields = fields.union(set(item.keys())) with open(filename, 'w', encoding='utf8', newline='') as csvfile: writer = DictWriter(csvfile, extrasaction='ignore', dialect='excel', fieldnames=sorted(fields, reverse=False)) writer.writeheader() for item in data: writer.writerow(item) self._log_info("save %d items as csv file: %s", len(data), filename) return filename def _archive_into_zipfile(self, filelist): zipname = "snippet_{}.zip".format(strftime("%Y-%m-%d_%H-%M-%S")) with ZipFile(zipname, 'w', ZIP_DEFLATED) as zip: for filename in filelist: zip.write(filename) remove(filename) self._log_info("archive exported files into %s", zipname) def close(self): self._db_conn.close() self._log_info("exporter exit") self._close_logger()