Exemple #1
0
 def __init__(self, log_level=None):
     BaseLogger.__init__(self, self.__class__.__name__, log_level)
     self._db_conn = DatabaseAccessor()
     self._log_info("exporter start @%s", node())
     self._source_set_joke = [
         "http://neihanshequ.com/joke/",
         "http://neihanshequ.com/bar/1/",
         "http://neihanshequ.com/bar/11/",
         "http://neihanshequ.com/bar/76/",
         "http://neihanshequ.com/bar/80/",
         "http://neihanshequ.com/bar/82/",
         "http://neihanshequ.com/bar/59/",
         "http://neihanshequ.com/bar/5/",
     ]
     self._source_set_art = [
         "http://neihanshequ.com/bar/25/",
         "http://neihanshequ.com/bar/26/",
         "http://neihanshequ.com/bar/3/",
         "http://neihanshequ.com/bar/53/",
         "http://neihanshequ.com/bar/46/",
         "http://neihanshequ.com/bar/49/",
         "http://neihanshequ.com/bar/69/",
         "http://neihanshequ.com/bar/51/",
         "http://neihanshequ.com/bar/60/",
     ]
Exemple #2
0
 def __init__(self, log_level=None):
     BaseLogger.__init__(self, self.__class__.__name__, log_level)
     self._db_conn = DatabaseAccessor()
     if not isdir(config_report_folder):
         makedirs(config_report_folder)
         self._log_info("create folder of charts: %s", config_report_folder)
     self._log_info("watchdog start @%s", node())
class ParserFollow(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("follow parser start @%s", node())

    def process(self):
        status = False
        job = self._db_conn.queue_page_take_follow()
        if job != None:
            url = job['url']
            text = job.get('text', "")
            links = self._parse_user_links(url, text)
            self._log_info("parse follow page: %s, link count: %d", url,
                           len(links))
            if links:
                for link in links:
                    if not self._db_conn.queue_crawl_create(link):
                        self._log_warning(
                            "fail to add %s as 'new' job in queue_crawl", link)
                if not self._db_conn.queue_page_done_follow(url):
                    self._log_warning(
                        "fail to mark %s as 'done_follow' in queue_page", url)
                status = True
        else:
            self._log_warning("grab no follow pages to parse")
            sleep(config_idle_sleep)
        return status

    def _parse_user_links(self, url, text):
        links = []
        soup = BeautifulSoup(text)

        for tag in soup.find_all(class_="follow-list-item"):
            if tag.find_all("a"):
                links.append(tag.find("a").get('href'))

        pagination = soup.find(class_="pagination")
        if pagination:
            for tag in pagination.find_all("a"):
                if "Next" == tag.text:
                    links.append(tag.get('href'))

        return self._purge_data_list(links, config_parse_domain)

    def _purge_data_list(self, data, prefix=None):
        purged = []
        for item in data:
            if item != None and len(item.strip()) > 0:
                if prefix == None or prefix in item:
                    purged.append(item.strip())
                else:
                    purged.append(prefix + item.strip())
        return purged

    def close(self):
        self._db_conn.close()
        self._log_info("follow parser exit")
        self._close_logger()
Exemple #4
0
class Exporter(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("exporter start @%s", node())

    def process(self):
        filelist = []
        data = self._db_conn.profile_read()
        self._log_info("load all profiles data from database")
        filelist.append(self._save_as_json(data))
        filelist.append(self._save_as_csv(data))
        data = self._db_conn.profile_read('email')
        self._log_info("load profiles data with email from database")
        filelist.append(self._save_as_json(data, "profile_email.json"))
        filelist.append(self._save_as_csv(data, "profile_email.csv"))
        self._archive_into_zipfile(filelist)

    def _save_as_json(self, data, filename="profile.json"):
        with open(filename, 'w') as jsonfile:
            for item in data:
                dump(item, jsonfile, sort_keys=True)
                jsonfile.write("\n")
        self._log_info("save %d items as json file: %s", len(data), filename)
        return filename

    def _save_as_csv(self, data, filename="profile.csv"):
        fields = set()
        for item in data:
            fields = fields.union(set(item.keys()))
        with open(filename, 'w', encoding='utf8', newline='') as csvfile:
            writer = DictWriter(csvfile,
                                extrasaction='ignore',
                                dialect='excel',
                                fieldnames=sorted(fields, reverse=True))
            writer.writeheader()
            for item in data:
                writer.writerow(item)
        self._log_info("save %d items as csv file: %s", len(data), filename)
        return filename

    def _archive_into_zipfile(self, filelist):
        zipname = "profile_{}.zip".format(strftime("%Y-%m-%d_%H-%M-%S"))
        with ZipFile(zipname, 'w', ZIP_DEFLATED) as zip:
            for filename in filelist:
                zip.write(filename)
                remove(filename)
        self._log_info("archive exported files into %s", zipname)

    def close(self):
        self._db_conn.close()
        self._log_info("exporter exit")
        self._close_logger()
Exemple #5
0
 def add_urls_to_queue_crawl(self, urls):
     print("add_urls_to_queue_crawl with config_crawl_date_max =",
           config_crawl_date_max)
     with closing(DatabaseAccessor()) as dal:
         for url_prefix in urls:
             url = url_prefix + str(config_crawl_date_max)
             print("add {} - {}".format(url, dal.queue_crawl_create(url)))
Exemple #6
0
class DatabaseBuilder():
    def __init__(self, host=constant.host, port=constant.port):
        self.c = pymongo.MongoClient(host=host, port=port)
        self.db_accessor = DatabaseAccessor()

    def update_stock_hq(self, code, update_accessor_name='all', start_date=None, end_date=None, retry_times=3):

        if not end_date:
            end_date = date.today()

        accessor_map = constant.external_accessor_map
        checking_df = {}
        earliest_check_point = date.today()

        for accessor_name, accessor in accessor_map.items():
            if update_accessor_name == 'all' or update_accessor_name == accessor_name:
                i_start, i_end, e_start, e_end = utility.get_dates(code, accessor_name, start_date, end_date)
                check_point = self.db_accessor.get_checkpoints(code=code, accessor_name=accessor_name)
                if e_start:
                    df = accessor.get_hq(code, start_date=e_start, end_date=e_end, retry_times=retry_times)
                    checking_df[accessor_name] = df
                    earliest_check_point = min(earliest_check_point, check_point)

        if not checking_df or earliest_check_point == date.today():
            return

        new_checkpoints = dict()
        for cur_date in pd.date_range(earliest_check_point, end_date):
Exemple #7
0
def get_dates(code, accessor_name, start_date, end_date):
    """
    this method split the date range into two part, internal_start, internal_end, external_start, external_end,
    in order to reduce the dates need be requested from external data source.
    :param code: string number code without pre/suffix, e.g. '600033'
    :param accessor_name: string for the external data source, e.g. 'sohu', 'sina' etc
    :param start_date: string '%Y-%m-%d' e.g. '2017-01-04'
    :param end_date: string '%Y-%m-%d' e.g. '2017-01-04'
    :return: (i_start, i_end, e_start, e_end) data type are datetime or None
             if e_start != None, means need read from external, e_end also != None,
             if i_end != None, means need read from internal database. But i_start can be None
    """
    db_accessor = DatabaseAccessor()
    db_checkpoint = db_accessor.get_checkpoints(code, accessor_name)
    i_start = None
    i_end = None
    e_start = None
    e_end = None

    if start_date:
        start_date = date_parser.parse(start_date).date()
    else:
        start_date = constant.default_start_date

    if end_date:
        end_date = date_parser.parse(end_date).date()
    else:
        end_date = date.today()

    if db_checkpoint == constant.default_start_date or start_date > db_checkpoint:
        i_start = None
        i_end = None
        e_start = start_date
        e_end = end_date
    elif end_date <= db_checkpoint:
        i_start = None
        i_end = end_date
        e_start = None
        e_end = None
    else:
        i_start = start_date
        i_end = db_checkpoint
        e_start = db_checkpoint + timedelta(days=1)
        e_end = end_date

    return i_start, i_end, e_start, e_end
    def get_hq(self, code, start_date=None, end_date=None, retry_times=3):
        i_start, i_end, e_start, e_end = utility.get_dates(
            code, self.accessor_name, start_date, end_date)
        e_df = None
        i_df = None
        retry = 0
        code = constant.code_map.get(code[0]) + code

        if e_start:
            while retry < retry_times:
                url = self.hq_fast_baseURL.format(code, e_start, e_end)
                try:
                    r = requests.get(url)
                    if r.status_code == 200:
                        page = r.content
                        soup = BeautifulSoup(page)
                        record_list = []
                        for el in soup.find_all(name='content'):
                            hq_cur = dict()
                            hq_cur['Close'] = float(el.attrs['c'])
                            hq_cur['Open'] = float(el.attrs['o'])
                            hq_cur['Volume'] = int(el.attrs['v'])
                            hq_cur['High'] = float(el.attrs['h'])
                            hq_cur['Low'] = float(el.attrs['l'])
                            hq_cur['Date'] = date_parser.parse(
                                el.attrs['d']).date()
                            record_list.append(hq_cur)
                        e_df = pd.DataFrame(record_list)
                        if len(e_df) != 0:
                            retry = retry_times + 1
                        e_df['Amount'] = np.NaN
                        e_df = e_df[constant.hq_col_names]
                        e_df = e_df.set_index('Date', drop=False).sort_index()
                        e_df = utility.clean_hq_df(e_df)
                        e_df = e_df[e_start:e_end]
                except Exception:
                    retry += 1

        if i_end:
            db_accessor = DatabaseAccessor()
            i_df = db_accessor.get_hq(code, i_start, i_end)

        df = pd.concat([i_df, e_df]).sort_index()
        return df
Exemple #9
0
class Assigner(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("assigner start @%s", node())

    def process(self):
        url = None
        flag = None
        job = self._db_conn.queue_page_take()
        if job != None:
            url = job['url']
            text = job.get('text', "")
            flag = self._classify(url, text)
            self._log_info("%s is classified as '%s'", url, flag)
            if not self._db_conn.queue_page_done(url, flag):
                self._log_warning("fail to mark %s as '%s' in queue_page", url,
                                  flag)
        else:
            self._log_warning("grab no jobs to assign")
            sleep(config_idle_sleep)
        return url, flag

    def _classify(self, url, text):
        soup = BeautifulSoup(text)
        flag = "unknown"
        if soup.find_all(class_="vcard-names"):
            flag = "profile"
        elif soup.find_all(class_="follow-list"):
            flag = "follow"
        elif soup.find_all(class_="blankslate"):
            flag = "alone"
        elif soup.find_all(class_="org-name"):
            flag = "org"
        return flag

    def close(self):
        self._db_conn.close()
        self._log_info("assigner exit")
        self._close_logger()
    def get_fhps(self, code, start_date=None, end_date=None, retry_times=3):
        i_start, i_end, e_start, e_end = utility.get_dates(
            code, self.accessor_name, start_date, end_date)
        e_df = None
        i_df = None
        url = self.fhps_baseURL.format(code)
        retry = 0

        if e_start:
            while retry < retry_times:
                with utility.open_phantomJS_driver() as driver:
                    try:
                        driver.get(url)
                        d_table = WebDriverWait(driver, 30).until(
                            EC.presence_of_element_located(
                                (By.XPATH, self.fhps_table_xpath)))
                        e_df = pd.read_html(d_table.get_attribute('outerHTML'))
                        retry = retry_times + 1
                    except Exception:
                        retry += 1
            e_df = e_df[0].iloc[:, [1, 2, 3, 5]]
            e_df.columns = ['Sg', 'Zg', 'Fh', 'Date']
            e_df[['Sg', 'Zg', 'Fh'
                  ]] = e_df[['Sg', 'Zg', 'Fh']].astype(float).fillna(0) / 10
            # getting the date type instead of datetime.datetime, refer to http://stackoverflow.com/a/34277514/4229125
            e_df['Date'] = pd.to_datetime(e_df['Date'],
                                          errors='coerce').dt.date
            e_df.loc[:, 'Ps'] = e_df['Sg'] + e_df['Zg']
            e_df = e_df[constant.fhps_col_names].set_index(
                'Date', drop=False).sort_index()
            e_df = e_df[e_start:e_end]

        if i_end:
            db_accessor = DatabaseAccessor()
            i_df = db_accessor.get_fhps(code, i_start, i_end)

        df = pd.concat([i_df, e_df]).sort_index()
        return df
Exemple #11
0
    def get_hq(self, code, start_date=None, end_date=None, retry_times=3):

        i_start, i_end, e_start, e_end = utility.get_dates(
            code, self.accessor_name, start_date, end_date)
        e_df = None
        i_df = None

        if e_start:
            hq_url = self.hq_baseURL.format(code, e_start, e_end)
            retry = 0
            while retry < retry_times:
                r = requests.get(hq_url)
                if r.status_code != 200:
                    retry += 1
                else:
                    retry = retry_times + 1  # this means read successfully
            if retry != retry_times:
                page = r.content[1:-2]
                page_io = StringIO(page)
                data = json.load(page_io)
                e_df = pd.DataFrame(data['hq']).iloc[:, [0, 1, 6, 2, 5, 7, 8]]
                e_df.iloc[:, 0] = pd.to_datetime(e_df.iloc[:, 0],
                                                 errors='coerce').dt.date
                for i in range(1, len(e_df.columns)):
                    e_df.iloc[:, i] = e_df.iloc[:, i].astype(
                        constant.hq_datatypes[i])
                e_df.columns = constant.hq_col_names
                e_df = e_df.set_index('Date', drop=False).sort_index()
                e_df = utility.clean_hq_df(e_df)
                e_df = e_df[e_start:e_end]

        if i_end:
            db_accessor = DatabaseAccessor()
            i_df = db_accessor.get_hq(code, i_start, i_end)

        df = pd.concat([i_df, e_df]).sort_index()
        return df
class Crawler(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("crawler start @%s", node())

    def process(self):
        status = False
        job = self._db_conn.queue_crawl_take()
        if job != None:
            url = job['url']
            self._log_info("start to crawl %s", url)
            retry_times = config_crawl_retry
            while retry_times > 0:
                text = self._crawl_page(url)
                if text == None:
                    retry_times -= 1
                else:
                    retry_times = 0
            if text == None:
                self._log_warning("fail to crawl %s after %d attempts", url,
                                  config_crawl_retry)
                if not self._db_conn.queue_crawl_fail(url):
                    self._log_warning(
                        "fail to mark %s as 'fail' in queue_crawl", url)
            else:
                self._log_info("finish crawling %s, response length: %d", url,
                               len(text))
                if not self._db_conn.queue_page_create(url, text):
                    self._log_warning(
                        "fail to add %s as 'new' job in queue_page", url)
                if not self._db_conn.queue_crawl_done(url):
                    self._log_warning(
                        "fail to mark %s as 'done' in queue_crawl", url)
                status = True
        else:
            self._log_warning("grab no jobs to crawl")
        return status

    def _crawl_page(self, url):
        try:
            resp = get(url, timeout=config_crawl_timeout)
            if resp.status_code == codes.ok:
                return resp.text
        except Exception as e:
            pass

    def close(self):
        self._db_conn.close()
        self._log_info("crawler exit")
        self._close_logger()
Exemple #13
0
class Crawler(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("crawler start @%s", node())


    def process(self):
        status = False
        job = self._db_conn.queue_crawl_take()
        if job != None:
            url = job['url']
            self._log_info("start to crawl %s", url)
            retry_times = config_crawl_retry
            while retry_times > 0:
                text = self._crawl_page(url)
                if text == None:
                    retry_times -= 1
                else:
                    retry_times = 0
            if text == None:
                self._log_warning("fail to crawl %s after %d attempts", url, config_crawl_retry)
                if not self._db_conn.queue_crawl_fail(url):
                    self._log_warning("fail to mark %s as 'fail' in queue_crawl", url)
            else:
                self._log_info("finish crawling %s, response length: %d", url, len(text))
                if not self._db_conn.queue_page_create(url, text):
                    self._log_warning("fail to add %s as 'new' job in queue_page", url)
                if not self._db_conn.queue_crawl_done(url):
                    self._log_warning("fail to mark %s as 'done' in queue_crawl", url)
                status = True
        else:
            self._log_warning("grab no jobs to crawl")
        return status


    def _crawl_page(self, url):
        try:
            resp = get(url, timeout=config_crawl_timeout)
            if resp.status_code == codes.ok:
                return resp.text
        except Exception as e:
            pass


    def close(self):
        self._db_conn.close()
        self._log_info("crawler exit")
        self._close_logger()
Exemple #14
0
import os
import sys

THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))
sys.path.append(THIS_FOLDER + '/../../database-layer')
sys.path.append(THIS_FOLDER + '/../utilities')
sys.path.append(THIS_FOLDER + '/../models')

from TokenChecker import TokenChecker
from DatabaseAccessor import DatabaseAccessor
from User import User

DBA = DatabaseAccessor()


class TokenCheckHandler:
    def __init__(self):
        self.tokenChecker = TokenChecker()

    def getJsonResponse(self, isUserAuthorized, userId, responseCode):
        return {
            'is user authorized': isUserAuthorized,
            'user id': userId,
            'response code': responseCode
        }

    def handleTokenCheck(self, authToken):
        tokenPayload = self.tokenChecker.getTokenPayload(str(authToken))

        if tokenPayload == False:
            return self.getJsonResponse(False, 'unauthorized', 401)
Exemple #15
0
 def __init__(self, log_level=None):
     BaseLogger.__init__(self, self.__class__.__name__, log_level)
     self._db_conn = DatabaseAccessor()
     self._log_info("exporter start @%s", node())
Exemple #16
0
 def read_all_profile(self):
     with closing(DatabaseAccessor()) as dal:
         pprint(dal.profile_read())
Exemple #17
0
 def clear_queue_crawl_page_profile(self):
     with closing(DatabaseAccessor()) as dal:
         print("clear crawl - {}".format(dal.queue_crawl_clear()))
         print("clear page - {}".format(dal.queue_page_clear()))
         print("clear profile - {}".format(dal.profile_clear()))
Exemple #18
0
 def add_urls_to_queue_crawl(self, urls):
     with closing(DatabaseAccessor()) as dal:
         for url in urls:
             print("add {} - {}".format(url, dal.queue_crawl_create(url)))
class ParserProfile(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("profile parser start @%s", node())

    def process(self):
        status_profile, status_like = False, False
        job = self._db_conn.queue_page_take_profile()
        if job != None:
            url = job['url']
            text = job.get('text', "")
            profile, like = self._parse_profile_and_like(url, text)
            self._log_info(
                "parse profile page: %s, items count: { profile: %d, like: %d }",
                url, len(profile), len(like))
            if profile:
                if not self._db_conn.profile_create(profile):
                    self._log_warning("fail to add profile of %s in database",
                                      url)
                if not self._db_conn.queue_page_done_profile(url):
                    self._log_warning(
                        "fail to mark %s as 'done_profile' in queue_page", url)
                status_profile = True
            if like:
                for key in like:
                    if not self._db_conn.queue_crawl_create(like[key]):
                        self._log_warning(
                            "fail to add %s as 'new' job in queue_crawl",
                            like[key])
                status_like = True
        else:
            self._log_warning("grab no profile pages to parse")
            sleep(config_idle_sleep)
        return status_profile, status_like

    def _parse_profile_and_like(self, url, text):
        profile = {}
        like = {}
        soup = BeautifulSoup(text)
        profile["url"] = url
        profile["login"] = self._parse_tag_text_by_itemprop(
            soup, "additionalName")
        profile["name"] = self._parse_tag_text_by_itemprop(soup, "name")
        profile["company"] = self._parse_tag_text_by_itemprop(soup, "worksFor")
        profile["location"] = self._parse_tag_text_by_itemprop(
            soup, "homeLocation")
        profile["blog"] = self._parse_tag_text_by_itemprop(soup, "url")
        profile["email"] = self._parse_tag_text_by_itemprop(soup, "email")
        profile["join_at"] = self._parse_tag_datetime_by_class(
            soup, "join-date")
        profile["follower"], like["follower"] = self._parse_tag_count_and_link(
            soup, "Follower")
        profile["following"], like[
            "following"] = self._parse_tag_count_and_link(soup, "Following")
        profile["starred"], _ = self._parse_tag_count_and_link(soup, "Starred")
        return self._purge_data_dict(profile), self._purge_data_dict(
            like, config_parse_domain)

    def _purge_data_dict(self, data, prefix=None):
        purged = {}
        for key in data:
            if data[key] != None and len(data[key].strip()) > 0:
                if prefix == None:
                    purged[key] = data[key].strip()
                else:
                    purged[key] = prefix + data[key].strip()
        return purged

    def _parse_tag_text_by_itemprop(self, soup, item_name):
        tags = soup.find_all(itemprop=item_name)
        if len(tags) > 0:
            return tags[0].text

    def _parse_tag_string_by_class(self, soup, class_name):
        tags = soup.find_all(class_=class_name)
        if len(tags) > 0:
            return tags[0].string

    def _parse_tag_datetime_by_class(self, soup, class_name):
        tags = soup.find_all(class_=class_name)
        if len(tags) > 0:
            return tags[0].get('datetime')

    def _parse_tag_count_and_link(self, soup, text):
        tags = soup.find_all(class_="vcard-stat")
        count = None
        link = None
        for tag in tags:
            if text in tag.find(class_="text-muted").text:
                count = tag.find(class_="vcard-stat-count").text
                link = tag.get('href')
                break
        return count, link

    def close(self):
        self._db_conn.close()
        self._log_info("profile parser exit")
        self._close_logger()
    def get_hq_full(self, code, start_date=None, end_date=None, retry_times=3):
        """
        This method is too slow, by default it will not be used
        :param code:
        :param start_date:
        :param end_date:
        :param retry_times:
        :return:
        """
        i_start, i_end, e_start, e_end = utility.get_dates(
            code, self.accessor_name, start_date, end_date)
        e_df = None
        i_df = None
        retry = 0
        available_year_list = []

        if e_start:
            while retry < retry_times:
                with utility.open_phantomJS_driver() as driver:
                    try:
                        e_df_list = []
                        year_start = e_start.year
                        year_end = e_end.year
                        jidu_start = (e_start.month - 1) / 3 + 1
                        jidu_end = (e_end.month - 1) / 3 + 1
                        year = year_end + 1
                        while year > year_start:
                            year -= 1
                            for jidu in range(4, 0, -1):
                                if year > year_end or (year == year_end
                                                       and jidu > jidu_end):
                                    continue
                                if year < year_start or (year == year_start and
                                                         jidu < jidu_start):
                                    continue
                                url = self.hq_full_baseURL.format(
                                    code, year, jidu)
                                driver.get(url)
                                if not available_year_list:
                                    # need find the available years first
                                    year_list_option = WebDriverWait(
                                        driver, 30).until(
                                            EC.presence_of_element_located(
                                                (By.XPATH,
                                                 self.hq_year_list_xpath)))
                                    soup = BeautifulSoup(
                                        year_list_option.get_attribute(
                                            'innerHTML'))
                                    for available_year in soup.find_all(
                                            name='option'):
                                        available_year_list.append(
                                            int(available_year.text))
                                    if year_start < min(available_year_list):
                                        year_start = min(available_year_list)
                                        jidu_start = 1
                                    if year_end > max(available_year_list):
                                        year_end = max(available_year_list)
                                        jidu_end = 4
                                        continue
                                d_table = WebDriverWait(driver, 30).until(
                                    EC.presence_of_element_located(
                                        (By.XPATH, self.hq_table_xpath)))
                                cur_e_df = pd.read_html(
                                    d_table.get_attribute('outerHTML'),
                                    header=1)[0]
                                cur_e_df.columns = constant.hq_col_names
                                cur_e_df['Date'] = pd.to_datetime(
                                    cur_e_df['Date'], errors='coerce').dt.date
                                for i in range(1, len(cur_e_df.columns)):
                                    cur_e_df.iloc[:,
                                                  i] = cur_e_df.iloc[:, i].astype(
                                                      constant.hq_datatypes[i])
                                cur_e_df = cur_e_df.set_index(
                                    'Date', drop=False).sort_index()
                                e_df_list.append(cur_e_df)
                        retry = retry_times + 1
                    except Exception:
                        retry += 1
            e_df = pd.concat(e_df_list).sort_index()
            e_df = utility.clean_hq_df(e_df)
            e_df = e_df[e_start:e_end]

        if i_end:
            db_accessor = DatabaseAccessor()
            i_df = db_accessor.get_fhps(code, i_start, i_end)

        df = pd.concat([i_df, e_df]).sort_index()
        return df
Exemple #21
0
class Assigner(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("assigner start @%s", node())


    def close(self):
        self._db_conn.close()
        self._log_info("assigner exit")
        self._close_logger()


    def process(self):
        url = None
        job = self._db_conn.queue_page_take_raw()
        if job != None:
            url = job['url']
            text = job.get('text', "")
            parse_result = self._parse_raw_page(url, text)
            if parse_result == None:
                self._log_warning("fail to parse '%s' as JSON in queue_page", url)
                if not self._db_conn.queue_page_fail_raw(url):
                    self._log_warning("fail to mark %s as 'fail' in queue_page", url)
            else:
                if parse_result[0] == None:
                    self._log_warning("'%s' in queue_page indicates no more new content", url)
                else:
                    self._log_info("%s indicates new crawling job: %s", url, parse_result[0])
                    if not self._db_conn.queue_crawl_create(parse_result[0]):
                        self._log_warning("fail to add %s as 'new' job in queue_crawl", parse_result[0])
                if parse_result[1] == None:
                    self._log_warning("'%s' in queue_page contains on content", url)
                else:
                    self._log_info("%s contains %d raw snippets", url, len(parse_result[1]))
                    if not self._db_conn.queue_page_done_raw(url, parse_result[1]):
                        self._log_warning("fail to append parsed data for %s in queue_crawl", url)
        else:
            self._log_warning("grab no jobs to assign")
            sleep(config_idle_sleep)
        return url


    def _parse_raw_page(self, url, text):
        try:
            page_content = loads(text)
            url_new, data_new = None, None
            if (page_content["data"]["has_more"]) and (page_content["data"]["max_time"] > config_crawl_date_min):
                url_new = sub(r"=(\d*)$", r"=" + str(page_content["data"]["max_time"]), url)
            if len(page_content["data"]["data"]) > 0:
                data_new = page_content["data"]["data"]
            result = (url_new, data_new)
            self._log_info(
                "%s data status - more: %s, min: %d, max: %d",
                url,
                page_content["data"]["has_more"],
                page_content["data"]["min_time"],
                page_content["data"]["max_time"])
        except Exception as e:
            result = None
        return result
Exemple #22
0
 def __init__(self, host=constant.host, port=constant.port):
     self.c = pymongo.MongoClient(host=host, port=port)
     self.db_accessor = DatabaseAccessor()
Exemple #23
0
 def clear_queue_crawl_page_snippet(self):
     print("clear_queue_crawl_page_snippet")
     with closing(DatabaseAccessor()) as dal:
         print("clear crawl - {}".format(dal.queue_crawl_clear()))
         print("clear page - {}".format(dal.queue_page_clear()))
         print("clear snippet - {}".format(dal.snippet_clear()))
Exemple #24
0
class WatchDog(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        if not isdir(config_report_folder):
            makedirs(config_report_folder)
            self._log_info("create folder of charts: %s", config_report_folder)
        self._log_info("watchdog start @%s", node())

    def process(self):
        time_start = time()
        data = self._update_data()
        self._draw_charts_with_data(data)
        time_end = time()
        sleep(max(0, config_report_interval - (time_end - time_start)))
        return

    def _update_data(self):
        data = self._load_data()
        self._log_info("load existing data, count: %d", len(data))

        time_start = time()
        status = {
            "crawl_all": self._db_conn.queue_crawl_count(),
            "crawl_new": self._db_conn.queue_crawl_count("new"),
            "crawl_fail": self._db_conn.queue_crawl_count("fail"),
            "crawl_done": self._db_conn.queue_crawl_count("done"),
            "page_all": self._db_conn.queue_page_count(),
            "page_new": self._db_conn.queue_page_count("new"),
            "page_profile": self._db_conn.queue_page_count("profile"),
            "page_profile_done":
            self._db_conn.queue_page_count("done_profile"),
            "page_follow": self._db_conn.queue_page_count("follow"),
            "page_follow_done": self._db_conn.queue_page_count("done_follow"),
            "page_unknown": self._db_conn.queue_page_count("unknown"),
            "profile": self._db_conn.profile_count(),
            "profile_email": self._db_conn.profile_count("email"),
        }
        time_end = time()
        status["duration"] = time_end - time_start
        status["date"] = datetime.utcnow()

        data.append(status)
        self._save_data(data)
        self._log_info("save existing data, count: %d", len(data))
        self._log_info(
            dumps(status, sort_keys=True, indent=4, default=json_util.default))
        return data

    def _load_data(self, filename=config_report_status):
        data = []
        if isfile(filename):
            try:
                data_file = open(filename, 'r')
                content = data_file.read()
                data_file.close()
                data = loads(content, object_hook=json_util.object_hook)
            except Exception as e:
                self._log_exception("fail to load json file: %s", filename)
        else:
            self._log_warning("fail to find json file: %s", filename)
        return data

    def _save_data(self, data, filename=config_report_status):
        output_file = open(filename, 'w')
        output_file.write(
            dumps(data, sort_keys=True, indent=4, default=json_util.default))
        output_file.close()

    def _draw_charts_with_data(self, data):
        chart_size_methods = [
            self._draw_size_chart_summary, self._draw_size_chart_crawl,
            self._draw_size_chart_page, self._draw_size_chart_profile
        ]
        chart_delta_methods = [
            self._draw_delta_chart_summary, self._draw_delta_chart_page
        ]
        pos_start = config_report_item * config_report_step - config_report_step + 1
        if len(data) > pos_start:
            data_render = data[-pos_start::config_report_step]
        else:
            data_render = data[-config_report_item:]
        for method in chart_size_methods:
            result = method(data_render)
            self._log_info("save chart as %s", result)
        for method in chart_delta_methods:
            result = method(data[-config_report_item - 1:])
            self._log_info("save chart as %s", result)

    def _get_stackedline_with_style(self):
        dark_rotate_style = RotateStyle('#9e6ffe')
        return StackedLine(fill=True,
                           disable_xml_declaration=True,
                           include_x_axis=False,
                           human_readable=True,
                           interpolate='hermite',
                           style=dark_rotate_style)

    def _get_line_with_style(self):
        dark_rotate_style = RotateStyle('#9e6ffe')
        return Line(fill=False,
                    disable_xml_declaration=True,
                    include_x_axis=False,
                    human_readable=True,
                    interpolate='hermite',
                    style=dark_rotate_style)

    def _extract_list(self, data, field):
        return [item[field] for item in data]

    def _extract_date_list(self, data_list):
        return [
            data["date"].replace(tzinfo=timezone.utc).astimezone(
                tz=None).strftime("%H:%M") for data in data_list
        ]

    def _draw_size_chart_summary(self, data, filename="size_summary.svg"):
        filename = join(config_report_folder, filename)
        list_crawl = self._extract_list(data, "crawl_all")
        list_page = self._extract_list(data, "page_all")
        list_profile = self._extract_list(data, "profile")
        chart = self._get_line_with_style()
        chart.title = 'Queue Size Summary'
        chart.x_labels = self._extract_date_list(data)
        chart.add('Profile', list_profile)
        chart.add('Page', list_page)
        chart.add('Crawl', list_crawl)
        chart.render_to_file(filename)
        return filename

    def _draw_size_chart_crawl(self, data, filename="size_crawl.svg"):
        filename = join(config_report_folder, filename)
        list_all = self._extract_list(data, "crawl_all")
        list_new = self._extract_list(data, "crawl_new")
        list_fail = self._extract_list(data, "crawl_fail")
        list_done = [
            list_all[i] - list_new[i] - list_fail[i]
            for i in range(len(list_all))
        ]
        chart = self._get_stackedline_with_style()
        chart.title = 'Size of Queue Crawl'
        chart.x_labels = self._extract_date_list(data)
        chart.add('Done', list_done)
        chart.add('Todo', list_new)
        chart.render_to_file(filename)
        return filename

    def _draw_size_chart_page(self, data, filename="size_page.svg"):
        filename = join(config_report_folder, filename)
        list_all = self._extract_list(data, "page_all")
        list_new = self._extract_list(data, "page_new")
        list_profile = self._extract_list(data, "page_profile")
        list_follow = self._extract_list(data, "page_follow")
        list_unknown = self._extract_list(data, "page_unknown")
        list_done = [
            list_all[i] - list_new[i] - list_profile[i] - list_follow[i] -
            list_unknown[i] for i in range(len(list_all))
        ]
        chart = self._get_stackedline_with_style()
        chart.title = 'Size of Queue Page'
        chart.x_labels = self._extract_date_list(data)
        chart.add('Done', list_done)
        chart.add('Profile', list_profile)
        chart.add('Follow', list_follow)
        chart.add('Todo', list_new)
        chart.render_to_file(filename)
        return filename

    def _draw_size_chart_profile(self, data, filename="size_profile.svg"):
        filename = join(config_report_folder, filename)
        list_all = self._extract_list(data, "profile")
        list_email = self._extract_list(data, "profile_email")
        list_other = [
            list_all[i] - list_email[i] for i in range(len(list_all))
        ]
        chart = self._get_stackedline_with_style()
        chart.title = 'Size of Queue Profile'
        chart.x_labels = self._extract_date_list(data)
        chart.add('Other', list_other)
        chart.add('Email', list_email)
        chart.render_to_file(filename)
        return filename

    def _get_delta_list(self, data, field):
        value_list = self._extract_list(data, field)
        for i in range(len(value_list) - 1, 0, -1):
            value_list[i] -= value_list[i - 1]
        return value_list[1:]

    def _draw_delta_chart_summary(self, data, filename="delta_summary.svg"):
        filename = join(config_report_folder, filename)
        list_crawl = self._get_delta_list(data, "crawl_all")
        list_page = self._get_delta_list(data, "page_all")
        list_profile = self._get_delta_list(data, "profile")
        chart = self._get_line_with_style()
        chart.title = 'Queue Size Increase Summary'
        chart.x_labels = self._extract_date_list(data[1:])
        chart.add('Profile', list_profile)
        chart.add('Page', list_page)
        chart.add('Crawl', list_crawl)
        chart.render_to_file(filename)
        return filename

    def _draw_delta_chart_page(self, data, filename="delta_page.svg"):
        filename = join(config_report_folder, filename)
        list_all = self._get_delta_list(data, "page_all")
        list_fail = self._get_delta_list(data, "crawl_fail")
        list_follow = self._get_delta_list(data, "page_follow_done")
        list_profile = self._get_delta_list(data, "page_profile_done")
        chart = self._get_line_with_style()
        chart.title = 'Queue Page Size Increase'
        chart.x_labels = self._extract_date_list(data[1:])
        chart.add('Profile', list_profile)
        chart.add('Follow', list_follow)
        chart.add('Failed Crawl', list_fail)
        chart.add('New Crawl', list_all)
        chart.render_to_file(filename)
        return filename

    def close(self):
        self._db_conn.close()
        self._log_info("watchdog exit")
        self._close_logger()
Exemple #25
0
class Parser(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("parser start @%s", node())

    def close(self):
        self._db_conn.close()
        self._log_info("parser exit")
        self._close_logger()

    def process(self):
        count_valid = None
        count_duplicate = None
        job = self._db_conn.queue_page_take_data()
        if job != None:
            url = job['url']
            data_list = job.get('data', [])
            self._log_info("parse json data from %s, items count %d", url,
                           len(data_list))
            count_valid = 0
            count_duplicate = 0
            for data_index, data_item in enumerate(data_list):
                snippet = self._extract_snippet_record(url, data_item)
                if snippet == None:
                    self._log_warning(
                        "fail to extract #%d record of '%s' json data in queue_page",
                        data_index, url)
                else:
                    if not self._db_conn.snippet_create(snippet):
                        count_duplicate += 1
                        # self._log_warning("fail to add new snippet %s", snippet["url"])
                    else:
                        count_valid += 1
            self._log_info(
                "extract %d valid & %d duplicate snippets from %s json data",
                count_valid, count_duplicate, url)
            if not self._db_conn.queue_page_done_data(url):
                self._log_warning("fail to mark %s as 'done' in queue_crawl",
                                  url)
        else:
            self._log_warning("grab no json data to parse")
            sleep(config_idle_sleep)
        return (count_valid, count_duplicate)

    def _extract_snippet_record(self, url, data):
        try:
            snippet = {
                "url": config_parse_domain + str(data["group"]["group_id"]),
                "date": datetime.fromtimestamp(data["group"]["create_time"]),
                "content": data["group"]["content"],
                "archive": data,
                "source": url.split("?")[0],
                "source_name": data["group"]["category_name"],
            }
            snippet["count"] = {
                "digg": data["group"]["digg_count"],
                "bury": data["group"]["bury_count"],
                "favorite": data["group"]["favorite_count"],
                "comment": data["group"]["comment_count"],
            }
            if len(data["comments"]) > 0:
                comment_text = []
                comment_digg = []
                for comment in data["comments"]:
                    comment_text.append(comment["text"])
                    comment_digg.append(comment["digg_count"])
                snippet["comments"] = comment_text
                snippet["count"]["commdigg"] = comment_digg
            if len(snippet["content"].strip()) == 0:
                snippet = None
        except Exception as e:
            snippet = None
        return snippet
Exemple #26
0
 def __init__(self, log_level=None):
     BaseLogger.__init__(self, self.__class__.__name__, log_level)
     self._db_conn = DatabaseAccessor()
     self._log_info("crawler start @%s", node())
Exemple #27
0
    def get_fhps(self, code, start_date=None, end_date=None, retry_times=3):
        i_start, i_end, e_start, e_end = utility.get_dates(
            code, self.accessor_name, start_date, end_date)
        e_df = None
        i_df = None
        url = self.fhps_baseURL.format(code)
        table = None
        retry = 0

        if e_start:
            while retry < retry_times:
                with utility.open_phantomJS_driver() as driver:
                    try:
                        driver.get(url)
                        d_table = WebDriverWait(driver, 30).until(
                            EC.presence_of_element_located(
                                (By.XPATH, self.fhps_table_xpath)))
                        table = d_table.text
                        if not table:
                            retry += 1
                        else:
                            retry = retry_times + 1
                    except Exception:
                        retry += 1

            if retry != retry_times:
                line_iter = iter(table.split('\n'))
                records = []
                while True:
                    record = dict()
                    date_m = None
                    try:
                        line = line_iter.next()
                        tokens = line.strip().split()
                        if len(tokens) == 1:
                            continue
                        if tokens[0] != u'除权除息日':
                            continue
                        if len(tokens) == 2:
                            next_line = line_iter.next()
                            date_m = self.date_re.search(next_line)
                        if not date_m:
                            date_m = self.date_re.search(line)
                        zg_m = self.zg_re.search(line)
                        sg_m = self.sg_re.search(line)
                        fh_m = self.fh_re.search(line)
                        record['Date'] = date_parser.parse(
                            date_m.group(1)).date()
                        if zg_m:
                            record['Zg'] = float(zg_m.group(1)) / 10.
                        if sg_m:
                            record['Sg'] = float(sg_m.group(1)) / 10.
                        if fh_m:
                            record['Fh'] = float(fh_m.group(1)) / 10.
                        records.append(record)
                    except StopIteration:
                        break

                if records:
                    e_df = pd.DataFrame(records)
                    for field in ['Fh', 'Zg', 'Sg']:
                        if field not in e_df:
                            e_df.loc[:, field] = 0.0
                    e_df.loc[:, ['Fh', 'Zg', 'Sg'
                                 ]] = e_df.loc[:,
                                               ['Fh', 'Zg', 'Sg']].fillna(0.0)
                    e_df.loc[:, 'Ps'] = e_df['Zg'] + e_df['Sg']
                    e_df = e_df[constant.fhps_col_names].set_index(
                        'Date', drop=False).sort_index()
                    e_df = e_df[e_start:e_end]
        if i_end:
            db_accessor = DatabaseAccessor()
            i_df = db_accessor.get_fhps(code, i_start, i_end)

        df = pd.concat([i_df, e_df]).sort_index()
        return df
Exemple #28
0
class Exporter(BaseLogger):
    def __init__(self, log_level=None):
        BaseLogger.__init__(self, self.__class__.__name__, log_level)
        self._db_conn = DatabaseAccessor()
        self._log_info("exporter start @%s", node())
        self._source_set_joke = [
            "http://neihanshequ.com/joke/",
            "http://neihanshequ.com/bar/1/",
            "http://neihanshequ.com/bar/11/",
            "http://neihanshequ.com/bar/76/",
            "http://neihanshequ.com/bar/80/",
            "http://neihanshequ.com/bar/82/",
            "http://neihanshequ.com/bar/59/",
            "http://neihanshequ.com/bar/5/",
        ]
        self._source_set_art = [
            "http://neihanshequ.com/bar/25/",
            "http://neihanshequ.com/bar/26/",
            "http://neihanshequ.com/bar/3/",
            "http://neihanshequ.com/bar/53/",
            "http://neihanshequ.com/bar/46/",
            "http://neihanshequ.com/bar/49/",
            "http://neihanshequ.com/bar/69/",
            "http://neihanshequ.com/bar/51/",
            "http://neihanshequ.com/bar/60/",
        ]


    def process(self):
        filelist = []
        data = self._db_conn.snippet_read()
        self._log_info("load all snippet data from database")

        filelist.append(self._save_as_json(data))
        filelist.append(self._save_as_csv(data))

        data_joke = self._select_data_column(data, self._source_set_joke)
        filelist.append(self._save_as_csv(data_joke, "snippet_joke.csv"))

        data_art = self._select_data_column(data, self._source_set_art)
        filelist.append(self._save_as_csv(data_art, "snippet_art.csv"))

        self._archive_into_zipfile(filelist)


    def _select_data_column(self, data_raw, source_set):
        data_new = []
        for item_raw in data_raw:
            if item_raw["source"] not in source_set:
                continue
            for index in range(max(1, len(item_raw.get("comments", [])))):
                item_new = {
                    "count_digg": item_raw["count"]["digg"],
                    "count_bury": item_raw["count"]["bury"],
                    "count_favorite": item_raw["count"]["favorite"],
                    "count_comment": item_raw["count"]["comment"],
                    "count_diggcomm": None,
                    "text": item_raw["content"],
                    "text_comment": None,
                    "source": item_raw["source_name"],
                }
                if "comments" in item_raw:
                    item_new["text_comment"] = item_raw["comments"][index]
                    item_new["count_diggcomm"] = item_raw["count"]["commdigg"][index]
                data_new.append(item_new)
        return data_new


    def _save_as_json(self, data, filename="snippet.json"):
        with open(filename, 'w') as jsonfile:
            for item in data:
                dump(item, jsonfile, sort_keys=True)
                jsonfile.write("\n")
        self._log_info("save %d items as json file: %s", len(data), filename)
        return filename


    def _save_as_csv(self, data, filename="snippet.csv"):
        fields = set()
        for item in data:
            fields = fields.union(set(item.keys()))
        with open(filename, 'w', encoding='utf8', newline='') as csvfile:
            writer = DictWriter(csvfile, extrasaction='ignore', dialect='excel', fieldnames=sorted(fields, reverse=False))
            writer.writeheader()
            for item in data:
                writer.writerow(item)
        self._log_info("save %d items as csv file: %s", len(data), filename)
        return filename


    def _archive_into_zipfile(self, filelist):
        zipname = "snippet_{}.zip".format(strftime("%Y-%m-%d_%H-%M-%S"))
        with ZipFile(zipname, 'w', ZIP_DEFLATED) as zip:
            for filename in filelist:
                zip.write(filename)
                remove(filename)
        self._log_info("archive exported files into %s", zipname)


    def close(self):
        self._db_conn.close()
        self._log_info("exporter exit")
        self._close_logger()