Example #1
0
    def create_table(self, table_name, column_info, schema_name=None, database_name=None, drop_if_exists=False, **kwargs):
        """Creates a table.

        Args:
            table_name (str): The name of the table.
            column_info: A list of dictionary defining the ``column_name`` and
                ``column_type``.
                Examples:
                ``[{"foo":"int"}, {"bar":"varchar(50)"}]``
                Default value is ``None``.
            schema_name (str): The schema name (Can be ``None``, some databases
                have no schema). Default value is ``None``.
            database_name (str): The database name. Default value is ``default``.
            drop_if_exists (boolean): Whether to drop the table if the table
                already exists. If set to ``False``, the data will be appended
                to the existing table. Default value is ``False``.
            **kwargs: Arbitrary key-value pairs. For example: ``verify=False``
                means do not verify the SSL certificate when SSL is enabled.

        Returns:
            True if table created otherwise False.
        """
        database_name, schema_name, table_path = self._get_table_path(table_name, schema_name, database_name, kwargs.get("is_hive", False))

        if self._table_exists(table_path, schema_name, database_name):
            if drop_if_exists:
                sql = "DROP TABLE %s" % table_path
                self.sql_execute(sql, schema_name, database_name, **kwargs)
            else:
                logger.warning("%s already exists and drop_if_exists set to False, table will not be created" % table_path)
                return False

        sql = "CREATE TABLE %s ( %s )" % (table_path, ",\n".join(map(lambda x: "\t".join(x.items()[0]), column_info)))
        self.sql_execute(sql, schema_name, database_name, **kwargs)
        return True
Example #2
0
 def update_car_by_license_plate(cls, obj):
     logger.warning(
         f"Overwrite an existing VIN {obj.vin} license plate to {obj.license_plate} by {current_user.user_name}"
     )
     cls.db_session.query(cls).filter(cls.vin == obj.vin).update(
         {cls.license_plate: obj.license_plate})
     cls.db_session.commit()
Example #3
0
    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:
                                                                           6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' %
                            (pid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num, )
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(
                followees
            )  # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num + 1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (
                        pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i,
                                                                 fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(
                        random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                       2 * Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(
                        html, pid, datetime.now())
                    if followees is None:  # dirty html
                        log.warning(
                            'Cannot parse followee page correctly - pid:%s' %
                            (pid, ))
                        time.sleep(
                            random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                           2 * Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break
Example #4
0
 def post_request(self, API):
     try:
         response = urllib2.urlopen(API, timeout=30)
     except Exception as e:
         log.warning(e)
         return None, None
     return response.info(), json.loads(response.read())
Example #5
0
    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' % (pid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num,)
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(followees) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num+1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i, fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(html, pid, datetime.now())
                    if followees is None: # dirty html
                        log.warning('Cannot parse followee page correctly - pid:%s' % (pid,))
                        time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break
Example #6
0
    def write_table(self,
                    data_frame,
                    table_name,
                    column_info=None,
                    schema_name=None,
                    database_name=None,
                    drop_if_exists=False,
                    append_if_exists=False,
                    limit = 1000,
                    **kwargs):

        """Import the contents in a pandas DataFrame to the table in the
        database.

        Args:
            data_frame: The pandas DataFrame object.
            table_name (str): The table name.
            column_info: A list of dictionary defining the ``column_name`` and
                ``column_type``.
                Examples:
                ``[{"foo":"int"}, {"bar":"varchar(50)"}]``
                Default value is ``None``.
            schema_name (str): The schema name (Can be ``None``, some databases
                have no schema). Default value is ``None``.
            database_name (str): The database name. Default value is ``None``.
            drop_if_exists (boolean): Whether to drop the table if the table
                already exists. If set to ``False``, nothing happens. Default
                value is ``False``.
            append_if_exists (boolean): Whether to append the table if the table
                already exists. If set to ``False``, nothing happens. Default
                value is ``False``.
            limit (int): The number of rows in the DataFrame that need to be
                written into the table. Default value is ``1000``.
            **kwargs: Arbitrary key-value pairs. For example: ``verify=False``
                means do not verify the SSL certificate when SSL is enabled.
        """
        database_name, schema_name, table_path = self._get_table_path(table_name, schema_name, database_name, kwargs.get("is_hive", False))

        if column_info is None:
            column_info = map(lambda x: {x[0]: self._transfer_dtype_to_db_type(x[1])}, zip(data_frame.columns.get_values(), data_frame.dtypes))
        flag = self.create_table(table_name, column_info, schema_name, database_name, drop_if_exists, **kwargs)
        if flag is False and append_if_exists is False:
            logger.warning("nothing write into %s, the append_if_exists is set to False" % table_path)
            return
        rows = len(data_frame)
        if rows <= 1:
            logger.warning("nothing write into %s, because no data found in dataframe" % table_path)
            return

        if rows > limit:
            logger.warning("inserted data exceed %s, will only insert %s rows" % (limit, limit))
        sql = "INSERT INTO %s (%s) VALUES %s" % (table_path, ",".join(map(lambda x: x.keys()[0], column_info)),
                                                 ",\n".join(map(lambda x: "('%s')" % "', '".join(map(str, x[1:])), list(data_frame.itertuples())[:limit])))
        self.sql_execute(sql, schema_name, database_name, **kwargs)
Example #7
0
    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = { # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息
            '__rnd':1343647638078,
            '_k':1343647471134109,
            '_t':0,
            'count':15,
            'end_id':3473519214542343,
            'max_id':3473279479126179,
            'page':1,
            'pagebar':1,
            'pre_page':1,
            'uid':uid
       }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum-1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (pnum, bnum+1) # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                continue
Example #8
0
    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = {  # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息
            '__rnd': 1343647638078,
            '_k': 1343647471134109,
            '_t': 0,
            'count': 15,
            'end_id': 3473519214542343,
            'max_id': 3473279479126179,
            'page': 1,
            'pagebar': 1,
            'pre_page': 1,
            'uid': uid
        }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum - 1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (
                    pnum, bnum + 1)  # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))
                continue