def create_table(self, table_name, column_info, schema_name=None, database_name=None, drop_if_exists=False, **kwargs): """Creates a table. Args: table_name (str): The name of the table. column_info: A list of dictionary defining the ``column_name`` and ``column_type``. Examples: ``[{"foo":"int"}, {"bar":"varchar(50)"}]`` Default value is ``None``. schema_name (str): The schema name (Can be ``None``, some databases have no schema). Default value is ``None``. database_name (str): The database name. Default value is ``default``. drop_if_exists (boolean): Whether to drop the table if the table already exists. If set to ``False``, the data will be appended to the existing table. Default value is ``False``. **kwargs: Arbitrary key-value pairs. For example: ``verify=False`` means do not verify the SSL certificate when SSL is enabled. Returns: True if table created otherwise False. """ database_name, schema_name, table_path = self._get_table_path(table_name, schema_name, database_name, kwargs.get("is_hive", False)) if self._table_exists(table_path, schema_name, database_name): if drop_if_exists: sql = "DROP TABLE %s" % table_path self.sql_execute(sql, schema_name, database_name, **kwargs) else: logger.warning("%s already exists and drop_if_exists set to False, table will not be created" % table_path) return False sql = "CREATE TABLE %s ( %s )" % (table_path, ",\n".join(map(lambda x: "\t".join(x.items()[0]), column_info))) self.sql_execute(sql, schema_name, database_name, **kwargs) return True
def update_car_by_license_plate(cls, obj): logger.warning( f"Overwrite an existing VIN {obj.vin} license plate to {obj.license_plate} by {current_user.user_name}" ) cls.db_session.query(cls).filter(cls.vin == obj.vin).update( {cls.license_plate: obj.license_plate}) cls.db_session.commit()
def get_followees(self, pid): url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[: 6] + '&wvr=6&mod=headfollow#place' while True: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) uid = self.parser.parse_uid(html) if uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue fee_page_num = self.get_followee_page_num(html) if fee_page_num is not None: break else: log.warning('Cannot get followee page total number - pid:%s' % (pid, )) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) if fee_page_num == 0: print 'He/She does not follow any one.' return else: print 'Getting followee page 1 of %d...' % (fee_page_num, ) followees = self.parser.parse_followees(html, pid, datetime.now()) self.followee_list.extend( followees ) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html) if fee_page_num == 1: return for i in xrange(2, fee_page_num + 1): while True: url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % ( pid, pid[:6], i) print 'Getting followee page %d of %d...' % (i, fee_page_num) html = open_url(fetcher, url) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) followees = self.parser.parse_followees( html, pid, datetime.now()) if followees is None: # dirty html log.warning( 'Cannot parse followee page correctly - pid:%s' % (pid, )) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) continue self.followee_list.extend(followees) break
def post_request(self, API): try: response = urllib2.urlopen(API, timeout=30) except Exception as e: log.warning(e) return None, None return response.info(), json.loads(response.read())
def get_followees(self, pid): url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:6] + '&wvr=6&mod=headfollow#place' while True: fetcher = self.fetchers[self.main_fetcher] html = open_url(fetcher, url) uid = self.parser.parse_uid(html) if uid == -1: self.ban_account() continue elif self.parser.is_visitor(html) is True: self.reset_account() continue fee_page_num = self.get_followee_page_num(html) if fee_page_num is not None: break else: log.warning('Cannot get followee page total number - pid:%s' % (pid,)) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) if fee_page_num == 0: print 'He/She does not follow any one.' return else: print 'Getting followee page 1 of %d...' % (fee_page_num,) followees = self.parser.parse_followees(html, pid, datetime.now()) self.followee_list.extend(followees) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html) if fee_page_num == 1: return for i in xrange(2, fee_page_num+1): while True: url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (pid, pid[:6], i) print 'Getting followee page %d of %d...' % (i, fee_page_num) html = open_url(fetcher, url) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) followees = self.parser.parse_followees(html, pid, datetime.now()) if followees is None: # dirty html log.warning('Cannot parse followee page correctly - pid:%s' % (pid,)) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) continue self.followee_list.extend(followees) break
def write_table(self, data_frame, table_name, column_info=None, schema_name=None, database_name=None, drop_if_exists=False, append_if_exists=False, limit = 1000, **kwargs): """Import the contents in a pandas DataFrame to the table in the database. Args: data_frame: The pandas DataFrame object. table_name (str): The table name. column_info: A list of dictionary defining the ``column_name`` and ``column_type``. Examples: ``[{"foo":"int"}, {"bar":"varchar(50)"}]`` Default value is ``None``. schema_name (str): The schema name (Can be ``None``, some databases have no schema). Default value is ``None``. database_name (str): The database name. Default value is ``None``. drop_if_exists (boolean): Whether to drop the table if the table already exists. If set to ``False``, nothing happens. Default value is ``False``. append_if_exists (boolean): Whether to append the table if the table already exists. If set to ``False``, nothing happens. Default value is ``False``. limit (int): The number of rows in the DataFrame that need to be written into the table. Default value is ``1000``. **kwargs: Arbitrary key-value pairs. For example: ``verify=False`` means do not verify the SSL certificate when SSL is enabled. """ database_name, schema_name, table_path = self._get_table_path(table_name, schema_name, database_name, kwargs.get("is_hive", False)) if column_info is None: column_info = map(lambda x: {x[0]: self._transfer_dtype_to_db_type(x[1])}, zip(data_frame.columns.get_values(), data_frame.dtypes)) flag = self.create_table(table_name, column_info, schema_name, database_name, drop_if_exists, **kwargs) if flag is False and append_if_exists is False: logger.warning("nothing write into %s, the append_if_exists is set to False" % table_path) return rows = len(data_frame) if rows <= 1: logger.warning("nothing write into %s, because no data found in dataframe" % table_path) return if rows > limit: logger.warning("inserted data exceed %s, will only insert %s rows" % (limit, limit)) sql = "INSERT INTO %s (%s) VALUES %s" % (table_path, ",".join(map(lambda x: x.keys()[0], column_info)), ",\n".join(map(lambda x: "('%s')" % "', '".join(map(str, x[1:])), list(data_frame.itertuples())[:limit]))) self.sql_execute(sql, schema_name, database_name, **kwargs)
def fetch_timelines_by_page_bar(self, uid, pnum, bnum): """ fetch timelines by specifying page number and bar number :param uid: :param pnum: page number :param bnum: bar number :return: html containing timelines or None if there are no timelines """ body = { # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息 '__rnd':1343647638078, '_k':1343647471134109, '_t':0, 'count':15, 'end_id':3473519214542343, 'max_id':3473279479126179, 'page':1, 'pagebar':1, 'pre_page':1, 'uid':uid } body['page'] = pnum if bnum == 0: body['count'] = '50' body['pagebar'] = '' body['pre_page'] = pnum-1 elif bnum == 1: body['count'] = '15' body['pagebar'] = '0' body['pre_page'] = pnum elif bnum == 2: body['count'] = '15' body['pagebar'] = '1' body['pre_page'] = pnum url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body) while True: try: print 'Getting timeline page %d part %d...' % (pnum, bnum+1) # bnum starts with zero up to two jsn_data = open_url(self.fetchers[self.main_fetcher], url) if self.parser.is_frozen(jsn_data): self.ban_account() continue data = json.loads(jsn_data) html = data['data'] if u'WB_feed_type SW_fun S_line2' in html: return html else: return None except Exception as e: if 'No valid account!' in e.message: raise e if 'No JSON object could be decoded' in e.message: if self.parser.is_visitor(jsn_data) is True: self.reset_account() else: self.ban_account() log.warning(e.message) time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION)) continue
def fetch_timelines_by_page_bar(self, uid, pnum, bnum): """ fetch timelines by specifying page number and bar number :param uid: :param pnum: page number :param bnum: bar number :return: html containing timelines or None if there are no timelines """ body = { # 这个是有抓包得出的,因为新浪微博用了瀑布流动态加载,所以不能一次性得到一页中所有信息 '__rnd': 1343647638078, '_k': 1343647471134109, '_t': 0, 'count': 15, 'end_id': 3473519214542343, 'max_id': 3473279479126179, 'page': 1, 'pagebar': 1, 'pre_page': 1, 'uid': uid } body['page'] = pnum if bnum == 0: body['count'] = '50' body['pagebar'] = '' body['pre_page'] = pnum - 1 elif bnum == 1: body['count'] = '15' body['pagebar'] = '0' body['pre_page'] = pnum elif bnum == 2: body['count'] = '15' body['pagebar'] = '1' body['pre_page'] = pnum url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body) while True: try: print 'Getting timeline page %d part %d...' % ( pnum, bnum + 1) # bnum starts with zero up to two jsn_data = open_url(self.fetchers[self.main_fetcher], url) if self.parser.is_frozen(jsn_data): self.ban_account() continue data = json.loads(jsn_data) html = data['data'] if u'WB_feed_type SW_fun S_line2' in html: return html else: return None except Exception as e: if 'No valid account!' in e.message: raise e if 'No JSON object could be decoded' in e.message: if self.parser.is_visitor(jsn_data) is True: self.reset_account() else: self.ban_account() log.warning(e.message) time.sleep( random.randint(Config.SLEEP_WHEN_EXCEPTION, 2 * Config.SLEEP_WHEN_EXCEPTION)) continue