def get_status(self, id, getUser=False, toDB=True): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api.get_status(id=id) break except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return {"msg": e.reason} if e.reason.find("Error: target weibo does not exist!") >= 0: try: rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return {"msg": e.reason} time.sleep(self.api_wait_secs)
def __init__(self, geo=None): self.pgconn = mypass.getConn() if geo is None: self.key = None self.geotable = None else: self.geotable = geo if geo in self.GEO_KEYS: self.key = self.GEO_KEYS[geo] self.geokey = self.GEO_KEYS[geo] else: self.key = geo self.geokey = geo self.datatable = None self.geokml = dict() self.data = dict() self.factory = kmldom.KmlFactory_GetFactory() self.kml = self.factory.CreateElementById(kmldom.Type_kml) self.geo = None self.geokey = None self.geo_tolerance = None self.geo_mapping = dict() # for TPU to TPU_(LARGE|SMALL) self.output = None self.verbose = False self.outputformat = "csv" self.year = None
def get_status(self, id, getUser=False, toDB=True): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api.get_status(id=id) break except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.reason } if e.reason.find("Error: target weibo does not exist!") >= 0: try: rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return { "msg": e.reason } time.sleep(self.api_wait_secs)
def comments(self, status_id, count=200, page=1, toDB=True, toBeginning=True): already_exists_count = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: comments = self.api.comments(id=status_id, count=count, page=page) break except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses == self.max_api_misses: return { "msg": e.reason } if e.reason.find("Error: target weibo does not exist!") > 0: try: rps = self.getRangePartitionByIds([status_id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": status_id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return { "msg": e.reason } time.sleep(self.api_wait_secs) if e.reason.find("requests out of rate limit") >= 0: if e.reason.find("IP") >= 0 and api_misses <= self.max_api_misses_half: time.sleep(60) # to consider rolling IPs else: self.waitRateLimit()
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = { "smartcn": smartcn } directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth()
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth()
def __init__(self): self.qqkeys = mypass.getQQKey() self.pgconn = mypass.getConn() self.http = httplib2.Http() self.qqclient = qweiboclient.QQWeiboClient( self.qqkeys["consumer_key"], self.qqkeys["consumer_secret"]) self.qqclient.setAccessToken(self.qqkeys["oauth_token"], self.qqkeys["oauth_token_secret"])
def get_status(self): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api2.statuses.show.get(id=self.id) break except weibo.APIError as e: ## Need more exception handling, and warned by > Python 2.6. if self.verbose > 0: print e if e is not None and ("out of rate limit" in str(e).lower()): self.changeToken() api_misses += 1 if api_misses >= self.max_api_misses: return {"id": self.id, "err_msg": e} ## aka toxicbar if e is not None and ( "target weibo does not exist" in str(e).lower() or "permission denied" in str(e).lower()): out = { 'id': self.id, "error_msg": str(e).lower(), "deleted": True, "permission_denied": False } permission_denied = False if ("permission denied" in str(e).lower()): permission_denied = True try: if self.pgconn is None: self.pgconn = mypass.getConn() permission_sql = "" if "permission denied" in str(e).lower(): permission_sql = ", permission_denied = true" out["permission_denied"] = True sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": self.id, "permission": permission_sql } if self.verbose > 0: print "deleted %d " % self.id res = self.pgconn.query(sql_deleted) sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": self.id } res_status = self.pgconn.query(sql_status).dictresult() out["deleted"] = True if len(res_status) > 0: out["user_id"] = res_status[0]["user_id"] if self.verbose > 1: out["status"] = res_status[0] out["sql"] = sql_deleted return out except pg.ProgrammingError, pg.InternalError: print self.pgconn.error time.sleep(self.api_wait_secs * 1)
def __init__(self): twitterOAuth = mypass.getTwitterOauth() api = twitter.Api( consumer_key=twitterOAuth["consumer_key"], consumer_secret=twitterOAuth["consumer_secret"], access_token_key=twitterOAuth["oauth_token"], access_token_secret=twitterOAuth["oauth_token_secret"], cache=None) self._api = api self._pgconn = mypass.getConn() socket.setdefaulttimeout(150)
def __init__(self, forumname): if not forumname in self.supported_forums: sys.exit() else: self.forum = forumname self.STORE_DIR = self.STORE_BASE_DIR + forumname smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = { "smartcn": smartcn } directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn()
def __init__(self, forumname): if not forumname in self.supported_forums: sys.exit() else: self.forum = forumname self.STORE_DIR = self.STORE_BASE_DIR + forumname smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn()
def __init__(self): twitterOAuth = mypass.getTwitterOauth() api = twitter.Api( consumer_key=twitterOAuth["consumer_key"], consumer_secret=twitterOAuth["consumer_secret"], access_token_key=twitterOAuth["oauth_token"], access_token_secret=twitterOAuth["oauth_token_secret"], cache=None, ) self._api = api self._pgconn = mypass.getConn() socket.setdefaulttimeout(150)
def toDB(self, tablename, data, doupdate=False, updatefirst=False): if self.pgconn is None: self.pgconn = mypass.getConn() resp = {"success": False, "already_exists": False} #r = self.pgconn.insert(tablename, data) #r = self.pgconn.update(tablename, data) if updatefirst: if doupdate: try: r = self.pgconn.update(tablename, data) resp["success"] = True except pg.DatabaseError: #, pg.ProgrammingError, pg.InternalError: if self.pgconn.error.find('No such record in') > 0: try: r = self.pgconn.insert(tablename, data) resp["success"] = True except: if self.pgconn.error.find( 'duplicate key value violates unique constraint' ) > 0: resp["already_exists"] = True else: try: #print data r = self.pgconn.insert(tablename, data) resp["success"] = True except: if self.pgconn.error.find( 'duplicate key value violates unique constraint' ) > 0: resp["already_exists"] = True else: try: #print data r = self.pgconn.insert(tablename, data) resp["success"] = True if r is not None and "id" in r: resp["id"] = r["id"] except pg.ProgrammingError, pg.InternalError: resp["reason"] = self.pgconn.error if self.pgconn.error.find( 'duplicate key value violates unique constraint') > 0: resp["already_exists"] = True try: if doupdate: r = self.pgconn.update(tablename, data) resp["success"] = True if r is not None and "id" in r: resp["id"] = r["id"] except: resp["reason"] = self.pgconn.error pass
def __init__(self, network): self.network = network smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = { "smartcn": smartcn } self.pgconn = mypass.getConn() writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.storeDir = self.storeDirBase + self.network store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) self.writer = lucene.IndexWriter(store, writerconfig)
def comments(self, status_id, count=200, page=1, toDB=True, toBeginning=True): already_exists_count = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: comments = self.api.comments(id=status_id, count=count, page=page) break except weibopy2.error.WeibopError as e: print e.reason api_misses += 1 if api_misses == self.max_api_misses: return {"msg": e.reason} if e.reason.find("target weibo does not exist") >= 0: try: ''' rps = self.getRangePartitionByIds([status_id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": status_id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) ''' if self.pgconn is None: self.pgconn = mypass.getConn() sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "id": status_id } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return {"msg": e.reason} time.sleep(self.api_wait_secs) if e.reason.find("requests out of rate limit") >= 0: if e.reason.find( "IP" ) >= 0 and api_misses <= self.max_api_misses_half: time.sleep(60) # to consider rolling IPs else: self.waitRateLimit()
def __init__(self, network): self.network = network smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = {"smartcn": smartcn} self.pgconn = mypass.getConn() writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.storeDir = self.storeDirBase + self.network store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) self.writer = lucene.IndexWriter(store, writerconfig)
def getCensusData(req, geotype, geoid): pgconn = mypass.getConn() if geotype is None: return json.dumps(None); if geoid is None: return json.dumps(None); if geotype in GEO_TABLES: geotable = GEO_TABLES[geotype] else: geotable = geotype if geotype == "dcca": parentgeotype = "dc" parentgeoid = geoid[0:2] '''
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = { "smartcn": smartcn } self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth() if not os.path.exists(self.storeDir): os.mkdir(self.storeDir) store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.writer = lucene.IndexWriter(store, writerconfig)
def user_timeline(self, user_id, count=200, page=1): start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: timeline = self.api.user_timeline(count=count, page=page, user_id=user_id) break except httplib.IncompleteRead as h: print h api_misses += 1 if api_misses >= self.max_api_misses: return {"msg": h} time.sleep(self.api_wait_secs) except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return {"msg": e.reason} time.sleep(self.api_wait_secs) if string.find(e.reason, "requests out of rate limit") >= 0: self.waitRateLimit() except socket.error as e: print e api_misses += 1 if api_misses >= self.max_api_misses: return {"msg": e.message} time.sleep(self.api_wait_secs) ''' except ValueError as e: print user_id print e api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.message } time.sleep(self.api_wait_secs) ''' time_api = time.time() - start_time_api r = self.status_timeline(timeline, toBeginning=False) if "count" in r and r["count"] == 0: if self.pgconn is None: self.pgconn = mypass.getConn() #self.pgconn.query("UPDATE sinaweibo_users SET posts_updated = NOW() WHERE id = %d" % user_id) r["time_api"] = time_api r["page"] = page return r
def toDB(self, tablename, data, doupdate=False, updatefirst=False): if self.pgconn is None: self.pgconn = mypass.getConn() resp = { "success": False, "already_exists": False } #r = self.pgconn.insert(tablename, data) #r = self.pgconn.update(tablename, data) if updatefirst: if doupdate: try: r = self.pgconn.update(tablename, data) resp["success"] = True except pg.DatabaseError:#, pg.ProgrammingError, pg.InternalError: if self.pgconn.error.find('No such record in') > 0: try: r = self.pgconn.insert(tablename, data) resp["success"] = True except: if self.pgconn.error.find('duplicate key value violates unique constraint') > 0: resp["already_exists"] = True else: try: #print data r = self.pgconn.insert(tablename, data) resp["success"] = True except: if self.pgconn.error.find('duplicate key value violates unique constraint') > 0: resp["already_exists"] = True else: try: #print data r = self.pgconn.insert(tablename, data) resp["success"] = True if r is not None and "id" in r: resp["id"] = r["id"] except pg.ProgrammingError, pg.InternalError: resp["reason"] = self.pgconn.error if self.pgconn.error.find('duplicate key value violates unique constraint') > 0: resp["already_exists"] = True try: if doupdate: r = self.pgconn.update(tablename, data) resp["success"] = True if r is not None and "id" in r: resp["id"] = r["id"] except: resp["reason"] = self.pgconn.error pass
def getBounds(req, geotype, geoid, year=2001): pgconn = mypass.getConn() if geotype is None: return json.dumps(None); if geoid is None: return json.dumps(None); #geoid = geoid.strip() if geotype in GEO_TABLES: geotable = GEO_TABLES[geotype] else: geotable = geotype if geotype in GEO_HASYEARS: geotable += str(year) if geotype in GEO_KEYS: geokey = GEO_KEYS[geotype] else: geokey = geotype if geotype in GEO_KEYS: geogeom = GEO_GEOM[geotype] else: geogeom = "the_geom" if geotype in GEO_TOLERANCE: tolerance = GEO_TOLERANCE[geotype] if geotype == "dc": try: geoid_int = int(geoid) if geoid_int >= 32: tolerance = tolerance * 5 except: pass else: tolerance = 1.0 if geotype == "sb": m = re.match(r"([\d/]+)", geoid) if m is not None: x_geo = m.group(0) # x_geo is numeric if len(geoid) > len(x_geo): x_geo += "S" geoid = x_geo sql = "SELECT ST_AsGeoJSON(ST_Transform(ST_SimplifyPreserveTopology(%(geom)s, %(tolerance)f),4326)) AS bounds FROM hkcensus.%(geotable)s \ WHERE %(geokey)s = '%(geoid)s'" % { "geotable": geotable, "geokey": geokey, "geoid": geoid, "geom": geokey, "geom": geogeom, "tolerance": tolerance } rows = pgconn.query(sql) results = rows.getresult() if len(results) <= 0: return json.dumps({}) return results[0][0]
def reposts(self, status_id, count=200, page=1): already_exists_count = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: timeline = self.api.repost_timeline(count=count, page=page, id=status_id) break except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses == self.max_api_misses: return {"msg": e.reason} if e.reason.find("Error: target weibo does not exist!") > 0: try: rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) permission_sql = "" if "permission denied" in e.reason.lower(): permission_sql = ", permission_denied = true" sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": id, "year": year, "week": week, "permission": permission_sql } #sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return {"msg": e.reason} time.sleep(self.api_wait_secs) if e.reason.find("requests out of rate limit") >= 0: if e.reason.find( "IP" ) >= 0 and api_misses <= self.max_api_misses_half: time.sleep(60) # to consider rolling IPs else: self.waitRateLimit()
def get_status(self): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api2.statuses.show.get(id=self.id) break except weibo.APIError as e: ## Need more exception handling, and warned by > Python 2.6. if self.verbose > 0: print e if e is not None and ("out of rate limit" in str(e).lower()): self.changeToken() api_misses += 1 if api_misses >= self.max_api_misses: return { "id": self.id, "err_msg": e } ## aka toxicbar if e is not None and ("target weibo does not exist" in str(e).lower() or "permission denied" in str(e).lower()): out = { 'id': self.id, "error_msg": str(e).lower(), "deleted": True, "permission_denied": False } permission_denied = False if ("permission denied" in str(e).lower()): permission_denied = True try: if self.pgconn is None: self.pgconn = mypass.getConn() permission_sql = "" if "permission denied" in str(e).lower(): permission_sql = ", permission_denied = true" out["permission_denied"] = True sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": self.id, "permission": permission_sql } if self.verbose > 0: print "deleted %d " % self.id res = self.pgconn.query(sql_deleted) sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": self.id } res_status = self.pgconn.query(sql_status).dictresult() out["deleted"] = True if len(res_status) > 0: out["user_id"] = res_status[0]["user_id"] if self.verbose > 1: out["status"] = res_status[0] out["sql"] = sql_deleted return out except pg.ProgrammingError, pg.InternalError: print self.pgconn.error time.sleep(self.api_wait_secs * 1)
def user_timeline(self, user_id, count=200, page=1): start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: timeline = self.api.user_timeline(count=count, page=page, user_id=user_id) break except httplib.IncompleteRead as h: print h api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": h } time.sleep(self.api_wait_secs) except weibopy.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.reason } time.sleep(self.api_wait_secs) if string.find(e.reason, "requests out of rate limit") >= 0: self.waitRateLimit() except socket.error as e: print e api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.message } time.sleep(self.api_wait_secs) ''' except ValueError as e: print user_id print e api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.message } time.sleep(self.api_wait_secs) ''' time_api = time.time() - start_time_api r = self.status_timeline(timeline, toBeginning=False) if "count" in r and r["count"] == 0: if self.pgconn is None: self.pgconn = mypass.getConn() #self.pgconn.query("UPDATE sinaweibo_users SET posts_updated = NOW() WHERE id = %d" % user_id) r["time_api"] = time_api r["page"] = page return r
def user(self, user_id, screen_name=None, toDB=True): start_time_api = time.time() try: if screen_name is not None: user = self.api.get_user(screen_name=screen_name) else: user = self.api.get_user(user_id=user_id) except weibopy.error.WeibopError as e: if e.reason.find("User does not exists") >= 0: if self.pgconn is None: self.pgconn = mypass.getConn() try: if not self.force_screenname and not user_id is None: sql_deleted = "UPDATE sinaweibo_users SET deleted = NOW() WHERE id = %d AND deleted IS NULL " % user_id res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return {"msg": e.reason}
def user(self, user_id, screen_name=None, toDB=True): start_time_api = time.time() try: if screen_name is not None: user = self.api.get_user(screen_name=screen_name) else: user = self.api.get_user(user_id=user_id) except weibopy.error.WeibopError as e: if e.reason.find("User does not exists") >= 0: if self.pgconn is None: self.pgconn = mypass.getConn() try: if not self.force_screenname and not user_id is None: sql_deleted = "UPDATE sinaweibo_users SET deleted = NOW() WHERE id = %d AND deleted IS NULL " % user_id res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return { "msg": e.reason }
def get_status(self, id, getUser=False, toDB=True): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api.get_status(id=id) break except weibopy2.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return { "msg": e.reason } if e.reason is not None and ("target weibo does not exist" in e.reason.lower() or "permission denied" in e.reason.lower()): out = { "msg": e.reason } try: ''' rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() ''' if self.pgconn is None: self.pgconn = mypass.getConn() permission_sql = "" if "permission denied" in e.reason.lower(): permission_sql = ", permission_denied = true" sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": id, "permission": permission_sql } print "deleted %d " % id res = self.pgconn.query(sql_deleted) sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": id } res_status = self.pgconn.query(sql_status).dictresult() out["deleted"] = True if len(res_status) > 0: out["user_id"] = res_status[0]["user_id"] return out except pg.ProgrammingError, pg.InternalError: print self.pgconn.error time.sleep(self.api_wait_secs * 1)
def get_status(self, id, getUser=False, toDB=True): time_db = 0 time_db_u = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: status = self.api.get_status(id=id) break except weibopy2.error.WeibopError as e: print e.reason api_misses += 1 if api_misses >= self.max_api_misses: return {"msg": e.reason} if e.reason.find("target weibo does not exist") >= 0: out = {"msg": e.reason} try: ''' rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() ''' if self.pgconn is None: self.pgconn = mypass.getConn() sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "id": id } res = self.pgconn.query(sql_deleted) sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": id } res_status = self.pgconn.query(sql_status).dictresult() out["deleted"] = True if len(res_status) > 0: out["user_id"] = res_status[0]["user_id"] return out except pg.ProgrammingError, pg.InternalError: print self.pgconn.error time.sleep(self.api_wait_secs * 1)
def getLocation(buf): if buf.startswith("Location:"): realurl = string.replace(buf, "Location: ", "", 1) realurl = string.strip(realurl) print myurl m = re.match(r"http://([a-zA-Z0-9\-\.]+)/", myurl) try: baseurl = m.group(1) except IndexError: baseurl = "" hashurl = string.replace(myurl, "http://" + baseurl + "/", "", 1) sinaurl = {"hash": hashurl, "location": realurl, "base": baseurl} try: print sinaurl pgconn = mypass.getConn() pgconn.insert("sinaweibo_sinaurl", sinaurl) except pg.ProgrammingError: pass print realurl
def getLocation(buf): if buf.startswith("Location:"): realurl = string.replace(buf, "Location: ", "", 1) realurl = string.strip(realurl) print myurl m = re.match(r"http://([a-zA-Z0-9\-\.]+)/", myurl) try: baseurl = m.group(1) except IndexError: baseurl = "" hashurl = string.replace(myurl, "http://" + baseurl + "/", "", 1) sinaurl = { "hash": hashurl, "location": realurl, "base": baseurl } try: print sinaurl pgconn = mypass.getConn() pgconn.insert("sinaweibo_sinaurl", sinaurl) except pg.ProgrammingError: pass print realurl
def reposts(self, status_id, count=200, page=1): already_exists_count = 0 start_time_api = time.time() api_misses = 0 while api_misses < self.max_api_misses: try: timeline = self.api.repost_timeline(count=count, page=page, id=status_id) break except weibopy2.error.WeibopError as e: print e.reason api_misses += 1 if api_misses == self.max_api_misses: return { "msg": e.reason } if e.reason is not None and ("target weibo does not exist" in e.reason.lower() or "permission denied" in e.reason.lower()): try: ''' rps = self.getRangePartitionByIds([id]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id } if self.pgconn is None: self.pgconn = mypass.getConn() res = self.pgconn.query(sql_deleted) ''' permission_sql = "" if "permission denied" in e.reason.lower(): permission_sql = ", permission_denied = true" if self.pgconn is None: self.pgconn = mypass.getConn() sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": id, "permission": permission_sql } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error return { "msg": e.reason } time.sleep(self.api_wait_secs) if e.reason.find("requests out of rate limit") >= 0: if e.reason.find("IP") >= 0 and api_misses <= self.max_api_misses_half: time.sleep(60) # to consider rolling IPs else: self.waitRateLimit()
def dispatch(self, opt, id, output_counts=False): if opt == 1: # user timeline out = self.user_timeline(id) if "count" in out and out["count"] == 0: # see if the user was just deleted out_user = self.user(id) elif opt == 2: # user if self.force_screenname: out = self.user(None, id) else: out = self.user(id) elif opt == 3: # friends out = self.socialgraph(id, "friends") if "count" in out and out["count"] == 5000: out = self.socialgraph(id, "friends", 4999) elif opt == 4: # followers out = self.socialgraph(id, "followers") if "count" in out and out["count"] == 5000: out = self.socialgraph(id, "followers", 4999) elif opt == 7: # reposts blanks_count = 0 gotall_count = 0 for i in range(self.max_reposts_pages): items_count = 0 misses_count = 0 trial = 0 while items_count == 0 and misses_count <= self.max_reposts_tries: time.sleep(5) out = self.reposts(id, 200, i+1) if not "count" in out: misses_count += 1 rls = self.api.rate_limit_status() ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") } if self.verbose: print ratelimstatus continue elif out["count"] == 0: out["msg"] = "Too many blanks: probably reached the end" blanks_count += 1 gotall_count += 1 break else: blanks_count = 0 items_count = out["count"] if self.verbose: out["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print out if misses_count >= self.max_reposts_tries: print out["msg"] time.sleep(60) break elif blanks_count >= self.max_reposts_blanks: print out["msg"] break if items_count == out["already_exists_count"]: # already got everything gotall_count += 1 #continue # still continue if not self.getall and gotall_count >= self.max_gotall_count: out["msg"] = "Already full " + str(self.max_gotall_count) + " times: we're breaking here" break continue if output_counts: if self.pgconn is None: self.pgconn = mypass.getConn() rps = self.getRangePartitionByIds([id]) rps_count = 0 for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_count = "SELECT COUNT(*) FROM rp_sinaweibo_y%(year)dw%(week)d WHERE retweeted_status = %(id)d " % { "year": year, "week": week, "id": id } res_count = self.pgconn.query(sql_count).getresult() rps_count += int(res_count[0][0]) umask = os.umask(0) fo = open(self.reposts_dir + "/counts/" + str(id), "w") fo.write(str(rps_count)) fo.close() os.umask(umask) elif opt == 8: # comments blanks_count = 0 gotall_count = 0 for i in range(self.max_comments_pages): items_count = 0 misses_count = 0 trial = 0 while items_count == 0 and misses_count <= self.max_comments_tries: time.sleep(5) out = self.comments(id, 200, i+1) if not "count" in out: misses_count += 1 rls = self.api.rate_limit_status() ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") } if self.verbose: print ratelimstatus continue elif out["count"] == 0: out["msg"] = "Too many blanks: probably reached the end" blanks_count += 1 gotall_count += 1 break else: blanks_count = 0 items_count = out["count"] if self.verbose: out["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print out if misses_count >= self.max_comments_tries: print out["msg"] time.sleep(60) break elif blanks_count >= self.max_comments_blanks: print out["msg"] break if items_count == out["already_exists_count"]: # already got everything gotall_count += 1 if not self.getall and gotall_count >= self.max_gotall_count: break continue #break if output_counts: if self.pgconn is None: self.pgconn = mypass.getConn() sql_count = "SELECT COUNT(*) FROM sinaweibo_comments WHERE status_id = %d " % id res_count = self.pgconn.query(sql_count).getresult() umask = os.umask(0) fo = open(self.comments_dir + "/counts/" + str(id), "w") fo.write(str(res_count[0][0])) fo.close() os.umask(umask) elif opt == 9: # single status out = self.get_status(id) else: out = None return out
def status_timeline(self, statuses, isSingleUser=True, toBeginning=True): already_exists_count = 0 time_db = 0 time_db_u = 0 if self.index: time_index = 0 deleted_count = 0 timeline_users_ids = list() toleranceNotToBeginningCount = 0 newlyadded = 0 for l in statuses: x = self.status_to_row(l) x_rt = None if "rt" in x: x_rt = x["rt"] start_time_db = time.time() if ("user_id" not in x or ("user_id" in x and x["user_id"] is None)) and self.id is not None: x["user_id"] = self.id # handle deleted statuses if "deleted" in x and x["deleted"] is not None and (x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True): deleted_count += 1 if self.pgconn is None: self.pgconn = mypass.getConn() try: sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "id": x["id"] } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error continue tablename = self.getRangePartitionByDate(x["created_at"]) #tablename = "rp_sinaweibo" resp = self.toDB(tablename, x) if x_rt is not None and self.rt: tablename_rt = self.getRangePartitionByDate(x_rt["created_at"]) resp_rt = self.toDB(tablename_rt, x_rt, self.doupdate) time_db += time.time() - start_time_db if not resp["already_exists"] and resp["success"] and not isSingleUser: timeline_user = self.user_to_row(l["user"]) timeline_user_id = timeline_user["id"] if self.verbose > 1: print resp if not timeline_user_id in timeline_users_ids: start_time_db_u = time.time() u = self.user_to_row(timeline_user) resp_u = self.toDB("sinaweibo_users", u, doupdate=True) time_db_u += time.time() - start_time_db_u if resp_u["already_exists"] or resp_u["success"]: timeline_users_ids.append(timeline_user_id) if resp["already_exists"]: if toBeginning: already_exists_count += 1 else: toleranceNotToBeginningCount += 1 if isSingleUser: if self.verbose: print "already exists, tolerance: " + str(toleranceNotToBeginningCount) if toleranceNotToBeginningCount >= self.toleranceNotToBeginning: break else: if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong: break else: newlyadded += 1 if not toBeginning: toleranceNotToBeginningCount = 0 if self.index: # index if the row doesn't already exist time_index_start = time.time() try: t = time.strptime(x["created_at"],"%Y-%m-%d %H:%M:%S") created_at_secs = int(time.mktime(t)) self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs) except Exception as e: print e time_index += time.time() - time_index_start if self.verbose > 1: print x
def __init__(self): self.googlepluskey = mypass.getGooglePlusKey() self.pgconn = mypass.getConn() self.http = httplib2.Http() self.api_key = self.googlepluskey["api_key"]
def status_timeline(self, statuses, isSingleUser=True, toBeginning=True): already_exists_count = 0 time_db = 0 time_db_u = 0 if self.index: time_index = 0 deleted_count = 0 timeline_users_ids = list() toleranceNotToBeginningCount = 0 newlyadded = 0 for l in statuses: x = self.status_to_row(l) x_rt = None if "rt" in x: x_rt = x["rt"] start_time_db = time.time() if ("user_id" not in x or ("user_id" in x and x["user_id"] is None)) and self.id is not None: x["user_id"] = self.id # handle deleted statuses if "deleted" in x and x["deleted"] is not None and ( x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True): deleted_count += 1 if self.pgconn is None: self.pgconn = mypass.getConn() try: sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "id": x["id"] } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error continue tablename = self.getRangePartitionByDate(x["created_at"]) #tablename = "rp_sinaweibo" resp = self.toDB(tablename, x) if x_rt is not None and self.rt: tablename_rt = self.getRangePartitionByDate(x_rt["created_at"]) resp_rt = self.toDB(tablename_rt, x_rt, self.doupdate) time_db += time.time() - start_time_db if not resp["already_exists"] and resp[ "success"] and not isSingleUser: timeline_user = self.user_to_row(l["user"]) timeline_user_id = timeline_user["id"] if self.verbose > 1: print resp if not timeline_user_id in timeline_users_ids: start_time_db_u = time.time() u = self.user_to_row(timeline_user) resp_u = self.toDB("sinaweibo_users", u, doupdate=True) time_db_u += time.time() - start_time_db_u if resp_u["already_exists"] or resp_u["success"]: timeline_users_ids.append(timeline_user_id) if resp["already_exists"]: if toBeginning: already_exists_count += 1 else: toleranceNotToBeginningCount += 1 if isSingleUser: if self.verbose: print "already exists, tolerance: " + str( toleranceNotToBeginningCount) if toleranceNotToBeginningCount >= self.toleranceNotToBeginning: break else: if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong: break else: newlyadded += 1 if not toBeginning: toleranceNotToBeginningCount = 0 if self.index: # index if the row doesn't already exist time_index_start = time.time() try: t = time.strptime(x["created_at"], "%Y-%m-%d %H:%M:%S") created_at_secs = int(time.mktime(t)) self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs) except Exception as e: print e time_index += time.time() - time_index_start if self.verbose > 1: print x
def status_timeline(self, statuses, isSingleUser=True, toDB=True, toBeginning=True): already_exists_count = 0 time_db = 0 time_db_u = 0 if self.index: time_index = 0 deleted_count = 0 timeline_users_ids = list() toleranceNotToBeginningCount = 0 newlyadded = 0 for l in statuses: x = self.status_to_row(l) if toDB: start_time_db = time.time() # handle deleted statuses if "deleted" in x and x["deleted"] is not None and (x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True): deleted_count += 1 if self.pgconn is None: self.pgconn = mypass.getConn() try: rps = self.getRangePartitionByIds([x["id"]]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": x["id"] } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error continue tablename = self.getRangePartitionByDate(self.getAtt(l, "created_at")) #tablename = "rp_sinaweibo" resp = self.toDB(tablename, x) time_db += time.time() - start_time_db if not resp["already_exists"] and resp["success"] and not isSingleUser: timeline_user = self.getAtt(l, "user") timeline_user_id = self.getAtt(timeline_user, "id") #print resp if not timeline_user_id in timeline_users_ids: start_time_db_u = time.time() u = self.user_to_row(timeline_user) resp_u = self.toDB("sinaweibo_users", u) time_db_u += time.time() - start_time_db_u if resp_u["already_exists"] or resp_u["success"]: timeline_users_ids.append(timeline_user_id) if resp["already_exists"]: if toBeginning: already_exists_count += 1 else: toleranceNotToBeginningCount += 1 if isSingleUser: if self.verbose: print "already exists, tolerance: " + str(toleranceNotToBeginningCount) if toleranceNotToBeginningCount >= self.toleranceNotToBeginning: break else: if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong: break else: newlyadded += 1 if not toBeginning: toleranceNotToBeginningCount = 0 if self.index: # index if the row doesn't already exist time_index_start = time.time() try: t = time.strptime(x["created_at"],"%Y-%m-%d %H:%M:%S") created_at_secs = int(time.mktime(t)) self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs) except Exception as e: print e time_index += time.time() - time_index_start else: print x
def __init__(self): self.qqkeys = mypass.getQQKey() self.pgconn = mypass.getConn() self.http = httplib2.Http() self.qqclient = qweiboclient.QQWeiboClient(self.qqkeys["consumer_key"], self.qqkeys["consumer_secret"]) self.qqclient.setAccessToken(self.qqkeys["oauth_token"], self.qqkeys["oauth_token_secret"])
def dispatch(self, opt, id, output_counts=False): if opt == 1: # user timeline out = self.user_timeline(id) if "count" in out and out[ "count"] == 0: # see if the user was just deleted out_user = self.user(id) elif opt == 2: # user if self.force_screenname: out = self.user(None, id) else: out = self.user(id) elif opt == 3: # friends out = self.socialgraph(id, "friends") if "count" in out and out["count"] == 5000: out = self.socialgraph(id, "friends", 4999) elif opt == 4: # followers out = self.socialgraph(id, "followers") if "count" in out and out["count"] == 5000: out = self.socialgraph(id, "followers", 4999) elif opt == 7: # reposts blanks_count = 0 gotall_count = 0 for i in range(self.max_reposts_pages): items_count = 0 misses_count = 0 trial = 0 while items_count == 0 and misses_count <= self.max_reposts_tries: time.sleep(5) out = self.reposts(id, 200, i + 1) if not "count" in out: misses_count += 1 rls = self.api.rate_limit_status() ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") } if self.verbose: print ratelimstatus continue elif out["count"] == 0: out["msg"] = "Too many blanks: probably reached the end" blanks_count += 1 gotall_count += 1 break else: blanks_count = 0 items_count = out["count"] if self.verbose: out["timestamp"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") print out if misses_count >= self.max_reposts_tries: print out["msg"] time.sleep(60) break elif blanks_count >= self.max_reposts_blanks: print out["msg"] break if items_count == out[ "already_exists_count"]: # already got everything gotall_count += 1 #continue # still continue if not self.getall and gotall_count >= self.max_gotall_count: out["msg"] = "Already full " + str( self.max_gotall_count ) + " times: we're breaking here" break continue if output_counts: if self.pgconn is None: self.pgconn = mypass.getConn() rps = self.getRangePartitionByIds([id]) rps_count = 0 for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_count = "SELECT COUNT(*) FROM rp_sinaweibo_y%(year)dw%(week)d WHERE retweeted_status = %(id)d " % { "year": year, "week": week, "id": id } res_count = self.pgconn.query(sql_count).getresult() rps_count += int(res_count[0][0]) umask = os.umask(0) fo = open(self.reposts_dir + "/counts/" + str(id), "w") fo.write(str(rps_count)) fo.close() os.umask(umask) elif opt == 8: # comments blanks_count = 0 gotall_count = 0 for i in range(self.max_comments_pages): items_count = 0 misses_count = 0 trial = 0 while items_count == 0 and misses_count <= self.max_comments_tries: time.sleep(5) out = self.comments(id, 200, i + 1) if not "count" in out: misses_count += 1 rls = self.api.rate_limit_status() ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") } if self.verbose: print ratelimstatus continue elif out["count"] == 0: out["msg"] = "Too many blanks: probably reached the end" blanks_count += 1 gotall_count += 1 break else: blanks_count = 0 items_count = out["count"] if self.verbose: out["timestamp"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") print out if misses_count >= self.max_comments_tries: print out["msg"] time.sleep(60) break elif blanks_count >= self.max_comments_blanks: print out["msg"] break if items_count == out[ "already_exists_count"]: # already got everything gotall_count += 1 if not self.getall and gotall_count >= self.max_gotall_count: break continue #break if output_counts: if self.pgconn is None: self.pgconn = mypass.getConn() sql_count = "SELECT COUNT(*) FROM sinaweibo_comments WHERE status_id = %d " % id res_count = self.pgconn.query(sql_count).getresult() umask = os.umask(0) fo = open(self.comments_dir + "/counts/" + str(id), "w") fo.write(str(res_count[0][0])) fo.close() os.umask(umask) elif opt == 9: # single status out = self.get_status(id) else: out = None return out
def status_timeline(self, statuses, isSingleUser=True, toDB=True, toBeginning=True): already_exists_count = 0 time_db = 0 time_db_u = 0 if self.index: time_index = 0 deleted_count = 0 timeline_users_ids = list() toleranceNotToBeginningCount = 0 newlyadded = 0 for l in statuses: x = self.status_to_row(l) if toDB: start_time_db = time.time() # handle deleted statuses if "deleted" in x and x["deleted"] is not None and ( x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True): deleted_count += 1 if self.pgconn is None: self.pgconn = mypass.getConn() try: rps = self.getRangePartitionByIds([x["id"]]) for x in rps: yw = x.split(",") year = int(yw[0]) week = int(yw[1]) sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": x["id"] } res = self.pgconn.query(sql_deleted) except pg.ProgrammingError, pg.InternalError: print self.pgconn.error continue tablename = self.getRangePartitionByDate( self.getAtt(l, "created_at")) #tablename = "rp_sinaweibo" resp = self.toDB(tablename, x) time_db += time.time() - start_time_db if not resp["already_exists"] and resp[ "success"] and not isSingleUser: timeline_user = self.getAtt(l, "user") timeline_user_id = self.getAtt(timeline_user, "id") #print resp if not timeline_user_id in timeline_users_ids: start_time_db_u = time.time() u = self.user_to_row(timeline_user) resp_u = self.toDB("sinaweibo_users", u) time_db_u += time.time() - start_time_db_u if resp_u["already_exists"] or resp_u["success"]: timeline_users_ids.append(timeline_user_id) if resp["already_exists"]: if toBeginning: already_exists_count += 1 else: toleranceNotToBeginningCount += 1 if isSingleUser: if self.verbose: print "already exists, tolerance: " + str( toleranceNotToBeginningCount) if toleranceNotToBeginningCount >= self.toleranceNotToBeginning: break else: if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong: break else: newlyadded += 1 if not toBeginning: toleranceNotToBeginningCount = 0 if self.index: # index if the row doesn't already exist time_index_start = time.time() try: t = time.strptime(x["created_at"], "%Y-%m-%d %H:%M:%S") created_at_secs = int(time.mktime(t)) self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs) except Exception as e: print e time_index += time.time() - time_index_start else: print x
try: created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d %H:%M:%S") except ValueError: try: created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d %H:%M") except ValueError: try: created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d") except ValueError: print created_at_str sys.exit() isocal = created_at.isocalendar() return "twitter.rp_tweets_y" + str(isocal[0]) + "w" + str(isocal[1]) pgconn = mypass.getConn() sql_users_fields = "user_id,name,screen_name,description,profile_image_url,url,protected,followers_count,friends_count,created_at,\ favourites_count,utc_offset,time_zone,profile_background_image_url,profile_use_background_image,notifications,geo_enabled,verified,\ statuses_count,lang,contributors_enabled,follow_request_sent,listed_count,show_all_inline_media" todb = False tocsv = True tobeginning = False doupdate = False verbose = False socket.setdefaulttimeout(30) time_db = 0 time_api = 0