Example #1
0
 def get_status(self, id, getUser=False, toDB=True):
     time_db = 0
     time_db_u = 0
     start_time_api = time.time()
     api_misses = 0
     while api_misses < self.max_api_misses:
         try:
             status = self.api.get_status(id=id)
             break
         except weibopy.error.WeibopError as e:
             print e.reason
             api_misses += 1
             if api_misses >= self.max_api_misses:
                 return {"msg": e.reason}
             if e.reason.find("Error: target weibo does not exist!") >= 0:
                 try:
                     rps = self.getRangePartitionByIds([id])
                     for x in rps:
                         yw = x.split(",")
                         year = int(yw[0])
                         week = int(yw[1])
                         sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % {
                             "year": year,
                             "week": week,
                             "id": id
                         }
                         if self.pgconn is None:
                             self.pgconn = mypass.getConn()
                         res = self.pgconn.query(sql_deleted)
                 except pg.ProgrammingError, pg.InternalError:
                     print self.pgconn.error
                 return {"msg": e.reason}
             time.sleep(self.api_wait_secs)
Example #2
0
    def __init__(self, geo=None):
	self.pgconn = mypass.getConn()
	if geo is None:
	    self.key = None
	    self.geotable = None
	else:
	    self.geotable = geo
    	    if geo in self.GEO_KEYS:
    		self.key = self.GEO_KEYS[geo]
		self.geokey = self.GEO_KEYS[geo]
    	    else:
    		self.key = geo
		self.geokey = geo
	self.datatable = None
	self.geokml = dict()
	self.data = dict()
	self.factory = kmldom.KmlFactory_GetFactory()
	self.kml = self.factory.CreateElementById(kmldom.Type_kml)
	self.geo = None
	self.geokey = None
	self.geo_tolerance = None
	self.geo_mapping = dict() # for TPU to TPU_(LARGE|SMALL)
	self.output = None
	self.verbose = False
	self.outputformat = "csv"
	self.year = None
Example #3
0
    def get_status(self, id, getUser=False, toDB=True):
	time_db = 0
	time_db_u = 0
	start_time_api = time.time()
	api_misses = 0
	while api_misses < self.max_api_misses:
	    try:
		status = self.api.get_status(id=id)
		break
	    except weibopy.error.WeibopError as e:
		print e.reason
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.reason }
		if e.reason.find("Error: target weibo does not exist!") >= 0:
		    try:
			rps = self.getRangePartitionByIds([id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			    res = self.pgconn.query(sql_deleted)
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
		    return { "msg": e.reason }
		time.sleep(self.api_wait_secs)
Example #4
0
    def comments(self, status_id, count=200, page=1, toDB=True, toBeginning=True):
	already_exists_count = 0
	start_time_api = time.time()
	api_misses = 0
	while api_misses < self.max_api_misses:
	    try:
		comments = self.api.comments(id=status_id, count=count, page=page)
		break
	    except weibopy.error.WeibopError as e:
		print e.reason
		api_misses += 1
		if api_misses == self.max_api_misses:
		    return { "msg": e.reason }
		if e.reason.find("Error: target weibo does not exist!") > 0:
		    try:
			rps = self.getRangePartitionByIds([status_id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": status_id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			    res = self.pgconn.query(sql_deleted)
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
		    return { "msg": e.reason }
		time.sleep(self.api_wait_secs)
		if e.reason.find("requests out of rate limit") >= 0:
		    if e.reason.find("IP") >= 0 and api_misses <= self.max_api_misses_half:
			time.sleep(60) # to consider rolling IPs
		    else:
			self.waitRateLimit()
Example #5
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	self.analyzers = { "smartcn": smartcn }
	directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
	self.searcher = lucene.IndexSearcher(directory, True)
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
Example #6
0
 def __init__(self):
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
     self.sw = sinaweibooauth.SinaWeiboOauth()
Example #7
0
 def __init__(self):
     self.qqkeys = mypass.getQQKey()
     self.pgconn = mypass.getConn()
     self.http = httplib2.Http()
     self.qqclient = qweiboclient.QQWeiboClient(
         self.qqkeys["consumer_key"], self.qqkeys["consumer_secret"])
     self.qqclient.setAccessToken(self.qqkeys["oauth_token"],
                                  self.qqkeys["oauth_token_secret"])
Example #8
0
 def get_status(self):
     time_db = 0
     time_db_u = 0
     start_time_api = time.time()
     api_misses = 0
     while api_misses < self.max_api_misses:
         try:
             status = self.api2.statuses.show.get(id=self.id)
             break
         except weibo.APIError as e:  ## Need more exception handling, and warned by > Python 2.6.
             if self.verbose > 0:
                 print e
             if e is not None and ("out of rate limit" in str(e).lower()):
                 self.changeToken()
                 api_misses += 1
             if api_misses >= self.max_api_misses:
                 return {"id": self.id, "err_msg": e}  ## aka toxicbar
             if e is not None and (
                     "target weibo does not exist" in str(e).lower()
                     or "permission denied" in str(e).lower()):
                 out = {
                     'id': self.id,
                     "error_msg": str(e).lower(),
                     "deleted": True,
                     "permission_denied": False
                 }
                 permission_denied = False
                 if ("permission denied" in str(e).lower()):
                     permission_denied = True
                 try:
                     if self.pgconn is None:
                         self.pgconn = mypass.getConn()
                     permission_sql = ""
                     if "permission denied" in str(e).lower():
                         permission_sql = ", permission_denied = true"
                         out["permission_denied"] = True
                     sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % {
                         "id": self.id,
                         "permission": permission_sql
                     }
                     if self.verbose > 0:
                         print "deleted %d " % self.id
                     res = self.pgconn.query(sql_deleted)
                     sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % {
                         "id": self.id
                     }
                     res_status = self.pgconn.query(sql_status).dictresult()
                     out["deleted"] = True
                     if len(res_status) > 0:
                         out["user_id"] = res_status[0]["user_id"]
                     if self.verbose > 1:
                         out["status"] = res_status[0]
                         out["sql"] = sql_deleted
                     return out
                 except pg.ProgrammingError, pg.InternalError:
                     print self.pgconn.error
             time.sleep(self.api_wait_secs * 1)
Example #9
0
 def __init__(self):
     twitterOAuth = mypass.getTwitterOauth()
     api = twitter.Api(
         consumer_key=twitterOAuth["consumer_key"],
         consumer_secret=twitterOAuth["consumer_secret"],
         access_token_key=twitterOAuth["oauth_token"],
         access_token_secret=twitterOAuth["oauth_token_secret"],
         cache=None)
     self._api = api
     self._pgconn = mypass.getConn()
     socket.setdefaulttimeout(150)
Example #10
0
    def __init__(self, forumname):
	if not forumname in self.supported_forums:
	    sys.exit()
	else:
	    self.forum = forumname
	self.STORE_DIR = self.STORE_BASE_DIR + forumname
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	self.analyzers = { "smartcn": smartcn }
	directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
	self.searcher = lucene.IndexSearcher(directory, True)
	self.pgconn = mypass.getConn()
Example #11
0
 def __init__(self, forumname):
     if not forumname in self.supported_forums:
         sys.exit()
     else:
         self.forum = forumname
     self.STORE_DIR = self.STORE_BASE_DIR + forumname
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
Example #12
0
 def __init__(self):
     twitterOAuth = mypass.getTwitterOauth()
     api = twitter.Api(
         consumer_key=twitterOAuth["consumer_key"],
         consumer_secret=twitterOAuth["consumer_secret"],
         access_token_key=twitterOAuth["oauth_token"],
         access_token_secret=twitterOAuth["oauth_token_secret"],
         cache=None,
     )
     self._api = api
     self._pgconn = mypass.getConn()
     socket.setdefaulttimeout(150)
Example #13
0
 def toDB(self, tablename, data, doupdate=False, updatefirst=False):
     if self.pgconn is None:
         self.pgconn = mypass.getConn()
     resp = {"success": False, "already_exists": False}
     #r = self.pgconn.insert(tablename, data)
     #r = self.pgconn.update(tablename, data)
     if updatefirst:
         if doupdate:
             try:
                 r = self.pgconn.update(tablename, data)
                 resp["success"] = True
             except pg.DatabaseError:  #, pg.ProgrammingError, pg.InternalError:
                 if self.pgconn.error.find('No such record in') > 0:
                     try:
                         r = self.pgconn.insert(tablename, data)
                         resp["success"] = True
                     except:
                         if self.pgconn.error.find(
                                 'duplicate key value violates unique constraint'
                         ) > 0:
                             resp["already_exists"] = True
         else:
             try:
                 #print data
                 r = self.pgconn.insert(tablename, data)
                 resp["success"] = True
             except:
                 if self.pgconn.error.find(
                         'duplicate key value violates unique constraint'
                 ) > 0:
                     resp["already_exists"] = True
     else:
         try:
             #print data
             r = self.pgconn.insert(tablename, data)
             resp["success"] = True
             if r is not None and "id" in r:
                 resp["id"] = r["id"]
         except pg.ProgrammingError, pg.InternalError:
             resp["reason"] = self.pgconn.error
             if self.pgconn.error.find(
                     'duplicate key value violates unique constraint') > 0:
                 resp["already_exists"] = True
                 try:
                     if doupdate:
                         r = self.pgconn.update(tablename, data)
                         resp["success"] = True
                         if r is not None and "id" in r:
                             resp["id"] = r["id"]
                 except:
                     resp["reason"] = self.pgconn.error
                     pass
Example #14
0
    def __init__(self, network):
	self.network = network	
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.storeDir = self.storeDirBase + self.network
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	self.writer = lucene.IndexWriter(store, writerconfig)
Example #15
0
    def comments(self,
                 status_id,
                 count=200,
                 page=1,
                 toDB=True,
                 toBeginning=True):
        already_exists_count = 0
        start_time_api = time.time()
        api_misses = 0
        while api_misses < self.max_api_misses:
            try:
                comments = self.api.comments(id=status_id,
                                             count=count,
                                             page=page)
                break
            except weibopy2.error.WeibopError as e:
                print e.reason
                api_misses += 1
                if api_misses == self.max_api_misses:
                    return {"msg": e.reason}
                if e.reason.find("target weibo does not exist") >= 0:
                    try:
                        '''
			rps = self.getRangePartitionByIds([status_id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": status_id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			    res = self.pgconn.query(sql_deleted)
			'''
                        if self.pgconn is None:
                            self.pgconn = mypass.getConn()
                        sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % {
                            "id": status_id
                        }
                        res = self.pgconn.query(sql_deleted)
                    except pg.ProgrammingError, pg.InternalError:
                        print self.pgconn.error
                    return {"msg": e.reason}
                time.sleep(self.api_wait_secs)
                if e.reason.find("requests out of rate limit") >= 0:
                    if e.reason.find(
                            "IP"
                    ) >= 0 and api_misses <= self.max_api_misses_half:
                        time.sleep(60)  # to consider rolling IPs
                    else:
                        self.waitRateLimit()
Example #16
0
 def __init__(self, network):
     self.network = network
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
     analyzers = {"smartcn": smartcn}
     self.pgconn = mypass.getConn()
     writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33,
                                             analyzers["smartcn"])
     writerconfig.setWriteLockTimeout(600000L)
     writerconfig.setMaxThreadStates(50)
     writerconfig.setRAMBufferSizeMB(128.0)
     self.storeDir = self.storeDirBase + self.network
     store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
     self.writer = lucene.IndexWriter(store, writerconfig)
Example #17
0
def getCensusData(req, geotype, geoid):
    pgconn = mypass.getConn()
    if geotype is None:
	return json.dumps(None);
    if geoid is None:
	return json.dumps(None);
    if geotype in GEO_TABLES:
	geotable = GEO_TABLES[geotype]
    else:
	geotable = geotype
    if geotype == "dcca":
	parentgeotype = "dc"
	parentgeoid = geoid[0:2]
	'''
Example #18
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
	if not os.path.exists(self.storeDir):
	    os.mkdir(self.storeDir)
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.writer = lucene.IndexWriter(store, writerconfig)
Example #19
0
    def user_timeline(self, user_id, count=200, page=1):
        start_time_api = time.time()
        api_misses = 0
        while api_misses < self.max_api_misses:
            try:
                timeline = self.api.user_timeline(count=count,
                                                  page=page,
                                                  user_id=user_id)
                break
            except httplib.IncompleteRead as h:
                print h
                api_misses += 1
                if api_misses >= self.max_api_misses:
                    return {"msg": h}
                time.sleep(self.api_wait_secs)
            except weibopy.error.WeibopError as e:
                print e.reason
                api_misses += 1
                if api_misses >= self.max_api_misses:
                    return {"msg": e.reason}
                time.sleep(self.api_wait_secs)
                if string.find(e.reason, "requests out of rate limit") >= 0:
                    self.waitRateLimit()
            except socket.error as e:
                print e
                api_misses += 1
                if api_misses >= self.max_api_misses:
                    return {"msg": e.message}
                time.sleep(self.api_wait_secs)
            '''
	    except ValueError as e:
		print user_id
		print e
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.message }
		time.sleep(self.api_wait_secs)
	    '''
        time_api = time.time() - start_time_api
        r = self.status_timeline(timeline, toBeginning=False)
        if "count" in r and r["count"] == 0:
            if self.pgconn is None:
                self.pgconn = mypass.getConn()
            #self.pgconn.query("UPDATE sinaweibo_users SET posts_updated = NOW() WHERE id = %d" % user_id)
        r["time_api"] = time_api
        r["page"] = page
        return r
Example #20
0
    def toDB(self, tablename, data, doupdate=False, updatefirst=False):
	if self.pgconn is None:
	    self.pgconn = mypass.getConn()
	resp = { "success": False, "already_exists": False }
	#r = self.pgconn.insert(tablename, data)
	#r = self.pgconn.update(tablename, data)
	if updatefirst:
	    if doupdate:
		try:
		    r = self.pgconn.update(tablename, data)
		    resp["success"] = True
		except pg.DatabaseError:#, pg.ProgrammingError, pg.InternalError:
		    if self.pgconn.error.find('No such record in') > 0:
			try:
			    r = self.pgconn.insert(tablename, data)
			    resp["success"] = True
			except:
			    if self.pgconn.error.find('duplicate key value violates unique constraint') > 0:
				resp["already_exists"] = True
	    else:
		try:
		    #print data
		    r = self.pgconn.insert(tablename, data)
		    resp["success"] = True
		except:
		    if self.pgconn.error.find('duplicate key value violates unique constraint') > 0:
			resp["already_exists"] = True
	else:
	    try:
		#print data
		r = self.pgconn.insert(tablename, data)
    		resp["success"] = True
		if r is not None and "id" in r:
		    resp["id"] = r["id"]
	    except pg.ProgrammingError, pg.InternalError:
		resp["reason"] = self.pgconn.error
		if self.pgconn.error.find('duplicate key value violates unique constraint') > 0:
		    resp["already_exists"] = True
		    try:
			if doupdate:
			    r = self.pgconn.update(tablename, data)
			    resp["success"] = True
			    if r is not None and "id" in r:
				resp["id"] = r["id"]
		    except:
			resp["reason"] = self.pgconn.error
			pass
Example #21
0
def getBounds(req, geotype, geoid, year=2001):
    pgconn = mypass.getConn()
    if geotype is None:
	return json.dumps(None);
    if geoid is None:
	return json.dumps(None);
    #geoid = geoid.strip()
    if geotype in GEO_TABLES:
	geotable = GEO_TABLES[geotype]
    else:
	geotable = geotype
    if geotype in GEO_HASYEARS:
	geotable += str(year)
    if geotype in GEO_KEYS:
	geokey = GEO_KEYS[geotype]
    else:
	geokey = geotype
    if geotype in GEO_KEYS:
	geogeom = GEO_GEOM[geotype]
    else:
	geogeom = "the_geom"
    if geotype in GEO_TOLERANCE:
	tolerance = GEO_TOLERANCE[geotype]
	if geotype == "dc":
	    try:
		geoid_int = int(geoid)
		if geoid_int >= 32:
		    tolerance = tolerance * 5
	    except:
		pass
    else:
	tolerance = 1.0
    if geotype == "sb":
	m = re.match(r"([\d/]+)", geoid)
	if m is not None:
	    x_geo = m.group(0) # x_geo is numeric
	    if len(geoid) > len(x_geo):
		x_geo += "S"
		geoid = x_geo
    sql = "SELECT ST_AsGeoJSON(ST_Transform(ST_SimplifyPreserveTopology(%(geom)s, %(tolerance)f),4326)) AS bounds FROM hkcensus.%(geotable)s \
WHERE %(geokey)s = '%(geoid)s'" % { "geotable": geotable, "geokey": geokey, "geoid": geoid, "geom": geokey, "geom": geogeom, "tolerance": tolerance }
    rows = pgconn.query(sql)
    results = rows.getresult()
    if len(results) <= 0:
	return json.dumps({})
    return results[0][0]
Example #22
0
 def reposts(self, status_id, count=200, page=1):
     already_exists_count = 0
     start_time_api = time.time()
     api_misses = 0
     while api_misses < self.max_api_misses:
         try:
             timeline = self.api.repost_timeline(count=count,
                                                 page=page,
                                                 id=status_id)
             break
         except weibopy.error.WeibopError as e:
             print e.reason
             api_misses += 1
             if api_misses == self.max_api_misses:
                 return {"msg": e.reason}
             if e.reason.find("Error: target weibo does not exist!") > 0:
                 try:
                     rps = self.getRangePartitionByIds([id])
                     for x in rps:
                         yw = x.split(",")
                         year = int(yw[0])
                         week = int(yw[1])
                         permission_sql = ""
                         if "permission denied" in e.reason.lower():
                             permission_sql = ", permission_denied = true"
                         sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % {
                             "id": id,
                             "year": year,
                             "week": week,
                             "permission": permission_sql
                         }
                         #sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id }
                         if self.pgconn is None:
                             self.pgconn = mypass.getConn()
                         res = self.pgconn.query(sql_deleted)
                 except pg.ProgrammingError, pg.InternalError:
                     print self.pgconn.error
                 return {"msg": e.reason}
             time.sleep(self.api_wait_secs)
             if e.reason.find("requests out of rate limit") >= 0:
                 if e.reason.find(
                         "IP"
                 ) >= 0 and api_misses <= self.max_api_misses_half:
                     time.sleep(60)  # to consider rolling IPs
                 else:
                     self.waitRateLimit()
Example #23
0
    def get_status(self):
        time_db = 0
        time_db_u = 0
        start_time_api = time.time()
        api_misses = 0
        while api_misses < self.max_api_misses:
            try:
                status = self.api2.statuses.show.get(id=self.id)
                break
            except weibo.APIError as e: ## Need more exception handling, and warned by > Python 2.6.
		if self.verbose > 0:
		    print e
		if e is not None and ("out of rate limit" in str(e).lower()):
		    self.changeToken()			
		    api_misses += 1
                if api_misses >= self.max_api_misses:
                    return { "id": self.id, "err_msg": e } ## aka toxicbar
                if e is not None and ("target weibo does not exist" in str(e).lower() or "permission denied" in str(e).lower()):
		    out = { 'id': self.id, "error_msg": str(e).lower(), "deleted": True, "permission_denied": False }
                    permission_denied = False
                    if ("permission denied" in str(e).lower()):
                        permission_denied = True
		    try:
			if self.pgconn is None:
		    	    self.pgconn = mypass.getConn()
			permission_sql = ""
			if "permission denied" in str(e).lower():
			    permission_sql = ", permission_denied = true"
			    out["permission_denied"] = True
			sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": self.id, "permission": permission_sql }
			if self.verbose > 0:
			    print "deleted %d " % self.id
		    	res = self.pgconn.query(sql_deleted)
			sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": self.id }
			res_status = self.pgconn.query(sql_status).dictresult()
			out["deleted"] = True
			if len(res_status) > 0:
			    out["user_id"] = res_status[0]["user_id"]
			if self.verbose > 1:
			    out["status"] = res_status[0]
			    out["sql"] = sql_deleted
			return out
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
                time.sleep(self.api_wait_secs * 1)
Example #24
0
    def user_timeline(self, user_id, count=200, page=1):
	start_time_api = time.time()
	api_misses = 0
	while api_misses < self.max_api_misses:
	    try:
		timeline = self.api.user_timeline(count=count, page=page, user_id=user_id)
		break
	    except httplib.IncompleteRead as h:
		print h
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": h }
		time.sleep(self.api_wait_secs)
	    except weibopy.error.WeibopError as e:
		print e.reason
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.reason }
		time.sleep(self.api_wait_secs)
		if string.find(e.reason, "requests out of rate limit") >= 0:
		    self.waitRateLimit()
	    except socket.error as e:
		print e
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.message }
		time.sleep(self.api_wait_secs)
	    '''
	    except ValueError as e:
		print user_id
		print e
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.message }
		time.sleep(self.api_wait_secs)
	    '''
	time_api = time.time() - start_time_api
	r = self.status_timeline(timeline, toBeginning=False)
	if "count" in r and r["count"] == 0:
	    if self.pgconn is None:
    		self.pgconn = mypass.getConn()
	    #self.pgconn.query("UPDATE sinaweibo_users SET posts_updated = NOW() WHERE id = %d" % user_id)
	r["time_api"] = time_api
	r["page"] = page
	return r
Example #25
0
 def user(self, user_id, screen_name=None, toDB=True):
     start_time_api = time.time()
     try:
         if screen_name is not None:
             user = self.api.get_user(screen_name=screen_name)
         else:
             user = self.api.get_user(user_id=user_id)
     except weibopy.error.WeibopError as e:
         if e.reason.find("User does not exists") >= 0:
             if self.pgconn is None:
                 self.pgconn = mypass.getConn()
             try:
                 if not self.force_screenname and not user_id is None:
                     sql_deleted = "UPDATE sinaweibo_users SET deleted = NOW() WHERE id = %d AND deleted IS NULL " % user_id
                     res = self.pgconn.query(sql_deleted)
             except pg.ProgrammingError, pg.InternalError:
                 print self.pgconn.error
         return {"msg": e.reason}
Example #26
0
    def user(self, user_id, screen_name=None, toDB=True):
	start_time_api = time.time()
	try:
	    if screen_name is not None:
		user = self.api.get_user(screen_name=screen_name)
	    else:
		user = self.api.get_user(user_id=user_id)
	except weibopy.error.WeibopError as e:
	    if e.reason.find("User does not exists") >= 0:
		if self.pgconn is None:
		    self.pgconn = mypass.getConn()
		try:
		    if not self.force_screenname and not user_id is None:
			sql_deleted = "UPDATE sinaweibo_users SET deleted = NOW() WHERE id = %d AND deleted IS NULL " % user_id
			res = self.pgconn.query(sql_deleted)
		except pg.ProgrammingError, pg.InternalError:
		    print self.pgconn.error
	    return { "msg": e.reason }
Example #27
0
    def get_status(self, id, getUser=False, toDB=True):
	time_db = 0
	time_db_u = 0
	start_time_api = time.time()
	api_misses = 0
	while api_misses < self.max_api_misses:
	    try:
		status = self.api.get_status(id=id)
		break
	    except weibopy2.error.WeibopError as e:
		print e.reason
		api_misses += 1
		if api_misses >= self.max_api_misses:
		    return { "msg": e.reason }
		if e.reason is not None and ("target weibo does not exist" in e.reason.lower() or "permission denied" in e.reason.lower()):
		    out = { "msg": e.reason }
		    try:
			'''
			rps = self.getRangePartitionByIds([id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			'''
			if self.pgconn is None:
		    	    self.pgconn = mypass.getConn()
			permission_sql = ""
			if "permission denied" in e.reason.lower():
			    permission_sql = ", permission_denied = true"
			sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": id, "permission": permission_sql }
			print "deleted %d " % id
		    	res = self.pgconn.query(sql_deleted)
			sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % { "id": id }
			res_status = self.pgconn.query(sql_status).dictresult()
			out["deleted"] = True
			if len(res_status) > 0:
			    out["user_id"] = res_status[0]["user_id"]
			return out
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
		time.sleep(self.api_wait_secs * 1)
Example #28
0
    def get_status(self, id, getUser=False, toDB=True):
        time_db = 0
        time_db_u = 0
        start_time_api = time.time()
        api_misses = 0
        while api_misses < self.max_api_misses:
            try:
                status = self.api.get_status(id=id)
                break
            except weibopy2.error.WeibopError as e:
                print e.reason
                api_misses += 1
                if api_misses >= self.max_api_misses:
                    return {"msg": e.reason}
                if e.reason.find("target weibo does not exist") >= 0:
                    out = {"msg": e.reason}
                    try:
                        '''
			rps = self.getRangePartitionByIds([id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			'''
                        if self.pgconn is None:
                            self.pgconn = mypass.getConn()
                        sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % {
                            "id": id
                        }
                        res = self.pgconn.query(sql_deleted)
                        sql_status = "SELECT * FROM rp_sinaweibo WHERE id = %(id)d " % {
                            "id": id
                        }
                        res_status = self.pgconn.query(sql_status).dictresult()
                        out["deleted"] = True
                        if len(res_status) > 0:
                            out["user_id"] = res_status[0]["user_id"]
                        return out
                    except pg.ProgrammingError, pg.InternalError:
                        print self.pgconn.error
                time.sleep(self.api_wait_secs * 1)
Example #29
0
def getLocation(buf):
    if buf.startswith("Location:"):
        realurl = string.replace(buf, "Location: ", "", 1)
        realurl = string.strip(realurl)
        print myurl
        m = re.match(r"http://([a-zA-Z0-9\-\.]+)/", myurl)
        try:
            baseurl = m.group(1)
        except IndexError:
            baseurl = ""
        hashurl = string.replace(myurl, "http://" + baseurl + "/", "", 1)
        sinaurl = {"hash": hashurl, "location": realurl, "base": baseurl}
        try:
            print sinaurl
            pgconn = mypass.getConn()
            pgconn.insert("sinaweibo_sinaurl", sinaurl)
        except pg.ProgrammingError:
            pass
        print realurl
Example #30
0
def getLocation(buf):
    if buf.startswith("Location:"):
	realurl = string.replace(buf, "Location: ", "", 1)
	realurl = string.strip(realurl)
	print myurl
	m = re.match(r"http://([a-zA-Z0-9\-\.]+)/", myurl)
	try:
	    baseurl = m.group(1)
	except IndexError:
	    baseurl = ""
	hashurl = string.replace(myurl, "http://" + baseurl + "/", "", 1)
	sinaurl = { "hash": hashurl, "location": realurl, "base": baseurl }
	try:
	    print sinaurl
	    pgconn = mypass.getConn()
	    pgconn.insert("sinaweibo_sinaurl", sinaurl)
	except pg.ProgrammingError:
	    pass
	print realurl
Example #31
0
    def reposts(self, status_id, count=200, page=1):
	already_exists_count = 0
	start_time_api = time.time()
	api_misses = 0
	while api_misses < self.max_api_misses:
	    try:
		timeline = self.api.repost_timeline(count=count, page=page, id=status_id)
		break
	    except weibopy2.error.WeibopError as e:
		print e.reason
		api_misses += 1
		if api_misses == self.max_api_misses:
		    return { "msg": e.reason }
		if e.reason is not None and ("target weibo does not exist" in e.reason.lower() or "permission denied" in e.reason.lower()):
		    try:
			'''
			rps = self.getRangePartitionByIds([id])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": id }
			    if self.pgconn is None:
				self.pgconn = mypass.getConn()
			    res = self.pgconn.query(sql_deleted)
			'''
			permission_sql = ""
			if "permission denied" in e.reason.lower():
			    permission_sql = ", permission_denied = true"
			if self.pgconn is None:
		    	    self.pgconn = mypass.getConn()
			sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() %(permission)s WHERE id = %(id)d AND deleted IS NULL " % { "id": id, "permission": permission_sql }
		    	res = self.pgconn.query(sql_deleted)
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
		    return { "msg": e.reason }
		time.sleep(self.api_wait_secs)
		if e.reason.find("requests out of rate limit") >= 0:
		    if e.reason.find("IP") >= 0 and api_misses <= self.max_api_misses_half:
			time.sleep(60) # to consider rolling IPs
		    else:
			self.waitRateLimit()
Example #32
0
    def dispatch(self, opt, id, output_counts=False):
	if opt == 1: # user timeline
	    out = self.user_timeline(id)
	    if "count" in out and out["count"] == 0: # see if the user was just deleted
		out_user = self.user(id)
	elif opt == 2: # user
	    if self.force_screenname:
		out = self.user(None, id)
	    else:
		out = self.user(id)
	elif opt == 3: # friends
	    out = self.socialgraph(id, "friends")
	    if "count" in out and out["count"] == 5000:
		out = self.socialgraph(id, "friends", 4999)
	elif opt == 4: # followers
	    out = self.socialgraph(id, "followers")
	    if "count" in out and out["count"] == 5000:
		out = self.socialgraph(id, "followers", 4999)
	elif opt == 7: # reposts
	    blanks_count = 0
	    gotall_count = 0
	    for i in range(self.max_reposts_pages):
		items_count = 0
		misses_count = 0
		trial = 0
		while items_count == 0 and misses_count <= self.max_reposts_tries:
		    time.sleep(5)
		    out = self.reposts(id, 200, i+1)
		    if not "count" in out:
			misses_count += 1
			rls = self.api.rate_limit_status()
			ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") }
			if self.verbose:
			    print ratelimstatus
			continue
		    elif out["count"] == 0:
			out["msg"] = "Too many blanks: probably reached the end"
			blanks_count += 1
			gotall_count += 1
			break
		    else:
			blanks_count = 0
		    items_count = out["count"]
		if self.verbose:
		    out["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
		    print out
		if misses_count >= self.max_reposts_tries:
		    print out["msg"]
		    time.sleep(60)
		    break
		elif blanks_count >= self.max_reposts_blanks:
		    print out["msg"]
		    break
		if items_count == out["already_exists_count"]: # already got everything
		    gotall_count += 1
		    #continue # still continue
		    if not self.getall and gotall_count >= self.max_gotall_count:
			out["msg"] = "Already full " + str(self.max_gotall_count) + " times: we're breaking here"
			break
		    continue
	    if output_counts:
		if self.pgconn is None:
		    self.pgconn = mypass.getConn()
		rps = self.getRangePartitionByIds([id])
		rps_count = 0
		for x in rps:
		    yw = x.split(",")
		    year = int(yw[0])
		    week = int(yw[1])
		    sql_count = "SELECT COUNT(*) FROM rp_sinaweibo_y%(year)dw%(week)d WHERE retweeted_status = %(id)d " % { "year": year, "week": week, "id": id }
		    res_count = self.pgconn.query(sql_count).getresult()
		    rps_count += int(res_count[0][0])
		umask = os.umask(0)
		fo = open(self.reposts_dir + "/counts/" + str(id), "w")
		fo.write(str(rps_count))
		fo.close()
		os.umask(umask)
	elif opt == 8: # comments
	    blanks_count = 0
	    gotall_count = 0
	    for i in range(self.max_comments_pages):
		items_count = 0
		misses_count = 0
		trial = 0
		while items_count == 0 and misses_count <= self.max_comments_tries:
		    time.sleep(5)
		    out = self.comments(id, 200, i+1)
		    if not "count" in out:
			misses_count += 1
			rls = self.api.rate_limit_status()
			ratelimstatus = { "remaining_hits": self.getAtt(rls, "remaining_hits"), "hourly_limit": self.getAtt(rls, "hourly_limit"), "reset_time_in_seconds": self.getAtt(rls, "reset_time_in_seconds"), "reset_time": self.getAtt(rls, "reset_time") }
			if self.verbose:
			    print ratelimstatus
			continue
		    elif out["count"] == 0:
			out["msg"] = "Too many blanks: probably reached the end"
			blanks_count += 1
			gotall_count += 1
			break
		    else:
			blanks_count = 0
		    items_count = out["count"]
		if self.verbose:
		    out["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
		    print out
		if misses_count >= self.max_comments_tries:
		    print out["msg"]
		    time.sleep(60)
		    break
		elif blanks_count >= self.max_comments_blanks:
		    print out["msg"]
		    break
		if items_count == out["already_exists_count"]: # already got everything
		    gotall_count += 1
		    if not self.getall and gotall_count >= self.max_gotall_count:
			break
		    continue #break
	    if output_counts:
		if self.pgconn is None:
		    self.pgconn = mypass.getConn()
		sql_count = "SELECT COUNT(*) FROM sinaweibo_comments WHERE status_id = %d " % id
		res_count = self.pgconn.query(sql_count).getresult()
		umask = os.umask(0)
		fo = open(self.comments_dir + "/counts/" + str(id), "w")
		fo.write(str(res_count[0][0]))
		fo.close()
		os.umask(umask)
	elif opt == 9: # single status
	    out = self.get_status(id)
	else:
	    out = None
	return out
Example #33
0
    def status_timeline(self, statuses, isSingleUser=True, toBeginning=True):
	already_exists_count = 0
	time_db = 0
	time_db_u = 0
	if self.index:
	    time_index = 0
	deleted_count = 0
	timeline_users_ids = list()
	toleranceNotToBeginningCount = 0
	newlyadded = 0
        for l in statuses:
	    x = self.status_to_row(l)
	    x_rt = None
	    if "rt" in x:
		x_rt = x["rt"]
	    start_time_db = time.time()
	    if ("user_id" not in x or ("user_id" in x and x["user_id"] is None)) and self.id is not None:
		x["user_id"] = self.id
	    # handle deleted statuses
	    if "deleted" in x and x["deleted"] is not None and (x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True):
		deleted_count += 1
		if self.pgconn is None:
		    self.pgconn = mypass.getConn()
		try:
		    sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "id": x["id"] }
		    res = self.pgconn.query(sql_deleted)
		except pg.ProgrammingError, pg.InternalError:
		    print self.pgconn.error
		continue
	    tablename = self.getRangePartitionByDate(x["created_at"])
	    #tablename = "rp_sinaweibo"
	    resp = self.toDB(tablename, x)
	    if x_rt is not None and self.rt:
		tablename_rt = self.getRangePartitionByDate(x_rt["created_at"])
		resp_rt = self.toDB(tablename_rt, x_rt, self.doupdate)
	    time_db += time.time() - start_time_db
	    if not resp["already_exists"] and resp["success"] and not isSingleUser:
		timeline_user = self.user_to_row(l["user"])
		timeline_user_id = timeline_user["id"]
		if self.verbose > 1:
		    print resp
		if not timeline_user_id in timeline_users_ids:
		    start_time_db_u = time.time()
		    u = self.user_to_row(timeline_user)
		    resp_u = self.toDB("sinaweibo_users", u, doupdate=True)
		    time_db_u += time.time() - start_time_db_u
		    if resp_u["already_exists"] or resp_u["success"]:
			timeline_users_ids.append(timeline_user_id)
	    if resp["already_exists"]:
		if toBeginning:
		    already_exists_count += 1
		else:
		    toleranceNotToBeginningCount += 1
		    if isSingleUser:
			if self.verbose:
			    print "already exists, tolerance: " + str(toleranceNotToBeginningCount)
			if toleranceNotToBeginningCount >= self.toleranceNotToBeginning:
			    break
		    else:
			if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong:
			    break
	    else:
		newlyadded += 1
		if not toBeginning:
		    toleranceNotToBeginningCount = 0
		if self.index: # index if the row doesn't already exist
		    time_index_start = time.time()
		    try:
			t = time.strptime(x["created_at"],"%Y-%m-%d %H:%M:%S")
			created_at_secs = int(time.mktime(t))
			self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs)
		    except Exception as e:
			print e
		    time_index += time.time() - time_index_start
	    if self.verbose > 1:
		print x
Example #34
0
 def __init__(self):
     self.googlepluskey = mypass.getGooglePlusKey()
     self.pgconn = mypass.getConn()
     self.http = httplib2.Http()
     self.api_key = self.googlepluskey["api_key"]
Example #35
0
 def status_timeline(self, statuses, isSingleUser=True, toBeginning=True):
     already_exists_count = 0
     time_db = 0
     time_db_u = 0
     if self.index:
         time_index = 0
     deleted_count = 0
     timeline_users_ids = list()
     toleranceNotToBeginningCount = 0
     newlyadded = 0
     for l in statuses:
         x = self.status_to_row(l)
         x_rt = None
         if "rt" in x:
             x_rt = x["rt"]
         start_time_db = time.time()
         if ("user_id" not in x or
             ("user_id" in x
              and x["user_id"] is None)) and self.id is not None:
             x["user_id"] = self.id
         # handle deleted statuses
         if "deleted" in x and x["deleted"] is not None and (
                 x["deleted"] == "1" or x["deleted"] == 1
                 or x["deleted"] is True):
             deleted_count += 1
             if self.pgconn is None:
                 self.pgconn = mypass.getConn()
             try:
                 sql_deleted = "UPDATE rp_sinaweibo SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % {
                     "id": x["id"]
                 }
                 res = self.pgconn.query(sql_deleted)
             except pg.ProgrammingError, pg.InternalError:
                 print self.pgconn.error
             continue
         tablename = self.getRangePartitionByDate(x["created_at"])
         #tablename = "rp_sinaweibo"
         resp = self.toDB(tablename, x)
         if x_rt is not None and self.rt:
             tablename_rt = self.getRangePartitionByDate(x_rt["created_at"])
             resp_rt = self.toDB(tablename_rt, x_rt, self.doupdate)
         time_db += time.time() - start_time_db
         if not resp["already_exists"] and resp[
                 "success"] and not isSingleUser:
             timeline_user = self.user_to_row(l["user"])
             timeline_user_id = timeline_user["id"]
             if self.verbose > 1:
                 print resp
             if not timeline_user_id in timeline_users_ids:
                 start_time_db_u = time.time()
                 u = self.user_to_row(timeline_user)
                 resp_u = self.toDB("sinaweibo_users", u, doupdate=True)
                 time_db_u += time.time() - start_time_db_u
                 if resp_u["already_exists"] or resp_u["success"]:
                     timeline_users_ids.append(timeline_user_id)
         if resp["already_exists"]:
             if toBeginning:
                 already_exists_count += 1
             else:
                 toleranceNotToBeginningCount += 1
                 if isSingleUser:
                     if self.verbose:
                         print "already exists, tolerance: " + str(
                             toleranceNotToBeginningCount)
                     if toleranceNotToBeginningCount >= self.toleranceNotToBeginning:
                         break
                 else:
                     if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong:
                         break
         else:
             newlyadded += 1
             if not toBeginning:
                 toleranceNotToBeginningCount = 0
             if self.index:  # index if the row doesn't already exist
                 time_index_start = time.time()
                 try:
                     t = time.strptime(x["created_at"], "%Y-%m-%d %H:%M:%S")
                     created_at_secs = int(time.mktime(t))
                     self.indexer.indexWeibo(x["id"], x["text"],
                                             x["user_id"], created_at_secs)
                 except Exception as e:
                     print e
                 time_index += time.time() - time_index_start
         if self.verbose > 1:
             print x
Example #36
0
    def status_timeline(self, statuses, isSingleUser=True, toDB=True, toBeginning=True):
	already_exists_count = 0
	time_db = 0
	time_db_u = 0
	if self.index:
	    time_index = 0
	deleted_count = 0
	timeline_users_ids = list()
	toleranceNotToBeginningCount = 0
	newlyadded = 0
        for l in statuses:
	    x = self.status_to_row(l)
	    if toDB:
		start_time_db = time.time()
		# handle deleted statuses
		if "deleted" in x and x["deleted"] is not None and (x["deleted"] == "1" or x["deleted"] == 1 or x["deleted"] is True):
		    deleted_count += 1
		    if self.pgconn is None:
			self.pgconn = mypass.getConn()
		    try:
			rps = self.getRangePartitionByIds([x["id"]])
		    	for x in rps:
			    yw = x.split(",")
			    year = int(yw[0])
			    week = int(yw[1])
			    sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % { "year": year, "week": week, "id": x["id"] }
			    res = self.pgconn.query(sql_deleted)
		    except pg.ProgrammingError, pg.InternalError:
			print self.pgconn.error
		    continue
		tablename = self.getRangePartitionByDate(self.getAtt(l, "created_at"))
		#tablename = "rp_sinaweibo"
		resp = self.toDB(tablename, x)
		time_db += time.time() - start_time_db
		if not resp["already_exists"] and resp["success"] and not isSingleUser:
		    timeline_user = self.getAtt(l, "user")
		    timeline_user_id = self.getAtt(timeline_user, "id")
		    #print resp
		    if not timeline_user_id in timeline_users_ids:
			start_time_db_u = time.time()
			u = self.user_to_row(timeline_user)
			resp_u = self.toDB("sinaweibo_users", u)
			time_db_u += time.time() - start_time_db_u
			if resp_u["already_exists"] or resp_u["success"]:
			    timeline_users_ids.append(timeline_user_id)
		if resp["already_exists"]:
		    if toBeginning:
			already_exists_count += 1
		    else:
			toleranceNotToBeginningCount += 1
			if isSingleUser:
			    if self.verbose:
				print "already exists, tolerance: " + str(toleranceNotToBeginningCount)
			    if toleranceNotToBeginningCount >= self.toleranceNotToBeginning:
				break
			else:
			    if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong:
				break
		else:
		    newlyadded += 1
		    if not toBeginning:
			toleranceNotToBeginningCount = 0
		    if self.index: # index if the row doesn't already exist
			time_index_start = time.time()
			try:
			    t = time.strptime(x["created_at"],"%Y-%m-%d %H:%M:%S")
			    created_at_secs = int(time.mktime(t))
			    self.indexer.indexWeibo(x["id"], x["text"], x["user_id"], created_at_secs)
			except Exception as e:
			    print e
			time_index += time.time() - time_index_start
	    else:
		print x
Example #37
0
    def __init__(self):
	self.qqkeys = mypass.getQQKey()
	self.pgconn = mypass.getConn()
	self.http = httplib2.Http()
	self.qqclient = qweiboclient.QQWeiboClient(self.qqkeys["consumer_key"], self.qqkeys["consumer_secret"])
	self.qqclient.setAccessToken(self.qqkeys["oauth_token"], self.qqkeys["oauth_token_secret"])
Example #38
0
 def dispatch(self, opt, id, output_counts=False):
     if opt == 1:  # user timeline
         out = self.user_timeline(id)
         if "count" in out and out[
                 "count"] == 0:  # see if the user was just deleted
             out_user = self.user(id)
     elif opt == 2:  # user
         if self.force_screenname:
             out = self.user(None, id)
         else:
             out = self.user(id)
     elif opt == 3:  # friends
         out = self.socialgraph(id, "friends")
         if "count" in out and out["count"] == 5000:
             out = self.socialgraph(id, "friends", 4999)
     elif opt == 4:  # followers
         out = self.socialgraph(id, "followers")
         if "count" in out and out["count"] == 5000:
             out = self.socialgraph(id, "followers", 4999)
     elif opt == 7:  # reposts
         blanks_count = 0
         gotall_count = 0
         for i in range(self.max_reposts_pages):
             items_count = 0
             misses_count = 0
             trial = 0
             while items_count == 0 and misses_count <= self.max_reposts_tries:
                 time.sleep(5)
                 out = self.reposts(id, 200, i + 1)
                 if not "count" in out:
                     misses_count += 1
                     rls = self.api.rate_limit_status()
                     ratelimstatus = {
                         "remaining_hits":
                         self.getAtt(rls, "remaining_hits"),
                         "hourly_limit":
                         self.getAtt(rls, "hourly_limit"),
                         "reset_time_in_seconds":
                         self.getAtt(rls, "reset_time_in_seconds"),
                         "reset_time":
                         self.getAtt(rls, "reset_time")
                     }
                     if self.verbose:
                         print ratelimstatus
                     continue
                 elif out["count"] == 0:
                     out["msg"] = "Too many blanks: probably reached the end"
                     blanks_count += 1
                     gotall_count += 1
                     break
                 else:
                     blanks_count = 0
                 items_count = out["count"]
             if self.verbose:
                 out["timestamp"] = datetime.datetime.now().strftime(
                     "%Y-%m-%d %H:%M:%S")
                 print out
             if misses_count >= self.max_reposts_tries:
                 print out["msg"]
                 time.sleep(60)
                 break
             elif blanks_count >= self.max_reposts_blanks:
                 print out["msg"]
                 break
             if items_count == out[
                     "already_exists_count"]:  # already got everything
                 gotall_count += 1
                 #continue # still continue
                 if not self.getall and gotall_count >= self.max_gotall_count:
                     out["msg"] = "Already full " + str(
                         self.max_gotall_count
                     ) + " times: we're breaking here"
                     break
                 continue
         if output_counts:
             if self.pgconn is None:
                 self.pgconn = mypass.getConn()
             rps = self.getRangePartitionByIds([id])
             rps_count = 0
             for x in rps:
                 yw = x.split(",")
                 year = int(yw[0])
                 week = int(yw[1])
                 sql_count = "SELECT COUNT(*) FROM rp_sinaweibo_y%(year)dw%(week)d WHERE retweeted_status = %(id)d " % {
                     "year": year,
                     "week": week,
                     "id": id
                 }
                 res_count = self.pgconn.query(sql_count).getresult()
                 rps_count += int(res_count[0][0])
             umask = os.umask(0)
             fo = open(self.reposts_dir + "/counts/" + str(id), "w")
             fo.write(str(rps_count))
             fo.close()
             os.umask(umask)
     elif opt == 8:  # comments
         blanks_count = 0
         gotall_count = 0
         for i in range(self.max_comments_pages):
             items_count = 0
             misses_count = 0
             trial = 0
             while items_count == 0 and misses_count <= self.max_comments_tries:
                 time.sleep(5)
                 out = self.comments(id, 200, i + 1)
                 if not "count" in out:
                     misses_count += 1
                     rls = self.api.rate_limit_status()
                     ratelimstatus = {
                         "remaining_hits":
                         self.getAtt(rls, "remaining_hits"),
                         "hourly_limit":
                         self.getAtt(rls, "hourly_limit"),
                         "reset_time_in_seconds":
                         self.getAtt(rls, "reset_time_in_seconds"),
                         "reset_time":
                         self.getAtt(rls, "reset_time")
                     }
                     if self.verbose:
                         print ratelimstatus
                     continue
                 elif out["count"] == 0:
                     out["msg"] = "Too many blanks: probably reached the end"
                     blanks_count += 1
                     gotall_count += 1
                     break
                 else:
                     blanks_count = 0
                 items_count = out["count"]
             if self.verbose:
                 out["timestamp"] = datetime.datetime.now().strftime(
                     "%Y-%m-%d %H:%M:%S")
                 print out
             if misses_count >= self.max_comments_tries:
                 print out["msg"]
                 time.sleep(60)
                 break
             elif blanks_count >= self.max_comments_blanks:
                 print out["msg"]
                 break
             if items_count == out[
                     "already_exists_count"]:  # already got everything
                 gotall_count += 1
                 if not self.getall and gotall_count >= self.max_gotall_count:
                     break
                 continue  #break
         if output_counts:
             if self.pgconn is None:
                 self.pgconn = mypass.getConn()
             sql_count = "SELECT COUNT(*) FROM sinaweibo_comments WHERE status_id = %d " % id
             res_count = self.pgconn.query(sql_count).getresult()
             umask = os.umask(0)
             fo = open(self.comments_dir + "/counts/" + str(id), "w")
             fo.write(str(res_count[0][0]))
             fo.close()
             os.umask(umask)
     elif opt == 9:  # single status
         out = self.get_status(id)
     else:
         out = None
     return out
Example #39
0
    def __init__(self):
	self.googlepluskey = mypass.getGooglePlusKey()
	self.pgconn = mypass.getConn()
	self.http = httplib2.Http()
	self.api_key = self.googlepluskey["api_key"]
Example #40
0
 def status_timeline(self,
                     statuses,
                     isSingleUser=True,
                     toDB=True,
                     toBeginning=True):
     already_exists_count = 0
     time_db = 0
     time_db_u = 0
     if self.index:
         time_index = 0
     deleted_count = 0
     timeline_users_ids = list()
     toleranceNotToBeginningCount = 0
     newlyadded = 0
     for l in statuses:
         x = self.status_to_row(l)
         if toDB:
             start_time_db = time.time()
             # handle deleted statuses
             if "deleted" in x and x["deleted"] is not None and (
                     x["deleted"] == "1" or x["deleted"] == 1
                     or x["deleted"] is True):
                 deleted_count += 1
                 if self.pgconn is None:
                     self.pgconn = mypass.getConn()
                 try:
                     rps = self.getRangePartitionByIds([x["id"]])
                     for x in rps:
                         yw = x.split(",")
                         year = int(yw[0])
                         week = int(yw[1])
                         sql_deleted = "UPDATE rp_sinaweibo_y%(year)dw%(week)d SET deleted = NOW() WHERE id = %(id)d AND deleted IS NULL " % {
                             "year": year,
                             "week": week,
                             "id": x["id"]
                         }
                         res = self.pgconn.query(sql_deleted)
                 except pg.ProgrammingError, pg.InternalError:
                     print self.pgconn.error
                 continue
             tablename = self.getRangePartitionByDate(
                 self.getAtt(l, "created_at"))
             #tablename = "rp_sinaweibo"
             resp = self.toDB(tablename, x)
             time_db += time.time() - start_time_db
             if not resp["already_exists"] and resp[
                     "success"] and not isSingleUser:
                 timeline_user = self.getAtt(l, "user")
                 timeline_user_id = self.getAtt(timeline_user, "id")
                 #print resp
                 if not timeline_user_id in timeline_users_ids:
                     start_time_db_u = time.time()
                     u = self.user_to_row(timeline_user)
                     resp_u = self.toDB("sinaweibo_users", u)
                     time_db_u += time.time() - start_time_db_u
                     if resp_u["already_exists"] or resp_u["success"]:
                         timeline_users_ids.append(timeline_user_id)
             if resp["already_exists"]:
                 if toBeginning:
                     already_exists_count += 1
                 else:
                     toleranceNotToBeginningCount += 1
                     if isSingleUser:
                         if self.verbose:
                             print "already exists, tolerance: " + str(
                                 toleranceNotToBeginningCount)
                         if toleranceNotToBeginningCount >= self.toleranceNotToBeginning:
                             break
                     else:
                         if toleranceNotToBeginningCount >= self.toleranceNotToBeginningLong:
                             break
             else:
                 newlyadded += 1
                 if not toBeginning:
                     toleranceNotToBeginningCount = 0
                 if self.index:  # index if the row doesn't already exist
                     time_index_start = time.time()
                     try:
                         t = time.strptime(x["created_at"],
                                           "%Y-%m-%d %H:%M:%S")
                         created_at_secs = int(time.mktime(t))
                         self.indexer.indexWeibo(x["id"], x["text"],
                                                 x["user_id"],
                                                 created_at_secs)
                     except Exception as e:
                         print e
                     time_index += time.time() - time_index_start
         else:
             print x
Example #41
0
	try:
	    created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d %H:%M:%S")
	except ValueError:
	    try:
		created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d %H:%M")
	    except ValueError:
		try:
		    created_at = datetime.datetime.strptime(created_at_str, "%Y-%m-%d")
		except ValueError:
		    print created_at_str
		    sys.exit()
    isocal = created_at.isocalendar()
    return "twitter.rp_tweets_y" + str(isocal[0]) + "w" + str(isocal[1])


pgconn = mypass.getConn()

sql_users_fields = "user_id,name,screen_name,description,profile_image_url,url,protected,followers_count,friends_count,created_at,\
favourites_count,utc_offset,time_zone,profile_background_image_url,profile_use_background_image,notifications,geo_enabled,verified,\
statuses_count,lang,contributors_enabled,follow_request_sent,listed_count,show_all_inline_media"

todb = False
tocsv = True
tobeginning = False
doupdate = False
verbose = False

socket.setdefaulttimeout(30)

time_db = 0
time_api = 0