def Save_tb_landmark_timeline(self,dicResult): sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_timeline WHERE TimelineFBID = %s) \ BEGIN\ insert into tb_user_timeline(\ TimelineFBID,postUserFBID,postUserName,postTime,content,picturesURLs,picturesAlts,\ DZanCount,\ landMarkID,landMarkName,timestamp,crawledTime,contentZHCN)\ VALUES(%s,%d,%s,%s,%s,%s,%s,\ %d,\ %s,%s,%d,GETDATE(),%s)\ END" logHelper.getLogger().debug(sql) # print(dicResult) # logHelper.getLogger().debug(dicResult) dicResult['postUserName'] = dicResult['postUserName'][:120] dicResult['landMarkID'] = dicResult['landMarkID'][:50] dicResult['landMarkName'] = dicResult['landMarkName'][:120] import GoogleTrans dicResult['contentZHCN'] = GoogleTrans.translate(dicResult['content']) dicResult['picturesAlts'] = GoogleTrans.translate(dicResult['picturesAlts']) dicResult['picturesAlts'] = dicResult['picturesAlts'][:300] param = (dicResult['TimelineFBID'], dicResult['TimelineFBID'],dicResult['postUserFBID'],dicResult['postUserName'],dicResult['postTime'],dicResult['content'],dicResult['picturesURLs'],dicResult['picturesAlts'], dicResult['DZanCount'], dicResult['landMarkID'],dicResult['landMarkName'],dicResult['timestamp'],dicResult['contentZHCN']) self.dbinstance.ExecNonQuery(sql,param)
def main(): logHelper.getLogger().info(sys.argv[1]) seedfile = sys.argv[1] fbhelper = FBCheckHelper() fbhelper.ImportFBCheckSeed(seedfile) logHelper.getLogger().info("seed import completed!")
def ImportTWUserSeed(self, seedtxtfile, origin): with open(seedtxtfile, 'r', encoding='UTF-8') as f: for line in f.readlines(): line = line.strip() if len(line) == 0: continue if line.startswith('#'): continue now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') acount = line.split(',') fbid = acount[0].strip('\ufeff') name = '' if len(acount) > 1: name = acount[1].strip() else: name = fbid mail = '' if len(acount) > 2: mail = acount[2].strip() sql = "insert into tb_seed_user(fbid,name,mobileoremail,origin,publishedtime,hasTasked) values(%s,%s,%s,%s,%s,0);" param = (fbid, name, mail, origin, now) self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().info("insert {0}".format(line))
def GenerateUserTaskFromFriends(self, tasktype, whereclause): ''' 从朋友生成任务,按照任务类型 :param tasktype: :return: ''' lstsql = [] lstparam = [] sql = "insert into tb_task_user(fbid,tasktype,priority,runningstate,deep,name,originalfbid) \ select fbid,%s,priority,0,deep,name,originalfbid from tb_user_friends {0};".format( whereclause) param = [tasktype] lstsql.append(sql) lstparam.append(param) sql = "update tb_user_friends set hasTasked = %s,taskedTime=sysdate() {0}".format( whereclause) param = [1] lstsql.append(sql) lstparam.append(param) c = self.dbinstance.ExecNonQueryBatch2(lstsql, lstparam) logHelper.getLogger().debug( 'Generate {0} {1} Task from Friends is OK!'.format(c, tasktype)) return c
def Save_tb_user_timeline(self, dicResult): sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_timeline WHERE TimelineFBID = %s) \ BEGIN\ insert into tb_user_timeline(\ TimelineFBID,postUserFBID,postUserName,postTime,content,picturesURLs,\ DZanCount,\ landMarkID,landMarkName,timestamp,crawledTime)\ VALUES(%s,%d,%s,%s,%s,%s,\ %d,%s,%s,%d,GETDATE())\ END" # sql = sql.format(TimelineFBID=dicResult['TimelineFBID'], # postUserFBID=dicResult['postUserFBID'], # postUserName=dicResult['postUserName'], # postTime=dicResult['postTime'], # content=dicResult['content'], # picturesURLs=dicResult['picturesURLs'], # DZanCount=dicResult['DZanCount'], # landMarkID=dicResult['landMarkID'], # landMarkName=dicResult['landMarkName'], # timestamp=dicResult['timestamp'] # ) logHelper.getLogger().debug(sql) # print(dicResult) # logHelper.getLogger().debug(dicResult) param = (dicResult['TimelineFBID'], dicResult['TimelineFBID'], dicResult['postUserFBID'], dicResult['postUserName'], dicResult['postTime'], dicResult['content'], dicResult['picturesURLs'], dicResult['DZanCount'], dicResult['landMarkID'], dicResult['landMarkName'], dicResult['timestamp']) self.dbinstance.ExecNonQuery(sql, param)
def LoadTopNTask(self, n, tasktype): que = PriorityQueue() query = "SELECT TOP {0} id,priority,fbid,originalfbid,deep,name FROM tb_task_group WHERE runningState=0 and Tasktype = '{1}' order by priority DESC".format( n, tasktype) rows, c = self.dbinstance.ExecQuery(query) for row in rows: originfbid = row[3] deep = row[4] name = row[5] if row[3] is None: originfbid = '' if row[4] is None: deep = '3' if row[5] is None: name = '' r = common.FBTask( row[0], row[1], row[2], tasktype, originfbid, int(deep), name ) #def __init__(self,id,priority,fbid,tasktype,originalfbid,deep,name): for i in range(5): logHelper.getLogger().debug(row[i]) tup = (row[1], r) #priority作为优先级 que.put(tup) return que
def GenerateUserTask(self, tasktype): ''' 从种子生成任务,按照任务类型,将所有未运行种子都导入 :param tasktype: :return: ''' lstsql = [] lstparam = [] # sql = "insert into tb_task_user(fbid,tasktype,priority,runningstate,deep,name) select fbid,%s,100,0,0,name from tb_seed_user where hastasked = 0;update tb_seed_user set hasTasked = 1,taskedTime=sysdate() where hastasked = 0" sql = "insert into tb_task_user(fbid,tasktype,priority,runningstate,deep,name) select fbid,%s,100,0,0,name from tb_seed_user where hastasked = 0;" param = [tasktype] lstsql.append(sql) lstparam.append(param) sql = "update tb_seed_user set hasTasked = 1,taskedTime=sysdate() where hastasked = %s" param = [0] lstsql.append(sql) lstparam.append(param) logHelper.getLogger().debug(sql) print(lstsql) print(lstparam) c = self.dbinstance.ExecNonQueryBatch2(lstsql, lstparam) logHelper.getLogger().debug('Generate User Task From Seed is OK!') return c
def Save_tb_user_info(self, dicResult): sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_info WHERE fbid = %d) \ BEGIN\ insert into tb_user_info(fbid,Name,fbHomepage,logoFile,\ Gender,rank,Birthday,EDU,Work,currentCity,\ homeTown,Languages,homePageUrl,phone,email,interestedIn,favoriteQuotes,\ selfIntro,lifeEvents,Relationship,Description,crawledTime)\ VALUES(%d,%s,%s,%s,%s,%d,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,GETDATE())\ END" logHelper.getLogger().debug(sql) # print(dicResult) # logHelper.getLogger().debug(dicResult) # sql = sql.format(fbid=dicResult['fbid'], # Name=dicResult['Name'], # rank=dicResult['rank'] # ) dicResult['Name'] = dicResult['Name'][:120] dicResult['fbHomepage'] = dicResult['fbHomepage'][:128] dicResult['EDU'] = dicResult['EDU'][:1000] dicResult['Work'] = dicResult['Work'][:1000] dicResult['currentCity'] = dicResult['currentCity'][:100] dicResult['homeTown'] = dicResult['homeTown'][:100] dicResult['Languages'] = dicResult['Languages'][:50] dicResult['phone'] = dicResult['phone'][:50] dicResult['email'] = dicResult['email'][:80] dicResult['interestedIn'] = dicResult['interestedIn'][:50] dicResult['favoriteQuotes'] = dicResult['favoriteQuotes'][:1000] dicResult['selfIntro'] = dicResult['selfIntro'][:500] dicResult['lifeEvents'] = dicResult['lifeEvents'][:1000] dicResult['Relationship'] = dicResult['Relationship'][:100] param = ( dicResult['fbid'], dicResult['fbid'], dicResult['Name'], dicResult['fbHomepage'], dicResult['logoFile'], dicResult['Gender'], dicResult['rank'], dicResult['Birthday'], dicResult['EDU'], dicResult['Work'], dicResult['currentCity'], dicResult['homeTown'], dicResult['Languages'], dicResult['homePageUrl'], dicResult['phone'], dicResult['email'], dicResult['interestedIn'], dicResult['favoriteQuotes'], dicResult['selfIntro'], dicResult['lifeEvents'], dicResult['Relationship'], dicResult['Description'], ) self.dbinstance.ExecNonQuery(sql, param)
def is_visible(browser, locator, timeout=15): try: WebDriverWait(browser, timeout).until( EC.visibility_of_element_located((By.XPATH, locator))) return True except TimeoutException as e1: logHelper.getLogger().debug(e1) return False
def saveCookie(browser, cookieFile): myCookie = browser.get_cookies() for tmpCookie in myCookie: logHelper.getLogger().info(tmpCookie["name"] + ":" + tmpCookie["value"]) myCookieStr = json.dumps(myCookie) with open(cookieFile, 'w') as f: f.write(myCookieStr) logHelper.getLogger().info("Cookie is saved into file: " + cookieFile + " successfully !")
def ImportFBCheckSeed(self, seedtxtfile, origin): with open(seedtxtfile, 'r') as f: for line in f.readlines(): if len(line.strip()) == 0: continue now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') sql = "use FBDB;insert into tb_fbcheck_todo(mobileoremail,publishedtime,runningState,origin) values('{0}','{1}',{2},'{3}');".format( line.strip(), now, 0, origin) self.dbinstance.ExecNonQuery(sql) logHelper.getLogger().info("insert {0}".format(line))
def eleClick(eleNode): successFlag = False try: eleNode.click() successFlag = True except Exception as e: logHelper.getLogger().error(e) logHelper.getLogger().warning( "please check status of your network!") successFlag = False return successFlag
def login_by_up_userpage(browser, myAccount, cookieFile): cur_url = browser.current_url logHelper.getLogger().info('login_url: ' + browser.current_url) logInEle = {} logInEle['uStr'] = r'//input[@id="email"]' logInEle['pStr'] = r'//input[@id="pass"]' logInEle['lStr'] = r'//label[@id="loginbutton"]/input[@id="u_0_0"]' login_by_up(browser, myAccount, cookieFile, logInEle) # 返回到登录前页面。 browser.get(cur_url) time.sleep(random.randint(1, 4))
def GenerateUserTask(self, tasktype): ''' 从种子生成任务,按照任务类型,将所有未运行种子都导入 :param tasktype: :return: ''' sql = "insert into tb_task_user(fbid,tasktype,priority,runningstate,deep,name) select fbid,%s,100,0,0,name from tb_seed_user where hastasked = 0;update tb_seed_user set hasTasked = 1,taskedTime=GETDATE() where hastasked = 0" logHelper.getLogger().debug(sql) param = (tasktype) c = self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().debug('Generate User Task From Seed is OK!') return c
def DumpTaskUser(self): ''' 讲完成任务转出去备份保存 ''' sql = 'insert into tb_task_user_log select * from tb_task_user where runningstate = 2;\ delete from tb_task_user where runningstate = %d;' param = (2) c = self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().debug( 'Dump {0} User Task to tb_task_user_log!'.format(c)) return c
def openUrl_exit(browser, url, maxTryNum=5): isOpened = FBHelper.openUrl(browser, url, maxTryNum) if isOpened == False: strF = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) strF += "_" + str(random.randint(1, 61)) browser.save_screenshot("errorShot" + os.sep + strF + ".jpg") browser.quit() logHelper.getLogger().warning( "Your network seems to have some promlem, please check it. Try to exit!" ) exit(1)
def UpdateTaskDispatch(self, taskid, spider=''): ''' 分配任务后,填写分配时间 :param taskid: :return: ''' sql = "update tb_task_user set dispatchTime = sysdate(),Spider = %s where id = %s" # logHelper.getLogger().debug(sql) param = (spider, taskid) c = self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().debug('Update User Task DispatchTime is OK!') return c
def Save_tb_user_relationship(self,dicResult): #dicResult={'fbida':'111','fbidb':'222','relationtype':'friend'} sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_relationship WHERE fbida = %d AND fbidb = %d AND relationtype = %s) \ BEGIN\ insert into tb_user_relationship(fbida,fbidb,namea,nameb,relationtype,crawledTime,relationclass) VALUES\ (%d,%d,%s,%s,%s,GETDATE(),%s)\ END" # sql = sql.format(fbida=dicResult['fbida'],fbidb=dicResult['fbidb'],relationtype=dicResult['relationtype']) param = (dicResult['fbida'], dicResult['fbidb'], dicResult['relationtype'],dicResult['fbida'], dicResult['fbidb'], dicResult['namea'],dicResult['nameb'],dicResult['relationtype'],'LandmarkVisitor') logHelper.getLogger().debug(sql) self.dbinstance.ExecNonQuery(sql,param)
def ImportFBGroupSeed(self, seedtxtfile, origin): with open(seedtxtfile, 'r', encoding='UTF-8') as f: for line in f.readlines(): line = line.strip() if len(line) == 0: continue now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') acount = line.split(',') fbid = acount[0] name = acount[1] sql = "insert into tb_seed_group(fbid,name,origin,publishedtime,hasTasked) values(%d,%s,%s,%s,0);" param = (fbid, name, origin, now) self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().info("insert {0}".format(line))
def testDispatchServer(): ''' 判断是否连接上调度服务器: :return:'Connected!' or "Disconnected!" ''' serverconfig = getDispatchServerConfig() try: s = xmlrpc.client.ServerProxy('http://{0}:{1}'.format( serverconfig[0], serverconfig[1])) logHelper.getLogger().debug( 'connecting to the dispatch server http://{0}:{1}'.format( serverconfig[0], serverconfig[1])) return s.test() except Exception as e: logHelper.getLogger().error(e) return 'Disconnected!'
def testDatabaseServer(): ''' 判断是否连接上数据库服务器: :return: 'Connected!' or "Disconnected!" ''' try: serverconfig = getDatabaseServerConfig() logHelper.getLogger().debug( 'connecting to the database server {0}:{1}'.format( serverconfig[0], serverconfig[3])) s = SqlServer(serverconfig[0], serverconfig[1], serverconfig[2], serverconfig[3]) return s.test() except Exception as e: logHelper.getLogger().error(e) return 'Disconnected!'
def Save_tb_user_friends(self,dicResult): # dicResult={'fbid':'111','name':'na','homepage':'','priority':'1','Description':'ddd'} sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_friends WHERE Fbid = %s) \ BEGIN\ insert into tb_user_friends(Fbid,Name,Homepage,priority,crawledTime,hasTasked,deep,Description) VALUES\ (%s,%s,%s,%d,GETDATE(),0,%d,%s)\ END" # sql = sql.format(fbid=dicResult['fbid'], name=dicResult['name'], homepage=dicResult['homepage'], # priority=dicResult['priority'], Description=dicResult['Description']) dicResult['name'] = dicResult['name'][:120] param = (dicResult['fbid'],dicResult['fbid'],dicResult['name'],dicResult['homepage'],dicResult['priority'],dicResult['deep'],dicResult['Description']) logHelper.getLogger().debug(sql) # print(dicResult) # logHelper.getLogger().debug(dicResult) self.dbinstance.ExecNonQuery(sql,param)
def Save_tb_user_friends_batch(self, lstDicResult): # dicResult={'fbid':'111','name':'na','homepage':'','priority':'1','Description':'ddd'} sql = "insert ignore into tb_user_friends(Fbid,Name,Homepage,priority,crawledTime,hasTasked,deep,Description,originalfbid) VALUES\ (%s,%s,%s,%s,sysdate(),0,%s,%s,%s);" lstParam = [] for dicResult in lstDicResult: dicResult['name'] = dicResult['name'][:120] param = (dicResult['fbid'], dicResult['name'], dicResult['homepage'], dicResult['priority'], dicResult['deep'], dicResult['Description'], dicResult['originalfbid']) lstParam.append(param) # logHelper.getLogger().debug('lstParam count is {0}'.format(len(lstParam))) self.dbinstance.ExecNonQueryBatch(sql, lstParam) logHelper.getLogger().info("Save_tb_user_friends_batch ok.")
def GenerateUserTaskFromFriends(self, tasktype, whereclause): ''' 从朋友生成任务,按照任务类型 :param tasktype: :return: ''' sql = "insert into tb_task_user(fbid,tasktype,priority,runningstate,deep,name) \ select fbid,%s,priority,0,deep,name from tb_user_friends {0} ; \ update tb_user_friends set hasTasked = 1,taskedTime=GETDATE() {0}".format( whereclause) logHelper.getLogger().debug(sql) param = (tasktype) c = self.dbinstance.ExecNonQuery(sql, param) logHelper.getLogger().debug( 'Generate {0} User Task from Friends is OK!'.format(c)) return c
def saveImg(imageURL, fileName): try: heads = { "User-Agent": r"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0" } req = request.Request(imageURL, headers=heads) u = urllib.request.urlopen(req) data = u.read() f = open(fileName, 'wb') f.write(data) f.close() return True except Exception as e: logHelper.getLogger().error(e) logHelper.getLogger().warning( "failed to save the image to file %s" % fileName) return False
def Save_tb_user_friends_batch(self, lstDicResult): # dicResult={'fbid':'111','name':'na','homepage':'','priority':'1','Description':'ddd'} sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_friends WHERE Fbid = %s) \ BEGIN\ insert into tb_user_friends(Fbid,Name,Homepage,priority,crawledTime,hasTasked,deep,Description) VALUES\ (%s,%s,%s,%d,GETDATE(),0,%d,%s)\ END" lstParam = [] for dicResult in lstDicResult: dicResult['name'] = dicResult['name'][:120] param = (dicResult['fbid'], dicResult['fbid'], dicResult['name'], dicResult['homepage'], dicResult['priority'], dicResult['deep'], dicResult['Description']) lstParam.append(param) # logHelper.getLogger().debug('lstParam count is {0}'.format(len(lstParam))) self.dbinstance.ExecNonQueryBatch(sql, lstParam) logHelper.getLogger().info("Save_tb_user_friends_batch ok.")
def getTimeFromStr(timeStr): tmpList = [] if timeStr[-2:] == "am": time_struct = time.strptime(timeStr, "%A, %B %d, %Y at %H:%Mam") tmpList = [item for item in time_struct[0:6]] elif timeStr[-2:] == "pm": time_struct = time.strptime(timeStr, "%A, %B %d, %Y at %H:%Mpm") tmpList = [item for item in time_struct[0:6]] tmpList[-3] += 12 else: logHelper.getLogger().warning("the string's format is not right!") # 返回当前时间 curTime = datetime.datetime.now() tmpList = [ cur.year, cur.month, cur.day, cur.hour, cur.minute, curTime.second ] rtnList = tmpList[ 0:6] # [tmpList[3],tmpList[1],tmpList[2],tmpList[4],tmpList[5],0] return rtnList
def Save_tb_user_relationship_batch(self, lstDicResult): #dicResult={'fbida':'111','fbidb':'222','relationtype':'friend'} sql = "\ IF NOT EXISTS(SELECT * FROM tb_user_relationship WHERE fbida = %d AND fbidb = %d AND relationtype = %s) \ BEGIN\ insert into tb_user_relationship(fbida,fbidb,namea,nameb,relationtype,crawledTime,relationclass) VALUES\ (%d,%d,%s,%s,%s,GETDATE(),%s)\ END" # sql = sql.format(fbida=dicResult['fbida'],fbidb=dicResult['fbidb'],relationtype=dicResult['relationtype']) lstParam = [] for dicResult in lstDicResult: param = (dicResult['fbida'], dicResult['fbidb'], dicResult['relationtype'], dicResult['fbida'], dicResult['fbidb'], dicResult['namea'], dicResult['nameb'], dicResult['relationtype'], 'UserFriends') lstParam.append(param) self.dbinstance.ExecNonQueryBatch(sql, lstParam) logHelper.getLogger().info("Save_tb_user_relationship_batch ok.")
def login_by_cookie(browser, cookieFile, fbid): # browser = openPhantomJS() # browser = webdriver.Firefox() logHelper.getLogger().info('login_url: ' + browser.current_url) browser.delete_all_cookies() with open(cookieFile, 'r', encoding='utf-8') as f: listCookies = json.loads(f.read()) for tmpCookie in listCookies: browser.add_cookie({'domain': '.facebook.com',\ 'name':tmpCookie["name"],\ 'value':tmpCookie["value"],\ 'path': '/',\ 'expires': None,\ }) if FBHelper.openUrl(browser, browser.current_url): # browser() # browser.refresh() time.sleep(random.randint(1, 6)) loginSuc = isLogin(browser, fbid) if loginSuc: logHelper.getLogger().info("login by cookie successfuly!") else: logHelper.getLogger().info("failed in login by cookie!") return loginSuc else: return False
def login_by_up(browser, myAccount, cookieFile, dictLogin): logHelper.getLogger().info('login_url: ' + browser.current_url) # print(browser.page_source) #### firefox取元素代码 # try: # user_input = browser.find_element_by_xpath(r'//input[@id="email"]') # pwd_input = browser.find_element_by_xpath(r'//input[@id="pass"]') # sub_btn = browser.find_element_by_xpath(r'//label[@id="loginbutton"]/input[@id="u_0_r"]') # #### PhantomJS取元素代码 # # user_input = browser.find_element_by_xpath(r'//input[@name="email"]') # # pwd_input = browser.find_element_by_xpath(r'//input[@name="pass"]') # # sub_btn = browser.find_element_by_xpath(r'//button[@name="login"]') # except NoSuchElementException as e: # logHelper.getLogger().debug('login element can not be found, please check whether the page is changed!') # browser.quit() # return False uStr = dictLogin['uStr'] pStr = dictLogin['pStr'] lStr = dictLogin['lStr'] user_input, pwd_input, sub_btn, = findLoginInput(browser, uStr, pStr, lStr) if user_input is None or pwd_input is None or sub_btn is None: return executeLogin(user_input, myAccount.u, pwd_input, myAccount.p, sub_btn) loginSuc = isLogin(browser, myAccount.fbid) if loginSuc: logHelper.getLogger().info("login successfuly!") #saveCookie(browser, cookieFile) else: logHelper.getLogger().info("failed in login!") return loginSuc