def forum_crawl(link,outFile): createFile(outFile, force=True) p=1 lastpage='' while True: page=getWebpage(link+str(p),timeSleep=0) if not page or page==lastpage: break lastpage=page soup=BeautifulSoup(page.decode('gb2312','ignore')) fields=soup.findAll('div',{'id':"content"}) for f in fields: for line in f.findAll(text=True): if len(line.strip())>1: f=open(outFile,'a') f.write(line) f.close() p+=1
def forum_crawl(link, outFile): createFile(outFile, force=True) p = 1 lastpage = '' while True: page = getWebpage(link + str(p), timeSleep=0) if not page or page == lastpage: break lastpage = page soup = BeautifulSoup(page.decode('gb2312', 'ignore')) fields = soup.findAll('div', {'id': "content"}) for f in fields: for line in f.findAll(text=True): if len(line.strip()) > 1: f = open(outFile, 'a') f.write(line) f.close() p += 1
''' from getWebpage import getWebpage import re import json,time from sysPath import createFile,sysPath try: from BeautifulSoup import BeautifulSoup,SoupStrainer except: from bs4 import BeautifulSoup,SoupStrainer # beta version of bs coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299' f=open(sysPath('webpages/ids.txt')) jf=json.loads(f.read().decode('utf8','ignore')) f.close() createFile('infos_fb.txt',force=True) g=open('infos_fb.txt','a') g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n') g.close() ans=[] for f in jf['data']: info=getWebpage('http://www.facebook.com/'+str(f['id']), cookies=coo, info=str(f['id']) ) bI=BeautifulSoup(info) link=bI.find('link',{'rel':'alternate'}) ''' info=getWebpage(link['href']+'/info', cookies=coo,
''' from getWebpage import getWebpage import re import json,time from sysPath import createFile coo='anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; __utma=151146938.1762808405.1361533510.1361533510.1361533510.1; __utmz=151146938.1361533510.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); l4pager=0; depovince=GW; jebecookies=abb5a061-adf7-4276-9913-0059ed1553e6|||||; p=c506abb8c6dd441921166c4464e116341; ap=269496411; t=351ac721dd34d54a08268e46db838a211; societyguester=351ac721dd34d54a08268e46db838a211; id=269496411; xnsid=cacc7bc0; XNESSESSIONID=376bb17a6b26; at=1; loginfrom=null' headpage=getWebpage(link='http://friend.renren.com/myfriendlistx.do', cookies=coo) r=re.search('var friends=(\[.*\]);',headpage) friendList=r.group(1) jf=json.loads(friendList) ids=[] for f in jf: ids.append(f['id']) createFile('infos.txt',force=True) g=open('infos.txt','a') g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n') g.close() count=0 for id in ids[:]: timeSleep=1 count+=1 #print count mainInfo=getWebpage('http://www.renren.com/'+str(id)+ '/profile?v=info_ajax&undefined', cookies=coo, referer='http://www.renren.com/'+str(id)+'/profile#pdetails', timeSleep=timeSleep,info=str(id))