def combineTXT(folder,des='combine.txt'): folder=folderPath(folder) des=sysPath(des) g=codecs.open(des,'w','gb18030') for root, dirs, files in os.walk(folder): for f in files: if f.endswith('.txt'): add=sysPath(root+'/'+f) fi=codecs.open(add,'r','gb18030') g.write(fi.read()) g.close()
def combineTXT(folder, des='combine.txt'): folder = folderPath(folder) des = sysPath(des) g = codecs.open(des, 'w', 'gb18030') for root, dirs, files in os.walk(folder): for f in files: if f.endswith('.txt'): add = sysPath(root + '/' + f) fi = codecs.open(add, 'r', 'gb18030') g.write(fi.read()) g.close()
def genTable(filename='../../testData/testingMonuments.txt',\ outfname='../../testData/testingMonumentsData_week4_all.csv', \ months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\ testNow=False, country='en'): now = datetime.datetime.now() now=(int(now.year),int(now.month)) if months==None: months=[] for year in range(yearBegin,yearEnd): for month in range(1,13): if (year, month)>=now: break months.append(str(year)+'0'*(2-len(str(month)))+str(month)) months=map(str,months) filename=sysPath(filename) f=open(filename,'r') links=f.read().splitlines() f.close() #soup=BeautifulSoup(links) titleLine=['linkTitle'] for month in months: titleLine.append('Img'+month) titleLine.append('Content'+month) titleLine.append('Traffic'+month) if not os.path.exists(outfname): outf=open(outfname,'w') outf.write('\t'.join(titleLine)+'\n') start=0 outf.close() else: outf=open(outfname,'r') start=len(outf.read().splitlines()) outf.close() count=0 ## for field in soup.findAll('a')[:endLine]: for linkTitle in links: index=linkTitle.find('/wiki/') if index!=-1: linkTitle=linkTitle[index+6:] count+=1 if count<start: continue ## if not field.has_key('title'): continue ## linkTitle=field['href'][6:] ## officialTitle=field['title'] curLine=[linkTitle] for month in months: date=month+'01' revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s if not silent: print 'revId=',revId if revId==None: curLine+=['','',''] continue link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId if testNow: print 'title=',linkTitle, 'link=',link,'month=',month if not silent: print 'prepare' page=getWebpage(link, timeSleep=0.5,silent=silent) if not silent: print 'got page' soup=BeautifulSoup(page) if not silent: print 'got soup' numImg=numImage(soup) if not silent: print 'got num' conLen=contentLen(soup) if not silent: print 'got len' traffic=str(getTraffic(linkTitle,month, silent=silent, country=country)) if not silent: print 'got history' curLine+=[numImg, conLen, traffic] curLine=map(str, curLine) outf=open(outfname,'a') outf.write('\t'.join(curLine)+'\n') outf.close()
''' this program needs to add wait time, may cause problem with your renren id ''' from getWebpage import getWebpage import re import json,time from sysPath import createFile,sysPath try: from BeautifulSoup import BeautifulSoup,SoupStrainer except: from bs4 import BeautifulSoup,SoupStrainer # beta version of bs coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299' f=open(sysPath('webpages/ids.txt')) jf=json.loads(f.read().decode('utf8','ignore')) f.close() createFile('infos_fb.txt',force=True) g=open('infos_fb.txt','a') g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n') g.close() ans=[] for f in jf['data']: info=getWebpage('http://www.facebook.com/'+str(f['id']), cookies=coo, info=str(f['id']) ) bI=BeautifulSoup(info) link=bI.find('link',{'rel':'alternate'}) '''
''' this program needs to add wait time, may cause problem with your renren id ''' from getWebpage import getWebpage import re import json, time from sysPath import createFile, sysPath try: from BeautifulSoup import BeautifulSoup, SoupStrainer except: from bs4 import BeautifulSoup, SoupStrainer # beta version of bs coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299' f = open(sysPath('webpages/ids.txt')) jf = json.loads(f.read().decode('utf8', 'ignore')) f.close() createFile('infos_fb.txt', force=True) g = open('infos_fb.txt', 'a') g.write( 'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value' + '\n') g.close() ans = [] for f in jf['data']: info = getWebpage('http://www.facebook.com/' + str(f['id']), cookies=coo, info=str(f['id'])) bI = BeautifulSoup(info) link = bI.find('link', {'rel': 'alternate'})