コード例 #1
0
ファイル: combine.py プロジェクト: gaoyunzhi/crawling_toolkit
def combineTXT(folder,des='combine.txt'):
    folder=folderPath(folder)
    des=sysPath(des)
    g=codecs.open(des,'w','gb18030')
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.endswith('.txt'):
                add=sysPath(root+'/'+f)
                fi=codecs.open(add,'r','gb18030')
                g.write(fi.read())
    g.close()
コード例 #2
0
ファイル: combine.py プロジェクト: gaoyunzhi/crawling_toolkit
def combineTXT(folder, des='combine.txt'):
    folder = folderPath(folder)
    des = sysPath(des)
    g = codecs.open(des, 'w', 'gb18030')
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.endswith('.txt'):
                add = sysPath(root + '/' + f)
                fi = codecs.open(add, 'r', 'gb18030')
                g.write(fi.read())
    g.close()
コード例 #3
0
def genTable(filename='../../testData/testingMonuments.txt',\
             outfname='../../testData/testingMonumentsData_week4_all.csv', \
             months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\
             testNow=False, country='en'):
    now = datetime.datetime.now()
    now=(int(now.year),int(now.month))
    if months==None:
        months=[]
        for year in range(yearBegin,yearEnd):
                for month in range(1,13):
                    if (year, month)>=now: break
                    months.append(str(year)+'0'*(2-len(str(month)))+str(month))
    months=map(str,months)
    filename=sysPath(filename)
    f=open(filename,'r')
    links=f.read().splitlines()
    f.close()    
    #soup=BeautifulSoup(links)
    titleLine=['linkTitle']
    for month in months:
        titleLine.append('Img'+month)
        titleLine.append('Content'+month)
        titleLine.append('Traffic'+month)
    if not os.path.exists(outfname):
        outf=open(outfname,'w')
        outf.write('\t'.join(titleLine)+'\n')
        start=0
        outf.close()
    else:
        outf=open(outfname,'r')
        start=len(outf.read().splitlines())
        outf.close()
    count=0
##    for field in soup.findAll('a')[:endLine]:
    for linkTitle in links:
        index=linkTitle.find('/wiki/')
        if index!=-1:
            linkTitle=linkTitle[index+6:]
        count+=1
        if count<start: continue
##        if not field.has_key('title'): continue
##        linkTitle=field['href'][6:]
##        officialTitle=field['title']
        curLine=[linkTitle]
        for month in months:
            date=month+'01'
            revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s
            if not silent: print 'revId=',revId
            if revId==None:
                curLine+=['','','']
                continue
            link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId
            if testNow: print 'title=',linkTitle, 'link=',link,'month=',month
            if not silent: print 'prepare'
            page=getWebpage(link, timeSleep=0.5,silent=silent)
            if not silent: print 'got page'
            soup=BeautifulSoup(page)
            if not silent: print 'got soup'
            numImg=numImage(soup)            
            if not silent: print 'got num'
            conLen=contentLen(soup)
            if not silent: print 'got len'
            traffic=str(getTraffic(linkTitle,month, silent=silent, country=country))
            if not silent: print 'got history'
            curLine+=[numImg, conLen, traffic]
        curLine=map(str, curLine)
        outf=open(outfname,'a')
        outf.write('\t'.join(curLine)+'\n')
        outf.close()
コード例 #4
0
'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json,time
from sysPath import createFile,sysPath
try:
    from BeautifulSoup import BeautifulSoup,SoupStrainer
except:
    from bs4 import BeautifulSoup,SoupStrainer # beta version of bs

coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f=open(sysPath('webpages/ids.txt'))
jf=json.loads(f.read().decode('utf8','ignore'))
f.close()

createFile('infos_fb.txt',force=True)
g=open('infos_fb.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

ans=[]
for f in jf['data']:
    info=getWebpage('http://www.facebook.com/'+str(f['id']),
                    cookies=coo,
                    info=str(f['id'])
                    )
    bI=BeautifulSoup(info)
    link=bI.find('link',{'rel':'alternate'})
    '''
コード例 #5
0
'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json, time
from sysPath import createFile, sysPath
try:
    from BeautifulSoup import BeautifulSoup, SoupStrainer
except:
    from bs4 import BeautifulSoup, SoupStrainer  # beta version of bs

coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f = open(sysPath('webpages/ids.txt'))
jf = json.loads(f.read().decode('utf8', 'ignore'))
f.close()

createFile('infos_fb.txt', force=True)
g = open('infos_fb.txt', 'a')
g.write(
    'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'
    + '\n')
g.close()

ans = []
for f in jf['data']:
    info = getWebpage('http://www.facebook.com/' + str(f['id']),
                      cookies=coo,
                      info=str(f['id']))
    bI = BeautifulSoup(info)
    link = bI.find('link', {'rel': 'alternate'})