Python sysPath Examples, sysPath.sysPath Python Examples

Example #1

0

Show file

File: combine.py Project: gaoyunzhi/crawling_toolkit

def combineTXT(folder,des='combine.txt'):
    folder=folderPath(folder)
    des=sysPath(des)
    g=codecs.open(des,'w','gb18030')
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.endswith('.txt'):
                add=sysPath(root+'/'+f)
                fi=codecs.open(add,'r','gb18030')
                g.write(fi.read())
    g.close()

Example #2

0

Show file

File: combine.py Project: gaoyunzhi/crawling_toolkit

def combineTXT(folder, des='combine.txt'):
    folder = folderPath(folder)
    des = sysPath(des)
    g = codecs.open(des, 'w', 'gb18030')
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f.endswith('.txt'):
                add = sysPath(root + '/' + f)
                fi = codecs.open(add, 'r', 'gb18030')
                g.write(fi.read())
    g.close()

Example #3

0

Show file

File: genTable.py Project: gaoyunzhi/crawling_toolkit

def genTable(filename='../../testData/testingMonuments.txt',\
             outfname='../../testData/testingMonumentsData_week4_all.csv', \
             months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\
             testNow=False, country='en'):
    now = datetime.datetime.now()
    now=(int(now.year),int(now.month))
    if months==None:
        months=[]
        for year in range(yearBegin,yearEnd):
                for month in range(1,13):
                    if (year, month)>=now: break
                    months.append(str(year)+'0'*(2-len(str(month)))+str(month))
    months=map(str,months)
    filename=sysPath(filename)
    f=open(filename,'r')
    links=f.read().splitlines()
    f.close()    
    #soup=BeautifulSoup(links)
    titleLine=['linkTitle']
    for month in months:
        titleLine.append('Img'+month)
        titleLine.append('Content'+month)
        titleLine.append('Traffic'+month)
    if not os.path.exists(outfname):
        outf=open(outfname,'w')
        outf.write('\t'.join(titleLine)+'\n')
        start=0
        outf.close()
    else:
        outf=open(outfname,'r')
        start=len(outf.read().splitlines())
        outf.close()
    count=0
##    for field in soup.findAll('a')[:endLine]:
    for linkTitle in links:
        index=linkTitle.find('/wiki/')
        if index!=-1:
            linkTitle=linkTitle[index+6:]
        count+=1
        if count<start: continue
##        if not field.has_key('title'): continue
##        linkTitle=field['href'][6:]
##        officialTitle=field['title']
        curLine=[linkTitle]
        for month in months:
            date=month+'01'
            revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s
            if not silent: print 'revId=',revId
            if revId==None:
                curLine+=['','','']
                continue
            link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId
            if testNow: print 'title=',linkTitle, 'link=',link,'month=',month
            if not silent: print 'prepare'
            page=getWebpage(link, timeSleep=0.5,silent=silent)
            if not silent: print 'got page'
            soup=BeautifulSoup(page)
            if not silent: print 'got soup'
            numImg=numImage(soup)            
            if not silent: print 'got num'
            conLen=contentLen(soup)
            if not silent: print 'got len'
            traffic=str(getTraffic(linkTitle,month, silent=silent, country=country))
            if not silent: print 'got history'
            curLine+=[numImg, conLen, traffic]
        curLine=map(str, curLine)
        outf=open(outfname,'a')
        outf.write('\t'.join(curLine)+'\n')
        outf.close()

Example #4

0

Show file

File: fb_friendlist.py Project: gaoyunzhi/crawling_toolkit

'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json,time
from sysPath import createFile,sysPath
try:
    from BeautifulSoup import BeautifulSoup,SoupStrainer
except:
    from bs4 import BeautifulSoup,SoupStrainer # beta version of bs

coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f=open(sysPath('webpages/ids.txt'))
jf=json.loads(f.read().decode('utf8','ignore'))
f.close()

createFile('infos_fb.txt',force=True)
g=open('infos_fb.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

ans=[]
for f in jf['data']:
    info=getWebpage('http://www.facebook.com/'+str(f['id']),
                    cookies=coo,
                    info=str(f['id'])
                    )
    bI=BeautifulSoup(info)
    link=bI.find('link',{'rel':'alternate'})
    '''

Example #5

0

Show file

File: fb_friendlist.py Project: gaoyunzhi/crawling_toolkit

'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json, time
from sysPath import createFile, sysPath
try:
    from BeautifulSoup import BeautifulSoup, SoupStrainer
except:
    from bs4 import BeautifulSoup, SoupStrainer  # beta version of bs

coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f = open(sysPath('webpages/ids.txt'))
jf = json.loads(f.read().decode('utf8', 'ignore'))
f.close()

createFile('infos_fb.txt', force=True)
g = open('infos_fb.txt', 'a')
g.write(
    'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'
    + '\n')
g.close()

ans = []
for f in jf['data']:
    info = getWebpage('http://www.facebook.com/' + str(f['id']),
                      cookies=coo,
                      info=str(f['id']))
    bI = BeautifulSoup(info)
    link = bI.find('link', {'rel': 'alternate'})