Ejemplo n.º 1
0
def run():
    user_agents = ldUserAgents("./UserAgentString.json")
    bookinfos = loadMatrixFromFile("./kaijuannodump.csv")
    # bookinfos = loadMatrixFromFile(sys.argv[1])
    # newbookinfos = []
    sellers = ["亚马逊", "京东", "当当", "北发", "淘书", "博库", "文轩", "中国图书", "China-pub"]

    for index in range(1, len(bookinfos)):
        if isAready(bookinfos[index][0]):
            print "INFO: Already processed"
            continue

        print "INFO: processing", bookinfos[index][0],

        b = BookAPI(bookinfos[index][0], choice(user_agents))
        JsonData = json.loads(b.api())
        if JsonData["error"]:
            appendstr2file(JsonData["isbn"], "error.log")
            print "ERROR: got an network error "
            sleep(1)
            continue

        tmpLst = bookinfos[index]

        if JsonData["total"] == 0:
            for seller in sellers:
                tmpLst.append("None")
            print "WARNING: no data"
            sleep(1)
            continue

        for seller in sellers:
            tmpLst.append("None")
            for key in JsonData["data"]:
                if seller.decode("UTF-8") in key:
                    del tmpLst[-1]
                    tmpLst.append(JsonData["data"][key])
                    break
        # newbookinfos.append(tmpLst)
        appendlst2file(lstUtf8(tmpLst), "newkaijuan.csv")
        appendstr2file(bookinfos[index][0], "visited.csv")

        print "Done"
        sleep(1)
#!/usr/bin/env python
#
# Author: Archer Reilly
# Date: 15/Sep/2015
# File: dedumplicate.py
# Desc: remove the dumplicate records from kaijuan.csv
#
# Produced By BR(BeautifulReading)
from util import loadMatrixFromFile, saveMatrixToFile

mat = loadMatrixFromFile('./kaijuan.csv')
newmat = []
tmpLst = []

for row in mat:
    if row[0] in tmpLst:
        print 'INFO: dumplicate ', row[0]
    else:
        newmat.append(row)
        tmpLst.append(row[0])

saveMatrixToFile('kaijuannodump.csv', newmat)