Esempio n. 1
0
class Tupian():
    def __init__(self):
        self.base_url = u"http://www.39eh.com"
        self.db = Db()

    def run_spider(self, url, type):
        tool = Tools()
        soup = tool.gethtml(url)
        cid = self.get_category(soup, type)

        box_list = soup.select(".zxlist ul")
        self.get_list_con(box_list, cid)

    def get_category(self, soup, type):
        # 建立分类
        catgory_tite = soup.title.string
        try:
            self.db.execute(
                "INSERT INTO category(`catgory`,`type`) VALUES(%s,%s)",
                (catgory_tite, type))
            cid = self.db.last_row_id()
        except:
            traceback.print_exc()
            self.db.rollback()
        else:
            self.db.commit()
        return cid

    def get_list_con(self, box_list, cid):
        for i in range(0, len(box_list)):
            time.sleep(3)
            title = box_list[i].select("a")[0].string
            print title
            date = box_list[i].select("li.zxsyd")[0].string
            href = self.base_url + box_list[i].select("a")[0].attrs['href']
            content = self.get_detail(href)
            create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            try:
                self.db.execute(
                    "INSERT INTO tupian (`cid`,`title`,`date`,`content`,`create_time`) VALUES(%s,%s,%s,%s,%s)",
                    (cid, title, date, content, create_time))
            except:
                traceback.print_exc()
                self.db.rollback()
            else:
                self.db.commit()

    def get_detail(self, url):
        tool = Tools()
        soup = tool.gethtml(url)
        content = soup.select(".temp23")[0]
        return content
Esempio n. 2
0
sql = 'INSERT INTO %s(%s) VALUES (%s)' % (self.enclose_sys(table), ','.join(cols), ','.join(['%s'] * len(vals)))

query = 'LOAD DATA INFILE (%s) INTO (%s) FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 LINES;' %(cfg['op_file'], ','.join(cfg['tbl_name'])),' FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 LINES;'

with open(cfg['op_file'], 'r') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header row.
    for row in reader:
        print(row)
        db.insert(cfg['tbl_name'], row)
        print("Row completed " , row)
        #next(reader)


#close the connection to the database.
db.commit()
db.disconnect()
print "Done"


'''
print ("Testing")
db = Db(username='******', password='******', database='ref_data', driver='mysql')
ret = []
ret = db.select(table='geo_info', columns='*')


print(len(ret))

'''