def __init__(self): self.newcsv = csv.writer(open("books.csv", "w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session()
class InfibeamSpider(BaseSpider): name = "InfibeamSpider" allowed_domains = ["http://infibeam.com","www.infibeam.com"] def __init__(self): self.session = Session() def start_requests(self): round_info = self.session.query(Meta).order_by(desc(Meta.round)).first() for row in self.session.query(Books).filter(Books.round==round_info.round): # log.msg("Looking for "+row.title,level=log.DEBUG) ebayurl="http://www.ebay.in/sch/i.html?_from=R40&_npmv=3&_trksid=m570&_nkw=Anupam Kher&_sacat=267" url = "http://www.infibeam.com/search?q="+row.title+" ,"+row.author yield Request(url,callback=self.parse_lookup, encoding='utf-8') def parse_lookup(self,response): hxs = HtmlXPathSelector(response) # log.msg("here",level=log.DEBUG) result_title = hxs.select('//div[@id="bd"]/div[@id="yui-main"]/div[@class="yui-b"]/div[@class="yui-g"]/div[@id="search_result"]/ul[@class="search_result"]/li/span[@class="title"]/h2[@class="simple"]/a/text()').extract()[0] result_price = hxs.select('//div[@id="bd"]/div[@id="yui-main"]/div[@class="yui-b"]/div[@class="yui-g"]/div[@id="search_result"]/ul[@class="search_result"]/li/div[@class="price"]/b/text()').extract()[0] item = Book() item['title']=result_title item['author']="None" item['price']= result_price return item
class InfibeamSpider(BaseSpider): name = "InfibeamSpider" allowed_domains = ["http://infibeam.com", "www.infibeam.com"] def __init__(self): self.session = Session() def start_requests(self): round_info = self.session.query(Meta).order_by(desc( Meta.round)).first() for row in self.session.query(Books).filter( Books.round == round_info.round): # log.msg("Looking for "+row.title,level=log.DEBUG) ebayurl = "http://www.ebay.in/sch/i.html?_from=R40&_npmv=3&_trksid=m570&_nkw=Anupam Kher&_sacat=267" url = "http://www.infibeam.com/search?q=" + row.title + " ," + row.author yield Request(url, callback=self.parse_lookup, encoding='utf-8') def parse_lookup(self, response): hxs = HtmlXPathSelector(response) # log.msg("here",level=log.DEBUG) result_title = hxs.select( '//div[@id="bd"]/div[@id="yui-main"]/div[@class="yui-b"]/div[@class="yui-g"]/div[@id="search_result"]/ul[@class="search_result"]/li/span[@class="title"]/h2[@class="simple"]/a/text()' ).extract()[0] result_price = hxs.select( '//div[@id="bd"]/div[@id="yui-main"]/div[@class="yui-b"]/div[@class="yui-g"]/div[@id="search_result"]/ul[@class="search_result"]/li/div[@class="price"]/b/text()' ).extract()[0] item = Book() item['title'] = result_title item['author'] = "None" item['price'] = result_price return item
def __init__(self): self.newcsv=csv.writer(open("books.csv","w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session()
class FlipkartTrendsPipeline(object): def __init__(self): self.newcsv=csv.writer(open("books.csv","w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session() def process_item(self, item, spider): use_for = ['FlipkartSpider'] if spider.name in use_for: log.msg(item['title'], level=log.DEBUG) round_number = self.session.query(Meta).order_by(desc(Meta.round)).first() #self.session.refresh(round_number) #self.newcsv.writerow([item['author'][0],item['title'][0],item['price'][0]]) book = Books(unicode(round_number.round),unicode(item['title'][0]),unicode(item['author'][0]),flipkart=unicode(item['price'][0].split(' ')[2])) self.session.add(book) self.session.commit() self.session.flush() #session.close() return item else: return item
class FlipkartTrendsPipeline(object): def __init__(self): self.newcsv = csv.writer(open("books.csv", "w")) #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #self.dt = datetime #Base.metadata.create_all(self.engine) self.session = Session() def process_item(self, item, spider): use_for = ['FlipkartSpider'] if spider.name in use_for: log.msg(item['title'], level=log.DEBUG) round_number = self.session.query(Meta).order_by(desc( Meta.round)).first() #self.session.refresh(round_number) #self.newcsv.writerow([item['author'][0],item['title'][0],item['price'][0]]) book = Books(unicode(round_number.round), unicode(item['title'][0]), unicode(item['author'][0]), flipkart=unicode(item['price'][0].split(' ')[2])) self.session.add(book) self.session.commit() self.session.flush() #session.close() return item else: return item
def __init__(self): #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #Base.metadata.create_all(self.engine) self.session = Session() try: round_info = self.session.query(Meta).order_by(desc(Meta.round)).first() print round_info new_round = Meta(round_info.round+1) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close() except: new_round = Meta(0) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close()
def __init__(self): self.session = Session()
class FlipkartSpider(BaseSpider): name = "FlipkartSpider" allowed_domains = ["http://flipkart.com","www.flipkart.com"] start_urls = [ 'http://www.flipkart.com/view-books/0/new-releases' ] def __init__(self): #self.engine = create_engine('mysql connection') #self.Session = sessionmaker(bind=self.engine) #Base.metadata.create_all(self.engine) self.session = Session() try: round_info = self.session.query(Meta).order_by(desc(Meta.round)).first() print round_info new_round = Meta(round_info.round+1) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close() except: new_round = Meta(0) self.session.add(new_round) self.session.commit() self.session.flush() self.session.close() def parse(self, response): #filename = response.url.split("/")[-2] #open(filename, 'wb').write(response.body) hxs = HtmlXPathSelector(response) #hxs.select('//div[@class="line bmargin10"]/h2[@class="fk-srch-item-title fksd-bodytext"]/a/text()').extract() sites = hxs.select('//div[@class="fk-srch-item fk-inf-scroll-item"]') #sites = hxs.select('//div[@class="lastUnit"]/div[@id="search_results"]') items=[] print sites.__len__() for site in sites: #print site item = Book() item['title']= site.select('div[@class="line fksd-bodytext "]/div[@class="line bmargin10"]/h2[@class="fk-srch-item-title fksd-bodytext"]/a/text()').extract() item['author'] = site.select('div[@class="line fksd-bodytext "]/div[@class="line bmargin10"]/span[@class="fk-item-authorinfo-text fksd-smalltext"]/a/text()').extract() item['price'] = site.select('div[@class="line fksd-bodytext "]/div[@class="unit fk-sitem-info-section"]/div[@class="line fk-itemdetail-info fksd-bodytext"]/div[@class="line dlvry-det"]/div[@class="line fk-srch-pricing fksd-smalltext"]/b[@class="fksd-bodytext price final-price"]/text()').extract() items.append(item) #print item return items