def get_fk_single_result(query=None): bookinfolist = [] link = flipkart_link + urllib.urlencode({'query':query}) status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) store, b, soup = get_bookinfo('flipkart', query, None) # bookinfom: isbn13, link, imagelink, mrp, price, availability properties = soup('td', {'class':'specs-key boldtext'}) pdict = {} for p in properties: pdict[p.text.strip()] = p.nextSibling.text.strip() isbn = pdict['ISBN:'] title = pdict['Book:'] author = pdict['Author:'] edition = pdict['Binding:'] bookinfo = Bookinfo(b.link, b.imagelink, b.isbn13, isbn, title, author, edition, b.mrp, b.price, b.availability) bookinfolist.append(bookinfo) return ('success', bookinfolist)
def get_bookinfo(store, isbn13, q): #http://www.flipkart.com/b/books/audacity-hope-barack-obama-book-1847670830 print 'inside get bookinfo' #final_isbn = msutils.get_final_isbn(isbn13, isbn) final_isbn=isbn13 if final_isbn == None: return (store, None) link = stores[store]%(final_isbn) print link imagelink = price = mrp = availability = None status, soup = msutils.get_soup(link, retry=1) print 'got soup' if status == 'failure': print 'bad soup returning none' if q: q.put((store, None)) else: return (store, None) print 'crossed' selectors = store_selectors[store] sel1 = selectors['imagelink'] try: imagelink = soup(sel1[0], {sel1[1]:sel1[2]})[0].img['src'] except Exception, e: print 'error', str(e) imagelink = 'unknown'
def crawl(): # get current catid and pageno conn = sqlite3.connect('clc.db') c = conn.cursor() c.execute('''select catid, pageid from crawlinfo limit 1''') catid, pageid = c.fetchall()[0] print 'crawling' for i in xrange(catid,42): print 'inside' while True: #crawl the page url = burl %(i, pageid) print url status, soup = msutils.get_soup(url, retry=1) time.sleep(2) if status == 'failure': return finished= False try: books = soup('div', {'class': 'image_holder_small'}) for book in books: isbn = book.a.img['alt'] print 'isbn', isbn if len(isbn) != 13: continue #write to db c.execute('insert into books values (%d,"%s")'%(i, isbn)) conn.commit() print len(books) if len(books) == 0: finished = True except Exception, e: print 'error', str(e) print pageid, catid finished = False return if finished: pageid = 1 break else: pageid += 1 c.execute('update crawlinfo set catid=%d, pageid=%d where id=1'%(i, pageid)) conn.commit()
def get_fk_bookinfo(isbn13=None, isbn=None): # http://www.flipkart.com/b/books/audacity-hope-barack-obama-book-1847670830 final_isbn = msutils.get_final_isbn(isbn13, isbn) if final_isbn == None: return ("failure", None) link = flipkart_book_link + final_isbn print link try: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None status, soup = msutils.get_soup(link, retry=1) if status == "failure": return ("failure", None) try: imagelink = soup("div", id="mprodimg-id")[0].img["src"] except (AttribureError, IndexError): imagelink = "unknown" desc = soup("div", id="details")[0].table("tr") info = {} for tr in desc: key, value = tr.text.split(":", 1) info[key] = value if "ISBN-13" in info: isbn13 = info["ISBN-13"].split(",")[0] if "ISBN" in info: isbn = info["ISBN"] if "Binding" in info: edition = info["Binding"] price = soup("span", id="fk-mprod-our-id")[0].text.split()[-1].split(".")[-1].replace(",", "") try: mrp = soup("span", id="fk-mprod-list-id")[0].text.split()[-1].replace(",", "") except IndexError: mrp = price availability_str = soup("div", {"class": "shipping-details"})[0].span.text availability = msutils.get_availability(availability_str) title = info["Book"] author = info["Author"] bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) except urllib2.HTTPError, e: print "http error ", e.code
def crawl(): url = 'http://www.flipkart.com/browse/academic-and-professional-books-3277?response-type=json&inf-start=20' status, soup = msutils.get_soup(url, retry=1) #print soup books = soup('h2') print len(books) for book in books: print book
def get_ur_result(query=None, link=None): bookinfolist = [] if not link and query: link = uread_link + '-'.join([q.strip() for q in query.split()]) print link t = time.time() status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: books = soup('div', {'class':'product-vert-list-item'}) except (AttributeError, IndexError): return ('failure', None) for book in books: try: link = imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None link = book('div', {'class':'product-vert-list-image'})[0].a['href'] #print link imagelink = book('div', {'class':'product-vert-list-image'})[0].a.img['src'] isbn1 = link.split('/')[-1] isbn2 = imagelink.split('/')[-1].split('.')[0] isbn = isbn13 = None if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue summary_elem = book('div', {'class':'product-vert-list-summary'}) title = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].h2.a.text author = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].strong.a.text edition = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].text.rsplit('(', 1)[-1][:-1] price = summary_elem[0]('div', {'class':'product-vert-list-price'})[0]('span', {'class':'our-price'})[0].contents[1].replace(',','') try: mrp = summary_elem[0]('div', {'class':'product-vert-list-price'})[0]('span', {'class':'list-price'})[0].contents[1].replace(',','') except IndexError: mrp = price availability_str = summary_elem[0]('div', {'class':'product-shipping-info'})[0].text availability = msutils.get_availability(availability_str) bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except (IndexError, AttributeError): print link #logging.info('UR:%s' %(time.time()-t)) return ('success', bookinfolist)
def get_lm_result(query=None, link=None): bookinfolist = [] if not link and query: link = landmark_link + urllib.urlencode({'code':query, 'type':'0', 'num':'0'}) print link t = time.time() status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: items = soup('div', {'class': 'searc_box'}) except (AttributeError, IndexError): return ('failure', None) for item in items: try: try: imagelink = item.input['src'] except KeyError: imagelink = 'unknown' link = landmark_root + item.a['href'] isbn1 = link.split('/')[-1] isbn2 = imagelink.split('/')[-1].split('.')[0] isbn = isbn13 = None if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue title = item.contents[3].span.a.text.strip() author = item.contents[3].contents[5].a.text.strip() #edition = item.contents[3].span.contents[2].string.split('-')[0][2:-1] edition = 'unknown' #isbn = item.contents[3].contents[16].string mrp = item.contents[5].contents[1].contents[1].span.text[:-2].replace(',','') try: price = item.contents[5].contents[3].contents[3].span.text[:-2].replace(',','') except AttributeError: price = mrp try: availstr = item.contents[5].contents[9].span.string availability_str = ' '.join([x.strip() for x in re.split(' +', availstr)]) availability = msutils.get_availability(availability_str) except (AttributeError, IndexError): availability = 'na' bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except IndexError: print 'index error, book not important' except AttributeError: print 'attribute error in strip, no issues for now' #logging.info('LM:%s' %(time.time()-t)) return ('success', bookinfolist)
def get_cw_result(query=None, link=None): #f = open(os.path.join('D:/crawler/metasearchinfo/crossword/', query + '.txt'), 'w') bookinfolist = [] if not link and query: link = crossword_link + urllib.urlencode({'q':query}) print link t = time.time() status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: items = soup('ul', {'class': 'list-view clearfix'})[0]('li') except (AttributeError, IndexError): return ('failure', None) for item in items: try: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None try: imagelink = item.img['src'] except KeyError: continue link = crossword_root + item.contents[1].a['href'] isbn1 = link.split('/')[-1].split('.')[0].split('-')[-1] isbn2 = imagelink.split('/')[-1].split('.')[0].split('-')[-1] isbn = isbn13 = None if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue title = item.contents[3].contents[1].a.string author = item.contents[3].contents[3].contents[3].a.string edition = 'unknown' price = item('span', {'class':'price'})[0]('span', {'class':'variant-final-price'})[0].text[1:].replace(',','') try: mrp = item('span', {'class':'price'})[0]('span', {'class':'variant-list-price'})[0].text[1:].replace(',', '') except (AttributeError, IndexError): mrp = price availability_str = item.contents[3].contents[17].text + ' ' + item.contents[3].contents[19].text availability = msutils.get_availability(availability_str) bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except IndexError: print 'index error:', link print '' except AttributeError: print 'attribute error:', link print '' #logging.info('CW:%s' %(time.time()-t)) return ('success', bookinfolist)
def get_ip_result(query=None, link=None): if not link and query: link = indiaplaza_link + urllib.urlencode({'storename': 'Books', 'srchVal' : query}) print link bookinfolist = [] t = time.time() try: status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: books = soup('ul', {'class': 'bookdetails'}) except (IndexError, AttributeError): return ('failure', None) count = 0 for book in books: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None try: imagelink = book.parent.img['src'] except TypeError: imagelink = 'unknown' title = book.li.a.string.strip() link = indiaplaza_root + book.li.a['href'] info = {} for li in book('li')[1:]: key, value = li.text.split(':', 1) info[key] = value if 'ISBN-13' in info: isbn13 = info['ISBN-13'] if 'ISBN' in info: isbn = info['ISBN'] if len(isbn) == 13: isbn13 = isbn isbn = None if not isbn and not isbn13: continue if 'Format' in info: edition = info['Format'] if 'Author' in info: author = info['Author'] else: author = 'unknown' priceinfo = books[count].parent.nextSibling.ul('li') price = priceinfo[0].span.string.split()[-1].replace(',','') try: mrp = priceinfo[1].string.split()[-1].replace(',','') except AttributeError: mrp = price #availability_str = ' '.join([c.text for c in book('span', style='line-height:30px;')[0].contents if c.string != '\n']) #availability = msutils.get_availability(availability_str) bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) count += 1 except urllib2.HTTPError, e: print 'http error ', e.code
def get_ib_result(query=None, link=None): #link = 'http://www.infibeam.com/Books/categories/Antiques_and_Collectibles/Advertising' if not link and query: link = infibeam_link + urllib.urlencode({'q':query}) print link bookinfolist = [] t = time.time() status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: books = soup('ul', {'class': 'search_result'})[0]('li') except (AttributeError, IndexError): return ('failure', None) for book in books: try: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None link = infibeam_root + book('div', {'class':'img'})[0].a['href'] link = link.split('.html')[0] imagelink = book('div', {'class':'img'})[0].a.img['src'] isbn1 = imagelink.split('/')[-1].split('.')[0].split('-')[-1] isbn2 = link.split('.html')[0].split('/')[-1] if isbn1 == 'noimage': imagelink = 'na' if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue if not isbn: isbn = 'na' if not isbn13: isbn13 = 'na' title = ' '.join([a.string.strip() for a in book('span', {'class':'title'})[0].h2.a.contents]) authorlist = book('span', {'class':'title'})[0]('a')[1:] author = '' for auth in authorlist: author += (auth.text + ', ') try: edition = book('span', {'class':'title'})[0].span.text[1:-1].split()[0] except (IndexError, AttributeError): edition = 'na' try: price = book('div', {'class':'price'})[0].b.string.strip().replace(',', '') try: mrp = book('div', {'class':'price'})[0].span.string.strip().replace(',', '') except AttributeError: mrp = price availability_str = ' '.join([c.text for c in book('span', style='line-height:30px;')[0].contents if c.string != '\n']) availability = msutils.get_availability(availability_str) except IndexError: price = mrp = book('span', style='font:14px arial; color:#990000;')[0].text.replace(',','') availability = 'reseller only' bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except IndexError: print 'index error, book not important' except AttributeError: print 'attribute error in strip, no issues for now' #print len(bookinfolist) ''' for bookinfo in bookinfolist: bookinfo.print_details() ''' #logging.info('IB:%s' %(time.time()-t)) return ('success', bookinfolist)
def get_fk_result(query=None, link=None): if not link and query: link = flipkart_link + urllib.urlencode({'query':query}) print link try: query_int = int(query) print 'here' if len(query) in [10, 13]: return get_fk_single_result(query=query) except ValueError: print 'not and isbn query' bookinfolist = [] status, soup = msutils.get_soup(link, retry=1) if status == 'failure': return ('failure', None) try: books = soup('div', {'class': 'fk-srch-item fk-inf-scroll-item'}) print len(books) except (AttributeError, IndexError): print 'error' return ('failure', None) for book in books: try: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None partiallink = book.a['href'] #print partiallink link = flipkart_root + partiallink imagelink = book.a.img['src'] #print imagelink isbn1 = imagelink.split('/')[-1].split('.')[0].strip() isbn2 = partiallink.split('/')[2].strip() #print isbn1, isbn2 if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue if not isbn: isbn = 'na' if not isbn13: isbn13 = 'na' if isbn13: link = flipkart_book_link + isbn13 elif isbn: link = flipkart_book_link + isbn title = book.h2.a.text #print title authorlist = book('span', {'class':'fk-item-authorinfo-text fksd-smalltext'})[0]('a') #print isbn, isbn13, title author = '' for auth in authorlist: author += (auth.text + ', ') #print author mrp = price = '0' try: price = book('div', {'class':'line dlvry-det'})[0]('b', {'class':'fksd-bodytext price final-price'})[0].text.strip().split()[-1].strip().replace(',','') mrp = book('div', {'class':'line dlvry-det'})[0]('span', {'class':'list-price fksd-smalltext'})[0].text.strip().split()[-1].strip().replace(',','') except AttributeError: mrp = price except IndexError: if mrp == '0' and price == '0': pass else: mrp = price try: specs = book('tr', {'class':'line fk-item-specs-item'}) specdict = {} for spec in specs: k = spec('td')[0].text.strip() v = spec('td')[1].text.strip() specdict[k] = v if 'Binding:' in specdict: edition = specdict['Binding:'] if 'Publisher:' in specdict: publisher = specdict['Publisher:'] if 'Released:' in specdict: release = specdict['Released:'] #print edition, publisher, release availability_str = book('span', {'class':'shipping-period boldtext'})[0].text #print availability_str availability = msutils.get_availability(availability_str) except IndexError: edition = 'na' availability = 'na' except AttributeError: edition = 'na' availability = 'na' #print edition, availability bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except IndexError: print 'index error, book not important' print link except AttributeError: print 'attribute error in strip, no issues for now' print link #print len(bookinfolist) #print bookinfolist #logging.info('FK:%s' %(time.time()-t)) print 'here' write_to_log(bookinfolist); return ('success', bookinfolist)
def get_fk_result(query=None, link=None): if not link and query: link = flipkart_link + urllib.urlencode({"query": query}) # print link bookinfolist = [] t = time.time() status, soup = msutils.get_soup(link, retry=1) if status == "failure": return ("failure", None) try: books = soup("div", {"class": "fk-srch-item fk-inf-scroll-item"}) # print len(books) except (AttributeError, IndexError): # print 'error' return ("failure", None) isbns = [] for book in books: try: imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None partiallink = book.a["href"] # print partiallink link = flipkart_root + partiallink imagelink = book.a.img["src"] # print imagelink isbn1 = imagelink.split("/")[-1].split(".")[0].strip() isbn2 = partiallink.split("/")[2].strip() # print isbn1, isbn2 if len(isbn1) == 13: isbn13 = isbn1 elif len(isbn1) == 10: isbn = isbn1 if len(isbn2) == 13: isbn13 = isbn2 elif len(isbn2) == 10: isbn = isbn2 if not isbn and not isbn13: continue if not isbn: isbn = "na" if not isbn13: isbn13 = "na" if isbn13: link = flipkart_book_link + isbn13 isbns.append(isbn13) elif isbn: link = flipkart_book_link + isbn isbns.append(isbn) title = book.h2.a.text # print title authorlist = book("span", {"class": "fk-item-authorinfo-text fksd-smalltext"})[0]("a") # print isbn, isbn13, title author = "" for auth in authorlist: author += auth.text + ", " # print author mrp = price = "0" try: price = ( book("div", {"class": "line dlvry-det"})[0]("b", {"class": "fksd-bodytext price final-price"})[0] .text.strip() .split()[-1] .strip() .replace(",", "") ) mrp = ( book("div", {"class": "line dlvry-det"})[0]("span", {"class": "list-price fksd-smalltext"})[0] .text.strip() .split()[-1] .strip() .replace(",", "") ) except AttributeError: mrp = price except IndexError: if mrp == "0" and price == "0": pass else: mrp = price try: specs = book("tr", {"class": "line fk-item-specs-item"}) specdict = {} for spec in specs: k = spec("td")[0].text.strip() v = spec("td")[1].text.strip() specdict[k] = v if "Binding:" in specdict: edition = specdict["Binding:"] if "Publisher:" in specdict: publisher = specdict["Publisher:"] if "Released:" in specdict: release = specdict["Released:"] # print edition, publisher, release availability_str = book("span", {"class": "shipping-period boldtext"})[0].text # print availability_str availability = msutils.get_availability(availability_str) except IndexError: edition = "na" availability = "na" except AttributeError: edition = "na" availability = "na" # print edition, availability bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability) bookinfolist.append(bookinfo) except IndexError: print "index error, book not important" print link except AttributeError: print "attribute error in strip, no issues for now" print link # print len(bookinfolist) return ("success", bookinfolist)