Exemple #1
0
def get_fk_single_result(query=None):
    bookinfolist = []
    link = flipkart_link + urllib.urlencode({'query':query})
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)   

    store, b, soup = get_bookinfo('flipkart', query, None)
    
    # bookinfom: isbn13, link, imagelink, mrp, price, availability

    properties = soup('td', {'class':'specs-key boldtext'})

    pdict = {}
    for p in properties:
        pdict[p.text.strip()] = p.nextSibling.text.strip()

    isbn = pdict['ISBN:']
    title = pdict['Book:']
    author = pdict['Author:']
    edition = pdict['Binding:']

    bookinfo = Bookinfo(b.link, b.imagelink, b.isbn13, isbn, title, author, edition, b.mrp, b.price, b.availability)
    bookinfolist.append(bookinfo)

    return ('success', bookinfolist)    
Exemple #2
0
def get_bookinfo(store, isbn13, q):
    #http://www.flipkart.com/b/books/audacity-hope-barack-obama-book-1847670830 
    
    print 'inside get bookinfo'
    #final_isbn = msutils.get_final_isbn(isbn13, isbn)
    final_isbn=isbn13

    if final_isbn == None:
        return (store, None)

    link = stores[store]%(final_isbn)
    print link

      
    imagelink = price = mrp = availability = None
    
    status, soup = msutils.get_soup(link, retry=1)
    print 'got soup'
    
    if status == 'failure':
        print 'bad soup returning none'
        if q:
            q.put((store, None))
        else:
            return (store, None)

    print 'crossed'    

    selectors = store_selectors[store]
    sel1 = selectors['imagelink']
    try:
        imagelink = soup(sel1[0], {sel1[1]:sel1[2]})[0].img['src']
    except Exception, e:
        print 'error', str(e)
        imagelink = 'unknown'
Exemple #3
0
def crawl():
	# get current catid and pageno
    conn = sqlite3.connect('clc.db')    
    c = conn.cursor()
    c.execute('''select catid, pageid from crawlinfo limit 1''')
    catid, pageid = c.fetchall()[0]


    print 'crawling'

    for i in xrange(catid,42):
      print 'inside'
      while True:
    			#crawl the page
          url = burl %(i, pageid)
          print url
          status, soup = msutils.get_soup(url, retry=1)
          time.sleep(2)

          if status == 'failure':
            return

          finished= False

          try:
              books = soup('div', {'class': 'image_holder_small'})

              for book in books:
                isbn =  book.a.img['alt']
                print 'isbn', isbn

                if len(isbn) != 13:
                  continue
                
                #write to db
                c.execute('insert into books values (%d,"%s")'%(i, isbn))
                conn.commit()



              print len(books)
              if len(books) == 0:
                finished = True

          except Exception, e:
              print 'error', str(e)
              print pageid, catid
              finished = False
              return

          if finished:
            pageid = 1
            break

          else:
            pageid += 1
            c.execute('update crawlinfo set catid=%d, pageid=%d where id=1'%(i, pageid))
            conn.commit()
Exemple #4
0
def get_fk_bookinfo(isbn13=None, isbn=None):
    # http://www.flipkart.com/b/books/audacity-hope-barack-obama-book-1847670830

    final_isbn = msutils.get_final_isbn(isbn13, isbn)

    if final_isbn == None:
        return ("failure", None)

    link = flipkart_book_link + final_isbn
    print link

    try:
        imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None

        status, soup = msutils.get_soup(link, retry=1)

        if status == "failure":
            return ("failure", None)

        try:
            imagelink = soup("div", id="mprodimg-id")[0].img["src"]
        except (AttribureError, IndexError):
            imagelink = "unknown"

        desc = soup("div", id="details")[0].table("tr")

        info = {}
        for tr in desc:
            key, value = tr.text.split(":", 1)
            info[key] = value

        if "ISBN-13" in info:
            isbn13 = info["ISBN-13"].split(",")[0]

        if "ISBN" in info:
            isbn = info["ISBN"]

        if "Binding" in info:
            edition = info["Binding"]

        price = soup("span", id="fk-mprod-our-id")[0].text.split()[-1].split(".")[-1].replace(",", "")

        try:
            mrp = soup("span", id="fk-mprod-list-id")[0].text.split()[-1].replace(",", "")
        except IndexError:
            mrp = price

        availability_str = soup("div", {"class": "shipping-details"})[0].span.text

        availability = msutils.get_availability(availability_str)

        title = info["Book"]
        author = info["Author"]

        bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)

    except urllib2.HTTPError, e:
        print "http error ", e.code
Exemple #5
0
def crawl():

	url = 'http://www.flipkart.com/browse/academic-and-professional-books-3277?response-type=json&inf-start=20'
	status, soup = msutils.get_soup(url, retry=1)
	#print soup

	books = soup('h2')
	print len(books)

	for book in books:
		print book
Exemple #6
0
def get_ur_result(query=None, link=None):
    bookinfolist = []
    if not link and query:    
        link = uread_link + '-'.join([q.strip() for q in query.split()])
        
    print link
    
    t = time.time()
    
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)    

    try:
        books = soup('div', {'class':'product-vert-list-item'})
    except (AttributeError, IndexError):
        return ('failure', None)
    
    for book in books:
        try:
            link = imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None
            
            link = book('div', {'class':'product-vert-list-image'})[0].a['href']
            #print link
            imagelink = book('div', {'class':'product-vert-list-image'})[0].a.img['src']
            
            isbn1 = link.split('/')[-1]
            isbn2 = imagelink.split('/')[-1].split('.')[0]
            
            isbn = isbn13 = None
            
            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1
                
            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2
                
            if not isbn and not isbn13:
                continue
            
            summary_elem = book('div', {'class':'product-vert-list-summary'})

            title = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].h2.a.text
            author = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].strong.a.text
            edition = summary_elem[0]('div', {'class':'product-vert-list-title'})[0].text.rsplit('(', 1)[-1][:-1]
            
            price = summary_elem[0]('div', {'class':'product-vert-list-price'})[0]('span', {'class':'our-price'})[0].contents[1].replace(',','')
            try:
                mrp = summary_elem[0]('div', {'class':'product-vert-list-price'})[0]('span', {'class':'list-price'})[0].contents[1].replace(',','')
            except IndexError:
                mrp = price
            
            availability_str = summary_elem[0]('div', {'class':'product-shipping-info'})[0].text
            availability = msutils.get_availability(availability_str)
            
            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except (IndexError, AttributeError):
            print link
            
    #logging.info('UR:%s' %(time.time()-t))            
    return ('success', bookinfolist)
Exemple #7
0
def get_lm_result(query=None, link=None):

    bookinfolist = []
    if not link and query:    
        link = landmark_link + urllib.urlencode({'code':query, 'type':'0', 'num':'0'})
        
    print link
    
    t = time.time()
    
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)    

    try:
        items = soup('div', {'class': 'searc_box'})
    except (AttributeError, IndexError):
        return ('failure', None)
    
    for item in items:
    
        try:
            try:
                imagelink = item.input['src']        
            except KeyError:
                imagelink = 'unknown'
            link = landmark_root + item.a['href']
            isbn1 = link.split('/')[-1]
            isbn2 = imagelink.split('/')[-1].split('.')[0]

            isbn = isbn13 = None
            
            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1
                
            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2
                
            if not isbn and not isbn13:
                continue
                
            title = item.contents[3].span.a.text.strip()
            author = item.contents[3].contents[5].a.text.strip()
            #edition =  item.contents[3].span.contents[2].string.split('-')[0][2:-1]
            edition = 'unknown'
            #isbn = item.contents[3].contents[16].string

            mrp =  item.contents[5].contents[1].contents[1].span.text[:-2].replace(',','')

            try:    
                price = item.contents[5].contents[3].contents[3].span.text[:-2].replace(',','')
            except AttributeError:
                price =  mrp
            
            try:
                availstr =  item.contents[5].contents[9].span.string
                availability_str =  ' '.join([x.strip() for x in re.split(' +', availstr)])
                availability = msutils.get_availability(availability_str)
                
            except (AttributeError, IndexError):
                availability = 'na'
            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except IndexError:
            print 'index error, book not important'
        except AttributeError:
            print 'attribute error in strip, no issues for now'            

    #logging.info('LM:%s' %(time.time()-t))            
    return ('success', bookinfolist)
Exemple #8
0
def get_cw_result(query=None, link=None):
    #f = open(os.path.join('D:/crawler/metasearchinfo/crossword/', query + '.txt'), 'w')
    
    bookinfolist = []
    
    if not link and query:
        link = crossword_link + urllib.urlencode({'q':query})
    
    print link
    
    t = time.time()
    
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)    

    try:
        items = soup('ul', {'class': 'list-view clearfix'})[0]('li')
    except (AttributeError, IndexError):
        return ('failure', None)

    for item in items:    
        try:
            imagelink = isbn =    isbn13 = title = author = edition = price = mrp = availability = None
            
            try:
                imagelink = item.img['src']        
            except KeyError:
                continue
                
            link = crossword_root + item.contents[1].a['href']
            
            isbn1 = link.split('/')[-1].split('.')[0].split('-')[-1]    
            isbn2 = imagelink.split('/')[-1].split('.')[0].split('-')[-1]
            
            isbn = isbn13 = None
            
            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1
                
            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2
            
            if not isbn and not isbn13:
                continue
            
            title = item.contents[3].contents[1].a.string
            author = item.contents[3].contents[3].contents[3].a.string
            edition =  'unknown'
            
            price = item('span', {'class':'price'})[0]('span', {'class':'variant-final-price'})[0].text[1:].replace(',','')
            
            try:
                mrp =  item('span', {'class':'price'})[0]('span', {'class':'variant-list-price'})[0].text[1:].replace(',', '')
            except (AttributeError, IndexError):
                mrp = price
            
            availability_str =  item.contents[3].contents[17].text + ' ' + item.contents[3].contents[19].text
            availability = msutils.get_availability(availability_str)                

            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except IndexError:
            print 'index error:', link
            print ''
        except AttributeError:
            print 'attribute error:', link        
            print ''
    
    #logging.info('CW:%s' %(time.time()-t))
    return ('success', bookinfolist)
Exemple #9
0
def get_ip_result(query=None, link=None):
    if not link and query:
        link = indiaplaza_link + urllib.urlencode({'storename': 'Books', 'srchVal' : query})
        
    print link
    
    bookinfolist = []
    
    t = time.time()
    
    try:
        status, soup = msutils.get_soup(link, retry=1)
    
        if status == 'failure':
            return ('failure', None)
        
        try:
            books = soup('ul', {'class': 'bookdetails'})
        except (IndexError, AttributeError):
            return ('failure', None)
        
        count = 0
        for book in books:
            imagelink = isbn =    isbn13 = title = author = edition = price = mrp = availability = None
            try:
                imagelink = book.parent.img['src']
            except TypeError:
                imagelink = 'unknown'
            
            title = book.li.a.string.strip()
            link = indiaplaza_root + book.li.a['href']

            info = {}
            for li in book('li')[1:]:
                key, value = li.text.split(':', 1)
                info[key] = value                
            
            if 'ISBN-13' in info:
                isbn13 = info['ISBN-13']
                
            if 'ISBN' in info:
                isbn = info['ISBN']
                if len(isbn) == 13:
                    isbn13 = isbn
                    isbn = None

            if not isbn and not isbn13:
                continue

            if 'Format' in info:
                edition = info['Format']

            if 'Author' in info:
                author = info['Author']    
            else:
                author = 'unknown'

            priceinfo = books[count].parent.nextSibling.ul('li')
            
            price = priceinfo[0].span.string.split()[-1].replace(',','')
            
            try:
                mrp = priceinfo[1].string.split()[-1].replace(',','')   
            except AttributeError:
                mrp = price

            #availability_str = ' '.join([c.text for c in book('span', style='line-height:30px;')[0].contents if c.string != '\n'])
            #availability = msutils.get_availability(availability_str)              
            
            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
            count += 1
    except urllib2.HTTPError, e:
        print 'http error ', e.code        
Exemple #10
0
def get_ib_result(query=None, link=None):    
    #link = 'http://www.infibeam.com/Books/categories/Antiques_and_Collectibles/Advertising'
    if not link and query:
        link = infibeam_link + urllib.urlencode({'q':query})
    
    print link
    
    bookinfolist = []

    t = time.time()
    
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)    

    try:
        books = soup('ul', {'class': 'search_result'})[0]('li')
    except (AttributeError, IndexError):
        return ('failure', None)
        
    for book in books:    
        try:
            imagelink = isbn =    isbn13 = title = author = edition = price = mrp = availability = None                
            
            link = infibeam_root + book('div', {'class':'img'})[0].a['href']
            link = link.split('.html')[0]
            
            imagelink = book('div', {'class':'img'})[0].a.img['src']
            isbn1 = imagelink.split('/')[-1].split('.')[0].split('-')[-1]
            isbn2 = link.split('.html')[0].split('/')[-1]

            if isbn1 == 'noimage':
                imagelink = 'na'
            
            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1
        
            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2
            
            if not isbn and not isbn13:
                continue
            
            if not isbn:
                isbn = 'na'
            
            if not isbn13:
                isbn13 = 'na'
            
            
            title = ' '.join([a.string.strip() for a in book('span', {'class':'title'})[0].h2.a.contents])
            authorlist = book('span', {'class':'title'})[0]('a')[1:]
            author = ''
            for auth in authorlist:
                author += (auth.text +  ', ')
                
            try:
                edition = book('span', {'class':'title'})[0].span.text[1:-1].split()[0]
            except (IndexError, AttributeError):
                edition = 'na'
            
            try:
                price = book('div', {'class':'price'})[0].b.string.strip().replace(',', '')
                
                try:
                    mrp = book('div', {'class':'price'})[0].span.string.strip().replace(',', '')
                except AttributeError:
                    mrp = price
                
                availability_str = ' '.join([c.text for c in book('span', style='line-height:30px;')[0].contents if c.string != '\n'])
                availability = msutils.get_availability(availability_str)                
                
            except IndexError:
                price = mrp = book('span', style='font:14px arial; color:#990000;')[0].text.replace(',','')
                availability = 'reseller only'
                
            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except IndexError:
            print 'index error, book not important'
        except AttributeError:
            print 'attribute error in strip, no issues for now'
    
    #print len(bookinfolist)    
    
    '''
    for bookinfo in bookinfolist:
        bookinfo.print_details()
    '''
    
    #logging.info('IB:%s' %(time.time()-t))
    return ('success', bookinfolist)
Exemple #11
0
def get_fk_result(query=None, link=None):   
    if not link and query:
        link = flipkart_link + urllib.urlencode({'query':query})
    
    print link

    try:
        query_int = int(query)
        print 'here'
        if len(query) in [10, 13]:
            return get_fk_single_result(query=query)
    except ValueError:
        print 'not and isbn query'    
    
    bookinfolist = []
    status, soup = msutils.get_soup(link, retry=1)
    
    if status == 'failure':
        return ('failure', None)    
    
    try:
        books = soup('div', {'class': 'fk-srch-item fk-inf-scroll-item'})
        print len(books)
    except (AttributeError, IndexError):
        print 'error'
        return ('failure', None)
        
    
    for book in books:    
        try:
            imagelink = isbn =    isbn13 = title = author = edition = price = mrp = availability = None                
            
            partiallink = book.a['href']
            #print partiallink
            link = flipkart_root + partiallink
            imagelink = book.a.img['src']
            #print imagelink
            isbn1 = imagelink.split('/')[-1].split('.')[0].strip()
            isbn2 = partiallink.split('/')[2].strip()
            
            
            #print isbn1, isbn2

            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1
        
            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2
            
            if not isbn and not isbn13:
                continue
            
            if not isbn:
                isbn = 'na'
            
            if not isbn13:
                isbn13 = 'na'
            
            
            if isbn13:
                link = flipkart_book_link + isbn13
            elif isbn:
                link = flipkart_book_link + isbn
            
            title = book.h2.a.text
            #print title
            authorlist = book('span', {'class':'fk-item-authorinfo-text fksd-smalltext'})[0]('a')
            
            #print isbn, isbn13, title
            
            author = ''
            for auth in authorlist:
                author += (auth.text +  ', ')
        
            #print author
            
            mrp = price = '0'
            try:
                price = book('div', {'class':'line dlvry-det'})[0]('b', {'class':'fksd-bodytext price final-price'})[0].text.strip().split()[-1].strip().replace(',','')
                mrp = book('div', {'class':'line dlvry-det'})[0]('span', {'class':'list-price fksd-smalltext'})[0].text.strip().split()[-1].strip().replace(',','')
            except AttributeError:
                mrp = price
            except IndexError:
                if mrp == '0' and price == '0':
                    pass
                else:
                    mrp = price
                
            try:    
                specs = book('tr', {'class':'line fk-item-specs-item'})
                specdict = {}
                for spec in specs:
                    k = spec('td')[0].text.strip()
                    v = spec('td')[1].text.strip()
                    specdict[k] = v
                    
                if 'Binding:' in specdict:    
                    edition = specdict['Binding:']
                if 'Publisher:' in specdict:
                    publisher = specdict['Publisher:']
                if 'Released:' in specdict:
                    release = specdict['Released:']
                
                #print edition, publisher, release
                
                availability_str = book('span', {'class':'shipping-period boldtext'})[0].text
                #print availability_str
                
                availability = msutils.get_availability(availability_str)                
                
            except IndexError:
                edition = 'na'
                availability = 'na'
            except AttributeError:
                edition = 'na'
                availability = 'na'
                
            #print edition, availability
             
            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except IndexError:
            print 'index error, book not important'
            print link
        except AttributeError:
            print 'attribute error in strip, no issues for now'
            print link
    
    #print len(bookinfolist)    
   
    #print bookinfolist
   
    #logging.info('FK:%s' %(time.time()-t))
    
    print 'here'
    write_to_log(bookinfolist);
    
    return ('success', bookinfolist)
Exemple #12
0
def get_fk_result(query=None, link=None):
    if not link and query:
        link = flipkart_link + urllib.urlencode({"query": query})

    # print link

    bookinfolist = []

    t = time.time()

    status, soup = msutils.get_soup(link, retry=1)

    if status == "failure":
        return ("failure", None)

    try:
        books = soup("div", {"class": "fk-srch-item fk-inf-scroll-item"})
        # print len(books)
    except (AttributeError, IndexError):
        # print 'error'
        return ("failure", None)

    isbns = []
    for book in books:
        try:
            imagelink = isbn = isbn13 = title = author = edition = price = mrp = availability = None

            partiallink = book.a["href"]
            # print partiallink
            link = flipkart_root + partiallink
            imagelink = book.a.img["src"]
            # print imagelink
            isbn1 = imagelink.split("/")[-1].split(".")[0].strip()
            isbn2 = partiallink.split("/")[2].strip()

            # print isbn1, isbn2

            if len(isbn1) == 13:
                isbn13 = isbn1
            elif len(isbn1) == 10:
                isbn = isbn1

            if len(isbn2) == 13:
                isbn13 = isbn2
            elif len(isbn2) == 10:
                isbn = isbn2

            if not isbn and not isbn13:
                continue

            if not isbn:
                isbn = "na"

            if not isbn13:
                isbn13 = "na"

            if isbn13:
                link = flipkart_book_link + isbn13
                isbns.append(isbn13)
            elif isbn:
                link = flipkart_book_link + isbn
                isbns.append(isbn)

            title = book.h2.a.text
            # print title
            authorlist = book("span", {"class": "fk-item-authorinfo-text fksd-smalltext"})[0]("a")

            # print isbn, isbn13, title

            author = ""
            for auth in authorlist:
                author += auth.text + ", "

            # print author

            mrp = price = "0"
            try:
                price = (
                    book("div", {"class": "line dlvry-det"})[0]("b", {"class": "fksd-bodytext price final-price"})[0]
                    .text.strip()
                    .split()[-1]
                    .strip()
                    .replace(",", "")
                )
                mrp = (
                    book("div", {"class": "line dlvry-det"})[0]("span", {"class": "list-price fksd-smalltext"})[0]
                    .text.strip()
                    .split()[-1]
                    .strip()
                    .replace(",", "")
                )
            except AttributeError:
                mrp = price
            except IndexError:
                if mrp == "0" and price == "0":
                    pass
                else:
                    mrp = price

            try:
                specs = book("tr", {"class": "line fk-item-specs-item"})
                specdict = {}
                for spec in specs:
                    k = spec("td")[0].text.strip()
                    v = spec("td")[1].text.strip()
                    specdict[k] = v

                if "Binding:" in specdict:
                    edition = specdict["Binding:"]
                if "Publisher:" in specdict:
                    publisher = specdict["Publisher:"]
                if "Released:" in specdict:
                    release = specdict["Released:"]

                # print edition, publisher, release

                availability_str = book("span", {"class": "shipping-period boldtext"})[0].text
                # print availability_str

                availability = msutils.get_availability(availability_str)

            except IndexError:
                edition = "na"
                availability = "na"
            except AttributeError:
                edition = "na"
                availability = "na"

            # print edition, availability

            bookinfo = Bookinfo(link, imagelink, isbn13, isbn, title, author, edition, mrp, price, availability)
            bookinfolist.append(bookinfo)
        except IndexError:
            print "index error, book not important"
            print link
        except AttributeError:
            print "attribute error in strip, no issues for now"
            print link

    # print len(bookinfolist)

    return ("success", bookinfolist)