def Main():
    driver=None
    parser = OptionParser()
    parser.add_option("--crawl-textbook", dest="crawl_textbook", action="store_true", help="crawl textbook", default=False)
    parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbook details", default=False)

    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='opentextbooks')
    parser.add_option("--subject-table-name", dest="subject_table_name", type="string", help="subject table name", default='subject')
    parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='opentextbooks')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachments')
    parser.add_option("--toc-table-name", dest="toc_table_name", type="string", help="toc table name", default='table_of_content')

    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    if options.use_firefox:
        driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl_textbook:
            subjects = db.query("""select * from %s""" %(options.subject_table_name))
            print len(subjects), 'subjects to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (url,subject_title,) in subjects:
                count+=1
                print 'subject title:', subject_title
                print 'url:', url
                print 'subject count :',count
                driver.get(url)
                time.sleep(3)
                ShortDescription=driver.find_elements_by_class_name("ShortDescription")
                for short in ShortDescription:
                   thrid=short.find_element_by_class_name("thrid")
                   img_url=thrid.find_element_by_tag_name("img")
                   h2=short.find_element_by_tag_name("h2")
                   textbook_title=h2.text.strip()
                   textbook_link=h2.find_element_by_tag_name("a").get_attribute("href")
                   m = hashlib.md5()
                   m.update(textbook_title+textbook_link)
                   document_id=m.hexdigest()
                   string=short.find_element_by_tag_name("p").text
                   l=[]
                   if string.find("\n"):
                       authors=string.replace("\n",", ")
                   
                       list=string.split("\n")
                       for a in list:
                           l.append(a.split(",")[0])
                       author=','.join(l)
                   elif string.find(","):
                       authors=string
                       l.append(a.split(",")[0])
                       author=','.join(l)
                   else:
                       authors=string
                       author=string
             
                   print 'textbook_url',textbook_url
                   print 'subject_title',subject_title
                   print 'url',url
                   print 'author',author
                   print 'authors',authors
                   print 'document_id',document_id
                   print 'img_url',img_url
                   data = {
                    'textbook_title':textbook_title,
                    'textbook_url':textbook_url,
                    'subject_title':subject_title,
                    'url':url,
                    'author':author,
                    'authors':authors,
                    'document_id':document_id,
                    'img_url':img_url
                    }
                   db.insert(options.textbook_table_name, data)
                   print "db inserted"
        if options.crawl_textbook_details:
            textbook = db.query("""select document_id,textbook_url from %s where crawled=0""" %(options.textbook_table_name))
            print len(textbook), 'textbook to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (document_id,textbook_url,) in textbook:
                count+=1
                print 'textbook_url:', textbook_url
                print 'document_id:', document_id
                print 'subject count :',count
                driver.get(textbook_url)
                time.sleep(3)

                third=driver.find_element_by_class_name("twothird")
                para= third.find_elements_by_tag_name("p")
                for p in para:
                    para_text=p.text
                    if para_text.startswith("Pub Date:"):
                        pub_date=para_text.replace("Pub Date:","")
                        if pub_date:
                            pub_date=pub_date.strip()
                        else:
                            pub_date=None
                    elif para_text.startswith("ISBN 13:")
                        isbn_13_string=para_text.replace("ISBN 13:","")
                        if isbn_13_string:
                            isbn_13_string=isbn_13_string.strip()
                            isbn_13=isbn_13_string.replace("-","")
                        else:
                            isbn_13_string=None
                            isbn_13=None
                BookTypes=driver.find_element_by_class_name("BookTypes")
                books=BookTypes.find_elements_by_tag_name("a")
                for book in books:
                    attachment_link=book.get_attribute("href")
                    type=book.text.strip()
                    print "attachment_link",attachment_link
                    print "type",type
                    data={
                    "document_id":document_id,
                    "attachment_link":attachment_link,
                    "type":type
                    }
                    db.insert(options.attachment_table_name, data)
                    print "toc table  inserted"
                Badge=driver.find_element_by_class_name("Badge-Condition")
                conditions_text=Badge.text
                condition_link=Badge.find_element_by_tag_name("a").get_attribute("href")
                toc=driver.find_element_by_id("TOC")
                table_of_content=str(toc)
                list_tags=toc.find_elements_by_tag_name("li")
                for list in list_tags:
                    chapter=list.text.strip()
                    if chapter.startswith("Chapter"):
                        chapter_type="Chapter"
                    elif chapter.startswith("Part"):
                        chapter_type="Part"
                    else:
                        chapter_type=None
                    print "title",chapter
                    print "type",chapter_type

                    data={
                    'document_id':document_id,
                    'title':chapter,
                    'type': chapter_type
                    }
                    db.insert(options.toc_table_name, data)
                    print "toc table  inserted"
                AboutBook = driver.find_element_by_id("AboutBook")
                description = AboutBook.text
                links=AboutBook.find_elements_by_tag_name("a")
                for link in links:
                    href = link.get_attribute("href")
                    print "link in books",href
                    data={
                    "document_id":document_id
                    "link":href
                    }
                    db.insert("books", data)
                    print "toc table  inserted"
                AboutAuthors = driver.find_element_by_id("AboutAuthors")
                author_details = AboutAuthors.text
                print 'pub_date',pub_date,
                print 'isbn_13_string',isbn_13_string,
                print 'isbn_13',isbn_13,
                print 'conditions_text',conditions_text,
                print 'condition_link', condition_link,
                print 'table_of_content',table_of_content,
                print 'description',description,
                print 'author_details',author_details
                data = {
                'pub_date':pub_date,
                'isbn_13_string':isbn_13_string,
                'isbn_13':isbn_13,
                'conditions_text': conditions_text,
                'condition_link': condition_link,
                'table_of_content': table_of_content,
                'description' : description,
                'author_details':author_details
                'crawled':1
                }
                db.update(options.textbook_table_name, data, "document_id='%s'" %document_id)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #2
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawllist", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='ted')
    parser.add_option("--topic-table-name", dest="topic_table_name", type="string", help="topic table name", default='ted_topics')
    parser.add_option("--ted-table-name", dest="ted_table_name", type="string", help="ted table name", default='ted')
    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl:
    
            topics = db.query("""select * from %s""" %(options.topic_table_name))
    
            print len(topics), 'Topics to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (topic,topic_url,) in topics:
                count+=1
                print 'topic:', topic
                print 'topic_url:', topic_url
                print 'topic count :',count
                driver.get(topic_url)
                time.sleep(3)
                pagination=driver.find_elements_by_class_name("pagination")
                number=0
                if pagination:
                    atag=pagination[0].find_elements_by_tag_name("a")
                    page_numbers=int(atag[-2].text.encode("utf-8"))
                    print "Page numbers ",page_numbers
                    for page_number in range(page_numbers):
                        number+=1
                        url="https://www.ted.com/talks?page=%s&sort=newest&topics[]=%s"%(str(page_number+1),topic)
                        url=url.replace(" ","+")
                        print "Page url :",url
                        print "page number :",number
                        driver.get(url)
                        time.sleep(3)
                        crawl_data(driver,options,db,topic)
                else:
                    print "Paginator not found"
                    crawl_data(driver,options,db,topic)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #3
0
def Main():
    parser = OptionParser()
    parser.add_option("--textbook-package", dest="textbook_package", action="store_true", help="textbook package details", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="db name", default='pearsonhighered')
    parser.add_option("--db-port", dest="db_port", type="int", help="db port", default=3306)
    parser.add_option("--textbook-package-table-name", dest="textbook_package_table_name", type="string", help="textbook package table name", default='textbook_package')
    parser.add_option("--pearsonhighered-textbooks-table-name", dest="pearsonhighered_textbooks_table_name", type="string", help="subject2 table name", default='pearsonhighered_textbooks')
   
    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=False)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl_subjects1:
            textbooks = db.query("""select textbook_id,textbook_url from pearsonhighered_textbooks where status=0'""" %(options.textbook_package_table_name))
    
            print len(textbooks), 'textbooks yet to be crawled'
            db.set_autocommit(True)
    
            count = 0        
            for (textbook_id,textbook_url) in textbooks:
                count += 1
                print 'count:', count
                print 'textbook_id:',textbook_id
                print 'textbook_url:', textbook_url
                if not textbook_url:
                    continue    
                driver.get(textbook_url)
                time.sleep(3)
                tab_content_group=driver.find_elements_by_class_name("tab-content group")

                if not tab_content_group:
                   raise Exception "tab-content group not found"
                data_feed_float_right=tab_content_group[0].find_elements_by_class_name("data-feed float-right")
                if data_feed_float_right:
                   raise Exception "data-feed float-right no found"
                description=data_feed_float_right[0].find_elements_by_id("description")
                if not description:
                   raise Exception "description not found"
                ul=description[0].find_elements_by_tag_name("ul")
                
                if not ul:
                   raise  Exception "ul tag not found"
                while True:
                    a=ul[0].find_elements_by_tag_name("a")
                    if not a:
                      raise  Exception "a tag not found"
                    textbook_title=a[0].text.strip()
                    p=ul[0].find_elements_by_tag_name("p")
                    if not p:
                      raise  Exception "p tag not found"
                    if len(p)==3:
                      raise  Exception "all p tags are not found"
                    for tag in p:
                       package_details=tag.text
                       if '©' in package_details:
                          copy_right_year=package_details[package_details.find("©")+1:package_details.find("•")]
                          copy_right_year=copy_right_year.strip()
                          if not len(copy_right_year)==4:
                             raise  Exception "copy right right is not correct" 
                          if 'pp' in package_details:
                             pages=package_details[package_details.find(",")+1:package_details.find("pp")]
                             pages=pages.strip()
                       print "copy right year",copy_right_year
                       print "Pages",pages
                       if "ISBN" in package_details:
                          if "•" in package_details:
                             isbns=package_details.split("•")
                          for isbn in isbns:
                             if "ISBN-10:" in isbn:
                                isbn_10=isbn.replace("ISBN-10:","").strip()
                             if "ISBN-13:" in isbn:
                                isbn_13=isbn.replace("ISBN-13:","").strip()
                             if not len(isbn_10)==10:
                                raise  Exception "isbn 10 is not correct"
                             if not len(isbn_13)==13:
                                raise  Exception "isbn 13 is not correct"
                       print "isbn_10 :",isbn_10
                       print "isbn_13 :",isbn_13
                       author=package_details.strip()
                       print "author :",author
                    data = {'textbook_title':title,'textbook_isbn_10':textbook_isbn_10,'textbook_isbn_13':textbook_isbn_13,
                            'textbook_author':author, 'textbook_copyright_year':textbook_copyright_year,'pages':pages,  
                            'status':1}
                    db.update(options.textbook_table_name, data, "textbook_url='%s'" %textbook_url)
                    time.sleep(3)
                    next=description[0].find_element_by_xpath("//ul/following-sibling::ul")      
                    if next:
                       print "Next sibling found"
                       ul=next
                    else:
                       print "next sibling not found"
                       break

       
    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #4
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove')   
    parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    num_of_iframs=0
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'

    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    try:
        if options.crawl_text:
            count=0
            attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link"))
            print "Number of urls to crawl ",len(attachments)
            for (url,attachment_id,) in attachments:
                try:
                    count+=1
                    print "source url :",url
                    print "attachment_id :",attachment_id
                    print "count %s"%count
                    if "pdf" in url:
                        raise Exception(url)
                    driver.get(url)
                    iframes=driver.find_elements_by_tag_name("iframe")
                    body=driver.find_element_by_tag_name("body")
                    landing_url=driver.current_url
                    if "pdf" in landing_url:
                        raise Exception(landing_url)
                
                    cursor=db.get_cursor()
                    visible_text=body.text
                    if iframes:
                        num_of_iframs=len(iframes)
                        print "landing_url :",landing_url
                        print"landed :",2
                        print"num_of_iframs :",num_of_iframs
                        #data={
                        #"landing_url":landing_url,
                        #"landed":2,
                        #"num_of_iframs":num_of_iframs
                        #}
                        
                        cursor.execute("""update %s set landing_url='%s',landed=%s,num_of_iframs=%s  where url='%s'"""%(options.attachment_table,landing_url,2,num_of_iframs,url))
                        #db.update(options.attachment_table,data,"url='%s'"%url)
                    else:
                        txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.html"%attachment_id
                        f=codecs.open(txt_location,"w","utf-8")
                        f.write(visible_text)
                        f.close()
                        print "txt_location :",txt_location
                        print "landing_url :",landing_url
                        print "num_of_iframs :",num_of_iframs
                        print "landing :",1
                        cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,url))
                except:
                    traceback.print_exc()
                    logging.exception('Got exception on main handler')
                    cursor.execute("""update %s set landed=%s where url='%s'"""%(options.attachment_table,-1,url))    
                    pass


    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #5
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-subjects1",
                      dest="crawl_subjects1",
                      action="store_true",
                      help="crawl first level subjects",
                      default=False)
    parser.add_option("--crawl-subjects2",
                      dest="crawl_subjects2",
                      action="store_true",
                      help="crawl second level subjects",
                      default=False)
    parser.add_option("--crawl-subjects3",
                      dest="crawl_subjects3",
                      action="store_true",
                      help="crawl third level subjects",
                      default=False)
    parser.add_option("--crawl-subjects4",
                      dest="crawl_subjects4",
                      action="store_true",
                      help="crawl fourth level subjects",
                      default=False)
    parser.add_option("--crawl-textbooks",
                      dest="crawl_textbooks",
                      action="store_true",
                      help="crawl textbooks list",
                      default=False)
    parser.add_option("--crawl-textbook-details",
                      dest="crawl_textbook_details",
                      action="store_true",
                      help="crawl textbooks details",
                      default=False)
    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="db name",
                      default='textbook_0915')
    parser.add_option("--db-port",
                      dest="db_port",
                      type="int",
                      help="db port",
                      default=6606)
    parser.add_option("--subject1-table-name",
                      dest="subject1_table_name",
                      type="string",
                      help="subject1 table name",
                      default='wiley_subject1')
    parser.add_option("--subject2-table-name",
                      dest="subject2_table_name",
                      type="string",
                      help="subject2 table name",
                      default='wiley_subject2')
    parser.add_option("--subject3-table-name",
                      dest="subject3_table_name",
                      type="string",
                      help="subject3 table name",
                      default='wiley_subject3')
    parser.add_option("--subject4-table-name",
                      dest="subject4_table_name",
                      type="string",
                      help="subject4 table name",
                      default='wiley_subject4')
    parser.add_option("--textbook-table-name",
                      dest="textbook_table_name",
                      type="string",
                      help="textbook table name",
                      default='wiley_textbook')
    parser.add_option("--journal-table-name",
                      dest="journal_table_name",
                      type="string",
                      help="journal table name",
                      default='wiley_journal')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=False)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl_subjects1:
            url = "http://as.wiley.com/WileyCDA/Section/index.html"
            driver.get(url)
            hoverlist = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, "subjects-hoverlist")))
            db.set_autocommit(False)

            index = 0
            for a in hoverlist.find_elements_by_xpath("./li/a"):
                index += 1
                print 'index:', index
                subject1_title = a.text.strip()
                print 'subject1_title:', subject1_title
                subject1_url = a.get_attribute('href')
                print 'subject1_url:', subject1_url
                data = {
                    'subject1_title': subject1_title,
                    'subject1_url': subject1_url
                }
                db.insert(options.subject1_table_name, data)
            db.commit()

        if options.crawl_subjects2:

            subjects1 = db.query(
                """select subject1_title, subject1_url from %s where subject1_title not in (select distinct subject1_title from %s)
                                               """ %
                (options.subject1_table_name, options.subject2_table_name))

            print len(subjects1), 'subjects1 yet to be crawled'
            db.set_autocommit(True)

            for (subject1_title, subject1_url) in subjects1:
                print 'subject1_title:', subject1_title
                print 'subject1_url:', subject1_url
                driver.get(subject1_url)

                hoverlist = WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, "subjects")))
                index = 0
                for a in hoverlist.find_elements_by_xpath("./li/a"):
                    index += 1
                    print 'index:', index
                    subject2_title = a.text.strip()
                    print 'subject2_title:', subject2_title
                    subject2_url = a.get_attribute('href')
                    print 'subject2_url:', subject2_url
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject2_url': subject2_url
                    }
                    db.insert(options.subject2_table_name, data)
                db.commit()
                time.sleep(3)

        if options.crawl_subjects3:

            subjects2 = db.query(
                """select a.subject1_title, a.subject2_title, a.subject2_url from %s a left join %s b on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title where b.subject1_title is null
                                               """ %
                (options.subject2_table_name, options.subject3_table_name))

            print len(subjects2), 'subjects2 yet to be crawled'
            db.set_autocommit(False)

            for (subject1_title, subject2_title, subject2_url) in subjects2:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject2_url:', subject2_url
                driver.get(subject2_url)
                time.sleep(3)
                try:
                    hoverlist = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, "subjectsbox")))
                    hoverlist = hoverlist.find_elements_by_xpath(
                        "./ul[@class='subjects']")
                except:
                    print "subjects not found. so crawling textbook listing"
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject3_title': '',
                        'subject3_url': subject2_url
                    }
                    db.insert(options.subject3_table_name, data)
                    db.commit()
                    crawl_textbook_listing(driver, db, workingdir, options,
                                           subject1_title, subject2_title, '',
                                           '')
                    continue

                index = 0
                for a in hoverlist.find_elements_by_xpath("./li/a"):
                    index += 1
                    print 'index:', index
                    subject3_title = a.text.strip()
                    print 'subject3_title:', subject3_title
                    subject3_url = a.get_attribute('href')
                    print 'subject3_url:', subject3_url
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject3_title': subject3_title,
                        'subject3_url': subject3_url
                    }
                    db.insert(options.subject3_table_name, data)
                db.commit()
                time.sleep(3)

        if options.crawl_subjects4:

            subjects3 = db.query(
                """select a.subject1_title, a.subject2_title, a.subject3_title, a.subject3_url from %s a left join %s b on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title and a.subject3_title=b.subject3_title where b.subject1_title is null
                                               """ %
                (options.subject3_table_name, options.subject4_table_name))

            print len(subjects3), 'subjects3 yet to be crawled'
            db.set_autocommit(False)

            for (subject1_title, subject2_title, subject3_title,
                 subject3_url) in subjects3:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject3_title:', subject3_title
                print 'subject3_url:', subject3_url
                driver.get(subject3_url)
                time.sleep(3)

                if not subject3_title:
                    print "subject3_title is empty"
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject3_title': subject3_title,
                        'subject4_title': '',
                        'subject4_url': subject3_url
                    }
                    db.insert(options.subject4_table_name, data)
                    db.commit()
                    continue

                try:
                    hoverlist = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located(
                            (By.CLASS_NAME, "subjectsbox")))
                    hoverlist = hoverlist.find_elements_by_xpath(
                        "./ul[@class='subjects']")
                except:
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject3_title': subject3_title,
                        'subject4_title': '',
                        'subject4_url': subject3_url
                    }
                    db.insert(options.subject4_table_name, data)
                    db.commit()
                    crawl_textbook_listing(driver, db, workingdir, options,
                                           subject1_title, subject2_title,
                                           subject3_title, '')
                    continue

                index = 0
                for a in hoverlist.find_elements_by_xpath("./li/a"):
                    index += 1
                    print 'index:', index
                    subject4_title = a.text.strip()
                    print 'subject4_title:', subject4_title
                    subject4_url = a.get_attribute('href')
                    print 'subject4_url:', subject4_url
                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'subject3_title': subject3_title,
                        'subject4_title': subject4_title,
                        'subject4_url': subject4_url
                    }
                    db.insert(options.subject4_table_name, data)
                db.commit()
                time.sleep(3)

        if options.crawl_textbooks:

            subjects4 = db.query(
                """select a.subject1_title, a.subject2_title, a.subject3_title, a.subject4_title, a.subject4_url from %s a left join %s b 
                                                   on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title and a.subject3_title=b.subject3_title and a.subject3_title=b.subject3_title where b.subject1_title is null
                                               """ %
                (options.subject4_table_name, options.textbook_table_name))

            print len(subjects4), 'subjects3 to be crawled yet'
            db.set_autocommit(False)

            for (subject1_title, subject2_title, subject3_title,
                 subject4_title, subject4_url) in subjects4:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject3_title:', subject3_title
                print 'subject4_title:', subject4_title
                print 'subject4_url:', subject4_url
                driver.get(subject4_url)
                time.sleep(3)
                crawl_textbook_listing(driver, db, workingdir, options,
                                       subject1_title, subject2_title,
                                       subject3_title, subject4_title)

        if options.crawl_textbook_details:
            textbooks = db.query(
                """select distinct subject1_title, subject2_title, subject3_title, subject4_title,textbook_title,textbook_url from %s where crawled=0"""
                % (options.textbook_table_name))

            print len(textbooks), 'textbooks yet to be crawled'
            db.set_autocommit(True)

            count = 0
            for (subject1_title, subject2_title, subject3_title,
                 subject4_title, textbook_title, textbook_url) in textbooks:
                count += 1
                print 'count:', count
                print 'textbook_title:', textbook_title
                print 'textbook_url:', textbook_url
                if not textbook_url:
                    continue
                driver.get(textbook_url)
                time.sleep(3)
                format_journal = driver.find_elements_by_class_name(
                    "format-journal")
                if format_journal:
                    data = {'crawled': None}
                    db.update(options.textbook_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    time.sleep(3)
                    continue
                    #crawl_journal(driver,options,db,subject1_title, subject2_title, subject3_title, subject4_title,textbook_title,textbook_url)
                    #continue
                product_main = driver.find_elements_by_class_name(
                    "product-main")
                if not product_main:
                    data = {'crawled': None}
                    db.update(options.textbook_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    time.sleep(3)
                    continue
                productDetail_largeCover = product_main[
                    0].find_element_by_class_name("productDetail-largeCover")
                coverImage = productDetail_largeCover.find_elements_by_tag_name(
                    'img')
                textbook_image_url = coverImage[0].get_attribute('src')
                print 'textbook_image_url:', textbook_image_url
                product_biblio = driver.find_element_by_class_name(
                    "product-biblio")

                productDetail_authorsMain = product_biblio.find_elements_by_class_name(
                    "productDetail-authorsMain")
                textbook_author = None
                if productDetail_authorsMain:
                    textbook_author = productDetail_authorsMain[0].text.strip()
                    if textbook_author.startswith('By '):
                        textbook_author = textbook_author[3:].strip()
                print 'textbook_author:', textbook_author
                textbook_publish_date = None
                textbook_copyright_year = None
                if product_biblio.find_elements_by_class_name(
                        "productDetail-dateImprint"):
                    textbook_publish_date = product_biblio.find_element_by_class_name(
                        "productDetail-dateImprint").text.strip(
                            ",")[0].strip()
                    textbook_publish_date = int(
                        mx.DateTime.DateTimeFrom(textbook_publish_date))
                    productDetail_dateImprint = product_biblio.find_element_by_class_name(
                        "productDetail-dateImprint").text
                    if '©' in productDetail_dateImprint:
                        textbook_copyright_year = productDetail_dateImprint[(
                            productDetail_dateImprint.find('©') + 1):].strip()
                print 'textbook_publish_date:', textbook_publish_date
                print 'textbook_copyright_year:', textbook_copyright_year
                textbook_isbn = textbook_url[(textbook_url.find('-') +
                                              1):].replace(".html",
                                                           "").strip()
                if len(textbook_isbn) > 12:
                    textbook_isbn = textbook_isbn.replace(
                        textbook_isbn[textbook_isbn.find(','):], '').strip()
                textbook_isbn_10 = textbook_isbn
                print 'textbook_isbn_10:', textbook_isbn_10
                productDetail_productCode = product_biblio.find_elements_by_class_name(
                    "productDetail-productCode")
                textbook_isbn_13 = None
                if productDetail_productCode:
                    textbook_isbn_13 = productDetail_productCode[
                        0].text.replace('-', '')
                    textbook_isbn_13 = textbook_isbn_13.replace("ISBN:",
                                                                "").strip()
                else:
                    textbook_isbn_13 = None
                print 'textbook_isbn_13:', textbook_isbn_13
                toc = 0
                toc_html = ''
                textbook_description = None
                textbook_publisher = "Wiley"
                print 'textbook_publisher:', textbook_publisher

                infoDescription = driver.find_elements_by_id("infoDescription")
                if infoDescription:
                    #productDetail_richDataText = driver.find_elements_by_class_name("showMore")
                    #if productDetail_richDataText:
                    #    if productDetail_richDataText[0].text.strip() == 'See More':
                    #        productDetail_richDataText[0].click()

                    textbook_description = infoDescription[
                        0].find_element_by_class_name(
                            "productDetail-richDataText")
                    textbook_description = textbook_description.get_attribute(
                        'innerText').strip()
                    print 'textbook_description:', textbook_description

    #             ribbon_tab_navigation = driver.find_element_by_class_name("ribbon-tab-navigation")
    #             a = ribbon_tab_navigation.find_elements_by_xpath(".//li[@class = 'toc-tab']")
    #             if a:
    #                 toc = 1
    #                 print 'toc available'
    #                 #a[0].click()
    #                 #time.sleep(3)
    #
                infoTableof = driver.find_elements_by_id("infoTableof")
                if infoTableof:
                    #if infoTableof[0].text.strip() == 'See More':
                    #    infoTableof[0].click()

                    content = infoTableof[0].find_element_by_class_name(
                        'productDetail-richDataText')

                    toc_html = content.get_attribute('innerHTML').strip()
                    m = hashlib.md5()
                    m.update(textbook_url)
                    url_md5 = m.hexdigest()
                    file = codecs.open(
                        workingdir + '/wiley_toc_html/' + url_md5 + '.html',
                        "w", "utf-8")
                    file.write(toc_html)
                    file.close()
                    print 'TOC:'
                    print toc_html
                    print 'toc_html_file :', url_md5 + '.html'
                    toc = 1

                data = {
                    'textbook_isbn': textbook_isbn_13,
                    'textbook_isbn_10': textbook_isbn_10,
                    'textbook_isbn_13': textbook_isbn_13,
                    'textbook_author': textbook_author,
                    'textbook_copyright_year': textbook_copyright_year,
                    'textbook_publish_date': textbook_publish_date,
                    'textbook_description': textbook_description,
                    'textbook_publisher': textbook_publisher,
                    'textbook_image_url': textbook_image_url,
                    'crawled': 1,
                    'toc': toc,
                    'toc_html': toc_html
                }
                db.update(options.textbook_table_name, data,
                          "textbook_url='%s'" % textbook_url)
                time.sleep(3)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #6
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-textbooks",
                      dest="crawl_textbooks",
                      action="store_true",
                      help="crawl textbooks list",
                      default=False)
    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="database name",
                      default='colorado_1012')
    parser.add_option("--subject2-table-name",
                      dest="subject2_table_name",
                      type="string",
                      help="subject2 table name",
                      default='colorado_subject2')
    parser.add_option("--textbook-table-name",
                      dest="textbook_table_name",
                      type="string",
                      help="textbook table name",
                      default='colorado_textbook')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=True)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl_textbooks:

            subject2 = db.query("""select * from %s""" %
                                (options.subject2_table_name))

            print len(subject2), 'Textbook to be crawled yet'
            db.set_autocommit(True)

            for (subject1_title, subject2_title, subject_url) in subject2:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject_url:', subject_url
                driver.get(subject_url)
                time.sleep(3)
                simulation_link = driver.find_elements_by_class_name(
                    "simulation-link")

                for link in simulation_link:
                    file_format = None
                    textbook_url = link.get_attribute("href")
                    textbook_image_url = link.find_element_by_tag_name(
                        "img").get_attribute("src")
                    textbook_title = link.find_element_by_tag_name(
                        "strong").text
                    span = link.find_element_by_tag_name('span')
                    badge = span[1].get_attribute("class")
                    if "html" in badge:
                        file_format = "html5"
                    if "java" in badge:
                        file_format = "java applet"
                    if "flash" in badge:
                        file_format = "shockwave flash"
                    print "textbook_title :", textbook_title
                    print "textbook_url :", textbook_url
                    print "textbook_image_url :", textbook_image_url
                    print "file_format :", file_format
                    raise Exception("done")

                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'textbook_title': textbook_title,
                        'textbook_url': textbook_url,
                        'textbook_image_url': textbook_image_url,
                        'format': file_format
                    }
                    db.insert(options.textbook_table_name, data)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #7
0
def Main():
    parser = OptionParser()
    parser.add_option("--textbook-package",
                      dest="textbook_package",
                      action="store_true",
                      help="textbook package details",
                      default=False)
    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="db name",
                      default='pearsonhighered')
    parser.add_option("--db-port",
                      dest="db_port",
                      type="int",
                      help="db port",
                      default=3306)
    parser.add_option("--textbook-package-table-name",
                      dest="textbook_package_table_name",
                      type="string",
                      help="textbook package table name",
                      default='textbook_package')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=False)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.textbook_package:
            textbooks = db.query(
                """select textbook_id,textbook_url from %s where status=%s""" %
                (options.textbook_package_table_name, 0))

            print len(textbooks), 'textbooks yet to be crawled'
            db.set_autocommit(True)

            count = 0
            for (textbook_id, textbook_url) in textbooks:
                count += 1
                print 'count:', count
                print 'textbook_id:', textbook_id
                print 'textbook_url:', textbook_url
                if not textbook_url:
                    continue
                driver.get(textbook_url)
                time.sleep(3)
                tab_about_this_product = driver.find_elements_by_id(
                    "tab-about-this-product")
                if not tab_about_this_product:
                    data = {'status': 2}
                    db.update(options.textbook_package_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    continue
                    #raise Exception("tab about this product not found")
                #data_feed_float_right=tab_content_group[0].find_elements_by_class_name("data-feed.float-right")
                #if data_feed_float_right:
                #   raise Exception("data-feed float-right no found")
                description = tab_about_this_product[0].find_elements_by_id(
                    "description")
                if not description:
                    data = {'status': -1}
                    db.update(options.textbook_package_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    continue
                    #raise Exception("description not found")
                ul = description[0].find_elements_by_tag_name("ul")
                if not ul:
                    data = {'status': -1}
                    db.update(options.textbook_package_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    continue
                    #raise  Exception("ul tag not found")
                li_tag = ul[0].find_elements_by_tag_name("li")
                if not li_tag:
                    raise Exception("li tag not found")
                a = li_tag[0].find_elements_by_tag_name("a")
                if not a:
                    data = {'status': -1}
                    db.update(options.textbook_package_table_name, data,
                              "textbook_url='%s'" % textbook_url)
                    continue
                    #raise  Exception("a tag not found")
                for li in li_tag:
                    a = li.find_elements_by_tag_name("a")
                    if not a:
                        raise Exception("a tag not found")
                    title = a[0].text.strip()
                    print "textbook title", title
                    p = li.find_elements_by_tag_name("p")
                    if not p:
                        raise Exception("p tag not found")
                    if not len(p) == 3:
                        raise Exception("all p tags are not found")
                    for tag in p:
                        package_details = tag.text
                        if '©' in package_details:
                            copy_right_year = package_details[
                                package_details.find("©") +
                                1:package_details.find("•")]
                            copy_right_year = copy_right_year.strip()
                            if not len(copy_right_year) == 4:
                                raise Exception(
                                    "copy right right is not correct")
                            if 'pp' in package_details:
                                pages = package_details[
                                    package_details.find(",") +
                                    1:package_details.find("pp")]
                                pages = pages.strip()
                            continue

                        if "ISBN" in package_details:
                            if "•" in package_details:
                                isbns = package_details.split("•")
                            for isbn in isbns:
                                if "ISBN-10:" in isbn:
                                    isbn_10 = isbn.replace("ISBN-10:",
                                                           "").strip()
                                    continue
                                if "ISBN-13:" in isbn:
                                    isbn_13 = isbn.replace("ISBN-13:",
                                                           "").strip()
                                    continue
                            continue
                        author = package_details.strip()
                    if not len(isbn_10) == 10:
                        raise Exception("isbn 10 is not correct")
                    if not len(isbn_13) == 13:
                        raise Exception("isbn 13 is not correct")
                    print "author :", author
                    print "copy right year", copy_right_year
                    print "Pages", pages
                    print "isbn_10 :", isbn_10
                    print "isbn_13 :", isbn_13

                    data = {
                        'textbook_id': textbook_id,
                        'textbook_url': textbook_url,
                        'textbook_title': title,
                        'isbn10': isbn_10,
                        'isbn13': isbn_13,
                        'author': author,
                        'copyright_year': copy_right_year,
                        'pages': pages,
                        'status': 1
                    }
                    db.insert('package_textbook2', data)

                data = {'status': 1}
                db.update(options.textbook_package_table_name, data,
                          "textbook_url='%s'" % textbook_url)

            print "Crawling done"
    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Exemple #8
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl url", default=False)
    parser.add_option("--crawl-landing", dest="crawl_landing", action="store_true", help="crawl url", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='skillscommons')
    parser.add_option("--table-name", dest="table_name", type="string", help="table name", default='skill')
   
    parser.add_option("--main-table-name", dest="main_table_name", type="string", help="main table name", default='skillscommons')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachment')
    parser.add_option("--meta-table-name", dest="meta_table_name", type="string", help="meta table name", default='meta_data')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    links=["https://www.skillscommons.org/discover?rpp=2000&page=1&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=2&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=3&group_by=none&etal=0"]
    try:
        if options.crawl:
            
            count = 0
            for link in links:
                print "Link :",link
            
                driver.get(link)
                time.sleep(5)
                medium_results=driver.find_element_by_class_name("medium-results")
                li=medium_results.find_elements_by_tag_name("li")
                for tag in li:
                    count+=1
                    print "Count :",count
                    link_tag=tag.find_element_by_tag_name("a")
                    title=link_tag.text.strip()
                    url=link_tag.get_attribute("href")
                    types=tag.find_elements_by_class_name("type")
                    if len(types)==2:
                        type=types[0].text.strip()
                        institution=types[1].text.strip()
                    else:
                        type=None
                        institution=types[0].text.strip()
                    description=tag.find_element_by_class_name("abstract").text.strip()
                    print "title :", title
                    print "url :",url
                    print "type :",type
                    print "institution :",institution
                    print "description :",description
            
                    data = {
                    'title':title,
                    'institution':institution,
                    'url':url,
                    'type':type,
                    'description':description,
                    }
                    db.insert(options.table_name, data)                      
               

        if options.crawl_landing:
            count=0
            skill=db.query("select distinct url from skill where crawled=0")
            print "Number of urls to crawl ",len(skill)
            for (src_url,) in skill:
                print "source url :",src_url
                print "count %s"%count
                count+=1
                driver.get(src_url)
                author=None
                col=driver.find_element_by_class_name("col-sm-8")
                title=col.find_element_by_tag_name("h1").text.strip()
                m = hashlib.md5()
                m.update(title+src_url)
                document_id=m.hexdigest()
                toc_html="/mnt/data/kendavar/skillscommons/%s.html"%document_id
                file(toc_html,"w","utf8").write(driver.page_source)
                authors=col.find_element_by_class_name("authors")
                if not authors.find_elements_by_tag_name("div"):
                    author=authors.text.strip()
                description=col.find_element_by_class_name("abstract").text
                files=col.find_element_by_class_name("files")
                file_information=files.find_elements_by_class_name("file-information")
                attachment=[]
                for attach in file_information:
                    attachment.append((attach.text.strip(),attach.find_element_by_tag_name("a").get_attribute("href")))
                dls=col.find_elements_by_tag_name("dl")
                meta={}
                string=''
                for dl in dls:
                    for div in dl.find_elements_by_tag_name("div"):
                        string=''
                        dd=div.find_element_by_tag_name("dd")
                        if dd.find_elements_by_tag_name("li"):
                            for li in dd.find_elements_by_tag_name("li"):
                                string=string+li.text.strip()+","
                        elif dd.find_elements_by_tag_name("a"):
                            string=[dd.text.strip()]
                            anchors=[]
                            for anchor in dd.find_elements_by_tag_name("a"):
                                if anchor.get_attribute("href") not in anchors:
                                    anchors.append(anchor.get_attribute("href"))
                                    string.append(anchor.get_attribute("href"))
                        else:
                            string=dd.text.strip()
                        meta[div.find_element_by_tag_name("dt").text.replace(":","").strip()]=string
                print "title :",title
                print "author :",author
                print "description :",description
                print "toc_path",toc_html
                data={
                "document_id":document_id,
                "title":title,
                "author":author,
                "description":description,
                "toc_path":toc_html
                }
                db.insert(options.main_table_name, data) 
                for (attachment_title,attachment_url) in attachment:
                      print "document_id":document_id,
                      print "attachment_title":attachment_title,
                      print "attachment_url":attachment_url
                      data={
                      "document_id":document_id,
                      "attachment_title":attachment_title,
                      "attachment_url":attachment_url
                      }
                      db.insert(options.attachment_table_name, data) 
                for key,value in meta.iteritems():
                      if value[-1]==",":
                          value=value[:-1]
                      print '%s : %s'%(key,value)

                      if type(value) is list:
                          for val in value:
                              meta_title=key
                              if i%2==0 :
                                  meta_value=val
                              else:
                                  meta_url=val
                              print "meta_title":meta_title
                              print "meta_value":meta_value
                              print "meta_url":meta_url
                              data={
                              "document_id":document_id,
                              "meta_title":meta_title,
                              "meta_value":meta_value,
                              "meta_url":meta_url
                              }
                              db.insert(options.meta_table_name, data)
                      else:
                          meta_title=key
                          meta_url=None
                          meta_value=value
                          print "meta_title":meta_title
                          print "meta_value":meta_value
                          print "meta_url":meta_url
                          data={
                          "document_id":document_id,
                          "meta_title":meta_title,
                          "meta_value":meta_value,
                          "meta_url":meta_url
                          }
                          db.insert(options.meta_table_name, data)
                data={
                "crawled":1
                }
                db.update(options.table_name,data,"url='%s'"%src_url)
                print "updated the table"

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()