Example #1
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove')   
    parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    num_of_iframs=0
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    cursor=db.get_cursor()
    try:
        if options.crawl_text:
            count=0
            attachments=db.query("select distinct url,attachment_id from %s where
             file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link"))
            print "Number of urls to crawl ",len(attachments)
            for (url,attachment_id,) in attachments:
                try:
                    count+=1
                    print "source url :",url
                    print "attachment_id :",attachment_id
                    print "count %s"%count
                    res=requests.get(url)
                    time.sleep(3)
                    
                    landing_url=res.url
                    if ".pdf" in landing_url:
                        raise Exception("Format not html")
                    data=res.text
                    soup=bs(data,"html")
                    iframes=soup.findAll("iframe")
                    num_of_iframs=len(iframes)
                    body=soup.find('body')
                    print body.text
                    [e.extract() for e in body.findAll('script')]
                    [e.extract() for e in body.findAll('style')]
                    comments = body.findAll(text=lambda text:isinstance(text, Comment))
                    [e.extract() for e in comments]
                    txt=body.text
                    visible_text=txt.replace('\n', ' ').replace('\s','').replace("  ",' ').replace('\t','')
                    
                    txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.txt"%attachment_id
                    f=codecs.open(txt_location,"w","utf-8")
                    f.write(visible_text)
                    f.close()
                    print "txt_location :",txt_location
                    print "landing_url :",landing_url
                    print "num_of_iframs :",num_of_iframs
                    print "landing :",1
                    cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where attachment_id=%s"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,attachment_id))
                except:
                    traceback.print_exc()
                    logging.exception('Got exception on main handler')
                    cursor.execute("""update %s set landing_url='%s', landed=%s where attachment_id=%s"""%(options.attachment_table,landing_url,-1,attachment_id))    
                    pass
                    #data={
                    #"txt_location":txt_location,
                    #"landing_url":landing_url,
                    #"num_of_iframs":num_of_iframs,
                    #"landed":1
                    #}
                    #db.update(options.attachment_table,data,"url='%s'"%url)
    except:
        traceback.print_exc()
Example #2
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawllist", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='ted')
    parser.add_option("--topic-table-name", dest="topic_table_name", type="string", help="topic table name", default='ted_topics')
    parser.add_option("--ted-table-name", dest="ted_table_name", type="string", help="ted table name", default='ted')
    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl:
    
            topics = db.query("""select * from %s""" %(options.topic_table_name))
    
            print len(topics), 'Topics to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (topic,topic_url,) in topics:
                count+=1
                print 'topic:', topic
                print 'topic_url:', topic_url
                print 'topic count :',count
                driver.get(topic_url)
                time.sleep(3)
                pagination=driver.find_elements_by_class_name("pagination")
                number=0
                if pagination:
                    atag=pagination[0].find_elements_by_tag_name("a")
                    page_numbers=int(atag[-2].text.encode("utf-8"))
                    print "Page numbers ",page_numbers
                    for page_number in range(page_numbers):
                        number+=1
                        url="https://www.ted.com/talks?page=%s&sort=newest&topics[]=%s"%(str(page_number+1),topic)
                        url=url.replace(" ","+")
                        print "Page url :",url
                        print "page number :",number
                        driver.get(url)
                        time.sleep(3)
                        crawl_data(driver,options,db,topic)
                else:
                    print "Paginator not found"
                    crawl_data(driver,options,db,topic)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Example #3
0
def Main():
    driver=None
    parser = OptionParser()
    parser.add_option("--crawl-textbook", dest="crawl_textbook", action="store_true", help="crawl textbook", default=False)
    parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbook details", default=False)

    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='opentextbooks')
    parser.add_option("--subject-table-name", dest="subject_table_name", type="string", help="subject table name", default='subject')
    parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='opentextbooks')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachments')
    parser.add_option("--toc-table-name", dest="toc_table_name", type="string", help="toc table name", default='table_of_content')

    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    if options.use_firefox:
        driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl_textbook:
            subjects = db.query("""select * from %s""" %(options.subject_table_name))
            print len(subjects), 'subjects to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (url,subject_title,) in subjects:
                count+=1
                print 'subject title:', subject_title
                print 'url:', url
                print 'subject count :',count
                driver.get(url)
                time.sleep(3)
                ShortDescription=driver.find_elements_by_class_name("ShortDescription")
                for short in ShortDescription:
                   thrid=short.find_element_by_class_name("thrid")
                   img_url=thrid.find_element_by_tag_name("img")
                   h2=short.find_element_by_tag_name("h2")
                   textbook_title=h2.text.strip()
                   textbook_link=h2.find_element_by_tag_name("a").get_attribute("href")
                   m = hashlib.md5()
                   m.update(textbook_title+textbook_link)
                   document_id=m.hexdigest()
                   string=short.find_element_by_tag_name("p").text
                   l=[]
                   if string.find("\n"):
                       authors=string.replace("\n",", ")
                   
                       list=string.split("\n")
                       for a in list:
                           l.append(a.split(",")[0])
                       author=','.join(l)
                   elif string.find(","):
                       authors=string
                       l.append(a.split(",")[0])
                       author=','.join(l)
                   else:
                       authors=string
                       author=string
             
                   print 'textbook_url',textbook_url
                   print 'subject_title',subject_title
                   print 'url',url
                   print 'author',author
                   print 'authors',authors
                   print 'document_id',document_id
                   print 'img_url',img_url
                   data = {
                    'textbook_title':textbook_title,
                    'textbook_url':textbook_url,
                    'subject_title':subject_title,
                    'url':url,
                    'author':author,
                    'authors':authors,
                    'document_id':document_id,
                    'img_url':img_url
                    }
                   db.insert(options.textbook_table_name, data)
                   print "db inserted"
        if options.crawl_textbook_details:
            textbook = db.query("""select document_id,textbook_url from %s where crawled=0""" %(options.textbook_table_name))
            print len(textbook), 'textbook to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (document_id,textbook_url,) in textbook:
                count+=1
                print 'textbook_url:', textbook_url
                print 'document_id:', document_id
                print 'subject count :',count
                driver.get(textbook_url)
                time.sleep(3)

                third=driver.find_element_by_class_name("twothird")
                para= third.find_elements_by_tag_name("p")
                for p in para:
                    para_text=p.text
                    if para_text.startswith("Pub Date:"):
                        pub_date=para_text.replace("Pub Date:","")
                        if pub_date:
                            pub_date=pub_date.strip()
                        else:
                            pub_date=None
                    elif para_text.startswith("ISBN 13:")
                        isbn_13_string=para_text.replace("ISBN 13:","")
                        if isbn_13_string:
                            isbn_13_string=isbn_13_string.strip()
                            isbn_13=isbn_13_string.replace("-","")
                        else:
                            isbn_13_string=None
                            isbn_13=None
                BookTypes=driver.find_element_by_class_name("BookTypes")
                books=BookTypes.find_elements_by_tag_name("a")
                for book in books:
                    attachment_link=book.get_attribute("href")
                    type=book.text.strip()
                    print "attachment_link",attachment_link
                    print "type",type
                    data={
                    "document_id":document_id,
                    "attachment_link":attachment_link,
                    "type":type
                    }
                    db.insert(options.attachment_table_name, data)
                    print "toc table  inserted"
                Badge=driver.find_element_by_class_name("Badge-Condition")
                conditions_text=Badge.text
                condition_link=Badge.find_element_by_tag_name("a").get_attribute("href")
                toc=driver.find_element_by_id("TOC")
                table_of_content=str(toc)
                list_tags=toc.find_elements_by_tag_name("li")
                for list in list_tags:
                    chapter=list.text.strip()
                    if chapter.startswith("Chapter"):
                        chapter_type="Chapter"
                    elif chapter.startswith("Part"):
                        chapter_type="Part"
                    else:
                        chapter_type=None
                    print "title",chapter
                    print "type",chapter_type

                    data={
                    'document_id':document_id,
                    'title':chapter,
                    'type': chapter_type
                    }
                    db.insert(options.toc_table_name, data)
                    print "toc table  inserted"
                AboutBook = driver.find_element_by_id("AboutBook")
                description = AboutBook.text
                links=AboutBook.find_elements_by_tag_name("a")
                for link in links:
                    href = link.get_attribute("href")
                    print "link in books",href
                    data={
                    "document_id":document_id
                    "link":href
                    }
                    db.insert("books", data)
                    print "toc table  inserted"
                AboutAuthors = driver.find_element_by_id("AboutAuthors")
                author_details = AboutAuthors.text
                print 'pub_date',pub_date,
                print 'isbn_13_string',isbn_13_string,
                print 'isbn_13',isbn_13,
                print 'conditions_text',conditions_text,
                print 'condition_link', condition_link,
                print 'table_of_content',table_of_content,
                print 'description',description,
                print 'author_details',author_details
                data = {
                'pub_date':pub_date,
                'isbn_13_string':isbn_13_string,
                'isbn_13':isbn_13,
                'conditions_text': conditions_text,
                'condition_link': condition_link,
                'table_of_content': table_of_content,
                'description' : description,
                'author_details':author_details
                'crawled':1
                }
                db.update(options.textbook_table_name, data, "document_id='%s'" %document_id)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Example #4
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove')   
    parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    num_of_iframs=0
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'

    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    try:
        if options.crawl_text:
            count=0
            attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link"))
            print "Number of urls to crawl ",len(attachments)
            for (url,attachment_id,) in attachments:
                try:
                    count+=1
                    print "source url :",url
                    print "attachment_id :",attachment_id
                    print "count %s"%count
                    if "pdf" in url:
                        raise Exception(url)
                    driver.get(url)
                    iframes=driver.find_elements_by_tag_name("iframe")
                    body=driver.find_element_by_tag_name("body")
                    landing_url=driver.current_url
                    if "pdf" in landing_url:
                        raise Exception(landing_url)
                
                    cursor=db.get_cursor()
                    visible_text=body.text
                    if iframes:
                        num_of_iframs=len(iframes)
                        print "landing_url :",landing_url
                        print"landed :",2
                        print"num_of_iframs :",num_of_iframs
                        #data={
                        #"landing_url":landing_url,
                        #"landed":2,
                        #"num_of_iframs":num_of_iframs
                        #}
                        
                        cursor.execute("""update %s set landing_url='%s',landed=%s,num_of_iframs=%s  where url='%s'"""%(options.attachment_table,landing_url,2,num_of_iframs,url))
                        #db.update(options.attachment_table,data,"url='%s'"%url)
                    else:
                        txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.html"%attachment_id
                        f=codecs.open(txt_location,"w","utf-8")
                        f.write(visible_text)
                        f.close()
                        print "txt_location :",txt_location
                        print "landing_url :",landing_url
                        print "num_of_iframs :",num_of_iframs
                        print "landing :",1
                        cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,url))
                except:
                    traceback.print_exc()
                    logging.exception('Got exception on main handler')
                    cursor.execute("""update %s set landed=%s where url='%s'"""%(options.attachment_table,-1,url))    
                    pass


    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Example #5
0
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()


if __name__ == "__main__":
    Main()

from src import mysql
from src import crawlutils
import shutil
from selenium import webdriver

db = mysql.DB("usgs")
links = db.query("""select document_id,link from %s where crawled=2""" %("usgs"))
driver=webdriver.Chrome()
driver.set_window_size(1000,900)
for (document_id,link,) in links:
    txt_location="/mnt/data/kendavar/usgs/txtfiles/%s_txt"%document_id
    filename="/mnt/data/kendavar/usgs/screenshots/%s.png"%document_id
    driver.save_screenshot(filename)
    crawlutils.resize_png_image(filename)
    img_location="/mnt/data/kendavar/usgs/screenshot_png/%s_png"%document_id
    shutil.copyfile(filename,img_location)
    data = {
    'screenshot':img_location,
    'txt_location':txt_location,
    'crawled':1
    }
Example #6
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl-textbooks",
                      dest="crawl_textbooks",
                      action="store_true",
                      help="crawl textbooks list",
                      default=False)
    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="database name",
                      default='colorado_1012')
    parser.add_option("--subject2-table-name",
                      dest="subject2_table_name",
                      type="string",
                      help="subject2 table name",
                      default='colorado_subject2')
    parser.add_option("--textbook-table-name",
                      dest="textbook_table_name",
                      type="string",
                      help="textbook table name",
                      default='colorado_textbook')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=True)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl_textbooks:

            subject2 = db.query("""select * from %s""" %
                                (options.subject2_table_name))

            print len(subject2), 'Textbook to be crawled yet'
            db.set_autocommit(True)

            for (subject1_title, subject2_title, subject_url) in subject2:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject_url:', subject_url
                driver.get(subject_url)
                time.sleep(3)
                simulation_link = driver.find_elements_by_class_name(
                    "simulation-link")

                for link in simulation_link:
                    file_format = None
                    textbook_url = link.get_attribute("href")
                    textbook_image_url = link.find_element_by_tag_name(
                        "img").get_attribute("src")
                    textbook_title = link.find_element_by_tag_name(
                        "strong").text
                    span = link.find_element_by_tag_name('span')
                    badge = span[1].get_attribute("class")
                    if "html" in badge:
                        file_format = "html5"
                    if "java" in badge:
                        file_format = "java applet"
                    if "flash" in badge:
                        file_format = "shockwave flash"
                    print "textbook_title :", textbook_title
                    print "textbook_url :", textbook_url
                    print "textbook_image_url :", textbook_image_url
                    print "file_format :", file_format
                    raise Exception("done")

                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'textbook_title': textbook_title,
                        'textbook_url': textbook_url,
                        'textbook_image_url': textbook_image_url,
                        'format': file_format
                    }
                    db.insert(options.textbook_table_name, data)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Example #7
0
def Main():
    driver = None
    parser = OptionParser()
    parser.add_option("--crawl",
                      dest="crawl",
                      action="store_true",
                      help="crawl textbook",
                      default=False)
    parser.add_option("--details",
                      dest="details",
                      action="store_true",
                      help="crawl textbook details",
                      default=False)

    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="database name",
                      default='usgs')

    parser.add_option("--table-name",
                      dest="table_name",
                      type="string",
                      help="textbook table name",
                      default='usgs')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=True)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name)
    #if options.use_firefox:
    #    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl:
            res = requests.get("http://education.usgs.gov/undergraduate.html")
            soup = bs(res.text)
            td = soup.findAll("td")
            subject2 = None
            for i, tag in enumerate(td):
                if i >= 5:
                    if tag.find("h2"):
                        subject1 = tag.text.strip()
                    elif len(tag.findAll("hr")) == 2:
                        if tag.find("div"):
                            tag.div.extract()
                        subject2 = tag.text.strip()
                        print subject2
                    elif tag.find("a"):
                        if tag.find("li"):
                            if not tag.find("strong"):
                                tag.a.extract()
                                description = tag.text.strip()
                            for list1 in tag.findAll("li"):
                                save(list1, subject1, subject2)

                        else:
                            save(tag, subject1, subject2)

        if options.details:
            links = db.query(
                """select document_id,link from %s where crawled=0""" %
                (options.table_name))
            print len(links), 'links to be crawled yet'
            db.set_autocommit(True)
            count = 0
            for (
                    document_id,
                    link,
            ) in links:
                count += 1
                print 'link:', link
                print 'document_id:', document_id
                print 'link count :', count
                documents = (document_id, link)
                txt_location, driver = crawl_documents(
                    documents, '/mnt/data/kendavar/usgs')
                driver.set_window_size(1000, 900)
                filename = "/mnt/data/kendavar/usgs/screenshots/%s.png" % document_id
                driver.save_screenshot(filename)
                crawlutils.resize_png_image(filename)
                img_location = "/mnt/data/kendavar/usgs/screenshot_png/%s_png" % document_id
                shutil.copyfile(filename, img_location)
                data = {
                    'screenshot': img_location,
                    'txt_location': txt_location,
                    'crawled': 1
                }
                db.update(options.table_name, data,
                          "document_id='%s'" % document_id)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()
Example #8
0
def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl url", default=False)
    parser.add_option("--crawl-landing", dest="crawl_landing", action="store_true", help="crawl url", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='skillscommons')
    parser.add_option("--table-name", dest="table_name", type="string", help="table name", default='skill')
   
    parser.add_option("--main-table-name", dest="main_table_name", type="string", help="main table name", default='skillscommons')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachment')
    parser.add_option("--meta-table-name", dest="meta_table_name", type="string", help="meta table name", default='meta_data')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    links=["https://www.skillscommons.org/discover?rpp=2000&page=1&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=2&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=3&group_by=none&etal=0"]
    try:
        if options.crawl:
            
            count = 0
            for link in links:
                print "Link :",link
            
                driver.get(link)
                time.sleep(5)
                medium_results=driver.find_element_by_class_name("medium-results")
                li=medium_results.find_elements_by_tag_name("li")
                for tag in li:
                    count+=1
                    print "Count :",count
                    link_tag=tag.find_element_by_tag_name("a")
                    title=link_tag.text.strip()
                    url=link_tag.get_attribute("href")
                    types=tag.find_elements_by_class_name("type")
                    if len(types)==2:
                        type=types[0].text.strip()
                        institution=types[1].text.strip()
                    else:
                        type=None
                        institution=types[0].text.strip()
                    description=tag.find_element_by_class_name("abstract").text.strip()
                    print "title :", title
                    print "url :",url
                    print "type :",type
                    print "institution :",institution
                    print "description :",description
            
                    data = {
                    'title':title,
                    'institution':institution,
                    'url':url,
                    'type':type,
                    'description':description,
                    }
                    db.insert(options.table_name, data)                      
               

        if options.crawl_landing:
            count=0
            skill=db.query("select distinct url from skill where crawled=0")
            print "Number of urls to crawl ",len(skill)
            for (src_url,) in skill:
                print "source url :",src_url
                print "count %s"%count
                count+=1
                driver.get(src_url)
                author=None
                col=driver.find_element_by_class_name("col-sm-8")
                title=col.find_element_by_tag_name("h1").text.strip()
                m = hashlib.md5()
                m.update(title+src_url)
                document_id=m.hexdigest()
                toc_html="/mnt/data/kendavar/skillscommons/%s.html"%document_id
                file(toc_html,"w","utf8").write(driver.page_source)
                authors=col.find_element_by_class_name("authors")
                if not authors.find_elements_by_tag_name("div"):
                    author=authors.text.strip()
                description=col.find_element_by_class_name("abstract").text
                files=col.find_element_by_class_name("files")
                file_information=files.find_elements_by_class_name("file-information")
                attachment=[]
                for attach in file_information:
                    attachment.append((attach.text.strip(),attach.find_element_by_tag_name("a").get_attribute("href")))
                dls=col.find_elements_by_tag_name("dl")
                meta={}
                string=''
                for dl in dls:
                    for div in dl.find_elements_by_tag_name("div"):
                        string=''
                        dd=div.find_element_by_tag_name("dd")
                        if dd.find_elements_by_tag_name("li"):
                            for li in dd.find_elements_by_tag_name("li"):
                                string=string+li.text.strip()+","
                        elif dd.find_elements_by_tag_name("a"):
                            string=[dd.text.strip()]
                            anchors=[]
                            for anchor in dd.find_elements_by_tag_name("a"):
                                if anchor.get_attribute("href") not in anchors:
                                    anchors.append(anchor.get_attribute("href"))
                                    string.append(anchor.get_attribute("href"))
                        else:
                            string=dd.text.strip()
                        meta[div.find_element_by_tag_name("dt").text.replace(":","").strip()]=string
                print "title :",title
                print "author :",author
                print "description :",description
                print "toc_path",toc_html
                data={
                "document_id":document_id,
                "title":title,
                "author":author,
                "description":description,
                "toc_path":toc_html
                }
                db.insert(options.main_table_name, data) 
                for (attachment_title,attachment_url) in attachment:
                      print "document_id":document_id,
                      print "attachment_title":attachment_title,
                      print "attachment_url":attachment_url
                      data={
                      "document_id":document_id,
                      "attachment_title":attachment_title,
                      "attachment_url":attachment_url
                      }
                      db.insert(options.attachment_table_name, data) 
                for key,value in meta.iteritems():
                      if value[-1]==",":
                          value=value[:-1]
                      print '%s : %s'%(key,value)

                      if type(value) is list:
                          for val in value:
                              meta_title=key
                              if i%2==0 :
                                  meta_value=val
                              else:
                                  meta_url=val
                              print "meta_title":meta_title
                              print "meta_value":meta_value
                              print "meta_url":meta_url
                              data={
                              "document_id":document_id,
                              "meta_title":meta_title,
                              "meta_value":meta_value,
                              "meta_url":meta_url
                              }
                              db.insert(options.meta_table_name, data)
                      else:
                          meta_title=key
                          meta_url=None
                          meta_value=value
                          print "meta_title":meta_title
                          print "meta_value":meta_value
                          print "meta_url":meta_url
                          data={
                          "document_id":document_id,
                          "meta_title":meta_title,
                          "meta_value":meta_value,
                          "meta_url":meta_url
                          }
                          db.insert(options.meta_table_name, data)
                data={
                "crawled":1
                }
                db.update(options.table_name,data,"url='%s'"%src_url)
                print "updated the table"

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()