Python DB Examples

Programming Language: Python

Namespace/Package Name: src.mysql

Method/Function: DB

Examples at hotexamples.com: 8

Python DB - 8 examples found. These are the top rated real world Python examples of src.mysql.DB extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def Main():
    parser = OptionParser()
    parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove')   
    parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    num_of_iframs=0
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    cursor=db.get_cursor()
    try:
        if options.crawl_text:
            count=0
            attachments=db.query("select distinct url,attachment_id from %s where
             file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link"))
            print "Number of urls to crawl ",len(attachments)
            for (url,attachment_id,) in attachments:
                try:
                    count+=1
                    print "source url :",url
                    print "attachment_id :",attachment_id
                    print "count %s"%count
                    res=requests.get(url)
                    time.sleep(3)
                    
                    landing_url=res.url
                    if ".pdf" in landing_url:
                        raise Exception("Format not html")
                    data=res.text
                    soup=bs(data,"html")
                    iframes=soup.findAll("iframe")
                    num_of_iframs=len(iframes)
                    body=soup.find('body')
                    print body.text
                    [e.extract() for e in body.findAll('script')]
                    [e.extract() for e in body.findAll('style')]
                    comments = body.findAll(text=lambda text:isinstance(text, Comment))
                    [e.extract() for e in comments]
                    txt=body.text
                    visible_text=txt.replace('\n', ' ').replace('\s','').replace("  ",' ').replace('\t','')
                    
                    txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.txt"%attachment_id
                    f=codecs.open(txt_location,"w","utf-8")
                    f.write(visible_text)
                    f.close()
                    print "txt_location :",txt_location
                    print "landing_url :",landing_url
                    print "num_of_iframs :",num_of_iframs
                    print "landing :",1
                    cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where attachment_id=%s"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,attachment_id))
                except:
                    traceback.print_exc()
                    logging.exception('Got exception on main handler')
                    cursor.execute("""update %s set landing_url='%s', landed=%s where attachment_id=%s"""%(options.attachment_table,landing_url,-1,attachment_id))    
                    pass
                    #data={
                    #"txt_location":txt_location,
                    #"landing_url":landing_url,
                    #"num_of_iframs":num_of_iframs,
                    #"landed":1
                    #}
                    #db.update(options.attachment_table,data,"url='%s'"%url)
    except:
        traceback.print_exc()

Example #2

Show file

def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawllist", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='ted')
    parser.add_option("--topic-table-name", dest="topic_table_name", type="string", help="topic table name", default='ted_topics')
    parser.add_option("--ted-table-name", dest="ted_table_name", type="string", help="ted table name", default='ted')
    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl:
    
            topics = db.query("""select * from %s""" %(options.topic_table_name))
    
            print len(topics), 'Topics to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (topic,topic_url,) in topics:
                count+=1
                print 'topic:', topic
                print 'topic_url:', topic_url
                print 'topic count :',count
                driver.get(topic_url)
                time.sleep(3)
                pagination=driver.find_elements_by_class_name("pagination")
                number=0
                if pagination:
                    atag=pagination[0].find_elements_by_tag_name("a")
                    page_numbers=int(atag[-2].text.encode("utf-8"))
                    print "Page numbers ",page_numbers
                    for page_number in range(page_numbers):
                        number+=1
                        url="https://www.ted.com/talks?page=%s&sort=newest&topics[]=%s"%(str(page_number+1),topic)
                        url=url.replace(" ","+")
                        print "Page url :",url
                        print "page number :",number
                        driver.get(url)
                        time.sleep(3)
                        crawl_data(driver,options,db,topic)
                else:
                    print "Paginator not found"
                    crawl_data(driver,options,db,topic)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()

Example #3

Show file

File: opentextbooks.py Project: kendavar/personal_files

def Main():
    driver=None
    parser = OptionParser()
    parser.add_option("--crawl-textbook", dest="crawl_textbook", action="store_true", help="crawl textbook", default=False)
    parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbook details", default=False)

    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='opentextbooks')
    parser.add_option("--subject-table-name", dest="subject_table_name", type="string", help="subject table name", default='subject')
    parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='opentextbooks')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachments')
    parser.add_option("--toc-table-name", dest="toc_table_name", type="string", help="toc table name", default='table_of_content')

    parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0)
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
    (options, args) = parser.parse_args()
    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    if options.use_firefox:
        driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    
    try:
        if options.crawl_textbook:
            subjects = db.query("""select * from %s""" %(options.subject_table_name))
            print len(subjects), 'subjects to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (url,subject_title,) in subjects:
                count+=1
                print 'subject title:', subject_title
                print 'url:', url
                print 'subject count :',count
                driver.get(url)
                time.sleep(3)
                ShortDescription=driver.find_elements_by_class_name("ShortDescription")
                for short in ShortDescription:
                   thrid=short.find_element_by_class_name("thrid")
                   img_url=thrid.find_element_by_tag_name("img")
                   h2=short.find_element_by_tag_name("h2")
                   textbook_title=h2.text.strip()
                   textbook_link=h2.find_element_by_tag_name("a").get_attribute("href")
                   m = hashlib.md5()
                   m.update(textbook_title+textbook_link)
                   document_id=m.hexdigest()
                   string=short.find_element_by_tag_name("p").text
                   l=[]
                   if string.find("\n"):
                       authors=string.replace("\n",", ")
                   
                       list=string.split("\n")
                       for a in list:
                           l.append(a.split(",")[0])
                       author=','.join(l)
                   elif string.find(","):
                       authors=string
                       l.append(a.split(",")[0])
                       author=','.join(l)
                   else:
                       authors=string
                       author=string
             
                   print 'textbook_url',textbook_url
                   print 'subject_title',subject_title
                   print 'url',url
                   print 'author',author
                   print 'authors',authors
                   print 'document_id',document_id
                   print 'img_url',img_url
                   data = {
                    'textbook_title':textbook_title,
                    'textbook_url':textbook_url,
                    'subject_title':subject_title,
                    'url':url,
                    'author':author,
                    'authors':authors,
                    'document_id':document_id,
                    'img_url':img_url
                    }
                   db.insert(options.textbook_table_name, data)
                   print "db inserted"
        if options.crawl_textbook_details:
            textbook = db.query("""select document_id,textbook_url from %s where crawled=0""" %(options.textbook_table_name))
            print len(textbook), 'textbook to be crawled yet'
            db.set_autocommit(True)
            count=0
            for (document_id,textbook_url,) in textbook:
                count+=1
                print 'textbook_url:', textbook_url
                print 'document_id:', document_id
                print 'subject count :',count
                driver.get(textbook_url)
                time.sleep(3)

                third=driver.find_element_by_class_name("twothird")
                para= third.find_elements_by_tag_name("p")
                for p in para:
                    para_text=p.text
                    if para_text.startswith("Pub Date:"):
                        pub_date=para_text.replace("Pub Date:","")
                        if pub_date:
                            pub_date=pub_date.strip()
                        else:
                            pub_date=None
                    elif para_text.startswith("ISBN 13:")
                        isbn_13_string=para_text.replace("ISBN 13:","")
                        if isbn_13_string:
                            isbn_13_string=isbn_13_string.strip()
                            isbn_13=isbn_13_string.replace("-","")
                        else:
                            isbn_13_string=None
                            isbn_13=None
                BookTypes=driver.find_element_by_class_name("BookTypes")
                books=BookTypes.find_elements_by_tag_name("a")
                for book in books:
                    attachment_link=book.get_attribute("href")
                    type=book.text.strip()
                    print "attachment_link",attachment_link
                    print "type",type
                    data={
                    "document_id":document_id,
                    "attachment_link":attachment_link,
                    "type":type
                    }
                    db.insert(options.attachment_table_name, data)
                    print "toc table  inserted"
                Badge=driver.find_element_by_class_name("Badge-Condition")
                conditions_text=Badge.text
                condition_link=Badge.find_element_by_tag_name("a").get_attribute("href")
                toc=driver.find_element_by_id("TOC")
                table_of_content=str(toc)
                list_tags=toc.find_elements_by_tag_name("li")
                for list in list_tags:
                    chapter=list.text.strip()
                    if chapter.startswith("Chapter"):
                        chapter_type="Chapter"
                    elif chapter.startswith("Part"):
                        chapter_type="Part"
                    else:
                        chapter_type=None
                    print "title",chapter
                    print "type",chapter_type

                    data={
                    'document_id':document_id,
                    'title':chapter,
                    'type': chapter_type
                    }
                    db.insert(options.toc_table_name, data)
                    print "toc table  inserted"
                AboutBook = driver.find_element_by_id("AboutBook")
                description = AboutBook.text
                links=AboutBook.find_elements_by_tag_name("a")
                for link in links:
                    href = link.get_attribute("href")
                    print "link in books",href
                    data={
                    "document_id":document_id
                    "link":href
                    }
                    db.insert("books", data)
                    print "toc table  inserted"
                AboutAuthors = driver.find_element_by_id("AboutAuthors")
                author_details = AboutAuthors.text
                print 'pub_date',pub_date,
                print 'isbn_13_string',isbn_13_string,
                print 'isbn_13',isbn_13,
                print 'conditions_text',conditions_text,
                print 'condition_link', condition_link,
                print 'table_of_content',table_of_content,
                print 'description',description,
                print 'author_details',author_details
                data = {
                'pub_date':pub_date,
                'isbn_13_string':isbn_13_string,
                'isbn_13':isbn_13,
                'conditions_text': conditions_text,
                'condition_link': condition_link,
                'table_of_content': table_of_content,
                'description' : description,
                'author_details':author_details
                'crawled':1
                }
                db.update(options.textbook_table_name, data, "document_id='%s'" %document_id)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()

Example #4

Show file

File: back.py Project: kendavar/personal_files

def Main():
    parser = OptionParser()
    parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove')   
    parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    num_of_iframs=0
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'

    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    try:
        if options.crawl_text:
            count=0
            attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link"))
            print "Number of urls to crawl ",len(attachments)
            for (url,attachment_id,) in attachments:
                try:
                    count+=1
                    print "source url :",url
                    print "attachment_id :",attachment_id
                    print "count %s"%count
                    if "pdf" in url:
                        raise Exception(url)
                    driver.get(url)
                    iframes=driver.find_elements_by_tag_name("iframe")
                    body=driver.find_element_by_tag_name("body")
                    landing_url=driver.current_url
                    if "pdf" in landing_url:
                        raise Exception(landing_url)
                
                    cursor=db.get_cursor()
                    visible_text=body.text
                    if iframes:
                        num_of_iframs=len(iframes)
                        print "landing_url :",landing_url
                        print"landed :",2
                        print"num_of_iframs :",num_of_iframs
                        #data={
                        #"landing_url":landing_url,
                        #"landed":2,
                        #"num_of_iframs":num_of_iframs
                        #}
                        
                        cursor.execute("""update %s set landing_url='%s',landed=%s,num_of_iframs=%s  where url='%s'"""%(options.attachment_table,landing_url,2,num_of_iframs,url))
                        #db.update(options.attachment_table,data,"url='%s'"%url)
                    else:
                        txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.html"%attachment_id
                        f=codecs.open(txt_location,"w","utf-8")
                        f.write(visible_text)
                        f.close()
                        print "txt_location :",txt_location
                        print "landing_url :",landing_url
                        print "num_of_iframs :",num_of_iframs
                        print "landing :",1
                        cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,url))
                except:
                    traceback.print_exc()
                    logging.exception('Got exception on main handler')
                    cursor.execute("""update %s set landed=%s where url='%s'"""%(options.attachment_table,-1,url))    
                    pass


    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()

Example #5

Show file

File: back.py Project: kendavar/personal_files

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()


if __name__ == "__main__":
    Main()

from src import mysql
from src import crawlutils
import shutil
from selenium import webdriver

db = mysql.DB("usgs")
links = db.query("""select document_id,link from %s where crawled=2""" %("usgs"))
driver=webdriver.Chrome()
driver.set_window_size(1000,900)
for (document_id,link,) in links:
    txt_location="/mnt/data/kendavar/usgs/txtfiles/%s_txt"%document_id
    filename="/mnt/data/kendavar/usgs/screenshots/%s.png"%document_id
    driver.save_screenshot(filename)
    crawlutils.resize_png_image(filename)
    img_location="/mnt/data/kendavar/usgs/screenshot_png/%s_png"%document_id
    shutil.copyfile(filename,img_location)
    data = {
    'screenshot':img_location,
    'txt_location':txt_location,
    'crawled':1
    }

Example #6

Show file

def Main():
    parser = OptionParser()
    parser.add_option("--crawl-textbooks",
                      dest="crawl_textbooks",
                      action="store_true",
                      help="crawl textbooks list",
                      default=False)
    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="database name",
                      default='colorado_1012')
    parser.add_option("--subject2-table-name",
                      dest="subject2_table_name",
                      type="string",
                      help="subject2 table name",
                      default='colorado_subject2')
    parser.add_option("--textbook-table-name",
                      dest="textbook_table_name",
                      type="string",
                      help="textbook table name",
                      default='colorado_textbook')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=True)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name, port=options.db_port)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl_textbooks:

            subject2 = db.query("""select * from %s""" %
                                (options.subject2_table_name))

            print len(subject2), 'Textbook to be crawled yet'
            db.set_autocommit(True)

            for (subject1_title, subject2_title, subject_url) in subject2:
                print 'subject1_title:', subject1_title
                print 'subject2_title:', subject2_title
                print 'subject_url:', subject_url
                driver.get(subject_url)
                time.sleep(3)
                simulation_link = driver.find_elements_by_class_name(
                    "simulation-link")

                for link in simulation_link:
                    file_format = None
                    textbook_url = link.get_attribute("href")
                    textbook_image_url = link.find_element_by_tag_name(
                        "img").get_attribute("src")
                    textbook_title = link.find_element_by_tag_name(
                        "strong").text
                    span = link.find_element_by_tag_name('span')
                    badge = span[1].get_attribute("class")
                    if "html" in badge:
                        file_format = "html5"
                    if "java" in badge:
                        file_format = "java applet"
                    if "flash" in badge:
                        file_format = "shockwave flash"
                    print "textbook_title :", textbook_title
                    print "textbook_url :", textbook_url
                    print "textbook_image_url :", textbook_image_url
                    print "file_format :", file_format
                    raise Exception("done")

                    data = {
                        'subject1_title': subject1_title,
                        'subject2_title': subject2_title,
                        'textbook_title': textbook_title,
                        'textbook_url': textbook_url,
                        'textbook_image_url': textbook_image_url,
                        'format': file_format
                    }
                    db.insert(options.textbook_table_name, data)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()

Example #7

Show file

def Main():
    driver = None
    parser = OptionParser()
    parser.add_option("--crawl",
                      dest="crawl",
                      action="store_true",
                      help="crawl textbook",
                      default=False)
    parser.add_option("--details",
                      dest="details",
                      action="store_true",
                      help="crawl textbook details",
                      default=False)

    parser.add_option("--working-dir",
                      dest="workingdir",
                      type="string",
                      help="working directory",
                      default='.')
    parser.add_option("--db-name",
                      dest="db_name",
                      type="string",
                      help="database name",
                      default='usgs')

    parser.add_option("--table-name",
                      dest="table_name",
                      type="string",
                      help="textbook table name",
                      default='usgs')
    parser.add_option("--skip",
                      dest="skip",
                      type=int,
                      help="integer value",
                      default=0)
    parser.add_option("--use-firefox",
                      dest="use_firefox",
                      action="store_true",
                      help="use-firefox",
                      default=True)
    (options, args) = parser.parse_args()

    workingdir = options.workingdir.rstrip('/')

    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")

    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000, 900))
        display.start()
    except:
        print 'No Xvfb!'

    db = mysql.DB(db=options.db_name)
    #if options.use_firefox:
    #    driver = crawlutils.open_driver(use_firefox=options.use_firefox)

    try:
        if options.crawl:
            res = requests.get("http://education.usgs.gov/undergraduate.html")
            soup = bs(res.text)
            td = soup.findAll("td")
            subject2 = None
            for i, tag in enumerate(td):
                if i >= 5:
                    if tag.find("h2"):
                        subject1 = tag.text.strip()
                    elif len(tag.findAll("hr")) == 2:
                        if tag.find("div"):
                            tag.div.extract()
                        subject2 = tag.text.strip()
                        print subject2
                    elif tag.find("a"):
                        if tag.find("li"):
                            if not tag.find("strong"):
                                tag.a.extract()
                                description = tag.text.strip()
                            for list1 in tag.findAll("li"):
                                save(list1, subject1, subject2)

                        else:
                            save(tag, subject1, subject2)

        if options.details:
            links = db.query(
                """select document_id,link from %s where crawled=0""" %
                (options.table_name))
            print len(links), 'links to be crawled yet'
            db.set_autocommit(True)
            count = 0
            for (
                    document_id,
                    link,
            ) in links:
                count += 1
                print 'link:', link
                print 'document_id:', document_id
                print 'link count :', count
                documents = (document_id, link)
                txt_location, driver = crawl_documents(
                    documents, '/mnt/data/kendavar/usgs')
                driver.set_window_size(1000, 900)
                filename = "/mnt/data/kendavar/usgs/screenshots/%s.png" % document_id
                driver.save_screenshot(filename)
                crawlutils.resize_png_image(filename)
                img_location = "/mnt/data/kendavar/usgs/screenshot_png/%s_png" % document_id
                shutil.copyfile(filename, img_location)
                data = {
                    'screenshot': img_location,
                    'txt_location': txt_location,
                    'crawled': 1
                }
                db.update(options.table_name, data,
                          "document_id='%s'" % document_id)

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'

    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()

Example #8

Show file

def Main():
    parser = OptionParser()
    parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl url", default=False)
    parser.add_option("--crawl-landing", dest="crawl_landing", action="store_true", help="crawl url", default=False)
    parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.')
    parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='skillscommons')
    parser.add_option("--table-name", dest="table_name", type="string", help="table name", default='skill')
   
    parser.add_option("--main-table-name", dest="main_table_name", type="string", help="main table name", default='skillscommons')
    parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachment')
    parser.add_option("--meta-table-name", dest="meta_table_name", type="string", help="meta table name", default='meta_data')
    parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True)
   
    (options, args) = parser.parse_args()    
    workingdir = options.workingdir.rstrip('/')
    
    if not os.path.exists(workingdir):
        parser.error("workingdir not exists")
    
    try:
        display = None
        from pyvirtualdisplay import Display
        display = Display(visible=0, size=(1000,900))
        display.start()
    except:
        print 'No Xvfb!'
    
    db = mysql.DB(db=options.db_name)
    db.set_autocommit(True)
    driver = crawlutils.open_driver(use_firefox=options.use_firefox)
    links=["https://www.skillscommons.org/discover?rpp=2000&page=1&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=2&group_by=none&etal=0",
    "https://www.skillscommons.org/discover?rpp=2000&page=3&group_by=none&etal=0"]
    try:
        if options.crawl:
            
            count = 0
            for link in links:
                print "Link :",link
            
                driver.get(link)
                time.sleep(5)
                medium_results=driver.find_element_by_class_name("medium-results")
                li=medium_results.find_elements_by_tag_name("li")
                for tag in li:
                    count+=1
                    print "Count :",count
                    link_tag=tag.find_element_by_tag_name("a")
                    title=link_tag.text.strip()
                    url=link_tag.get_attribute("href")
                    types=tag.find_elements_by_class_name("type")
                    if len(types)==2:
                        type=types[0].text.strip()
                        institution=types[1].text.strip()
                    else:
                        type=None
                        institution=types[0].text.strip()
                    description=tag.find_element_by_class_name("abstract").text.strip()
                    print "title :", title
                    print "url :",url
                    print "type :",type
                    print "institution :",institution
                    print "description :",description
            
                    data = {
                    'title':title,
                    'institution':institution,
                    'url':url,
                    'type':type,
                    'description':description,
                    }
                    db.insert(options.table_name, data)                      
               

        if options.crawl_landing:
            count=0
            skill=db.query("select distinct url from skill where crawled=0")
            print "Number of urls to crawl ",len(skill)
            for (src_url,) in skill:
                print "source url :",src_url
                print "count %s"%count
                count+=1
                driver.get(src_url)
                author=None
                col=driver.find_element_by_class_name("col-sm-8")
                title=col.find_element_by_tag_name("h1").text.strip()
                m = hashlib.md5()
                m.update(title+src_url)
                document_id=m.hexdigest()
                toc_html="/mnt/data/kendavar/skillscommons/%s.html"%document_id
                file(toc_html,"w","utf8").write(driver.page_source)
                authors=col.find_element_by_class_name("authors")
                if not authors.find_elements_by_tag_name("div"):
                    author=authors.text.strip()
                description=col.find_element_by_class_name("abstract").text
                files=col.find_element_by_class_name("files")
                file_information=files.find_elements_by_class_name("file-information")
                attachment=[]
                for attach in file_information:
                    attachment.append((attach.text.strip(),attach.find_element_by_tag_name("a").get_attribute("href")))
                dls=col.find_elements_by_tag_name("dl")
                meta={}
                string=''
                for dl in dls:
                    for div in dl.find_elements_by_tag_name("div"):
                        string=''
                        dd=div.find_element_by_tag_name("dd")
                        if dd.find_elements_by_tag_name("li"):
                            for li in dd.find_elements_by_tag_name("li"):
                                string=string+li.text.strip()+","
                        elif dd.find_elements_by_tag_name("a"):
                            string=[dd.text.strip()]
                            anchors=[]
                            for anchor in dd.find_elements_by_tag_name("a"):
                                if anchor.get_attribute("href") not in anchors:
                                    anchors.append(anchor.get_attribute("href"))
                                    string.append(anchor.get_attribute("href"))
                        else:
                            string=dd.text.strip()
                        meta[div.find_element_by_tag_name("dt").text.replace(":","").strip()]=string
                print "title :",title
                print "author :",author
                print "description :",description
                print "toc_path",toc_html
                data={
                "document_id":document_id,
                "title":title,
                "author":author,
                "description":description,
                "toc_path":toc_html
                }
                db.insert(options.main_table_name, data) 
                for (attachment_title,attachment_url) in attachment:
                      print "document_id":document_id,
                      print "attachment_title":attachment_title,
                      print "attachment_url":attachment_url
                      data={
                      "document_id":document_id,
                      "attachment_title":attachment_title,
                      "attachment_url":attachment_url
                      }
                      db.insert(options.attachment_table_name, data) 
                for key,value in meta.iteritems():
                      if value[-1]==",":
                          value=value[:-1]
                      print '%s : %s'%(key,value)

                      if type(value) is list:
                          for val in value:
                              meta_title=key
                              if i%2==0 :
                                  meta_value=val
                              else:
                                  meta_url=val
                              print "meta_title":meta_title
                              print "meta_value":meta_value
                              print "meta_url":meta_url
                              data={
                              "document_id":document_id,
                              "meta_title":meta_title,
                              "meta_value":meta_value,
                              "meta_url":meta_url
                              }
                              db.insert(options.meta_table_name, data)
                      else:
                          meta_title=key
                          meta_url=None
                          meta_value=value
                          print "meta_title":meta_title
                          print "meta_value":meta_value
                          print "meta_url":meta_url
                          data={
                          "document_id":document_id,
                          "meta_title":meta_title,
                          "meta_value":meta_value,
                          "meta_url":meta_url
                          }
                          db.insert(options.meta_table_name, data)
                data={
                "crawled":1
                }
                db.update(options.table_name,data,"url='%s'"%src_url)
                print "updated the table"

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot(workingdir + '/error.png')
            print workingdir + '/error.png'
    
    finally:
        if driver:
            driver.quit()
        if display:
            display.stop()