Exemple #1
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit(
        'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml'
    )
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(
                        id=info_list[0].text[6:],
                        path='~',
                        title=info_list[4].text[6:],
                        abstract='~',
                        inventor=info_list[7].text[5:].split(';')[:-1],
                        applicant=info_list[6].text[10:].split(';')[:-1],
                        category=info_list[5].text[8:].split('; '),
                        update_time=time.strftime('%Y-%m-%dT%XZ',
                                                  time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'  # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
Exemple #2
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml')
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(id=info_list[0].text[6:],
                                    path='~',
                                    title=info_list[4].text[6:],
                                    abstract='~',
                                    inventor=info_list[7].text[5:].split(';')[:-1],
                                    applicant=info_list[6].text[10:].split(';')[:-1],
                                    category=info_list[5].text[8:].split('; '),
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'    # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
Exemple #3
0
def crawl(usr, pswd, out_path, driver="firefox"):
    bsr = Browser(driver)
    bsr.visit(URL_LOGIN)
    bsr.find_by_id("phone1").fill(usr)
    bsr.find_by_id("pswd").fill(pswd)
    bsr.find_by_id("login").click()
    if bsr.is_element_present_by_css("span.fw1.fs0.acnt"):
        print "Successfully login!"
    else:
        print "Login failed, bye!"
    bsr.visit("http://123.163.com/webmail/main/#mid=7")
    while bsr.is_element_not_present_by_css("div.list-time"):
        print "sleeping"
        time.sleep(1)
    bsr.find_by_css("span.iblock.icn-msg.list-icon.potr")[0].click()
    page_num = get_page_num(bsr)
    with open(out_path, "w") as out_f:
        for pi in xrange(page_num):
            print "Page %d/%d" % (pi+1, page_num)
            date_lst = bsr.find_by_css("div.list-time")
            date_msgs_lst = bsr.find_by_css("div.sms-item")
            #HACK for scrolling the sms list down because of AJAX-style of showing sms
            date_lst[-1].right_click()
            msg_i = 0
            for di in xrange(len(date_lst)):
                date = date_lst[di].text.strip().split()[0]
                msg_num_mat = re.findall(r"\(\s*(\d+).\s*\)", date_lst[di].text)
                msg_num = int(msg_num_mat[0])
                out_f.write("%s\t%d\n" % (date, msg_num))
                for _ in range(msg_num):
                    name_obj = date_msgs_lst[msg_i].find_by_css("span.js-cnt.name")[0]
                    phone_obj = date_msgs_lst[msg_i].find_by_css("span.js-cnt.fc2")[0]
                    time_obj = date_msgs_lst[msg_i].find_by_css("div.fr.w6.js-cnt.bm-hack-w6")[0]
                    msg_obj = date_msgs_lst[msg_i].find_by_css("div.w4")[0]
                    type_obj = date_msgs_lst[msg_i].find_by_css("div.fl.w3.thide.fc5")[0]
                    out_f.write("%s\t%s\t%s\t%s\t%s\n" % (name_obj.html.encode("utf8"), \
                                                            phone_obj.html.strip("() ").encode("utf8"), \
                                                            time_obj.text.encode("utf8"), \
                                                            "0" if type_obj.visible else "1", \
                                                            msg_obj.text.encode("utf8")))
                    msg_i += 1
            #next page
            next_page_link = bsr.find_by_css("div.fr.pager")[0].find_by_tag("a")[2]
            next_page_link.click()
Exemple #4
0
def scrape():
    # create mars_data dic that we can insert into mongo
    mars_data = {}
    
    # set the chromedriver path
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # Mar News
    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    news_title = soup.find("div", class_="content_title").text
    news_p = soup.find("div", class_="article_teaser_body").text
    
    mars_data["news_title"] = news_title
    mars_data["news_p"] = news_p

    # JPL Mars Space Images - Featured Image
    space_img_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(space_img_url)
    xpath = '//*[@id="full_image"]'
    img_button = browser.find_by_xpath(xpath)
    img_button.click()
    browser.is_element_not_present_by_css("img.fancybox-image", wait_time=1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    featured_image_url = soup.find("img", class_="fancybox-image")["src"]
    if "http:" not in featured_image_url:
        featured_image_url = "https://www.jpl.nasa.gov" + featured_image_url

    mars_data["featured_image_url"] = featured_image_url

    # Mars Weather
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twitter_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_weather = soup.find("p", class_="TweetTextSize", text=re.compile("Sol")).text

    mars_data["mars_weather"] = mars_weather

    # Mars Facts
    space_facts_url = "https://space-facts.com/mars/"
    tables = pd.read_html(space_facts_url)
    df = tables[0]
    df.columns = ["description", "value"]
    df.set_index("description", inplace=True)
    html_table = df.to_html()

    mars_data["table"] = html_table

    # Mars Hemispheres
    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    xpath1 = '//*[@id="product-section"]/div[2]/div[1]/a/img'
    xpath2 = '//*[@id="product-section"]/div[2]/div[2]/a/img'
    xpath3 = '//*[@id="product-section"]/div[2]/div[3]/a/img'
    xpath4 = '//*[@id="product-section"]/div[2]/div[4]/a/img'
    xpath_list = [xpath1, xpath2, xpath3, xpath4]
    hemisphere_image_urls = []
    browser.visit(hemisphere_url)

    for xpath in xpath_list:
        img_button = browser.find_by_xpath(xpath)
        img_button.click()
        browser.is_element_not_present_by_css("img.jpg", wait_time=1)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find('h2', class_="title").text
        img_url = soup.find('div', class_="wide-image-wrapper").ul.li.a['href']
        hemisphere_image_urls.append({"title": title, "img_url": img_url})
        browser.visit(hemisphere_url)

    mars_data["hemisphere"] = hemisphere_image_urls

    browser.quit()
    return mars_data
Exemple #5
0
class Session:
    def __init__(self, browser, user):
        self.browser = Browser(browser)
        self.browser.visit('http://jizdenky.studentagency.cz/')
        self.browser.fill_form({'passwordAccountCode': user['login'],
                                'password': user['password']})
        self.browser.execute_script('window.scrollTo(0, 100)')
        button = self.browser.find_by_value('Přihlásit').first
        button.click()
        self.user = user
        self.log = logging.getLogger(__name__)

    def go_search(self):
        self.browser.visit('http://jizdenky.studentagency.cz/')

    def search(self, task, date_return=None, is_open=False):
        self.browser.find_by_id('hp_form_itinerar').first \
            .find_by_xpath('div/input[@type="radio"]'
                           )[1 if date_return or is_open else 0].check()
        for city, i in [(task.from_city, 1), (task.to_city, 2)]:
            self.browser.find_by_css('input[tabindex="{}"]'.format(i)) \
                        .first.fill(city)
            for item in self.browser.find_by_css('.ui-menu-item'):
                link = item.find_by_tag('a')
                if link.value.lower() == city.lower():
                    link.click()
                    break
        self.browser.fill('departure:dateField', task.date)
        if date_return:
            self.browser.fill('returnDeparture:dateField', date_return)
        if is_open:
            self.browser.check('returnTicketOpen')
        self.browser.find_option_by_text('ISIC').first.check()
        self.browser.find_by_value('Vyhledat').first.click()
        while self.browser.is_element_not_present_by_css('.left_column',
                                                         wait_time=1):
            pass
        items = self.browser.find_by_css('.left_column') \
                            .find_by_xpath('div/div/*')
        connections = []
        for item in items:
            if item.tag_name == 'h2':
                date_local = item.text.split(' ')[1]
            elif item.tag_name == 'div' and item.has_class('routeSummary'):
                assert date_local
                if date_local != task.date:
                    break
                connections.append(Connection(item))
        return connections

    def order_time(self, connection):
        while True:
            if connection.click():
                self.browser

            dialog = self.browser.find_by_css('[id^=_wicket_window]')
            if dialog:
                dialog.first.find_by_tag('button').click()
            if self.browser.is_element_present_by_id('sumary_lines',
                                                     wait_time=1):
                break
        self.browser.find_by_id('sumary_lines') \
                    .first.find_by_tag('button') \
                    .first.click()
        seats = {}
        bus = self.browser.find_by_css('.seatsContainer')
        if bus:
            for seat in bus.first.find_by_css(
                    '.seatContainer:not([style*=blocked])'):
                seats[int(seat.find_by_tag('div').first.html[:-1])] = seat
        else:
            bus = self.browser.find_by_css('.vehicle')
            for seat in bus.first.find_by_css('.free, .selected'):
                seats[int(seat.text[:-1])] = seat
        return seats

    def order_seat(self, seat):
        if not seat.has_class('selected'):
            seat.click()
        for fs in self.browser.find_by_css('fieldset.topRoute'):
            legend = fs.find_by_css('legend')
            if legend and 'Pojištění' in legend[0].text:
                for package in fs.find_by_css('.insurancePackageType'):
                    if 'nechci' in package.find_by_tag('label').text:
                        package.find_by_tag('input').click()
                        time.sleep(1)
        submit = self.browser.find_by_css('[name^=buttonContainer]').first
        interaction_type = submit.text
        reserved = 'Rezervovat' in interaction_type
        if not reserved:
            submit.click()
            time.sleep(1)
            data = (self.user['first'],
                    self.user['last'],
                    self.user['email'],
                    self.user['phone'])
            for item, value in zip(self.browser.find_by_id('passengerInfo')
                                               .first.find_by_tag('input'),
                                   data):
                item.fill(value)
            submit = self.browser.find_by_css('[name^=buttonContainer]').first
            interaction_type = submit.text
            assert 'Rezervovat' in interaction_type
        agreement = self.browser.find_by_css('[name="bottomComponent:termsAgreementCont:termsAgreementCB"]')
        if agreement:
            agreement[0].check()
        time.sleep(1)
        submit.click()
        with open('conf.yaml') as f:
            conf = yaml.load(f)
        if 'email' in conf:
            email = conf['email']
            while self.browser.is_element_not_present_by_id('ticketPage', wait_time=1):
                pass
            msg = MIMEText(self.browser.find_by_id('ticketPage').first.html, 'html')
            msg['Subject'] = 'SA reservation'
            msg['From'] = email['from']
            msg['To'] = self.user['email']
            username = email['username']
            password = email['password']
            server = smtplib.SMTP(email['server'])
            server.starttls()
            server.login(username, b64decode(password).decode())
            server.sendmail(msg['From'], msg['To'], msg.as_string())
            server.quit()
class GPlusEventManager(object):
    def __init__(self, email, passwd, otp):
        self.email = email
        self.passwd = passwd
        self.br = Browser('firefox')
        atexit.register(self.force_br_quit)
        # To dynamically load jQuery into the HTML head
        self.loadjq = """var head = document.getElementsByTagName('head')[0];
           var script  = document.createElement('script');
           script.type = 'text/javascript';
           script.src  =
                '//ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js';
           head.appendChild(script);"""

        self.otp = otp
        self.logged_in = self.login()

    def force_br_quit(self):
        try:
            self.br.quit()
        except:
            pass

    def create(self, title, desc, date, time):
        """ Create a new Google Plus event """
        if not self.logged_in:
            self.logged_in = self.login()

        create_btn = 'div[guidedhelpid="events_create_event_button"]'
        self.br.find_by_css(create_btn)[0].click()

        return self.complete_form(title, desc, date, time, update=False)

    def update(self, id, title=None, desc=None, date=None, time=None):
        """ Update a Google Plus event """
        if not self.logged_in:
            self.logged_in = self.login()

        self.br.visit(id)

        dropdown = 'div[class="A7kfHd q3sPdd"]'
        while self.br.is_element_not_present_by_css(dropdown):
            pass
        self.br.find_by_css(dropdown).click()
        self.br.find_by_xpath('//*[@id=":o"]/div').click()

        return self.complete_form(title, desc, date, time, update=True)

    def complete_form(self, title, desc, date, time, update):
        '''Fill event create/edit form,
           the CSS selectors are valid in both types of form'''

        title_input = 'input[placeholder="Event title"]'
        while self.br.is_element_not_present_by_css(title_input):
            pass
        if title:
            title_placeholder = self.br.find_by_css(title_input)
            title_placeholder.fill(title)
        if date:
            self.br.find_by_css('input[class="g-A-G T4 lUa"]').click()
            rm_date = '''document.body.getElementsByClassName("g-A-G T4 lUa")
                         [0].value = ""'''
            self.br.execute_script(rm_date)
            date_field = 'input[class="g-A-G T4 lUa"]'
            self.br.find_by_css(date_field).type('{}\t'.format(date))
        if time:
            self.br.execute_script(self.loadjq)
            loaded = False
            rm_time = '$(".EKa")[0].value = ""'
            while not loaded:
                try:
                    self.br.execute_script(rm_time)
                except Exception, e:
                    pass
                else:
                    loaded = True

            time_field = 'input[class="g-A-G T4 EKa"]'
            self.br.find_by_css(time_field)[0].type('{}'.format(time))
        if desc:
            set_desc = '''document.body.getElementsByClassName("yd editable")
                         [1].innerHTML = "{}"'''.format(desc)
            self.br.execute_script(set_desc)

        invite_btn = self.br.find_by_css('div[guidedhelpid="sharebutton"]')
        invite_inp = self.br.find_by_css('input[class="i-j-h-G-G"]')

        invite_btn.click()
        if not update:  # If new entry, invite Public group by default
            invite_inp.click()
            invite_inp.type('Public\n')
            invite_btn.click()
            while not self.br.is_text_present('Going ('):
                pass  # wait on page load for new event

        url = self.br.url
        self.br.quit()
        return url  # return event url
Exemple #7
0
myClick_txt(u'帐号登录')
# data里填真实的username和password
data ={'username':'******','password':'******'}
browser.fill_form(data)
browser.find_by_css('.W_btn_a').first.click()
import time
browser.visit('http://weibo.com/message/history?uid=5175429989#_0')
question = browser.find_by_css('.bubble_r .page').last.text
lastAnswer = ''
answer = ''
j = 0
while True:
    try:
        if j % 100 == 0:
            browser.reload()
            while browser.is_element_not_present_by_css('.bubble_l .page'):
                browser.reload()
        j += 1
        i = 0
        while answer == lastAnswer and i < 1000:
            answer = browser.find_by_css('.bubble_l .page').last.text
            i += 1
        print '小冰:',answer
        try:
            cursor.execute(sql%(lastAnswer,question,answer))
            db.commit()
        except:
            db.rollback()
        question = sim_chat(answer)
        print '小黄鸡:',question
        lastAnswer = answer
Exemple #8
0
# Find and click the 'search' button
btnEmail = browser.find_by_id('username')
btnPasswd = browser.find_by_id('password')

btnEmail.fill('')  #用户名
btnPasswd.fill('')  #密码
#print dir(browser)
# Interact with elements
btnSubmit = browser.find_by_value("登 录")
btnSubmit.click()  #登录
# button.click()
#time.sleep(10)
time.sleep(6)
browser.visit("http://download.csdn.net/my/downloads")

while browser.is_element_not_present_by_css(".pageliststy"):
    time.sleep(2)

urls = [url['href'] for url in browser.find_link_by_text("立即评价,通过可返分")]

#总页面个数
pages = int(browser.find_by_css(".pageliststy")[-1]['href'].split('/')[-1])
for index in range(2, pages + 1):
    browser.visit("http://download.csdn.net/my/downloads/%d" % index)
    time.sleep(5)
    urls = urls + [
        url['href'] for url in browser.find_link_by_text("立即评价,通过可返分")
    ]
    for url in urls:
        print url
        try:
# %%
get_ipython().system(u'which chromedriver')

# %%
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}

# %%
browser = Browser('chrome', **executable_path)

# %%
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# %%
browser.is_element_not_present_by_css('.article_teaser_body', wait_time=5)

# %%
#using bs to write it into html
html = browser.html
soup = BeautifulSoup(html, "html.parser")

# %%
print(soup)

# %%
news_title = soup.find("div", class_="content_title").text
news_paragraph = soup.find("div", class_="article_teaser_body").text
print(f"Title: {news_title}")
print(f"Para: {news_paragraph}")
# Import Splinter and Beautiful Soup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

# Sewt executable path and initialize Chrome browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)

# Visit Mars Nasa Site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_not_present_by_css("ul.item_list li.slide", wait_time=3)

# Setup HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

# This line of code looks inside the slide.elem and specificall identifies the "div and class"
slide_elem.find("div", class_="content_title")

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_="content_title").get_text()

# Use the parent element to find the paragraph text
news_p = slide_elem.find("div", class_="article_teaser_body").get_text()

# ### Featured Images
Exemple #11
0
class Session:
    def __init__(self, browser, user):
        self.browser = Browser(browser)
        self.browser.visit('http://jizdenky.studentagency.cz/')
        self.browser.fill_form({
            'passwordAccountCode': user['login'],
            'password': user['password']
        })
        self.browser.execute_script('window.scrollTo(0, 100)')
        button = self.browser.find_by_value('Přihlásit').first
        button.click()
        self.user = user
        self.log = logging.getLogger(__name__)

    def go_search(self):
        self.browser.visit('http://jizdenky.studentagency.cz/')

    def search(self, task, date_return=None, is_open=False):
        self.browser.find_by_id('hp_form_itinerar').first \
            .find_by_xpath('div/input[@type="radio"]'
                           )[1 if date_return or is_open else 0].check()
        for city, i in [(task.from_city, 1), (task.to_city, 2)]:
            self.browser.find_by_css('input[tabindex="{}"]'.format(i)) \
                        .first.fill(city)
            for item in self.browser.find_by_css('.ui-menu-item'):
                link = item.find_by_tag('a')
                if link.value.lower() == city.lower():
                    link.click()
                    break
        self.browser.fill('departure:dateField', task.date)
        if date_return:
            self.browser.fill('returnDeparture:dateField', date_return)
        if is_open:
            self.browser.check('returnTicketOpen')
        self.browser.find_option_by_text('ISIC').first.check()
        self.browser.find_by_value('Vyhledat').first.click()
        while self.browser.is_element_not_present_by_css('.left_column',
                                                         wait_time=1):
            pass
        items = self.browser.find_by_css('.left_column') \
                            .find_by_xpath('div/div/*')
        connections = []
        for item in items:
            if item.tag_name == 'h2':
                date_local = item.text.split(' ')[1]
            elif item.tag_name == 'div' and item.has_class('routeSummary'):
                assert date_local
                if date_local != task.date:
                    break
                connections.append(Connection(item))
        return connections

    def order_time(self, connection):
        while True:
            if connection.click():
                self.browser

            dialog = self.browser.find_by_css('[id^=_wicket_window]')
            if dialog:
                dialog.first.find_by_tag('button').click()
            if self.browser.is_element_present_by_id('sumary_lines',
                                                     wait_time=1):
                break
        self.browser.find_by_id('sumary_lines') \
                    .first.find_by_tag('button') \
                    .first.click()
        seats = {}
        bus = self.browser.find_by_css('.seatsContainer')
        if bus:
            for seat in bus.first.find_by_css(
                    '.seatContainer:not([style*=blocked])'):
                seats[int(seat.find_by_tag('div').first.html[:-1])] = seat
        else:
            bus = self.browser.find_by_css('.vehicle')
            for seat in bus.first.find_by_css('.free, .selected'):
                seats[int(seat.text[:-1])] = seat
        return seats

    def order_seat(self, seat):
        if not seat.has_class('selected'):
            seat.click()
        for fs in self.browser.find_by_css('fieldset.topRoute'):
            legend = fs.find_by_css('legend')
            if legend and 'Pojištění' in legend[0].text:
                for package in fs.find_by_css('.insurancePackageType'):
                    if 'nechci' in package.find_by_tag('label').text:
                        package.find_by_tag('input').click()
                        time.sleep(1)
        submit = self.browser.find_by_css('[name^=buttonContainer]').first
        interaction_type = submit.text
        reserved = 'Rezervovat' in interaction_type
        if not reserved:
            submit.click()
            time.sleep(1)
            data = (self.user['first'], self.user['last'], self.user['email'],
                    self.user['phone'])
            for item, value in zip(
                    self.browser.find_by_id('passengerInfo').first.find_by_tag(
                        'input'), data):
                item.fill(value)
            submit = self.browser.find_by_css('[name^=buttonContainer]').first
            interaction_type = submit.text
            assert 'Rezervovat' in interaction_type
        agreement = self.browser.find_by_css(
            '[name="bottomComponent:termsAgreementCont:termsAgreementCB"]')
        if agreement:
            agreement[0].check()
        time.sleep(1)
        submit.click()
        with open('conf.yaml') as f:
            conf = yaml.load(f)
        if 'email' in conf:
            email = conf['email']
            while self.browser.is_element_not_present_by_id('ticketPage',
                                                            wait_time=1):
                pass
            msg = MIMEText(
                self.browser.find_by_id('ticketPage').first.html, 'html')
            msg['Subject'] = 'SA reservation'
            msg['From'] = email['from']
            msg['To'] = self.user['email']
            username = email['username']
            password = email['password']
            server = smtplib.SMTP(email['server'])
            server.starttls()
            server.login(username, b64decode(password).decode())
            server.sendmail(msg['From'], msg['To'], msg.as_string())
            server.quit()
Exemple #12
0
    classfile.write(classid), classfile.write('\n')
#print classlist
#打开微信
wechaturl = 'http://wechat.shwilling.com/auth/qrcode/login?redirect=http%3A%2F%2Fwechat.shwilling.com%2Fsjtu%2Fcourse'
browser.visit(wechaturl)
print u'你现在有20s的时间扫描二维码确认登陆'
time.sleep(10)
print u'请稍等,本程序稍微有点慢...但是等待还是值得的.'
myfile = open(u'all_scorelist.txt', 'w')
for classid in classlist:
    time = ['/2014-2015-1', '/2014-2015-2', '/2015-2016-1', '/2015-2016-2']
    for i in range(4):
        class_str = 'http://wechat.shwilling.com/sjtu/course/detail/' + classid + time[
            i]
        browser.visit(class_str)
        if (browser.is_element_not_present_by_css('.d-name')):
            pass
        else:
            name = browser.find_by_css('.d-name').text
            timea = browser.find_by_css('.c-code').text
            meanscore = browser.find_by_css('.c-aver').text
            highscore = browser.find_by_css('.c-max').text
            print name, time[i], meanscore, highscore
            myfile.write(
                name.encode('utf-8')), myfile.write('\t'), myfile.write(
                    time[i].encode('utf-8')), myfile.write('\t'), myfile.write(
                        meanscore.encode(
                            'utf-8')), myfile.write('\t'), myfile.write(
                                highscore.encode('utf-8')), myfile.write('\n')

classfile.close()
	if (re.match(pattern,ele.text)):
		classlist.append(ele.text)

#打开微信
wechaturl='http://wechat.shwilling.com/auth/qrcode/login?redirect=http%3A%2F%2Fwechat.shwilling.com%2Fsjtu%2Fcourse'
browser.visit(wechaturl)
print u'你现在有20s的时间扫描二维码确认登陆'
sleep(10)
print u'请稍等,本程序稍微有点慢...但是等待还是值得的.'
myfile=open(u'scorelist.txt','w')
for classid in classlist:
	time=['/2014-2015-1','/2014-2015-2','/2015-2016-1']
	for i in range(3):
		class_str='http://wechat.shwilling.com/sjtu/course/detail/'+classid+time[i]
		browser.visit(class_str)
		if (browser.is_element_not_present_by_css('.d-name')):
			pass
		else:
			name=browser.find_by_css('.d-name').text
			timea=browser.find_by_css('.c-code').text
			meanscore=browser.find_by_css('.c-aver').text
			highscore=browser.find_by_css('.c-max').text
			print name,time[i],meanscore,highscore
			myfile.write(name.encode('utf-8')),myfile.write('\t'),myfile.write(time[i].encode('utf-8')),myfile.write('\t'),myfile.write(meanscore.encode('utf-8')),myfile.write('\t'), myfile.write(highscore.encode('utf-8')),myfile.write('\n')

browser.quit()
myfile.close()

	
	
def main():
        
    #how many accounts we need  
    ntimes = 1
        
    for i in range(1,ntimes+1):     
     
        print "starting browser"
        firstname = names.get_first_name()
        #print "firstname", firstname
        lastname = names.get_last_name()
        #print "lastname", lastname
                
        browser = Browser() #Browser(user_agent="Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)")
        
        
        browser.visit('https://passport.yandex.com/registration/mail')
        
        browser.find_by_id('firstname').fill(firstname)
        browser.find_by_id('lastname').fill(lastname)
        
        testlogin = False
        count = 0
        while (testlogin == False):
            count = count + 1
            login = firstname+lastname+str(randint(10,1000))
            print "login:"******"div.control__error__login_notavailable", wait_time=2)
            if browser.is_text_present("username available"):
                testlogin = True
            else:
                print "login is not available, generate new"
            if (count>3):
                #print "logins in this script is unavailable now, please make new login generator"
                browser.quit()
                sys.exit("logins in this script is unavailable now, please make new login generator")
                
        password = password_generator.generate()
        print "password:"******"hint_question_id").click()
        
        #wait 
        browser.is_element_not_present_by_css("li[role=\"presentation\"]", wait_time=3)
        
        #check first question
        browser.find_by_css("li[role=\"presentation\"]")[1].click()
        
        browser.find_by_id("hint_answer").fill(firstname)
        
        gateimgcode = captcha(browser)
        browser.find_by_id('answer').fill(gateimgcode)
        
        browser.find_by_css("button[type=\"submit\"]").click()
        
        testcaptcha = False
        count = 0
        while (testcaptcha == False):
            count = count + 1
            browser.is_element_not_present_by_css("div.control__error__captcha_incorrect", wait_time=2)          
            if browser.is_text_present("characters were entered incorrectly"):
                print "captcha code is bad, try again"

                browser.find_by_id('password').fill(password)
                browser.find_by_id('password_confirm').fill(password)
                gateimgcode = captcha(browser)
                browser.find_by_id('answer').fill(gateimgcode)
                browser.find_by_css("button[type=\"submit\"]").click()
            else:
                testcaptcha = True
            if (count>3):
                #print "something wrong with captcha"
                browser.quit()
                sys.exit("something wrong with captcha")
                
        browser.is_element_not_present_by_tag("html", wait_time=2)
        
        if browser.is_text_present("Personal information"):        
            today = datetime.date.today()
            filename = 'yandex'+str(today)+'.txt'
            file = open(filename,'a')
            file.write(login+'@yandex.com'+':'+login+':'+password+'\n')
            file.close()
            print str(i)+" accounts saved to "+filename
            browser.quit()
        else:
            #print "something wrong, please start script again"
            browser.quit()
            sys.exit("something wrong, please start script again")