def get_info(id): data = {'username': '******', 'password': '******'} url = 'http://informatics.mccme.ru/login/index.php' #browser = webdriver.Chrome('/Users/mihail/Desktop/chromedriver') #browser.get(url) #request_cookies_browser = browser.get_cookies() #s = requests.Session() #c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser] #resp = s.post(url, data) #dict_resp_cookies = resp.cookies.get_dict() #response_cookies_browser = [{'name': name, 'value': value} for name, value in dict_resp_cookies.items()] #c = [browser.add_cookie(c) for c in response_cookies_browser] #browser.get(url) #html = browser.page_source #browser.get('http://informatics.mccme.ru/submits/view.php?user_id=' + id) #WebDriverWait(browser, 10).until( # ajax_complete, "Timeout waiting for page to load") #html = browser.page_source browser = Browser() browser.load('http://informatics.mccme.ru/submits/view.php?user_id=' + id) browser.wait_load() html = browser.html print(html)
def process_request(self, request, spider): browser = Browser() browser.create_webview() browser.set_html_parser(PyQuery) browser.load(request.url, 20) try: browser.wait_load(10) except: print '###########ERROR###########' pass body = browser.html body = body.encode('utf-8') renderedBody = str(body) return HtmlResponse(request.url, body=renderedBody)
class FileTranslator(object): def __init__(self, source_lang, target_lang, translate_type='headless'): self.source_lang = source_lang self.target_lang = target_lang # instantiate the relevant browser for the instance self.headless_browser = self.driver = None if translate_type == 'headless': self.headless_browser = Browser() self.headless_browser.set_html_parser(PyQuery) elif translate_type == 'selenium': self.driver = webdriver.Firefox() # determine the kind of translator needed self.translate = { 'simple': self.translate_simple, 'headless': self.translate_text_google_headless, 'selenium': self.translate_text_google }[translate_type] def _forcefully_kill_firefox(self): """ This is needed in order to kill Firefox when it gets stuck.. """ print "Killing Firefox forcefully..." os.system('taskkill /im firefox.exe /f /t') def translate_text_google(self, text_to_translate, quit_browser=True): """ Uses selenium to translate the text. kinda slow, and gets stuck.. """ # Open Google Translate website url = "http://translate.google.com/#%s/%s/%s" % (self.source_lang.to_google_translate(), self.target_lang.to_google_translate(), text_to_translate) self.driver.get(url) # Wait for results to appear and retrieve them # If results don't show up in 11 seconds, it means that Firefox stuck, kill it and continue t = Timer(11.0, self._forcefully_kill_firefox) t.start() WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//span[@id='result_box']/span[@class='hps']"))) t.cancel() result = self.driver.find_element_by_id('result_box') translated_text = result.text self.driver.get('http://www.google.com') return translated_text def translate_text_google_headless(self, text_to_translate): """ Uses spynner - a headless browser, to translate the text. This works really fast... """ url = "https://translate.google.com/#%s/%s/%s" % (self.source_lang.to_google_translate(), self.target_lang.to_google_translate(), text_to_translate) self.headless_browser.load(url) result_box = self.headless_browser.soup('#result_box')[0] translated_text = result_box.text_content() self.headless_browser.load('http://www.example.com/') return translated_text def translate_simple(self, text_to_translate): """ Uses simple urllib to translte. It doesn't really work on more than 2000 characters for some reason (maybe a limit posed by google) """ '''Return the translation using google translate you must shortcut the langage you define (French = fr, English = en, Spanish = es, etc...) if you don't define anything it will detect it or use english by default Example: print(translate("salut tu vas bien?", "en")) hello you alright?''' agents = {'User-Agent':"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"} before_trans = 'class="t0">' link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (self.target_lang.to_google_translate(), self.source_lang.to_google_translate(), text_to_translate.replace(" ", "+")) request = urllib2.Request(link, headers=agents) page = urllib2.urlopen(request).read() result = page[page.find(before_trans)+len(before_trans):] result = result.split("<")[0] return result def translate_text_spanishenglish(self, text_to_translate): """ An attempt to use spynner to translate through spanishenglish.com (instead of google.. in case google will block us or something..) The problem is that for some reason spanishenglish.com doesn't work with the spynner core (it does work well with chrome though)... so it's stuck now... """ pass # url = "http://www.spanishenglish.com/" # self.headless_browser.load(url) # print "LOADED" # self.headless_browser.click("a[href='#en']", wait_load=False) # self.headless_browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False) # self.headless_browser.wk_fill("#InputText", "How are you doing today?") # self.headless_browser.click_ajax("#TranslateButton") # # import time # #time.sleep(15) # result_box = self.headless_browser.soup('#OutputTextHtmlCell')[0] # translated_text = result_box.text_content() # return translated_text # # # self.headless_browser.show() # # time.sleep(30) def translate_file(self, path): with open(path) as f: json = f.read() d = ujson.loads(json) text = d['text'] return self.translate(text) def translate_to_file(self, source_path, target_path): try: translated_text = self.translate_file(source_path) except httplib.CannotSendRequest: self.driver = webdriver.Firefox() return except Exception, e: print "Failed to translate: {:s}".format(source_path) print '='*60 import traceback print traceback.format_exc() return with codecs.open(target_path, 'w', 'utf-8') as f: f.write(translated_text)
# f = lambda x: x # # QObject.connect(web, SIGNAL("loadFinished"), f(4)) # # import mechanize # url = "http://www.spanishenglish.com/" # browser = mechanize.Browser() # browser.set_handle_robots(False) # browser.open(url) from spynner import Browser browser = Browser() from pyquery import PyQuery browser.set_html_parser(PyQuery) url = "http://www.spanishenglish.com/" browser.load(url) print "LOADED" browser.click("a[href='#en']", wait_load=False) browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False) browser.wk_fill("#InputText", "How are you doing today?") browser.click_ajax("#TranslateButton") import time time.sleep(15) result_box = browser.soup('#TranslationOutput')[0] translated_text = result_box.text_content() # browser.show() time.sleep(30) print '123' from spynner import Browser browser = Browser()
# import mechanize # url = "http://www.spanishenglish.com/" # browser = mechanize.Browser() # browser.set_handle_robots(False) # browser.open(url) from spynner import Browser browser = Browser() from pyquery import PyQuery browser.set_html_parser(PyQuery) url = "http://www.spanishenglish.com/" browser.load(url) print "LOADED" browser.click("a[href='#en']", wait_load=False) browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False) browser.wk_fill("#InputText", "How are you doing today?") browser.click_ajax("#TranslateButton") import time time.sleep(15) result_box = browser.soup('#TranslationOutput')[0] translated_text = result_box.text_content() # browser.show() time.sleep(30) print '123'
class FileTranslator(object): def __init__(self, source_lang, target_lang, translate_type='headless'): self.source_lang = source_lang self.target_lang = target_lang # instantiate the relevant browser for the instance self.headless_browser = self.driver = None if translate_type == 'headless': self.headless_browser = Browser() self.headless_browser.set_html_parser(PyQuery) elif translate_type == 'selenium': self.driver = webdriver.Firefox() # determine the kind of translator needed self.translate = { 'simple': self.translate_simple, 'headless': self.translate_text_google_headless, 'selenium': self.translate_text_google }[translate_type] def _forcefully_kill_firefox(self): """ This is needed in order to kill Firefox when it gets stuck.. """ print "Killing Firefox forcefully..." os.system('taskkill /im firefox.exe /f /t') def translate_text_google(self, text_to_translate, quit_browser=True): """ Uses selenium to translate the text. kinda slow, and gets stuck.. """ # Open Google Translate website url = "http://translate.google.com/#%s/%s/%s" % ( self.source_lang.to_google_translate(), self.target_lang.to_google_translate(), text_to_translate) self.driver.get(url) # Wait for results to appear and retrieve them # If results don't show up in 11 seconds, it means that Firefox stuck, kill it and continue t = Timer(11.0, self._forcefully_kill_firefox) t.start() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@id='result_box']/span[@class='hps']"))) t.cancel() result = self.driver.find_element_by_id('result_box') translated_text = result.text self.driver.get('http://www.google.com') return translated_text def translate_text_google_headless(self, text_to_translate): """ Uses spynner - a headless browser, to translate the text. This works really fast... """ url = "https://translate.google.com/#%s/%s/%s" % ( self.source_lang.to_google_translate(), self.target_lang.to_google_translate(), text_to_translate) self.headless_browser.load(url) result_box = self.headless_browser.soup('#result_box')[0] translated_text = result_box.text_content() self.headless_browser.load('http://www.example.com/') return translated_text def translate_simple(self, text_to_translate): """ Uses simple urllib to translte. It doesn't really work on more than 2000 characters for some reason (maybe a limit posed by google) """ '''Return the translation using google translate you must shortcut the langage you define (French = fr, English = en, Spanish = es, etc...) if you don't define anything it will detect it or use english by default Example: print(translate("salut tu vas bien?", "en")) hello you alright?''' agents = { 'User-Agent': "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)" } before_trans = 'class="t0">' link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % ( self.target_lang.to_google_translate(), self.source_lang.to_google_translate(), text_to_translate.replace(" ", "+")) request = urllib2.Request(link, headers=agents) page = urllib2.urlopen(request).read() result = page[page.find(before_trans) + len(before_trans):] result = result.split("<")[0] return result def translate_text_spanishenglish(self, text_to_translate): """ An attempt to use spynner to translate through spanishenglish.com (instead of google.. in case google will block us or something..) The problem is that for some reason spanishenglish.com doesn't work with the spynner core (it does work well with chrome though)... so it's stuck now... """ pass # url = "http://www.spanishenglish.com/" # self.headless_browser.load(url) # print "LOADED" # self.headless_browser.click("a[href='#en']", wait_load=False) # self.headless_browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False) # self.headless_browser.wk_fill("#InputText", "How are you doing today?") # self.headless_browser.click_ajax("#TranslateButton") # # import time # #time.sleep(15) # result_box = self.headless_browser.soup('#OutputTextHtmlCell')[0] # translated_text = result_box.text_content() # return translated_text # # # self.headless_browser.show() # # time.sleep(30) def translate_file(self, path): with open(path) as f: json = f.read() d = simplejson.loads(json) text = d['text'] return self.translate(text) def translate_to_file(self, source_path, target_path): try: translated_text = self.translate_file(source_path) except httplib.CannotSendRequest: self.driver = webdriver.Firefox() return except Exception, e: print "Failed to translate: {:s}".format(source_path) print '=' * 60 import traceback print traceback.format_exc() return with codecs.open(target_path, 'w', 'utf-8') as f: f.write(translated_text)