def getUdemyLinks(class01): # print class01 br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) webpage = br.open(class01).read() soupPage = BeautifulSoup(webpage, 'xml') udemy_links = soupPage.findAll( "a", {"class": "button large visit"})[0]["href"].encode("ascii", "ignore") # print udemy_links return udemy_links
def patterncounter(): #defining function count = 0 #declaring variable count for counting browser = mechanize.Browser(factory=mechanize.RobustFactory()) #initialising browser browser.set_handle_robots(False) browser.open("http://www.minerbots.blogspot.in/") #opening URL html = browser.response().readlines() #Fetching web contents for i in range(0, len(html)): #Searching for pattern 'Vicz' line by line if 'Vicz' in html[i]: count = count + 1 print "%d No of times found" % count #analyzing and producing results
def sifre2(): filepath = os.path.join(folders, 'nfo.txt') cj = mechanize.CookieJar() name = __settings__.getSetting("Name") login = __settings__.getSetting("Username") password = __settings__.getSetting("password") if not login: __settings__.openSettings() else: pass br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [ ('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' ), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'), ('Accept-Encoding', 'none'), ('Accept-Language', 'en-US,en;q=0.8'), ('Connection', 'keep-alive') ] br.open('https://koditr.org/wp-login.php') br.title() br.select_form(nr=0) br.form['log'] = __settings__.getSetting("Username") br.form['pwd'] = __settings__.getSetting("password") br.submit() html2 = br.response().read() if "welcome" in html2: print "basarili bir login yapildi" else: dialog = xbmcgui.DialogProgress() dialog1 = xbmcgui.Dialog() dialog1.ok( '[COLOR red][B]IPTV HATA UYARISI[/B][/COLOR]', '[COLOR yellow][B]Bronze Uye Olmaniz Gerekiyor!!! Eger Bronze Uye Iseniz ve Bu Mesaji Goruyorsaniz[/B][/COLOR]', '[COLOR red][B]Yanlis Kullanici adi veya Sifre Girdiniz!!! Lutfen Tekrar Deneyiniz.[/B][/COLOR]' ) br.open('https://koditr.org/greating1/') html = br.response().read() return html
def __init__(self, config, cookie_jar): # fix #218 try: mechanize.Browser.__init__(self, factory=mechanize.RobustFactory()) except BaseException: PixivHelper.GetLogger().info( "Using default factory (mechanize 3.x ?)") mechanize.Browser.__init__(self) self._configureBrowser(config) self._configureCookie(cookie_jar)
def __init__(self): self._browser = mechanize.Browser(factory=mechanize.RobustFactory()) self._browser.set_handle_equiv(True) self._browser.set_handle_redirect(True) self._browser.set_handle_referer(True) self._browser.set_handle_robots(False) self._browser.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time=1) self._browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1' )]
def donate(donation_amount, tid): logging.info('Donating ${} for tid {}'.format(donation_amount, tid)) donation_amount = str(donation_amount) br = mechanize.Browser(factory=mechanize.RobustFactory()) br.addheaders = [( 'User-agent', ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' )] br.set_handle_robots(False) br.open('https://www.propublica.org/donate/') br.select_form('payment_form') prices = br.find_control('UnitPrice1') custom_price = mechanize.Item(prices, { 'contents': donation_amount, 'value': donation_amount }) custom_price.selected = True br.form['c_amount'] = donation_amount br.form['BillingFirstName'] = billing['first'] br.form['BillingLastName'] = billing['last'] if billing['email'].endswith('+{}@gmail.com'): br.form['BillingEmail'] = billing['email'].format(tid) else: br.form['BillingEmail'] = billing['email'].format(tid) br.form['CardNumber'] = billing['cc'] br.form['ExpirationMonth'] = [billing['exp_mo']] br.form['ExpirationYear'] = [billing['exp_yr']] br.form['Cvv2'] = billing['cvv'] br.form['BillingAddress1'] = billing['street'] br.form['BillingCity'] = billing['city'] br.form['BillingStateProvince'] = [billing['state']] br.form['BillingPostalCode'] = billing['zip'] br.form['BillingCountryCode'] = [billing['country']] response = br.submit() if 'Thank You.' in response.read(): logging.info('Donation success: ${} for tid {}'.format( donation_amount, tid)) return True else: logging.warning(response.read()) logging.warning('Donation failed: ${} for tid {}'.format( donation_amount, tid)) return False
def __init__(self): """ creates a mechanize.Browser with some custom settings creates (empty) custom attributes """ LOG.info("") LOG.info("Creating Browser") mechanize.Browser.__init__(self, factory=mechanize.RobustFactory()) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_robots(False) self.addheaders = [('User-agent', 'Mozilla/5.0')] self.userURL = None self.guessURL = {"index": None, "login": None, "upload": None}
def __init__(self, username, password, ignore_re=None, retries=3, skip_existing=True): self.username = username self.password = password self.ignore_re = ignore_re self.retries = retries self.skip_existing = skip_existing self.br = mechanize.Browser(factory=mechanize.RobustFactory()) self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \ Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) self.br.open(self.ping_url).read()
def browser(): br = mechanize.Browser(factory=mechanize.RobustFactory()) cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open(url) br.select_form(nr=0) # select 1st form br.find_control(name="sl").value = [flang] br.find_control(name="tl").value = [tlang] br.form["text"] = words br.submit() return etree.HTML(br.response().read())
def download_mechanize(): browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.set_handle_robots(False) browser.open("http://pypi.python.org/pypi") browser.select_form(name="searchform") browser.form["term"] = "mechanize" browser.submit() browser.follow_link(text_regex="mechanize-?(.*)") link = browser.find_link(text_regex=r"\.tar\.gz") filename = os.path.basename(urlparse.urlsplit(link.url)[2]) if os.path.exists(filename): sys.exit("%s already exists, not grabbing" % filename) browser.retrieve(link.url, filename)
def __init__(self): print("Initializing moodle ... ") self.br = mechanize.Browser(factory=mechanize.RobustFactory()) self.br.set_handle_equiv(False) self.br.set_handle_robots(False) self.br.set_handle_referer(False) self.br.set_handle_redirect(True) self.br.set_debug_redirects(True) self.br.set_debug_responses(False) self.br.set_debug_http(False) self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2) self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux 1686; en-US;\ rv:1.9.0.1) Gecko/201171615 Ubuntu/11.10-1 Firefox/3.0.1')]
def GetContent(): # Browser #br = mechanize.Browser() #br = mechanize.Browser(factory=mechanize.DefaultFactory(i_want_broken_xhtml_support=True)) br = mechanize.Browser(factory=mechanize.RobustFactory()) br.add_handler(PrettifyHandler()) # Cookie Jar #cj = cookielib.LWPCookieJar() #br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open('http://www.zvg-portal.de/index.php?button=Termine%20suchen') for form in br.forms(): print "Form name:", form.name print form br.select_form(name='globe') #br.select_form("globe") # works when form has a name #br.form = list(br.forms())[0] # use when form is unnamed #Tests for control in br.form.controls: print control #print "type=%s, name=%s value=%s" % (control.type, control.name, br[control.name]) br.form["land_abk"] = "by" # Login br.submit() return br.response().read()
def sifre3(): filepath = os.path.join(folders, 'nfo.txt') cj = mechanize.CookieJar() name = __settings__.getSetting("Name") login = __settings__.getSetting("Username") password = __settings__.getSetting("password") if not login: __settings__.openSettings() else: pass br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open('http://koditr.org/wp-login.php') br.title() br.select_form(nr=0) br.form['log'] = __settings__.getSetting("Username") br.form['pwd'] = __settings__.getSetting("password") br.submit() html2 = br.response().read() if "welcome" in html2: print "basarili bir login yapildi" else: dialog = xbmcgui.DialogProgress() dialog1 = xbmcgui.Dialog() dialog1.ok( '[COLOR red][B]Vip + HATA UYARISI[/B][/COLOR]', '[COLOR yellow][B]Silver Uye Olmaniz Gerekiyor!!! Eger Silver Uye Iseniz ve Bu Mesaji Goruyorsaniz[/B][/COLOR]', '[COLOR red][B]Yanlis Kullanici adi veya Sifre Girdiniz!!! Lutfen Tekrar Deneyiniz.[/B][/COLOR]' ) br.open('http://koditr.org/xman/') html = br.response().read() return html
def cv_parser(file): browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.set_handle_robots(False) browser.open("http://recruitplushrxmlapidemo.onlineresumeparser.com/Default.aspx") browser.select_form(nr=0) browser.form.set_all_readonly(False) file = '/home/heet/My-Work/MyProjects/hackiiitv18-submissions/team28-Automation/iscreening/Sample resumes/CV1.pdf' browser.form.add_file(open(file), 'text/plain', file) response = browser.submit() soup = BeautifulSoup(response.read().decode('utf-8'), 'html.parser') extracted_cv = soup.find(id="txtResume") text = extracted_cv.get_text().encode('utf-8') print(text) ext_file = open("./cv_parsed.txt", "w") ext_file.write(text) ext_file.close()
def getBrowser(config=None, cookieJar=None): global defaultCookieJar global defaultConfig if config != None: defaultConfig = config if cookieJar != None: defaultCookieJar = cookieJar if defaultCookieJar == None: PixivHelper.GetLogger().info( "No default cookie jar available, creating... ") defaultCookieJar = cookielib.LWPCookieJar() browser = Browser(factory=mechanize.RobustFactory()) configureBrowser(browser, defaultConfig) configureCookie(browser, defaultCookieJar) return browser
def robots(): import mechanize # sudo apt-get install python-mechanize import re # regular expressions ( Text Filter ) import sys # to get paramstring for the filename browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.set_handle_robots(False) browser.open("http://www.magistrix.de/") browser.select_form(nr=1) browser.form["q"] = "Kraftwerk,Roboter" browser.submit() browser.follow_link(text="Die Roboter", nr=0) html = browser.response().readlines() ##### get lyrics ################# start = False lyrics = [] for i in range(0, len(html)): line = html[i] OK = False if '<i>' in line: # find start of the lyrics, here it was "<i>" start = True if "class" in line: # find end of the lyrics , here it wass "class" start = False if '<p>' in line: OK = True if '<br' in line: OK = True if OK == True: if start == True: match = re.search('>[^<>]+', line) # <p>Text</p> --> ">Text" if match: lyrics.append(match.group()[1:]) lyrics.append( '\n\n\nIch hoffe hiermit geholfen zu haben\nund verbleibe mit freundlichen Grüssen\nund schüss' ) #### save Lyrics to Text File ################## Filename = sys.argv[0] Filename = Filename[0:-3] + '-lyrics.txt' SaveFile = open(Filename, 'w') for line in lyrics: SaveFile.write(line + '\n') #print (line) SaveFile.close() print('Lyrics saved to ' + Filename) ### espeak lyrics ########################### os.system('espeak -v de -p0 -s150 -a200 -f "' + Filename + '" 2> /dev/null')
def getDetails(lastname, add, state): """ address.getDetails("lastname","city","state","|") """ br = mechanize.Browser(factory=mechanize.RobustFactory()) # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML). br.set_handle_equiv(False) # Don't Ignore robots.txt. br.set_handle_robots(True) # add Referer (sic) header br.set_handle_referer(True) # handle Refresh redirections br.set_debug_redirects(True) # Log HTTP response bodies (ie. the HTML, most of the time). br.set_debug_responses(True) # Print HTTP headers. #br.set_debug_http(True) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=10) i = 0 gonext = True details = [] while gonext: urlCreated = 'http://www.yellowpages.com/whitepages?first=&last=' + str( lastname) + '&zip=' + str(add) + '&state=' + str( state) + '&start=' + str(i * 10) #Opens the site to be navigated response = br.open(urlCreated) soup = BeautifulSoup(br.response().read()) allLi = soup.findAll("div", {"class": "phone-result-container"}) gonext = len(allLi) > 0 if gonext: for item in allLi: d = [] d.append(item.find('a', {"class": 'fullname'}).text) d.append(item.find('p', {"class": 'address'}).text) d.append(item.find('p', {"class": 'phone'}).text) details.append(d) i = i + 1 else: print "Processing Complete for", lastname, add, state return details
def __init__(self, cur_cfg, cgen): self.br = mechanize.Browser(factory=mechanize.RobustFactory()) self.cj = cookielib.LWPCookieJar() self.br.set_cookiejar(self.cj) self.br.set_handle_equiv(True) self.br.set_handle_redirect(False) self.br.set_handle_referer(True) self.br.set_handle_robots(False) self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] self.cur_cfg = cur_cfg self.timeout = cgen['default_timeout'] self.baseURL = self.cur_cfg['url'] #~ print self.cur_cfg['url'] humanprovider = urlparse.urlparse(self.baseURL).hostname self.name = humanprovider.replace("www.", "") self.basic_sz = 1024*1024 #~ self.dologin() self.typesrch = 'DSNINIT'
def __init__(self): print("Initializing notepal ... ") self.url = "https://doqcs.ncbs.res.in/notepal2015" self.proxy = None self.br = mechanize.Browser(factory=mechanize.RobustFactory()) cj = cookielib.LWPCookieJar() self.br.set_cookiejar(cj) self.br.set_handle_equiv(False) self.br.set_handle_robots(False) self.br.set_handle_referer(False) self.br.set_handle_redirect(True) self.br.set_debug_redirects(True) self.br.set_debug_responses(False) self.br.set_debug_http(True) self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2) self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux 1686; en-US;\ rv:1.9.0.1) Gecko/201171615 Ubuntu/11.10-1 Firefox/3.0.1')]
def scrape_websites(latitude, longitude, radius): count = 1 places = get_places_in_radius(latitude, longitude, radius, False) places = places.filter(website__isnull=True) br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) br.set_handle_equiv(False) br.addheaders = [('User-agent', 'Mozilla/5.0')] for place in places: try: print count wt = random.uniform(1, 2) time.sleep(wt) br.open("http://google.com") br.select_form('f') br.form['q'] = remove_non_ascii(place.name) + "seattle wa" data = br.submit() soup = BeautifulSoup(data.read()) num = 0 while num < 3: url = urlparse.parse_qs( urlparse.urlparse( soup.select('.r a')[num]['href']).query)['q'][0] strings_to_exclude = [ 'plus.google', 'yelp', 'facebook', 'urbanspoon', 'foursquare', 'zomato', 'tripadvisor', 'allmenus', 'thestranger', 'seattlemet', 'thrillist', 'seattle.eater', 'yahoo', 'capitolhillseattle', 'eventful', 'groupon', 'clubplanet', 'postfastr', 'opentable', 'menupix', 'menuism', 'letzgoout', 'barmano', '2findlocal', 'whitepages', 'manta', 'gigsounds', 'mapquest', 'www.restaurant.com', 'nochelatina' ] if 'http' in url and not any(string in url for string in strings_to_exclude): place.website = url place.save() break num += 1 count += 1 except Exception: traceback.print_exc() print "Scrape websites successful"
def __init__(self, num, keyword): self.num = num self.keyword = keyword self.br = Browser(factory=mechanize.RobustFactory()) self.br.set_handle_robots(False) self.br.addheaders = [ ('User-Agent', userAgent), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ] self.cj = mechanize.LWPCookieJar() self.br.set_cookiejar(self.cj) self.br._factory.is_html = True self.br.set_handle_refresh(False) self.idletime = 0 threading.Thread.__init__(self) self.url = "" self.depth = 0 self.output = ""
def get_address_from_eircode(eircode): browser = mechanize.Browser(factory=mechanize.RobustFactory()) url = "http://correctaddress.anpost.ie/pages/Search.aspx" browser.addheaders = [('User-agent', random_user_agent() )] browser.open(url) html = browser.response().read() browser.select_form(nr=0) browser.form.set_all_readonly(False) browser["ctl00$body$txtEircode"] = str(eircode) request = browser.form.click() response = browser.submit() html = response.read() soup = BeautifulSoup(html, "html.parser") tag = soup.find(id="ctl00_body_hfTextToCopy") try: value = tag['value'] address = value.replace("\n",", ") return str(unicodedata.normalize('NFKD', address).encode('ascii','ignore')) except: return ""
def GetSaleInfo(url): while True: try: print "Getting Sale Info from " + url useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0" browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.addheaders = [('User-agent', useragent)] browser.open(url) html = browser.response().read() try: soup = BeautifulSoup(html, "html.parser") except: soup = BeautifulSoup(html) results1 = soup.find("table", {"id": "SaleInfo"}) results2 = soup.find("table", {"id": "OtherInfo"}) results = str(results1) + str(results2) if results is not None: return results except: pass
def build_browser(cookiejar): """ Returns a mechanize.Browser object properly configured """ browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.set_cookiejar(cookiejar) #browser.set_handle_gzip(True) browser.set_handle_equiv(True) browser.set_handle_refresh(False) browser.set_handle_redirect(True) browser.set_handle_referer(True) browser.set_handle_robots(False) browser.addheaders = [ ('User-agent', 'Mozilla/5.0 (Windows; U; Windows ' 'NT 5.1; en-US; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10') ] #browser.set_debug_http(True) if PARSER.has_section('general') and PARSER.has_option('general', 'proxy'): proxy = PARSER.get('general', 'proxy') if proxy: browser.set_proxies({"http": proxy, "https": proxy}) return browser
def getFreeLinks( nisdon_query_link='http://www.nisdon.com/search/label/free?max-results=', max_query=200): """ Get a the list of classes and links """ full_link = '%s%d' % (nisdon_query_link, max_query) br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) webpage = br.open(full_link).read() soupPage = BeautifulSoup(webpage, 'xml') # blog_post_div = soupPage.findAll("div",{"class":"blog-posts hfeed"}) blog_post = soupPage.findAll("div", {"class": "post-outer"}) all_link = map( lambda x: x.findAll("div", {"class": "thumb"})[0].find('a')['href']. encode('ascii', 'ignore'), blog_post) return all_link
def __init__(self, username, password, ignore_re=None, retries=3, skip_existing=True): self.username = username self.password = password self.ignore_re = ignore_re self.retries = retries self.skip_existing = skip_existing self.br = mechanize.Browser(factory=mechanize.RobustFactory()) self.br.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' )] self.br.set_handle_refresh(True) self.br.set_handle_redirect(True) self.br.open(self.ping_url).read()
def start_browser(url=None): br = mechanize.Browser(factory=mechanize.RobustFactory()) cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] if not url: br.open('http://denver.craigslist.org') else: br.open(url) print br.title() return br
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False, verify_ssl_certificates=True): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. :param honor_time: If True honors pause time in refresh requests :param max_time: Maximum time in seconds to wait during a refresh request :param verify_ssl_certificates: If false SSL certificates errors are ignored ''' from calibre.utils.browser import Browser if use_robust_parser: import mechanize opener = Browser(factory=mechanize.RobustFactory(), verify_ssl=verify_ssl_certificates) else: opener = Browser(verify_ssl=verify_ssl_certificates) opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT opener.addheaders = [('User-agent', user_agent)] proxies = get_proxies() to_add = {} http_proxy = proxies.get('http', None) if http_proxy: to_add['http'] = http_proxy https_proxy = proxies.get('https', None) if https_proxy: to_add['https'] = https_proxy if to_add: opener.set_proxies(to_add) return opener
def check_indeed(self, title, city): br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) indeed_url = 'http://www.indeed.com' br.open(indeed_url) br.form = list(br.forms())[0] br["q"] = title # The What id br["l"] = city # The Where id response = br.submit() print br.geturl() response = br.open( br.geturl() + '&limit=20' ) # 20 items per page, this is good to keep only relevant items print br.geturl() response = response.read() soup = BeautifulSoup(response) titles_soup = soup.findAll("a", attrs={"data-tn-element": "jobTitle"}) titles = [item.text for item in titles_soup] urls = [ 'http://www.indeed.com' + item.get('href') for item in titles_soup ] companies = self._find_field_in_soup(soup, "company") locations = self._find_field_in_soup(soup, "location") summaries = self._find_field_in_soup(soup, "summary") dates = self._find_field_in_soup(soup, "date") return self._create_jobs_dict(title=titles, company=companies, location=locations, summary=summaries, date_posted=dates, job_url=urls)
def post_data(): # url url_add = "http://www.indeed.com" # details details = {"q": "Python Developer", "l": "Santa Clara, CA"} # search based on job title and place browser = mechanize.Browser(factory=mechanize.RobustFactory()) browser.set_handle_robots(False) # open the absloute url browser.open(url_add) # select the form browser.select_form(nr=0) # fill the form browser.form["q"] = details.get("q", "") browser.form["l"] = details.get("l", "") return browser.submit()