def _hyperlink_conversion(self, ignoretext): if not self.has_selection: self.inline_call('square_brackets', nomove=True, text="Link text") return self.inline_call('parentheses', text="http://www.example.com") text = self.cursor.selectedText() is_email = validators.email(text) is_url = validators.url(text) if is_url: self.inline_call('square_brackets', text=text) return self.inline._call('parentheses', text=text) elif is_email: self.inline_call('square_brackets', text=text) return self.inline_call('parentheses', text='mailto:' + text) url_from_partial = 'http://' + text if validators.url(url_from_partial): self.inline_call('square_brackets') self.inline_call('parentheses', text=url_from_partial) else: self.inline_call('square_brackets', nomove=True) self.inline_call('parentheses', text="http://www.example.com")
def put_article(): ''' Add new article for a user. ''' username = request.headers.get('x-koala-username') apikey = request.headers.get('x-koala-key') user = locate_user(username, apikey) reqjson = request.get_json() result = validators.url(reqjson['url']) if not result: # try again but with http:// result = validators.url('http://' + reqjson['url']) if not result: logging.info("Bad URL: %s" % reqjson['url']) abort(400) else: reqjson['url'] = 'http://' + reqjson['url'] title = reqjson.get('title', reqjson['url']) url = reqjson['url'] date = str(datetime.now()) read = False favorite = False owner = user.id article = Article.create(title=title, url=url, date=date, read=read, favorite=favorite, owner=owner) return jsonify({'id': article.id}), 201
def test_stackoverflow(self): """ This function is used to test stackoverflow function which returns user name, description, tags and links. ...If url is valid https://stackoverflow.com/users/7690738/ashish-cherian returns username, description, tags and links ...If url is empty raise ValueError("unknown url type: %r" % self.full_url) ValueError: unknown url type: '' ...If an invalid url is given raise URLError(err) urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed> :return: if test is ok or not """ #validating url self.assertTrue(validators.url(self.url1)) self.assertTrue(validators.url(self.url2)) #checking if url is a stackoverflow url or not self.assertTrue("https://stackoverflow.com/users" in self.url1) self.assertTrue("https://stackoverflow.com/users" in self.url2) #checking connection to url self.assertEqual(urllib.request.urlopen(self.url1).getcode(), 200) self.assertEqual(urllib.request.urlopen(self.url2).getcode(), 200) #checking if url is not empty self.assertNotEqual(self.url1, "") self.assertNotEqual(self.url2, "") #checking for timeout error self.assertTrue(requests.get(self.url1, timeout=10.0)) self.assertTrue(requests.get(self.url1, timeout=10.0)) #checking username and description are not empty self.assertNotEqual(self.user1, "") self.assertNotEqual(self.description1, "") self.assertNotEqual(self.user2, "") self.assertNotEqual(self.description2, "")
def create_server(self,request): user = request.user url = str(self.cleaned_data.get('server')) serverS=Server.objects.filter(url=url) if serverS: self.add_error('server','Url already exist') else: if validators.url('https://'+url) or validators.url('http://'+url): server = Server.objects.create(url=url,user=user) server.save() else: self.add_error('server','Incorrect url')
def is_valid_url(self, url): if validators.url(url): return True else: if url[:8] == "https://": if validators.url("http://%s" % (url[8:])): return True else: return False else: if validators.url("http://%s" % (url)): return [True, "http://"]
def index(): if request.method == "POST": if 'username' in session: url = request.form["url"] links = db.links if not validators.url(url): url = "http://" + url if not validators.url(url): return render('form.html', error='URL is incorrect') else: existing_url = links.find_one({'url': url}) if not existing_url: current_time = str(datetime.now()) print current_time print url cur_user = db.users.find_one({'name': session['username']}) html = None try: html = urllib2.urlopen(url) html = html.read() soup = bf(html) title = url try: title = soup.find('title').text except Exception: pass db.links.insert({ 'url': url, 'title': title, 'author': cur_user['name'], 'author_id': cur_user['_id'], 'current_time': current_time, 'votes': 1 }) return render('form.html', error="New item is added") except Exception: return render('form.html', error="URL is incorrect") else: return render('form.html', error="URL already exists") else: flash('Please log in') redirect(url_for('login')) return render('form.html')
def validate_result(current, default, type): """ Validates the data, whether it needs to be url, twitter, linkedin link etc. """ if current is None: current = "" if default is None: default = "" if type == "URL" and validators.url(current, require_tld=True) and not validators.url(default, require_tld=True): return current if type == "EMAIL" and validators.email(current) and not validators.email(default): return current return default
def validated_redirect_uri(uri_param): if uri_param is None: raise BadRequest("Missing required redirect URI") try: validators.url(uri_param) except: raise BadRequest("Malformed redirect URI") parsed = urlparse(uri_param) if parsed.scheme not in ['http', 'https']: raise BadRequest("Redirect URI must be http or https") return parsed
def ddos_server(self, url, timeout=30): if self.ddos_process is not None: logging.debug("communicate with siege") stdout,stderr = self.ddos_process.communicate() logging.debug("siege stdout: %s"%stdout) logging.debug("siege stderr: %s"%stderr) if url is not None and validators.url(url): cmdstr = "timeout -k {longerTimeout}s {longerTimeout}s siege -c 100 -t {timeout} {url}"\ .format(longerTimeout=timeout+2, timeout=timeout, url=url) logging.debug(cmdstr) self.ddos_process = subprocess.Popen(shlex.split(cmdstr)) else: logging.warning("Neither ip nor url where supplied, DDOS failed") logging.debug("validators.url(%s) == %s"%(url, validators.url(str(url))))
def __get_potential_page_links(self, page): # Find all the pages href links html = BeautifulSoup(page.text, 'html5lib') for tag in html.find_all('a'): link = tag.get('href').__str__() element = None # If link appears to be a valid url if validators.url(link): # Categorize local and external links if link.startswith(self.site.base_url): element = PotentialUrlLinkElement(link) else: element = ExternalUrlLinkElement(link) # Find email address elif "mailto:" in link: potential_email = re.sub('mailto:', '', link) if validators.email(potential_email): element = EmailAddressElement(potential_email) elif validators.email(link): element = EmailAddressElement(link) # Link didn't match anything valid else: element = MalformedLinkElement(link) # Add new link element to list if it does not already exist if not self.discovered_links.contains(element) and element is not None: self.discovered_links.add_element(element) # Will attempt to create valid links with invalid fragments self.__build_in_malformed()
def is_url(uri): result = validators.url(uri) if result: return True else: print('It\'s not a valid url:' + uri) exit()
def create_post(request, type_of_post): profile=request.user.profile content=request.data.get('content') media=request.data.get('media') post=Post( profile=profile, content=content, media=media, type_of_post=type_of_post ) if not post.media: # og text = post.content text = text.split('http') if len(text) != 1: text = text[1] text = text.split(" ") url = "http" + str(text[0]) if validators.url(url): try: data = InfoExtractor.PyOpenGraph(url).metadata og=OpenGraph() og.site=data.get('site_name') # if og.site not in ["YouTube", "Vimeo"]: og.title=data.get('title') og.description=data.get('description') og.image=data.get('image') og.link=data.get('url') og.save() post.og=og except Exception as e: print str(e) post.save() return
def feed(request): """ View that process the posted submitted_url """ log.debug('In Feed View') view_response = dict(table_of_content='', error='') # get url submitted if request.POST.get('feed_url'): feed_url = request.POST.get('feed_url') if not validators.url(feed_url): view_response['error'] = 'Invalid url entered...' return view_response if 'wikipedia.org/' not in feed_url: # check if the url submitted a wikipedia url view_response['error'] = 'Url not a wikipedia domain...' return view_response response = urllib2.urlopen(feed_url) # retrieve the html page html = response.read() # convert the html to xml-format soup = BeautifulSoup(html, "lxml") # Get the table content with specific tag and class view_response['table_of_content'] = soup.find('div', class_="toc") else: # return error that no url entered view_response['error'] = 'No url were submitted...' return view_response if not view_response['table_of_content']: # return error that table of content found view_response['error'] = 'No Table of Content Found from the url...' return view_response
def urlLinkSpecified(request): if request.method == 'POST': url = request.POST.get("url") if validators.url(url): start_time = time.time() response = requests.get(url) maintype= response.headers['Content-Type'].split(';')[0].lower() if maintype not in ('image/png', 'image/jpeg', 'image/gif'): print("a") return HttpResponse(json.dumps({'data':"Url is not of image type",'url':"/home/ubuntu/DjangoWithCNN/myproject/media/blank_person.png"})) else: img = Image.open(StringIO(response.content)) FileName = str(uuid.uuid1())+".png" Pathname = "/home/ubuntu/DjangoWithCNN/myproject/media/"+FileName width, height = img.size if width > 600: basewidth = 400 wpercent = (basewidth/float(img.size[0])) hsize = int((float(img.size[1])*float(wpercent))) img = img.resize((basewidth,hsize), Image.ANTIALIAS) img.save(Pathname) print("--- %s seconds ---" % (time.time() - start_time)) result = main(Pathname) print(result) return HttpResponse(result) else: return HttpResponse(json.dumps({'data':"Invalid URL",'url':"/home/ubuntu/myproject/media/blank_person.png"}))
def crawl_link(link): """ Crawls a link, and returns a tuple with a urllib response and a code indicating success or error Input: string -- link to be claled Returns: Tuple(e1,e2) -- e1 = urllib response e2 = success or error code. Code Source: https://docs.python.org/3/howto/urllib2.html ('Fetching URLs') """ if not validators.url(link): return ("", False, "Invalid link") print("Currently handling link: " + link) req = urllib.request.Request(link) try: response = urllib.request.urlopen(req) except HTTPError as error: return (None, False, error.code) except ContentTooShortError as error: return (None, False, "ContentTooShortError") except URLError as error: return (None, False, error.reason) else: return (response, True, "")
def home_addEntry(): longUrl = request.form['longURl'] if not validators.url(longUrl): return render_template("bad_input.html", title=cfg['general']['title']) short = logic.addEntry(longUrl, logic.connect(cfg['general']['sqlite_location'])) current_url = cfg['web_paths']['shortened'] + short return render_template("shortened.html", url=current_url, short=short, title=cfg['general']['title'])
def link_data(request): if validators.url(request.data_values): request.data_values = str(request.data_values) else: request.data_values = '#' return request
def url(vdomain): if validators.url(vdomain): return vdomain else: return False
def load_units(self): """ Load units of the function descriptor content, section 'virtual_deployment_units' """ if 'virtual_deployment_units' not in self.content: log.error("Function id={0} is missing the " "'virtual_deployment_units' section" .format(self.id)) return for vdu in self.content['virtual_deployment_units']: unit = Unit(vdu['id']) self.associate_unit(unit) # Check vm image URLs # only perform a check if vm_image is a URL vdu_image_path = vdu['vm_image'] if validators.url(vdu_image_path): # Check if is URL/URI. try: # Check if the image URL is accessible # within a short time interval requests.head(vdu_image_path, timeout=1) except (requests.Timeout, requests.ConnectionError): evtlog.log("VDU image not found", "Failed to verify the existence of VDU image at" " the address '{0}'. VDU id='{1}'" .format(vdu_image_path, vdu['id']), self.id, 'evt_vnfd_itg_vdu_image_not_found') return True
def collectData(address, internet, fromaddress = ""): global doneAddresses, numberOfAddresses if doneAddresses > numberOfAddresses: return else: doneAddresses+=1 if not validators.url(address): return content = getContentFromSite(address) if content == "": return links = removeHost(getLinksFromContent(content), address) if internet.isStored(address): n = internet.getNode(address) else: n = Node() internet.storeNode(address, n) if fromaddress != "": n.inputs.append(fromaddress) for link in links: if link not in n.outputs: if link not in n.inputs: n.outputs.append(link) collectData(link, internet, address) else: return
def verify_config(owner, sample_config, config, current_key=None): """Verify that config corresponds to sample_config""" import validators def raise_exception(message): raise ValueError('in {} config {}\nsample: {}\nprovided: {}'.format(owner, message, sorted(sample_config.items()), sorted(config.items()))) if isinstance(sample_config, list): if not len(config): raise_exception('empty_list') for element in config: verify_config(owner=owner, sample_config=sample_config[0], config=element, current_key=current_key) elif isinstance(sample_config, dict): for sample_key, sample_value in sample_config.items(): if sample_key not in config: raise_exception('key "{}" is not provided'.format(sample_key)) if config[sample_key] is None: raise_exception('Value of "{}" is empty'.format(sample_key)) verify_config(owner=owner, sample_config=sample_value, config=config[sample_key], current_key=sample_key) else: # from this point config and sample_config start to be simple values if type(sample_config) is str: if sample_config.startswith('http') and validators.url(config) is not True: raise_exception('Key "{}" do not contain valid url: {}'.format(current_key, config)) elif sample_config.startswith('email') and not validators.email(config): raise_exception('Key "{}" do not contain valid email: {}'.format(current_key, config)) elif sample_config.startswith('ipv4') and not validators.ipv4(config): raise_exception('Key "{}" do not contain valid IPv4: {}'.format(current_key, config)) elif sample_config.startswith('int'): try: int(config) except ValueError: raise_exception('Key "{}" do not contain valid int number: {}'.format(current_key, config)) elif type(sample_config) is bool and type(config) is not bool: raise_exception('Key "{}" must be bool: {}'.format(current_key, config))
def __build_in_malformed(self): # Get an unreviewed malformed link element malformed_element = self.__get_unreviewed(MalformedLinkElement) while malformed_element: # If malformed element is blacklisted then don't review it if malformed_element.data in self.malformed_ignored: malformed_element.reviewed = True # If malformed element hasn't been reviewed, review it if not malformed_element.reviewed: potential_url = urljoin(self.site.base_url, malformed_element.data) # If the potential url looks valid if validators.url(potential_url): # Check if its a known potential element new_element = PotentialUrlLinkElement(potential_url) if not self.discovered_links.contains(new_element): self.discovered_links.add_element(new_element) # Hide from report, it's a known potential element malformed_element.hide = True malformed_element.reviewed = True malformed_element = self.__get_unreviewed(MalformedLinkElement)
def validate_account(self,account): """Check a string to see if it exists as the name of an AWS alias. Parameters: account The AWS account alias to validate """ result = { 'accountAlias': None, 'accountId': None, 'signinUri': 'https://' + account + '.signin.aws.amazon.com/', 'exists': False, 'error': None } # Check if the provided account name is a string of numbers (an ID) or not (an alias) if re.match(r'\d{12}',account): result['accountId'] = account else: result['accountAlias'] = account if not validators.url(result['signinUri']): result['error'] = 'Invalid URI' return result try: # Request the sign-in URL and don't allow the redirect request = requests.get(result['signinUri'],allow_redirects=False,timeout=self.requests_timeout) # If we have a redirect, not a 404, we have a valid account alias for AWS if request.status_code == 302: result['exists'] = True except requests.exceptions.RequestException as error: result['error'] = error return result
def spider(url,lvl=1): tld_url = tldextract.extract(url) tld_url = '.'.join(tld_url[:3]) pos = url.rfind('/') outFile = url[pos+1:] print (outFile) response = requests.get(url) #storing all the information including headers in the variable source code if response.status_code == 200: plain_text = response.text #sort source code and store only the plaintext convert_data = BeautifulSoup(plain_text) #converting plain_text to Beautiful Soup object so the library can sort thru it for link in convert_data.findAll('a'): #sorting useful information if link.get('href').find('//') == 0: #address URLs that start with // href = 'https:' + link.get('href') elif validators.url(link.get('href')): #address absolute URLs href = link.get('href') else: #address relative URLs href = url + link.get('href') #Building a clickable url #insertSQL(href, convert_data) print(indent(lvl) +str(lvl) + '. ' +href) #displaying the result back to the user #outData = codecs.open(saveLocation +'\\' +outFile +'.html', 'w', 'utf-8') #outData.write(plain_text) #outData.close() if lvl < max_depth: spider(href, lvl+1)
def clean(self): cleaned_data = super(ProfileDataForm, self).clean() website = cleaned_data['website'] username = cleaned_data['username'] type = cleaned_data['type'] instrument = cleaned_data['instrument'] if website and not validators.url(website): self._errors['website'] = self.error_class( ["Please enter a valid website. For example 'http://makemyband.in'"]) if type == u'Musician': if not instrument: self._errors['instrument'] = self.error_class( ["This field is required"]) try: user = get_user_model().objects.get(username=username) if user and (username != self.user.username): self._errors['username'] = self.error_class( ["Username already exists"]) except: pass return cleaned_data
def add_url(): if not session.get('logged_in'): abort(401) url = request.form['url'] # validate URL if not url: flash("URL cannot be empty") return redirect(url_for('show_urls')) if not validators.url(url): flash("URL is invalid, valid one starts with 'http://' or 'https://'") return redirect(url_for('show_urls')) # insert record insert = 'INSERT INTO urls (url) VALUES (?)' cur = g.db.cursor() cur.execute(insert, [url]) g.db.commit() # get the last record id and encoded last_id = cur.lastrowid short_url = ShortUrl.encode(last_id) # update the record again with short_url update = 'UPDATE %s SET %s="%s" WHERE id=%s' \ % (TABLE, _COL2, short_url, last_id) cur.execute(update) g.db.commit() cur.close() flash("Tiny URL was successfully created: " + request.host_url + 'o/' + short_url) return redirect(url_for('show_urls'))
def download_file(url, dest): if not validators.url(url): print("Not a valid image url: {}".format(url)) return response = requests.get(url) with open(dest, 'wb') as dest: dest.write(response.content)
def normalize(self, creator_email, endpoint_url=None, zip_file=None): """ Normalize the required data irrespective of the source :param creator_email: :param endpoint_url: :param zip_file: :return: """ self.update_status('Normalizing source data') if not endpoint_url and not zip_file: raise Exception('endpoint_url or zip_file is required') if endpoint_url: self.api_link = endpoint_url os.makedirs(self.app_temp_assets) event_info = requests.get(endpoint_url + '/event').json() self.download_event_data() else: unzip(zip_file, self.app_temp_assets) with open(self.get_temp_asset_path('event')) as json_data: event_info = json.load(json_data) event_id = event_info['id'] if os.path.isfile(self.get_temp_asset_path('meta')): with open(self.get_temp_asset_path('meta')) as json_data: meta = json.load(json_data) root_url = meta['root_url'] if root_url: self.api_link = root_url + '/api/v1/events/' + str(event_id) self.event_name = event_info['name'] self.app_name = self.event_name self.creator_email = creator_email self.update_status('Processing background image and logo') background_image = event_info['background_image'].strip() if event_info['background_image'] else '' logo = event_info['logo'].strip() if event_info['logo'] else '' if background_image != '': if background_image.startswith("/"): self.app_background_image = self.get_temp_asset_path(background_image) elif validators.url(background_image): self.app_background_image = self.get_temp_asset_path('background.png') urllib.urlretrieve(background_image, self.app_background_image) if logo != '': if logo.startswith("/"): self.app_launcher_icon = self.get_temp_asset_path(logo) elif validators.url(logo): self.app_launcher_icon = self.get_temp_asset_path('logo.png') urllib.urlretrieve(logo, self.app_launcher_icon)
def tips_process(): error = [] event_name = request.forms.get("event_name") if event_name == "": error.append("error01") category = request.POST.getall("category") if len(category) == 0: error.append("error02") first_day = request.forms.get("first_day") try: datetime.datetime.strptime(first_day, '%Y-%m-%d') except: error.append("error03") last_day = request.forms.get("last_day") try: datetime.datetime.strptime(last_day, '%Y-%m-%d') except: error.append("error04") first_time = request.forms.get("first_time") try: datetime.datetime.strptime(first_time, '%H:%M') except: error.append("error05") last_time = request.forms.get("last_time") try: datetime.datetime.strptime(last_time, '%H:%M') except: error.append("error06") location = request.forms.get("location") if location == "": error.append("error07") adress = request.forms.get("adress") if adress == "": error.append("error08") organizer = request.forms.get("organizer") if organizer == "": error.append("error09") website = request.forms.get("website") if not validators.url(website): error.append("error10") image = request.files.get("image") description = request.forms.get("description") if description == "": error.append("error12") tipster = request.forms.get("tipster") if tipster == "": error.append("error13") tipster_mail = request.forms.get("tipster_mail") if not validators.email(tipster_mail): error.append("error14") if len(error) > 0: redirect("/tips") else: query = ("INSERT INTO event (event_name, first_day, last_day, first_time, last_time, location, adress, organizer, website, image, description, tipster, tipster_mail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") cur.execute(query, (event_name, first_day, last_day, first_time, last_time, location, adress, organizer, website, image, description, tipster, tipster_mail)) db.commit() redirect("/tips")
def sumrise(text = text, sentences = 5): if (validators.url(text)): text = web2text.getwebtxt(text) parser = PlaintextParser.from_string(text, Tokenizer('english')) summerizer = LsaSummarizer() summary = str(summerizer(parser.document, sentences)) return summary
def validate(self, url): if not validators.url(url): raise RuntimeError('Invalid URL') if sys.getsizeof(url.encode('utf-8')) > 1024: raise RuntimeError('Too long input')
import validators import colorama import queue import datetime from random import randint from selenium import webdriver from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup # Colorama setup colorama.init() # URL setup start_url = input("Site to scan: ") while not validators.url(start_url): print("Invalid URL") start_url = input("Site to scan: ") parsed_uri = urlparse(start_url) hostname = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri) target_url = input("Link to be found on the site: ") target_url = target_url.rstrip("/") while not validators.url(target_url): print("Invalid URL") target_url = input("Link to be found on the site: ") # Timer setup start_time = datetime.datetime.now()
def validate_url(url): if validators.url(url): return True else: return False
def is_url(strg): try: return validators.url(strg) except ValidationFailure: return False
def search(self, search_params, age=0, ep_obj=None): results = [] if not self.login(): return results freeleech = '&free=on' if self.freeleech else '' for mode in search_params: items = [] logger.debug(_("Search Mode: {mode}".format(mode=mode))) for search_string in search_params[mode]: if mode != 'RSS': logger.debug( _("Search String: {search_string}".format( search_string=search_string))) # URL with 50 tv-show results, or max 150 if adjusted in IPTorrents profile search_url = self.urls['search'] % (self.categories, freeleech, search_string) search_url += ';o=seeders' if mode != 'RSS' else '' if self.custom_url: if not validators.url(self.custom_url): logger.warning("Invalid custom url: {0}".format( self.custom_url)) return results search_url = urljoin(self.custom_url, search_url.split(self.url)[1]) data = self.get_url(search_url, returns='text') if not data: continue try: data = re.sub(r'(?im)<button.+?</button>', '', data, 0) with BS4Parser(data, 'html5lib') as html: if not html: logger.debug("No data returned from provider") continue if html.find(text='No Torrents Found!'): logger.debug( "Data returned from provider does not contain any torrents" ) continue torrent_table = html.find('table', id='torrents') torrents = torrent_table('tr') if torrent_table else [] # Continue only if one Release is found if not torrents or len(torrents) < 2: logger.debug( "Data returned from provider does not contain any torrents" ) continue for result in torrents[1:]: try: title = result('td')[1].find('a').text download_url = urljoin( search_url, result('td')[3].find('a')['href']) seeders = int( result.find('td', class_='ac t_seeders').text) leechers = int( result.find('td', class_='ac t_leechers').text) torrent_size = result('td')[5].text size = convert_size(torrent_size) or -1 except (AttributeError, TypeError, KeyError): continue if not all([title, download_url]): continue # Filter unseeded torrent if seeders < self.minseed or leechers < self.minleech: if mode != 'RSS': logger.debug( "Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})" .format(title, seeders, leechers)) continue item = { 'title': title, 'link': download_url, 'size': size, 'seeders': seeders, 'leechers': leechers, 'hash': '' } if mode != 'RSS': logger.debug( "Found result: {0} with {1} seeders and {2} leechers" .format(title, seeders, leechers)) items.append(item) except Exception as e: logger.exception( "Failed parsing provider. Error: {0!r}".format(str(e))) logger.exception(traceback.format_exc()) # For each search mode sort all the items by seeders if available items.sort(key=lambda d: try_int(d.get('seeders', 0)), reverse=True) results += items return results
def __init__(self, url): self.url = url self.is_valid = validators.url(self.url) is True self.domain = self.url.split('/')[2] if self.is_valid else None self.logo_url = self.__LOGO_API + self.domain if self.domain else None self.contents = None
def test_has_image(self): d, q = openaccess_cma_search(has_image=1, indent=1, limit=100) for val in d['data']: self.assertTrue(val['images'] is not None) for k, v in val['images'].items(): self.assertTrue(validators.url(v['url']))
def upload(): # each new "session" has a random case number associated with it # obviously, there is a small chance that case numbers will collide. # In that case, the person who used it second would overwrite the other persons data. # So this is not how it should be in its final version. But it's fine for now. case_num = request.args.get('case_num', None) fileDict = dao.getFileDict(case_num) fileDict['research_question'] = request.form.get('smartsearch') if fileDict['research_question'] is not None and fileDict[ 'research_question'].strip() != '': if validators.url(fileDict['research_question'].strip()): return redirect( url_for('visualize_blueprint.visualize', case_num=case_num) ) # temporary submission for SmartSearch for demo else: return redirect( url_for('smart_search_blueprint.sheetSelect', case_num=case_num) ) # if its not a url take it to smartSearch input # here the use of fileDict is probably more clear # the strings used to index request.files come from the HTML name of the input field # see upload.html files = io_service.storeGSA(request.files.getlist('GSA_Input_map')) fileDict['GSA_Input_SHP'] = files[0] fileDict['GSA_Input_DBF'] = files[1] fileDict['GSA_file_list'] = request.files.getlist('GSA_Input_map') fileDict['NLP_Input_corpus'] = io_service.storeNLP( request.files.getlist('NLP_Input_corpus')) fileDict['NLP_Input_LDP'] = io_service.storefile( request.files.get('NLP_Input_LDP')) fileDict['NLP_Input_Sentiment'] = io_service.storefile( request.files.get('NLP_Input_Sentiment')) fileDict["NLP_INPUT_NER"] = request.form.get("NLP_INPUT_NER") fileDict["NLP_INPUT_IOB"] = request.form.get("NLP_INPUT_IOB") fileDict['SNA_Input'] = io_service.storefile( request.files.get('SNA_Input')) fileDict['GSA_Input'] = io_service.storefile( request.files.get('SGA_Input')) fileDict['research_question'] = request.form.get('research_question') errors = io_service.checkExtensions( case_num ) # helper method to make sure there are no input errors by the user # i.e. if there are errors, we can't proceed so we stay on the upload page if len(errors) > 0: return render_template('upload.html', errors=errors, case_num=case_num) # there are intermediary steps for SNA and NLP analyses if fileDict['SNA_Input']: return redirect(url_for('sna_blueprint.sheetSelect', case_num=case_num)) if fileDict['GSA_Input_SHP']: return redirect( url_for('gsa_blueprint.shp_vars_get', case_num=case_num)) # if a user does both SNA and NLP, as it stands, the NLP intermediary data will never be gotten to. This is a problem. if fileDict['NLP_Input_corpus']: return redirect( url_for('visualize_blueprint.visualize', case_num=case_num)) # if NLP chosen, allow them to pick from the different tools available # do i redirect to another url to choose then save the results then redirect to visualize? # no, just add the radio buttons under the file upload before the hr (in the template) return redirect(url_for('visualize_blueprint.visualize', case_num=case_num))
def load_schema(self, template, reload=False): """ Load schema from a local file or a remote URL. If the same schema was previously loaded and reload=False it will return the schema stored in cache. If reload=True it will force the reload of the schema. :param template: Name of local file or URL to remote schema :param reload: Force the reload, even if it was previously loaded :return: The loaded schema as a dictionary """ # Check if template is already loaded and present in _schemas_library if template in self._schemas_library and not reload: log.debug("Loading previously stored schema for {}" .format(template)) return self._schemas_library[template] # Load Online Schema schema_addr = self._schemas[template]['remote'] if validators.url(schema_addr): try: log.debug("Loading schema '{}' from remote location '{}'" .format(template, schema_addr)) # Load schema from remote source self._schemas_library[template] = \ load_remote_schema(schema_addr) # Update the corresponding local schema file write_local_schema(self._schemas_local_master, self._schemas[template]['local'], self._schemas_library[template]) return self._schemas_library[template] except RequestException as e: log.warning("Could not load schema '{}' from remote " "location '{}', error: {}" .format(template, schema_addr, e)) else: log.warning("Invalid schema URL '{}'".format(schema_addr)) # Load Offline Schema schema_addr = self._schemas[template]['local'] if os.path.isfile(schema_addr): try: log.debug("Loading schema '{}' from local file '{}'" .format(template, schema_addr)) self._schemas_library[template] = \ load_local_schema(schema_addr) return self._schemas_library[template] except FileNotFoundError: log.warning("Could not load schema '{}' from local file '{}'" .format(template, schema_addr)) else: log.warning("Schema file '{}' not found.".format(schema_addr)) log.error("Failed to load schema '{}'".format(template))
def url(self): if not validators.url(self._url): raise Exception("Invalid url format.") return self._url
def validUrl(url): return validators.url(url)
def isValidWebsite(web): if not validators.url('http://' + web): return False return True
def Isurl(check_url): try: return validators.url(check_url) except Exception: return False
def filter_function(url): if url is not None and validators.url(url): return True return False
def _parse_redirect_map(self, index_soup): """ Given the HTML soup of an index topic extract the redirect mappings from the "Redirects" section. The URLs section should contain a table of "Path" to "Location" mappings (extra markup around this table doesn't matter) e.g.: <h1>Redirects</h1> <details> <summary>Mapping table</summary> <table> <tr><th>Path</th><th>Location</th></tr> <tr> <td>/my-funky-path</td> <td>/cool-page</td> </tr> <tr> <td>/some/other/path</td> <td>https://example.com/cooler-place</td> </tr> </table> </details> This will typically be generated in Discourse from Markdown similar to the following: # Redirects [details=Mapping table] | Path | Path | | -- | -- | | /my-funky-path | /cool-page | | /some/other/path | https://example.com/cooler-place | """ redirect_soup = self._get_section(index_soup, "Redirects") redirect_map = {} warnings = [] if redirect_soup: for row in redirect_soup.select("tr:has(td)"): path_cell = row.select_one("td:first-child") location_cell = row.select_one("td:last-child") if not path_cell or not location_cell: warnings.append( f"Could not parse redirect map {path_cell}" ) continue path = path_cell.text location = location_cell.text if not path.startswith(self.url_prefix): warnings.append(f"Could not parse redirect map for {path}") continue if not ( location.startswith(self.url_prefix) or validators.url(location, public=True) ): warnings.append( f"Redirect map location {location} is invalid" ) continue if path in self.url_map: warnings.append( f"Redirect path {path} clashes with URL map" ) continue redirect_map[path] = location return redirect_map, warnings
def audit_website(website_types,website): if not website.startswith('http'): website = 'http://' + website if not validators.url(website): website_types.append(website)
def clean_url(url): # TODO: if not validators.url(url): raise ValueError return url
import validators class URL_Splitter: def __init__(self, url): self.url = url self.sep = self.url.split('/', 3) def split(self): self.protocol = self.protocol() self.domain = self.domain() self.path = self.path() return "\nProtocol: {} \nDomain: {} \nPath: {}\n".format( self.protocol, self.domain, self.path) def protocol(self): return self.sep[0][:-1] def domain(self): return self.sep[2] def path(self): return self.sep[3] if __name__ == "__main__": ask_url = input("Please input a url to be split: ") while validators.url(ask_url) != True: ask_url = input("INVALID url, please input another url: ") url = URL_Splitter(ask_url) print(url.split())
def summarize_text(): if request.method == "POST": print(request.get_json()) mode = request.args.get("mode") n_keywords = request.args.get("n_keywords") raw_text = None url = None raw_text = None if not mode in ["url", "raw_text"]: return make_response( jsonify({"error": "Mode must be one of ['url', 'raw_text']"}), 400) if request.get_json(): if mode == "url" and "url" not in request.get_json(): return make_response(jsonify({"error": "url is required."}), 400) elif mode == "url" and "url" in request.get_json(): url = request.get_json()["url"] if mode == "raw_text" and "raw_text" not in request.get_json(): return make_response( jsonify({"error": "raw_text is required."}), 400) elif mode == "raw_text" and "raw_text" in request.get_json(): raw_text = request.get_json()["raw_text"] if mode == "url" and not validators.url(url): return make_response(jsonify({"error": "url is invalid."}), 400) else: return make_response( jsonify({"error": "url or raw_text is required."}), 400) helpers = AbstrakktHelpers(url=url, raw_text=raw_text) if mode == "url": try: raw_text = helpers.fetch_from_url() except Exception as e: return make_response(jsonify({"error": e}), 400) response = {} try: summarized_text = helpers.gensim_summarize(raw_text) response["summarized_text"] = summarized_text if (len(summarized_text) == 0): return make_response( jsonify({ "error": "An unknown error occurred, could not summarize." }), 500) # Compute reading time in minutes (to 2 d.p) response["original_reading_time"] = round( len(raw_text) / app.config["WORDS_PER_MINUTE"], 2) response["reading_time"] = round( len(summarized_text) / app.config["WORDS_PER_MINUTE"], 2) n_keywords = n_keywords or 7 response["keywords"] = helpers.extract_keywords(raw_text, n=int(n_keywords)) return make_response( jsonify({ "message": "Summarization successful !", **response }), 200) except Exception as e: return make_response(jsonify({"error": str(e)}), 500)
def validate_pdf_file(data: dict, key: str): validate_text(data.get('description'), True, key, 'pdf file') if not data.get('url') and not url(data['url']): raise MissingCredentialsError( f"Incorrect pdf url in {key}'s pdf file section")
def search(self, search_strings, age=0, ep_obj=None): # pylint: disable=too-many-branches, too-many-locals, too-many-statements results = [] anime = (self.show and self.show.anime) or (ep_obj and ep_obj.show and ep_obj.show.anime) or False search_params = { "q": "", "field": "seeders", "sorder": "desc", "rss": 1, "category": ("tv", "anime")[anime] } for mode in search_strings: items = [] logger.log("Search Mode: {0}".format(mode), logger.DEBUG) for search_string in search_strings[mode]: search_params["q"] = search_string if mode != "RSS" else "" search_params["field"] = "seeders" if mode != "RSS" else "time_add" if mode != "RSS": logger.log("Search string: {0}".format (search_string.decode("utf-8")), logger.DEBUG) search_url = self.urls["search"] % ("usearch" if mode != "RSS" else search_string) if self.custom_url: if not validators.url(self.custom_url): logger.log("Invalid custom url: {0}".format(self.custom_url), logger.WARNING) return results search_url = urljoin(self.custom_url, search_url.split(self.url)[1]) data = self.get_url(search_url, params=search_params, returns="text") if not data: logger.log("URL did not return results/data, if the results are on the site maybe try a custom url, or a different one", logger.DEBUG) continue if not data.startswith("<?xml"): logger.log("Expected xml but got something else, is your mirror failing?", logger.INFO) continue with BS4Parser(data, "html5lib") as html: for item in html("item"): try: title = item.title.get_text(strip=True) # Use the torcache link kat provides, # unless it is not torcache or we are not using blackhole # because we want to use magnets if connecting direct to client # so that proxies work. download_url = item.enclosure["url"] if sickbeard.TORRENT_METHOD != "blackhole" or "torcache" not in download_url: download_url = item.find("torrent:magneturi").next.replace("CDATA", "").strip("[!]") + self._custom_trackers if not (title and download_url): continue seeders = try_int(item.find("torrent:seeds").get_text(strip=True)) leechers = try_int(item.find("torrent:peers").get_text(strip=True)) # Filter unseeded torrent if seeders < self.minseed or leechers < self.minleech: if mode != "RSS": logger.log("Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})".format (title, seeders, leechers), logger.DEBUG) continue verified = bool(try_int(item.find("torrent:verified").get_text(strip=True))) if self.confirmed and not verified: if mode != "RSS": logger.log("Found result " + title + " but that doesn't seem like a verified result so I'm ignoring it", logger.DEBUG) continue torrent_size = item.find("torrent:contentlength").get_text(strip=True) size = convert_size(torrent_size) or -1 info_hash = item.find("torrent:infohash").get_text(strip=True) item = {'title': title, 'link': download_url, 'size': size, 'seeders': seeders, 'leechers': leechers, 'hash': info_hash} if mode != "RSS": logger.log("Found result: {0} with {1} seeders and {2} leechers".format(title, seeders, leechers), logger.DEBUG) items.append(item) except (AttributeError, TypeError, KeyError, ValueError): continue # For each search mode sort all the items by seeders if available items.sort(key=lambda d: try_int(d.get('seeders', 0)), reverse=True) results += items return results
def view_page(): global usrarg #testing purpose test svn test svn usrarg = flask.request.args.get("url") #print(usrarg) urlvalidity = validators.url(usrarg) if (urlvalidity != True): return "Wrong url, please go back" social_list = [ "www.facebook.com", "www.qzone.qq.com", "www.tumblr.com", "www.instagram.com", "www.twitter.com", "www.skype.com", "www.vk.com", "www.linkedin.com", "www.reddit.com" ] urlobj = urlparse(usrarg) #print(urlobj.netloc) if (urlobj.netloc in social_list): #print("URL should not be social network.") return "URL should not be social network, please go back" if (usrarg[-1] != "/"): usrarg = usrarg + "/" req = urllib.request.Request(usrarg) req.add_header('Referer', 'http://www.python.org/') # Customize the default User-Agent header value: req.add_header( 'User-Agent', 'PurdueUniversityClassProject/1.0 ([email protected] https://goo.gl/dk8u5s)' ) open = get_html_at_url(usrarg) #print(usrarg) html = "<base href=" + usrarg + ">" + open #print(html) etree1 = make_etree(html, usrarg) print("------------------------------------------------------------") try: style = flask.request.args["style"] except: style = "" pass try: color = flask.request.args["color"] except: color = "" pass try: mustache = flask.request.args["beard"] except: mustache = "" pass #print(color) ''' if(checked == True): style = "Square" else if() ''' #print(html) #print(etree1) #print("UTIL copy is fine") path = copy_profile_photo_static(etree1, style, color, mustache) #print("path:"+path) #filename = path[len(os.getcwd()):] filename = os.path.basename(path) #print("filename+"+filename) #print(os.getcwd()) #print("oldpath:"+wpo.oldpath) #print("dict:") #print(wpo.url_to_sha) static_url = flask.url_for('static', filename=filename) #print("static_url:"+static_url) #print(type(html)) src = wpo.url_to_sha[wpo.oldpath] before = html[:(html.find(src))] after = html[(html.find(src)) + len(src):] tempname = static_url[1:].split("/")[1] #print(tempname) temp = flask.url_for('static', filename=filename, _external=True) #print(temp) html = before + temp + after #print(html) return html
def login(self): cookie_dict = dict_from_cookiejar(self.session.cookies) if cookie_dict.get('uid') and cookie_dict.get('pass'): return True if self.cookies: success, status = self.add_cookies_from_ui() if not success: logger.info(status) return False login_params = { 'username': self.username, 'password': self.password, 'login': '******' } if self.custom_url: if not validators.url(self.custom_url): logger.warning("Invalid custom url: {0}".format( self.custom_url)) return False # Get the index, redirects to login data = self.get_url(self.custom_url or self.url, returns='text') if not data: logger.warning("Unable to connect to provider") return False with BS4Parser(data, 'html5lib') as html: action = html.find('form', { 'action': re.compile(r'.*login.*') }).get('action') if not action: logger.warning( 'Could not find the login form. Try adding cookies instead' ) return False response = self.get_url(urljoin(self.custom_url or self.url, action), post_data=login_params, returns='text') if not response: logger.warning("Unable to connect to provider") return False # Invalid username and password combination if re.search('Invalid username and password combination', response): logger.warning("Invalid username or password. Check your settings") return False # You tried too often, please try again after 2 hours! if re.search('You tried too often', response): logger.warning( "You tried too often, please try again after 2 hours! Disable IPTorrents for at least 2 hours" ) return False # Captcha! if re.search('Captcha verification failed.', response): logger.warning("Stupid captcha") return False return True
imgs.attrib["src"] = urlparse.urljoin(base_img, imgs.attrib["src"]) except: continue for links in pagina.xpath('//a'): try: if links.attrib["href"].startswith('#'): continue links.attrib["href"] = urlparse.urljoin(base_section, links.attrib["href"]) except: continue archivo = open('out.html', 'w') archivo.write(html.tostring(pagina)) archivo.close() if __name__ == '__main__': argumentos = sys.argv if validators.url(sys.argv[1]): sep(sys.argv[1]) else: if validators.domain(sys.argv[1]): sep('http://' + sys.argv[1]) #busqueda='' #for x in argumentos[1:]: #busqueda=str(x)+'+' #if busqueda=='': #listax() #else: #listax(busqueda)
def is_valid_link(self, link: str) -> bool: return validators.url(link)
def validate_bg_image(img: Union[str, None], required: bool, key: str, section: str): if (required and not img) or (required and not url(img)): raise MissingCredentialsError( f"Wrong background image in {key}'s {section} section")
def main(): #prepare user's workarea home = os.path.expanduser("~") if os.path.exists(home + "/Library/Application Support"): #MacOS homegenice = home + "/Library/Application Support/GenIce" else: homegenice = os.path.expanduser(home + "/.genice") #Other unix sys.path.append(homegenice) try: os.makedirs(homegenice + "/lattices") # os.makedirs(homegenice+"/molecules") except: pass #just ignore. options = getoptions() if options.debug: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s") elif options.quiet: logging.basicConfig(level=logging.WARN, format="%(asctime)s %(levelname)s %(message)s") else: #normal logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger() Nbox = [int(x) for x in options.rep] name = options.name[0] #input must be a file......too bad. if os.path.exists(name): fNameIn = name fNameOut = homegenice + "/lattices/" + os.path.basename(name) if fNameOut[-4:] in (".cif", ".CIF"): fNameOut = fNameOut[:-4] fNameOut += ".py" else: if validators.url(name): URL = name name = os.path.basename(name) if name[-4:] in (".cif", ".CIF"): name = name[:-4] else: URL = "http://www.iza-structure.org/IZA-SC/cif/" + name + ".cif" fNameIn = homegenice + "/lattices/" + name + ".cif" fNameOut = homegenice + "/lattices/" + name + ".py" assert not os.path.exists( fNameIn ) or options.force, "File exists: {0}. Use '--force' option to overwrite.".format( fNameIn) assert validators.url(URL) download(URL, fNameIn) logger.info("Input: {0}".format(fNameIn)) logger.info("Output: {0}".format(fNameOut)) if os.path.exists(fNameOut) and not options.force: logger.error( "File exists: {0}. Use '--force' option to overwrite.".format( fNameOut)) sys.exit(1) atoms, box = read_cif.read_and_process(fNameIn, Nbox, make_rect_box=False) fOut = open(fNameOut, "w") write_py(atoms, box, fOut, matchfunc=lambda x: x[0] != "O")
parser.set_defaults(view=False) args = parser.parse_args() # Define Visual Network visual_sensor = VisualNetwork() visual_sensor = visual_sensor.cuda() # Define Temporal Network lstm = Recog(2048, 512, 128) lstm = lstm.cuda() # Define the memory graph mem = GraphMemory(128) # Open Video if validators.url(args.file): from pytube import YouTube print('Downloading...') yt = YouTube(args.file).streams.first() f = yt.default_filename if not os.path.exists(f): yt.download() print('f:', f) else: f = args.file cap = VideoCapture(f) cap.open() f1, ax1 = plt.subplots(1, 3) # f2, ax2 = plt.subplots(2, 1)
def search(self, search_strings, age=0, ep_obj=None): # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements """ Searches indexer using the params in search_strings, either for latest releases, or a string/id search Returns: list of results in dict form """ results = [] if not self._check_auth(): return results if 'gingadaddy' not in self.url: # gingadaddy has no caps. if not self.caps: self.get_newznab_categories(just_caps=True) if not self.caps: return results for mode in search_strings: search_params = { 't': ('search', 'tvsearch')[bool(self.use_tv_search)], 'limit': 100, 'offset': 0, 'cat': self.catIDs.strip(', ') or '5030,5040', 'maxage': sickbeard.USENET_RETENTION } if self.needs_auth and self.key: search_params['apikey'] = self.key if mode != 'RSS': if self.use_tv_search: if 'tvdbid' in str(self.cap_tv_search): search_params['tvdbid'] = ep_obj.show.indexerid if ep_obj.show.air_by_date or ep_obj.show.sports: date_str = str(ep_obj.airdate) search_params['season'] = date_str.partition('-')[0] search_params['ep'] = date_str.partition( '-')[2].replace('-', '/') elif ep_obj.show.is_anime: search_params['ep'] = ep_obj.absolute_number else: search_params['season'] = ep_obj.scene_season search_params['ep'] = ep_obj.scene_episode if mode == 'Season': search_params.pop('ep', '') if self.torznab: search_params.pop('ep', '') search_params.pop('season', '') items = [] logger.log('Search Mode: {0}'.format(mode), logger.DEBUG) for search_string in search_strings[mode]: if mode != 'RSS': logger.log( 'Search string: {0}'.format( search_string.decode('utf-8')), logger.DEBUG) if 'tvdbid' not in search_params: search_params['q'] = search_string time.sleep(cpu_presets[sickbeard.CPU_PRESET]) data = self.get_url(urljoin(self.url, 'api'), params=search_params, returns='text') if not data: break with BS4Parser(data, 'html5lib') as html: if not self._check_auth_from_data(html): break # try: # self.torznab = 'xmlns:torznab' in html.rss.attrs # except AttributeError: # self.torznab = False for item in html('item'): try: title = item.title.get_text(strip=True) download_url = None if item.link: if validators.url( item.link.get_text(strip=True)): download_url = item.link.get_text( strip=True) elif validators.url(item.link.next.strip()): download_url = item.link.next.strip() if (not download_url, item.enclosure and validators.url( item.enclosure.get('url', '').strip())): download_url = item.enclosure.get('url', '').strip() if not (title and download_url): continue seeders = leechers = None if 'gingadaddy' in self.url: size_regex = re.search(r'\d*.?\d* [KMGT]B', str(item.description)) item_size = size_regex.group( ) if size_regex else -1 else: item_size = item.size.get_text( strip=True) if item.size else -1 for attr in item.find_all( ['newznab:attr', 'torznab:attr']): item_size = attr['value'] if attr[ 'name'] == 'size' else item_size seeders = try_int( attr['value'] ) if attr['name'] == 'seeders' else seeders leechers = try_int( attr['value'] ) if attr['name'] == 'peers' else leechers if not item_size or (self.torznab and (seeders is None or leechers is None)): continue size = convert_size(item_size) or -1 result = { 'title': title, 'link': download_url, 'size': size, 'seeders': seeders, 'leechers': leechers } items.append(result) except StandardError: continue # Since we aren't using the search string, # break out of the search string loop if 'tvdbid' in search_params: break if self.torznab: results.sort(key=lambda d: try_int(d.get('seeders', 0)), reverse=True) results += items return results
def search(self, search_strings, age=0, ep_obj=None): # pylint: disable=too-many-locals, too-many-branches, too-many-statements results = [] """ 205 = SD, 208 = HD, 200 = All Videos https://pirateproxy.pl/s/?q=Game of Thrones&type=search&orderby=7&page=0&category=200 """ # oder_by is 7 in browse for seeders, but 8 in search! search_params = { "q": "", "type": "search", "orderby": 8, "page": 0, "category": 200 } # Units units = ["B", "KIB", "MIB", "GIB"] def process_column_header(th): text = "" if th.a: text = th.a.get_text(strip=True) if not text: text = th.get_text(strip=True) return text for mode in search_strings: items = [] logger.log("Search Mode: {0}".format(mode), logger.DEBUG) for search_string in search_strings[mode]: search_urls = (self.urls["search"], self.urls["rss"])[mode == "RSS"] if not isinstance(search_urls, list): search_urls = [search_urls] for search_url in search_urls: if self.custom_url: if not validators.url(self.custom_url): logger.log( "Invalid custom url: {0}".format( self.custom_url), logger.WARNING) return results search_url = urljoin(self.custom_url, search_url.split(self.url)[1]) if mode != "RSS": search_params["q"] = search_string logger.log( "Search string: {}".format( search_string.decode("utf-8")), logger.DEBUG) # Prevents a 302 redirect, since there is always a 301 from .se to the best mirror having an extra # redirect is excessive on the provider and spams the debug log unnecessarily search_url, params = self.convert_url( search_url, search_params) data = self.get_url(search_url, params=params, returns="text") else: data = self.get_url(search_url, returns="text") if not data: logger.log( "URL did not return data, maybe try a custom url, or a different one", logger.DEBUG) continue with BS4Parser(data, "html5lib") as html: torrent_table = html.find("table", id="searchResult") torrent_rows = torrent_table( "tr") if torrent_table else [] # Continue only if at least one Release is found if len(torrent_rows) < 2: logger.log( "Data returned from provider does not contain any torrents", logger.DEBUG) continue labels = [ process_column_header(label) for label in torrent_rows[0]("th") ] # Skip column headers for result in torrent_rows[1:]: try: cells = result("td") # Funky js on page messing up titles, this fixes that title = result.find( class_="detLink")['title'].split( 'Details for ', 1)[-1] download_url = result.find( title="Download this torrent using magnet" )["href"] + self._custom_trackers if not self.magnet_regex.match(download_url): logger.log( "Got an invalid magnet: {0}".format( download_url)) logger.log( "Invalid ThePirateBay proxy please try another one", logger.DEBUG) continue if not all([title, download_url]): continue seeders = try_int( cells[labels.index("SE")].get_text( strip=True)) leechers = try_int( cells[labels.index("LE")].get_text( strip=True)) # Filter unseeded torrent if seeders < self.minseed or leechers < self.minleech: if mode != "RSS": logger.log( "Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})" .format(title, seeders, leechers), logger.DEBUG) continue # Accept Torrent only from Good People for every Episode Search if self.confirmed and not result.find( alt=re.compile(r"VIP|Trusted")): if mode != "RSS": logger.log( "Found result: {0} but that doesn't seem like a trusted result so I'm ignoring it" .format(title), logger.DEBUG) continue # Convert size after all possible skip scenarios torrent_size = re.sub( r".*Size ([\d.]+).+([KMGT]iB).*", r"\1 \2", result.find(class_="detDesc").get_text( strip=True)) size = convert_size(torrent_size, units=units) or -1 item = { 'title': title, 'link': download_url, 'size': size, 'seeders': seeders, 'leechers': leechers, 'hash': '' } if mode != "RSS": logger.log( "Found result: {0} with {1} seeders and {2} leechers" .format(title, seeders, leechers), logger.DEBUG) items.append(item) except StandardError: continue # For each search mode sort all the items by seeders if available items.sort(key=lambda d: try_int(d.get('seeders', 0)), reverse=True) results += items return results