Exemple #1
0
 def extract_feed_rss(self):
     tmp = self._tree_explorer.xpath(
         self._backup_html_tree,
         '//link[@type="application/rss+xml" and @rel="alternate"] | //link[@type="application/atom+xml" and @rel="alternate"]'
     )
     feeds = dict()
     for t in tmp:
         feeds[self._tree_explorer.get_attribute(
             t,
             attr='href')] = self._tree_explorer.get_attribute(t,
                                                               attr='title')
     if len(feeds) == 0:
         tmp = self._tree_explorer.xpath(self._backup_html_tree,
                                         "//a[contains(@href, '.xml')]")
         for t in tmp:
             href = self._tree_explorer.get_attribute(t, attr='href')
             file_type = utils.get_filetype_from_url(href)
             if file_type and file_type == 'xml':
                 feeds[href] = ''
     if len(feeds) == 0:
         tmp = self._tree_explorer.xpath(
             self._backup_html_tree,
             "//a[contains(@href, 'rss')] | //a[contains(@href, 'feed')]")
         for t in tmp:
             href = self._tree_explorer.get_attribute(t, attr='href')
             if not utils.is_valid_url(href):
                 final_url = '%s/%s' % (self._url, href)
                 if utils.is_valid_url(final_url):
                     feeds[final_url] = ''
             elif not utils.are_equals_urls(href, self._url):
                 feeds[href] = ''
     return feeds
Exemple #2
0
def find_rss_path(html, url):
    soup = BeautifulSoup(html, 'html.parser')

    raw_rss_paths = soup.find_all('link', {'type': re.compile('rss|atom')})
    if not raw_rss_paths:
        raw_rss_paths = soup.find_all('a', {'href': re.compile('feed|rss')})

    rss_paths = []
    invalid_rss_path = re.compile('(feedback)|((.pdf|.xlsx|.xls|.doc|.docx)$)',
                                  re.IGNORECASE)

    for path in raw_rss_paths:
        if invalid_rss_path.search(path['href']) is None:
            rss_paths.append(path['href'])

    if rss_paths:
        href = rss_paths[0].replace(' ', '%20')

        if is_valid_url(href):
            full_path = href
        else:
            if href[0] != '/':
                full_path = f'{url}/{href}'
            else:
                full_path = url + href

        return full_path

    return None
Exemple #3
0
    def post(self):
        user = users.get_current_user()
        if not user or 'user_id' not in dir(user):
            self.redirect(users.create_login_url('/addbuffr'))
        apiAddress = self.request.get('apiAddress')
        to_console = {}
        to_console["apiAddress"] = apiAddress
        to_console["is_valid_url(apiAddress)"] = (is_valid_url(apiAddress) != None)

        buffr_instance = Buffr()
        buffr_instance.apiName = self.request.get('apiName')
        buffr_instance.apiAddress = apiAddress
        APIUnstable = self.request.get('APIUnstable')
        if APIUnstable not in [True, False]:
            buffr_instance.APIUnstable = False
        else:
            buffr_instance.APIUnstable = APIUnstable
        buffr_instance.user_id = user.user_id()
        buffr_instance.user_email = user.email()
        buffr_instance.update_interval = int(self.request.get('updateInterval'))
        for possibility in user_readable_convertion_table:
            logging.info(str((possibility[0], buffr_instance.update_interval)))
            if int(possibility[0]) == buffr_instance.update_interval:
                buffr_instance.user_readable_update_interval = possibility[2]
        buffr_instance.end_point = hashlib.md5('%s:%s' % (user.user_id(), apiAddress)).hexdigest()
        buffr_instance.last_known_data = None
        buffr_instance.buffr_version = current_api_version
        buffr_instance.put()
        memcache.flush_all()
        logging.info('Added new Buffr to datastore')
        taskqueue.add(url='/confirm_working_url', params={'key': buffr_instance.key()})
        render(self, 'addbuffer.html', {'to_console': to_console,
                                        'submitted': True,
                                        'apiAddress': apiAddress})
def svg_to_any(elem, doc):
    """
    Convert a svg to supported formats
    """
    if not isinstance(elem, Image):
        return

    # We don't want urls, you have to download them first
    if is_valid_url(elem.url):
        return

    mimet, _ = mimetypes.guess_type(elem.url)
    flag, file_ext = FMT_OPTIONS.get(doc.format)
    if mimet == 'image/svg+xml' and flag:
        base_name, _ = os.path.splitext(elem.url)
        target_name = base_name + "." + file_ext
        try:
            mtime = os.path.getmtime(target_name)
        except OSError:
            mtime = -1
        if mtime < os.path.getmtime(elem.url):
            cmd_line = ['inkscape', flag, target_name, elem.url]
            sys.stderr.write("Running %s\n" % " ".join(cmd_line))
            subprocess.call(cmd_line, stdout=sys.stderr.fileno())
        elem.url = target_name
def download_image(elem, _):
    """
    Download an image from the web
    """
    if not isinstance(elem, Image):
        return

    result = is_valid_url(elem.url)
    if not result: # not a valid url, return
        return

    file_name = unquote(result.path).split('/')[-1]
    full_path = os.path.join(IMAGEDIR, file_name)

    if os.path.isfile(full_path):
        elem.url = full_path
    else:
        try:
            os.mkdir(IMAGEDIR)
            sys.stderr.write('Created directory ' + IMAGEDIR + '\n')
        except OSError:
            pass

        try:
            with urlopen(elem.url) as response, open(full_path, 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
                elem.url = full_path
        except HTTPError as err:
            logging.warning('HTTP error %s %d %s', elem.url, err.code, err.reason)
def pdf_to_svg(elem, doc):
    """
    Convert a pdf to svg
    """
    if not isinstance(elem, Image):
        return

    # We don't want urls, you have to download them first
    if is_valid_url(elem.url):
        return

    mimet, _ = mimetypes.guess_type(elem.url)
    flag, file_ext = ('--export-plain-svg', 'svg')
    if mimet == 'application/pdf' and flag:
        base_name, _ = os.path.splitext(elem.url)
        target_name = base_name + "." + file_ext
        try:
            mtime = os.path.getmtime(target_name)
        except OSError:
            mtime = -1
        if mtime < os.path.getmtime(elem.url):
            cmd_line = ['inkscape', flag, target_name, elem.url]
            sys.stderr.write("Running %s\n" % " ".join(cmd_line))
            subprocess.call(cmd_line, stdout=sys.stderr.fileno())
        elem.url = target_name
    def get_har(
            self,
            remove_domain_request=True,
            domains_to_remove={
                'facebook.com', 'facebook.it', 'youtube.it', 'youtube.com',
                'twitter.it', 'twitter.com'
            },
            file_type_to_remove={'jpg', 'png', 'jpeg'}):
        result = list()
        if self.logging and self.logs:
            domain = None
            if remove_domain_request:
                domain = utils.get_domain(self.current_url)
            for log in self.logs:
                message = json.load(StringIO(log['message']))['message']
                if 'method' in message:
                    method = message['method']
                    if method and method == 'Network.responseReceived':
                        url = message['params']['response']['url']
                        if utils.is_valid_url(url):
                            to_insert = (domain and not utils.is_domain_link(
                                url, domain)) or domain is None
                            to_insert = to_insert and utils.get_filetype_from_url(
                                url) not in file_type_to_remove
                            if to_insert:
                                for d in domains_to_remove:
                                    if utils.is_domain_link(url, d):
                                        to_insert = False
                                        break
                                if to_insert:
                                    result.append(url)

        result = list(set(result))
        #print('har len: ' + str(len(result)))
        return result
Exemple #8
0
def download_album(host, url, name, dest=".", delim=" - ", digits=3, number=1):
    if not is_valid_url(url):
        sys.exit(1)

    host = host.lower()
    name = name.lower()

    if host == "imagebam":
        imagebam(url, name, dest, delim, digits, number)
    elif host == "imagevenue":
        imagevenue(url, name, dest, delim, digits, number)
    elif host == "imgbox":
        imgbox(url, name, dest, delim, digits, number)
    elif host == "imgur":
        imgur(url, name, dest, delim, digits, number)
    elif host == "someimage":
        someimage(url, name, dest, delim, digits, number)
    elif host == "upix":
        upix(url, name, dest, delim, digits, number)
    elif host == "hotflick":
        hotflick(url, name, dest, delim, digits, number)
    elif host == "myceleb":
        myceleb(url, name, dest, delim, digits, number)
    elif host == "mangastream":
        mangastream(url, name, dest, delim, digits, number)
    else:
        print "ERROR: Unsupported image host '{}'".format(host)
Exemple #9
0
def crawl(url_list):
    q = deque(url_list)
    processed = set(url_list)
    domain = 'http://www.ourcampaigns.com/'
    count = 0
    sw = StopWatch()
    while q:
        current_url = q.popleft()
        result = html_to_json(domain + current_url)
        if result is None:
            print '  skip', current_url
            continue
        category, uid = tokenize(current_url)

        if category == 'race':
            components = result['RACE DETAILS']['Parents'][0]['text'].split(
                '>')
            if len(components) <= 2:
                print '  Bad', components, current_url
                continue
            if components[1].strip() != 'United States':
                continue
            position = campactify(components[-2] + components[-1])
            year = int(result['RACE DETAILS']['Term Start'][0]['text'].split(
                '-')[0].split(',')[-1].strip())
            if year > 2017 or year < 1900:
                continue
            description = 'race_{}_{}'.format(position, year)
        elif category == 'candidate':
            name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text'])
            description = 'candidate_{}'.format(name)
        elif category == 'container':
            name = campactify(result['INCUMBENT']['Name'][0]['text'])
            year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip()
            description = 'container_{}_{}'.format(name, year)

        count += 1
        if count % 500 == 0:
            print '{}, crawling {}'.format(count, description)

        for table_title, table in result.iteritems():
            camel_title = to_camel(table_title)
            if camel_title not in [
                    'LastGeneralElection', 'PrimaryOtherSchedule'
            ]:
                with open(
                        os.path.join(
                            JSON_DIR,
                            '{}_{}_{}.json'.format(description, uid,
                                                   camel_title)), 'wb') as fp:
                    json.dump(table, fp)
            if category == 'race' and 'Governor' not in description and 'Mayor' not in description:
                continue
            for row_title, row in table.iteritems():
                for cell in row:
                    link = cell['link']
                    if link not in processed and is_valid_url(link):
                        q.append(link)
                        processed.add(link)
    sw.tic('crawl {} urls'.format(count))
Exemple #10
0
 async def remove_award(self, ctx, user: UserConverter, url: str):
     '''
     !add_award @user url
     [Admin only] Removes an award from a user
     '''
     if not is_valid_url(url):
         raise InvalidUrl()
     await self.bot.remove_award(ctx, user, url)
     await ctx.send('Done.')
Exemple #11
0
 async def set_award(self, ctx, url: str):
     '''
     !set_award url
     [Admin only] Sets an award for current challenge
     '''
     if not is_valid_url(url):
         raise InvalidUrl()
     await self.bot.set_award(ctx, url)
     await ctx.send('Done.')
 def generate_features(self, url_info, svm_feature=False):
     # preprocess the url_info
     self.features = []
     if is_valid_url(url_info[0]):
         print url_info[0], len(url_info[0])
         for feature_obj in self.pipeline:
             self.features.append( feature_obj.extract(url_info, svm_feature) )
     else:
         self.features = []
Exemple #13
0
def crawl(url):
    q = deque([url])
    with open('processed.txt', 'rb') as fp:
        processed = set(fp.read().split())
    processed.add(url)
    domain = 'http://www.ourcampaigns.com/'
    while q:
        current_url = q.popleft()
        if current_url.startswith(domain):
            current_url = current_url[len(domain):]
        result = html_to_json(domain + current_url)
        if result is None:
            print '  skip', current_url
            continue
        category, uid = tokenize(current_url)

        if category == 'race':
            components = result['RACE DETAILS']['Parents'][0]['text'].split(
                '>')
            if len(components) <= 2:
                print '  Bad', components, current_url
                continue
            if components[1].strip() != 'United States':
                continue
            position = campactify(components[-2] + components[-1])
            year = int(result['RACE DETAILS']['Term Start'][0]['text'].split(
                '-')[0].split(',')[-1].strip())
            if year > 2016 or year < 1950:
                continue
            description = 'race_{}_{}'.format(position, year)
        elif category == 'candidate':
            name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text'])
            description = 'candidate_{}'.format(name)
        elif category == 'container':
            name = campactify(result['INCUMBENT']['Name'][0]['text'])
            year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip()
            description = 'container_{}_{}'.format(name, year)
        # print '    ' + description, current_url
        for table_title, table in result.iteritems():
            camel_title = to_camel(table_title)
            if camel_title not in [
                    'LastGeneralElection', 'PrimaryOtherSchedule'
            ]:
                with open(
                        'data/{}_{}_{}.json'.format(description, uid,
                                                    camel_title), 'wb') as fp:
                    json.dump(table, fp)
            if category == 'race' and 'Governor' not in description:
                continue
            for row_title, row in table.iteritems():
                for cell in row:
                    link = cell['link']
                    if is_valid_url(link) and link not in processed:
                        q.append(link)
                        processed.add(link)
    with open('processed.txt', 'wb') as fp:
        fp.write('\n'.join(processed))
Exemple #14
0
 def on_new_url(self, request):
     error = None
     if request.method == "POST":
         url = request.form["url"]
         if not is_valid_url(url):
             error = "URL is not valide"
         else:
             url_id = insert_url(self.redis, url)
             return redirect("/%s_details" % url_id)
     return self.render_template("new_url.html", error=error)
Exemple #15
0
 def test_is_valid_url(self):
     self.assertEqual(is_valid_url('https://google.com'), True)
     self.assertEqual(is_valid_url(\
         'https://www.google.com/search?q=the+simpsons'), True)
     self.assertEqual(is_valid_url('https://google.co'), False)
     self.assertEqual(is_valid_url('//google.com'), False)
     self.assertEqual(is_valid_url('google.com'), False)
     self.assertEqual(is_valid_url('google'), False)
     self.assertEqual(is_valid_url(''), False)
Exemple #16
0
 def test_is_valid_url(self):
     self.assertEqual(is_valid_url('https://google.com'), True)
     self.assertEqual(is_valid_url(\
         'https://www.google.com/search?q=the+simpsons'), True)
     self.assertEqual(is_valid_url('https://google.co'), False)
     self.assertEqual(is_valid_url('//google.com'), False)
     self.assertEqual(is_valid_url('google.com'), False)
     self.assertEqual(is_valid_url('google'), False)
     self.assertEqual(is_valid_url(''), False)
Exemple #17
0
 def _retrieve_outbound_links(self):
     result = dict()
     principal_domain = utils.get_principal_domain(self._url)
     regex = "//*[@href and not(@href [contains(., '%s')])]" % principal_domain
     elements_with_urls = self._tree_explorer.xpath(self.body_node, regex)
     for element in elements_with_urls:
         href = element.attrib['href']
         if utils.is_valid_url(href):
             href = utils.clean_url(href)
             if href not in result:
                 result[href] = ''
     return list(result.keys())
    def add_url(self, request):
        error = None
        url = None
        if request.method == 'POST':
            url = request.form['url']
            if not is_valid_url(url):
                error = 'Please enter a valid URL'
            else:
                short_id = self.insert_url(url)
                return redirect(f'/{short_id}+')

        return self.render_template('new_url.html', error=error, url=url)
Exemple #19
0
    def on_new_url(self, request):
        error = None
        url = ""
        if request.method == "POST":
            url = request.form["url"]
            if not is_valid_url(url):
                error = "Invalid url"
            else:
                short_id = insert_url(self.redis, url)
                return redirect('/%s_details' % short_id)

        return self.render_template("new_url.html", error=error, url=url)
Exemple #20
0
    def on_new_url(self, request):
        error = None
        url = ""
        if request.method == 'POST':
            url = request.form['url']
            if not is_valid_url(url):
                error = 'invalid url'
            else:
                id = insert_url(self.redis, url)
                return redirect(b'/%s_details' % id)

        return self.render_template("new_url.html", error=error, url=url)
Exemple #21
0
 def on_new_url(self, request):
     error = None
     url = ""
     if request.method == "POST":
         url = request.form['url']
         if not is_valid_url(url):
             error = 'invalid url'
         else:
             id = insert_url(self.redis, url)
             if type(id) == bytes:
                 return redirect('%s_details' % id.decode('utf-8'))
             return redirect('/%s_details' % id)
     return self.render_template("new_url.html", error=error, url=url)
Exemple #22
0
    def on_new_url(self, request):
        error = None
        url = ""

        if request.method == 'POST':
            url = request.form['url']

            if is_valid_url(url):
                url_id = insert_url(self.redis, url)
                return redirect('%s/detail' % url_id.decode('utf-8'))
            error = 'URL is not valid'

        return self.render_template("new_url.html", error=error, url=url)
Exemple #23
0
async def handle_shortify(request):
    """
    Хендлер обрабатывающий запросы на сокращение ссылок
    """
    data = await request.post()
    db = request.app["db"]
    url = data.get("url")
    user_url = data.get("user_url")
    if not url:
        return aiohttp_jinja2.render_template("index.html", request,
                                              {"error": ERRORS["without_url"]})
    if not utils.is_valid_url(url):
        return aiohttp_jinja2.render_template("index.html", request,
                                              {"error": ERRORS["invalid_url"]})
    if user_url:
        exists = await db.get(user_url)
        if exists:
            return aiohttp_jinja2.render_template(
                "index.html", request, {"error": ERRORS["busy_url"]})
        short_url = user_url
    else:
        short_url = await db.get(url)
        # Если ссылка уже есть в базе, то не плодим новые короткие ссылки
        if short_url:
            return aiohttp_jinja2.render_template(
                "index.html", request, {
                    "shortened_url":
                    "{}:{}/{}".format(settings.HOST, settings.PORT,
                                      short_url.decode("UTF-8"))
                })

        link_count = await db.incr(settings.DB_LINKS_COUNT_KEY)

        short_url = utils.encode(link_count)
        exists = await db.get(short_url)

        while exists:
            link_count = await db.incr(settings.DB_LINKS_COUNT_KEY)
            short_url = Shortener.encode(link_count)
            exists = await db.get(short_url)

    # Заносим оба варианта в базу, чтобы получить возможно проверять наличие полного url в базе
    # Это позволит избежать наличия дубликатов
    await db.set(short_url, url)
    await db.set(url, short_url)
    return aiohttp_jinja2.render_template(
        "index.html", request, {
            "shortened_url":
            "{}:{}/{}".format(settings.HOST, settings.PORT, short_url)
        })
Exemple #24
0
    def on_new_url(self, request):

        error = None
        url = ''

        if request.method == 'POST':
            url = request.form['url']

            if(is_valid_url(url) == False):
                error = 'Not valid url'
            else:           
                id = insert_url(self.redis,url)
                return redirect('%s'%id)

        return self.render_template("new_url.html", error=error, url=url)
def generate_links(link, base_link):
    """docstring for generate_links"""
    a_url_list = []
    try:
        response = requests.get(link)
        # BeautifulSoup generate DOM tree for html document
        # for searching and manuplation of dom document
        dom_tree = BeautifulSoup.BeautifulSoup(response.text)
        a_element_list = dom_tree.fetch('a')  # a elements in html doc
        for a_element in a_element_list:
            a_url = a_element.get('href')
            if utils.is_valid_url(a_url, base_link):
                a_url_list.append(utils.sanitize_url(a_url, base_link))
    except:
        return a_url_list
    return a_url_list
Exemple #26
0
    def __crawl(self, page, level):

        if page not in self.__page_content:

            self.read(page)

            if self.__progress_bar:
                print(".", end="")

            if level < self.__deep:
                result = []
                searchers.FindLink("").do(page, self.__page_content[page],
                                          result)
                for link in result:
                    new_page = link if utils.is_valid_url(
                        link) else urllib.parse.urljoin(self.__url, link)
                    self.__crawl(new_page, level + 1)
Exemple #27
0
    def get(self, url):
        '''deliver pages to bots'''
        try:
            logging.info('Serving %s to %s', url, self.request.headers['User-Agent'])
            self.set_header('content-type', 'text/html')
            if not utils.is_valid_url(url):
                self.send_error(400)
                return

            content = datastore.get_page(utils.to_pretty_url(url))

            if content is None:
                self.send_error(502)
            else:
                self.write(content)
        except Exception, e:
            logging.error('Error getting page for crawler', exc_info=True)
Exemple #28
0
def download_album(host, url, name, dest=".", delim=" - ", digits=3, number=1):
    if not is_valid_url(url):
        sys.exit(1)

    host = host.lower()
    name = name.lower()

    if host == "imagebam":
        imagebam(url, name, dest, delim, digits, number)
    elif host == "imagevenue":
        imagevenue(url, name, dest, delim, digits, number)
    elif host == "imgbox":
        imgbox(url, name, dest, delim, digits, number)
    elif host == "imgur":
        imgur(url, name, dest, delim, digits, number)
    else:
        print "ERROR: Unsupported image host '{}'".format(host)
Exemple #29
0
    def on_new_url(self, request):
        error = None
        url = ""
        if request.method == "POST":
            url = request.form['url']
            if not is_valid_url(url):
                error = 'invalid url'
            else:
                id = insert_url(self.redis, url)
                return redirect('/%s_details' % str(id))
        # TODO: Проверить что метод для создания новой ссылки "POST"
        # Проверить валидность ссылки используя is_valid_url
        # Если ссылка верна - создать запись в базе и
        # отправить пользователя на детальную информацию
        # Если неверна - написать ошибку

        return self.render_template("new_url.html", error=error, url=url)
Exemple #30
0
    def _add_job(self, update: telegram.Update,
                 context: telegram.ext.CallbackContext):
        f"""
        Callback for the update of a job. Message must be:
        ```
        {Bot.ADD_USAGE}
        ```
        """
        user = update.effective_chat.id
        try:
            # Extract info.
            url = context.args[0]

            # Check url validity.
            if not utils.is_valid_url(url):
                update.message.reply_text(f"{url} is not a valid url.",
                                          disable_web_page_preview=True)
                logging.warning(f"Invalid url from user {user}.")
                return

            # Check minimum time
            freq = int(context.args[1])
            if freq < self._minimum_interval:
                update.message.reply_text(
                    f"{self._minimum_interval} minutes is the minimum time. I'll just set it for you."
                )
                freq = self._minimum_interval

            keywords = context.args[2::] if len(context.args) > 2 else list()

            # Update database.
            job = Job(user, url, freq, keywords)
            Database(self._database_file).add_job(job)

            # Schedule job.
            self._schedule(job)

            # Send back a response as a confirmation.
            response = f"Will start searching {url} for links containing {', '.join(keywords)} every {freq} minutes."
            update.message.reply_text(response, disable_web_page_preview=True)
            logging.info(f"/add command received by user: {user}. {response}")

        except (IndexError, ValueError):
            update.message.reply_text(f"Usage: {Bot.ADD_USAGE}")
            logging.warning(f"Inappropriate /add command from user {user}.")
Exemple #31
0
    def set_webhook(url: str):
        """
        Accepts a url string and checks to see if it meets HTTP/s specifications. If it does, it returns the
        string. If it does not, it raises an ArgumentError.

        :url:   A string representing the webhook's url
        """
        try:
            if url is None:
                return None
            if utils.is_valid_url(url=url):
                return url
        except Exception as e:
            # Should catch type errors
            raise errors.ArgumentError(
                {'webhook': (e.__class__, url, e.__str__())})

        # Was something wrong with URL format
        raise errors.ArgumentError({
            'webhook':
            (ValueError, url, 'Webhook URL not meet HTTP/s specifications.')
        })
Exemple #32
0
    async def parse_title_args(self, ctx, guild_id, *_args):
        args = [ x for x in _args ]

        params = {}
        params['user'] = ctx.message.author
        params['pool'] = 'main' # todo: make so default pool isn't main but the first pool in a challenge
        params['guild_id'] = guild_id

        for i, arg in enumerate(args):
            if is_valid_url(arg):
                params['url'] = arg
                args[i] = None
            
            usr = await user_or_none(ctx, arg)
            if usr:
                params['user'] = usr
                args[i] = None
            
            if await self.bot.has_pool(ctx, arg, params['guild_id']):
                params['pool'] = arg
                args[i] = None

        args = [ arg for arg in args if arg != None ]
        if params['url']:
            title_info = self.bot.get_api_title_info(params['url'])
            params['title_name'] = title_info.name
            params['score'] = title_info.score
            params['duration'] = title_info.duration
            params['num_of_episodes'] = title_info.num_of_episodes
            params['difficulty'] = title_info.difficulty            

        if len(args) > 1:
            raise BotErr('Bad argumnets')
        if len(args) == 1:
            params['title_name'] = args[0]
        
        return params
Exemple #33
0
def url_post():
    """Saves URLs if ok, else wont (?)"""
    data = request.json.copy()
    if "url" not in list(data.keys()):
        return jsonify(error="URL not provided"), 400

    else:
        if "code" not in list(data.keys()):
            code = utils.gen_code()
        else:
            code = data.get('code')
            if not utils.is_valid_code(code):
                return jsonify(error="Code not valid"), 409

        _, exists = utils.code_exists(code)
        if exists:
            return jsonify(error="Code in use"), 422
        else:
            if utils.is_valid_url(data.get('url')):
                utils.insert_url(data.get('url'), code)
            else:
                return jsonify(error="URL not valid"), 409

        return jsonify(code=code), 201
Exemple #34
0
    def fetch_reviews(self):
        print('Fetching reviews...', flush=True)
        self.lookup = {}
        reviews = []
        has_next = True
        set_locale(self.url)

        if not is_valid_url(self.url):
            print('[ERROR] URL is not valid: ' + self.url, flush=True)
            return None

        self.driver.get(self.url)

        try:
            self.driver.find_element_by_id('taplc_location_review_filter_controls_0_filterLang_ALL').click()
        except:
            pass

        while has_next:
            time.sleep(SECONDS_BETWEEN_REQUEST + 0.5)
            reviews_parsed = self.__parse_page()
            if len(reviews_parsed) == 0:
                break
            reviews += reviews_parsed
            print('Fetched reviews: ' + str(len(reviews)), flush=True)

            try:
                has_next = self.driver.execute_script(
                    'return !document.querySelector(".ui_pagination>.next").classList.contains("disabled")')
            except:
                has_next = False
            if has_next:
                self.driver.execute_script(
                    'document.querySelector(".ui_pagination>.next").click()')

        return [r.__dict__ for r in reviews]
	def post(self, username):
		self.response.headers['Content-Type'] = 'text/plain'

		callback = self.request.get('hub.callback', '')
		topic = self.request.get('hub.topic', '')
		verify_type_list = [s.lower() for s in self.request.get_all('hub.verify')]
		verify_token = unicode(self.request.get('hub.verify_token', ''))
		secret = unicode(self.request.get('hub.secret', '')) or None
		lease_seconds = self.request.get('hub.lease_seconds', str(constants.DEFAULT_LEASE_SECONDS))
		mode = self.request.get('hub.mode', '').lower()

		error_message = None
		if not callback or not utils.is_valid_url(callback):
			error_message = 'Invalid parameter: hub.callback'
		else:
			callback = utils.unicode_to_iri(callback)

		if not topic or not utils.is_valid_url(topic):
			error_message = 'Invalid parameter: hub.topic'
		else:
			topic = utils.unicode_to_iri(topic)

		enabled_types = [vt for vt in verify_type_list if vt in ('async', 'sync')]
		if not enabled_types:
			error_message = 'Invalid values for hub.verify: %s' % (verify_type_list,)
		else:
			verify_type = enabled_types[0]

		if mode not in ('subscribe', 'unsubscribe'):
			error_message = 'Invalid value for hub.mode: %s' % mode

		if lease_seconds:
			try:
				old_lease_seconds = lease_seconds
				lease_seconds = int(old_lease_seconds)
				if not old_lease_seconds == str(lease_seconds):
					raise ValueError
			except ValueError:
				error_message = ('Invalid value for hub.lease_seconds: %s' %
												 old_lease_seconds)

		if error_message:
			logging.debug('Bad request for mode = %s, topic = %s, '
										'callback = %s, verify_token = %s, lease_seconds = %s: %s',
										mode, topic, callback, verify_token,
										lease_seconds, error_message)
			self.response.out.write(error_message)
			return self.response.set_status(400)

		try:
			# Retrieve any existing subscription for this callback.
			sub = hubmodel.HubSubscription.get_by_key_name(
					hubmodel.HubSubscription.create_key_name(callback, topic))

			# Deletions for non-existant subscriptions will be ignored.
			if mode == 'unsubscribe' and not sub:
				return self.response.set_status(204)

			# Enqueue a background verification task, or immediately confirm.
			# We prefer synchronous confirmation.
			if verify_type == 'sync':
				if hooks.execute(confirm_subscription,
							mode, topic, callback, verify_token, secret, lease_seconds):
					return self.response.set_status(204)
				else:
					self.response.out.write('Error trying to confirm subscription')
					return self.response.set_status(409)
			else:
				if mode == 'subscribe':
					hubmodel.HubSubscription.request_insert(callback, topic, verify_token, secret,
																			lease_seconds=lease_seconds)
				else:
					hubmodel.HubSubscription.request_remove(callback, topic, verify_token)
				logging.debug('Queued %s request for callback = %s, '
											'topic = %s, verify_token = "%s", lease_seconds= %s',
											mode, callback, topic, verify_token, lease_seconds)
				return self.response.set_status(202)

		except (apiproxy_errors.Error, db.Error,
						runtime.DeadlineExceededError, taskqueue.Error):
			logging.exception('Could not verify subscription request')
			self.response.headers['Retry-After'] = '120'
			return self.response.set_status(503)
Exemple #36
0
        engine.execute(
            text("""
            UPDATE posts_rssfeed SET updated_at=:updated_at WHERE id=:rss_id
            """), {
                'updated_at': updated_at,
                'rss_id': parser['rss_id']
            })

        internal_entries = []
        raw_external_entries = []
        for entry in parser['entries']:
            if entry.get('link'):
                link = entry['link']
            else:
                if entry.get('id') and is_valid_url(entry['id']):
                    link = entry['id']
                else:
                    print("Unfortunately this news doesn\'t have a link.")
                    print(entry)
                    continue

            content = None
            if entry.get('summary'):
                text_to_search_in = entry['summary']
            elif entry.get('content') and entry['content'][0]['value']:
                content = remove_html_tags(entry['content'][0]['value'])
                text_to_search_in = content
            else:
                text_to_search_in = ''
Exemple #37
0
class Sms:
    DESCRIPTION = 'Provides authentication using an SMS code.'
    CONFIG = {
        'msg': 'Enter in __URL__ and put this code __CODE__',
        'registration-action': {
            'mode': 'vote',
            'mode-config': None,
        },
        'authentication-action': {
            'mode': 'vote',
            'mode-config': None,
        }
    }
    PIPELINES = {
        'give_perms': [{
            'object_type': 'UserData',
            'perms': [
                'edit',
            ],
            'object_id': 'UserDataId'
        }, {
            'object_type': 'AuthEvent',
            'perms': [
                'vote',
            ],
            'object_id': 'AuthEventId'
        }],
        "register-pipeline": [
            ["check_whitelisted", {
                "field": "tlf"
            }],
            ["check_whitelisted", {
                "field": "ip"
            }],
            ["check_blacklisted", {
                "field": "ip"
            }],
            ["check_blacklisted", {
                "field": "tlf"
            }],
            ["check_total_max", {
                "field": "ip",
                "max": 8
            }],
            ["check_total_max", {
                "field": "tlf",
                "max": 7
            }],
            ["check_total_max", {
                "field": "tlf",
                "period": 1440,
                "max": 5
            }],
        ],
        "authenticate-pipeline": [
            #['check_total_connection', {'times': 5 }],
            #['check_sms_code', {'timestamp': 5 }]
        ],
        "resend-auth-pipeline": [
            ["check_whitelisted", {
                "field": "tlf"
            }],
            ["check_whitelisted", {
                "field": "ip"
            }],
            ["check_blacklisted", {
                "field": "ip"
            }],
            ["check_blacklisted", {
                "field": "tlf"
            }],
            ["check_total_max", {
                "field": "tlf",
                "period": 3600,
                "max": 5
            }],
            [
                "check_total_max", {
                    "field": "tlf",
                    "period": 3600 * 24,
                    "max": 15
                }
            ],
            ["check_total_max", {
                "field": "ip",
                "period": 3600,
                "max": 10
            }],
            [
                "check_total_max", {
                    "field": "ip",
                    "period": 3600 * 24,
                    "max": 20
                }
            ],
        ]
    }
    USED_TYPE_FIELDS = ['tlf']

    tlf_definition = {
        "name": "tlf",
        "type": "text",
        "required": True,
        "min": 4,
        "max": 20,
        "required_on_authentication": True
    }
    code_definition = {
        "name": "code",
        "type": "text",
        "required": True,
        "min": 6,
        "max": 255,
        "required_on_authentication": True
    }

    CONFIG_CONTRACT = [{
        'check': 'isinstance',
        'type': dict
    }, {
        'check':
        'dict-keys-exact',
        'keys': ['msg', 'registration-action', 'authentication-action']
    }, {
        'check':
        'index-check-list',
        'index':
        'msg',
        'check-list': [{
            'check': 'isinstance',
            'type': str
        }, {
            'check': 'length',
            'range': [1, 200]
        }]
    }, {
        'check':
        'index-check-list',
        'index':
        'registration-action',
        'check-list': [{
            'check': 'isinstance',
            'type': dict
        }, {
            'check': 'dict-keys-exact',
            'keys': ['mode', 'mode-config']
        }, {
            'check':
            'index-check-list',
            'index':
            'mode',
            'check-list': [{
                'check': 'isinstance',
                'type': str
            }, {
                'check': 'lambda',
                'lambda': lambda d: d in ['vote', 'go-to-url']
            }]
        }, {
            'check': 'switch-contract-by-dict-key',
            'switch-key': 'mode',
            'contract-key': 'mode-config',
            'contracts': {
                'vote': [{
                    'check': 'lambda',
                    'lambda': lambda d: d is None
                }],
                'go-to-url': [{
                    'check': 'isinstance',
                    'type': dict
                }, {
                    'check': 'dict-keys-exact',
                    'keys': ['url']
                }, {
                    'check':
                    'index-check-list',
                    'index':
                    'url',
                    'check-list': [{
                        'check': 'isinstance',
                        'type': str
                    }, {
                        'check': 'length',
                        'range': [1, 400]
                    }, {
                        'check':
                        'lambda',
                        'lambda':
                        lambda d: is_valid_url(d, schemes=['https'])
                    }]
                }]
            }
        }]
    }, {
        'check':
        'index-check-list',
        'index':
        'authentication-action',
        'check-list': [{
            'check': 'isinstance',
            'type': dict
        }, {
            'check': 'dict-keys-exact',
            'keys': ['mode', 'mode-config']
        }, {
            'check':
            'index-check-list',
            'index':
            'mode',
            'check-list': [{
                'check': 'isinstance',
                'type': str
            }, {
                'check': 'lambda',
                'lambda': lambda d: d in ['vote', 'go-to-url']
            }]
        }, {
            'check': 'switch-contract-by-dict-key',
            'switch-key': 'mode',
            'contract-key': 'mode-config',
            'contracts': {
                'vote': [{
                    'check': 'lambda',
                    'lambda': lambda d: d is None
                }],
                'go-to-url': [{
                    'check': 'isinstance',
                    'type': dict
                }, {
                    'check': 'dict-keys-exact',
                    'keys': ['url']
                }, {
                    'check':
                    'index-check-list',
                    'index':
                    'url',
                    'check-list': [{
                        'check': 'isinstance',
                        'type': str
                    }, {
                        'check': 'length',
                        'range': [1, 400]
                    }, {
                        'check':
                        'lambda',
                        'lambda':
                        lambda d: is_valid_url(d, schemes=['https'])
                    }]
                }]
            }
        }]
    }]

    def error(self, msg, error_codename):
        d = {'status': 'nok', 'msg': msg, 'error_codename': error_codename}
        return d

    def check_config(self, config):
        """ Check config when create auth-event. """
        msg = ''
        try:
            check_contract(self.CONFIG_CONTRACT, config)
            return ''
        except CheckException as e:
            return json.dumps(e.data, cls=JsonTypeEncoder)

    def census(self, ae, request):
        req = json.loads(request.body.decode('utf-8'))
        validation = req.get('field-validation', 'enabled') == 'enabled'
        data = {'status': 'ok'}

        msg = ''
        current_tlfs = []
        for r in req.get('census'):
            if r.get('tlf'):
                r['tlf'] = get_cannonical_tlf(r.get('tlf'))
            tlf = r.get('tlf')
            if isinstance(tlf, str):
                tlf = tlf.strip()
            msg += check_field_type(self.tlf_definition, tlf)
            if validation:
                msg += check_field_value(self.tlf_definition, tlf)
            msg += check_fields_in_request(r,
                                           ae,
                                           'census',
                                           validation=validation)
            if validation:
                msg += exist_user(r, ae)
                if tlf in current_tlfs:
                    msg += "Tlf %s repeat." % tlf
                current_tlfs.append(tlf)
            else:
                if msg:
                    msg = ''
                    continue
                exist = exist_user(r, ae)
                if exist and not exist.count('None'):
                    continue
                # By default we creates the user as active we don't check
                # the pipeline
                u = create_user(r, ae, True)
                give_perms(u, ae)
        if msg and validation:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        if validation:
            for r in req.get('census'):
                # By default we creates the user as active we don't check
                # the pipeline
                u = create_user(r, ae, True)
                give_perms(u, ae)
        return data

    def register(self, ae, request):
        req = json.loads(request.body.decode('utf-8'))

        msg = check_pipeline(request, ae)
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        # create the user as active? Usually yes, but the execute_pipeline call inside
        # check_fields_in_request might modify this
        req['active'] = True

        msg = ''
        if req.get('tlf'):
            req['tlf'] = get_cannonical_tlf(req.get('tlf'))
        tlf = req.get('tlf')
        if isinstance(tlf, str):
            tlf = tlf.strip()
        msg += check_field_type(self.tlf_definition, tlf)
        msg += check_field_value(self.tlf_definition, tlf)
        msg += check_fields_in_request(req, ae)
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")
        # get active from req, this value might have changed in check_fields_in_requests
        active = req.pop('active')

        msg_exist = exist_user(req, ae, get_repeated=True)
        if msg_exist:
            u = msg_exist.get('user')
            if u.is_active:
                return self.error("Incorrect data",
                                  error_codename="invalid_credentials")
        else:
            u = create_user(req, ae, active)
            msg += give_perms(u, ae)

        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")
        elif not active:
            # Note, we are not calling to extend_send_sms because we are not
            # sending the code in here
            return {'status': 'ok'}

        result = plugins.call("extend_send_sms", ae, 1)
        if result:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")
        send_codes.apply_async(args=[[
            u.id,
        ], get_client_ip(request)])
        return {'status': 'ok'}

    def authenticate(self, ae, request):
        req = json.loads(request.body.decode('utf-8'))

        msg = ''
        if req.get('tlf'):
            req['tlf'] = get_cannonical_tlf(req.get('tlf'))
        tlf = req.get('tlf')
        if isinstance(tlf, str):
            tlf = tlf.strip()
        msg += check_field_type(self.tlf_definition, tlf, 'authenticate')
        msg += check_field_value(self.tlf_definition, tlf, 'authenticate')
        msg += check_field_type(self.code_definition, req.get('code'),
                                'authenticate')
        msg += check_field_value(self.code_definition, req.get('code'),
                                 'authenticate')
        msg += check_fields_in_request(req, ae, 'authenticate')
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        try:
            u = User.objects.get(userdata__tlf=tlf,
                                 userdata__event=ae,
                                 is_active=True)
        except:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        code = Code.objects.filter(
            user=u.userdata,
            code=req.get('code').upper()).order_by('-created').first()
        if not code:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        msg = check_pipeline(request, ae, 'authenticate')
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        msg = check_metadata(req, u)
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        u.save()

        data = {'status': 'ok'}
        data['username'] = u.username
        data['auth-token'] = genhmac(settings.SHARED_SECRET, u.username)

        # add redirection
        auth_action = ae.auth_method_config['config']['authentication-action']
        if auth_action['mode'] == 'go-to-url':
            data['redirect-to-url'] = auth_action['mode-config']['url']

        return data

    def resend_auth_code(self, ae, request):
        req = json.loads(request.body.decode('utf-8'))

        msg = ''
        if req.get('tlf'):
            req['tlf'] = get_cannonical_tlf(req.get('tlf'))
        tlf = req.get('tlf')
        if isinstance(tlf, str):
            tlf = tlf.strip()
        msg += check_field_type(self.tlf_definition, tlf, 'authenticate')
        msg += check_field_value(self.tlf_definition, tlf, 'authenticate')
        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        try:
            u = User.objects.get(userdata__tlf=tlf,
                                 userdata__event=ae,
                                 is_active=True)
        except:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        msg = check_pipeline(request, ae, 'resend-auth-pipeline',
                             Sms.PIPELINES['resend-auth-pipeline'])

        if msg:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")

        result = plugins.call("extend_send_sms", ae, 1)
        if result:
            return self.error("Incorrect data",
                              error_codename="invalid_credentials")
        send_codes.apply_async(args=[[
            u.id,
        ], get_client_ip(request)])
        return {'status': 'ok'}