Example #1
0
def main():
    session = Session()
    session.headers.update({'User-Agent': USER_AGENT})

    for url in ALBUM_URLS:
        try:
            album_name = decode_url(url.split('/')[-2])
        except Exception:
            album_name = None

        print("Downloading music from album '{}'\n".format(album_name))
        resp = session.get(url, timeout=30)
        if not resp.ok:
            print('Response code: {}'.format(resp.status_code))
            return
        html = resp.text

        soup = Soup(html)
        try:
            music_list = soup.scrape(scrape_config)
        except Exception:
            print(
                "Can't download music from album {}. check URL and content selector.\n"
                .format(album_name))
            continue

        for song in music_list:
            music_title = remove_whitespace(song.get('music_title', None)) \
                .replace(" (music.com.bd).mp3", "") \
                .replace("{} - ".format(decode_url(url.split('/')[-3])), "")

            music_url = 'https:{}'.format(encode_to_url(
                song.get('music_url'))).replace(".html", "")

            if not music_url.endswith('.mp3'):
                print("Skipping '{}' : not a music file".format(music_title))
                continue

            music_title = '{}.mp3'.format(
                re.search(MUSIC_TITLE_REGEX, music_title).group(1))

            create_dir(os.path.join('Downloads', album_name))

            print('>>> Downloading {}'.format(music_title))
            resp_music = session.get(music_url)
            if not resp_music.ok:
                print('    >>> Download failed : {}'.format(
                    resp_music.status_code))
                continue

            music_path = os.path.join(album_name, music_title)
            write_binary_file(music_path, resp_music)
            print('    >>> {} downloaded'.format(music_title))
        # end loop: music_list
        print("\n>>> Album '{}' downloaded".format(album_name))
        print('----------------------------\n\n')
    # end loop : urls
    print('- Done -')
Example #2
0
def add_page(request, category_name_url):
    context = RequestContext(request)

    category_name = decode_url(category_name_url)
    if request.method == 'POST':
        form = PageForm(request.POST)
        if form.is_valid():
            page = form.save(commit=False)

            try:
                cat = Category.objects.get(name=category_name)
                page.category = cat
            except Category.DoesNotExist:
                print category_name
                return render_to_response('rango/add_category.html',{},context)

            page.views = 0
            page.save()

            print 'here'

            return category(request, category_name_url)
        else:
            print form.errors
    else:
        page = PageForm()
    return render_to_response('rango/add_page.html',{'category_name_url':category_name_url,'page':page},context)
Example #3
0
def worker(index, event, base_url):
    while True:
        print "thread-{} start".format(index)
        event.wait()
        _time = time.time()
        conn = check_data_for_thread(index)
        data = ""
        new_data = True
        while b"\r\n" not in data and new_data:
            new_data = conn.recv(1024)
            if new_data:
                data += new_data
            else:
                break
        data = decode_url(data)
        method, path, http_version = http_parser(data)
        if not method or not path or not http_version:
            try:
                conn.send(make_40X_resopnse_header("405 Bad Gateway"))
            except BaseException as e:
                conn.close()
                set_data_for_thread(index, None)
                event.clear()
                continue
        else:
            is_data_type_determinate, data_type = determinate_content_type(path)
            try:
                if not is_data_type_determinate:
                    path += 'index.html'
                data, length = read_file(path, base_url)
            except IOError:
                if is_data_type_determinate:
                    conn.send(make_40X_resopnse_header("404 Not Found"))
                else:
                    conn.send(make_40X_resopnse_header("403 Forbidden"))
                conn.close()
                set_data_for_thread(index, None)
                event.clear()
                continue
            try:
                header = make_response_header(data_type, length, http_version)

                if method == "GET":
                    data = header + data
                    conn.send(data)
                if method == "HEAD":
                    data = header
                    conn.send(data)
            except BaseException as e:
                print "404 BASE in thread-{}".format(index)
                print path
                print e
        try:
            conn.close()
        except Exception as e:
            print e
        set_data_for_thread(index, None)
        event.clear()
        print "thread-{} stop".format(index)
Example #4
0
def category(request, category_name_url):
    context = RequestContext(request)

    category_name = decode_url(category_name_url)
    context_dict = {'category_name':category_name,
                    'category_name_url':category_name_url}

    try:
        category = Category.objects.get(name=category_name)
        pages = Page.objects.filter(category=category)

        context_dict['pages'] = pages
        context_dict['category'] = category
    except Category.DoesNotExist:
        pass

    return render_to_response('rango/category.html', context_dict, context)
def download_attachment(download_url, download_folder, attachment_id, attachment_duplicate_file_names,
                        attachment_file_matching, depth=0):
    """ Repairs links in the page contents with local links.

    :param download_url: Confluence download URL.
    :param download_folder: Folder to place downloaded files in.
    :param attachment_id: ID of the attachment to download.
    :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \
                                            duplicates}
    :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'}
    :param depth: (optional) Hierarchy depth of the handled Confluence page.
    :returns: Path and name of the downloaded file as dict.
    """
    clean_url = utils.decode_url(download_url)
    downloaded_file_name = derive_downloaded_file_name(clean_url)
    downloaded_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
                                                    downloaded_file_name)
    downloaded_file_path = download_file(download_url, download_folder, downloaded_file_name, depth=depth)

    # Download the thumbnail as well if the attachment is an image
    clean_thumbnail_url = clean_url.replace('/attachments/', '/thumbnails/', 1)
    downloaded_thumbnail_file_name = derive_downloaded_file_name(clean_thumbnail_url)
    downloaded_thumbnail_file_name = provide_unique_file_name(attachment_duplicate_file_names, attachment_file_matching,
                                                              downloaded_thumbnail_file_name)
    if utils.is_file_format(downloaded_thumbnail_file_name, settings.CONFLUENCE_THUMBNAIL_FORMATS):
        # TODO: Confluence creates thumbnails always as PNGs but does not change the file extension to .png.
        download_file(clean_thumbnail_url, download_folder, downloaded_thumbnail_file_name, depth=depth,
                      error_output=False)

    # Download the image preview as well if Confluence generated one for the attachment
    if utils.is_file_format(downloaded_file_name, settings.CONFLUENCE_GENERATED_PREVIEW_FORMATS):
        clean_preview_url = '/rest/documentConversion/latest/conversion/thumbnail/%s/1' % attachment_id
        downloaded_preview_file_name = derive_downloaded_file_name(clean_preview_url)
        downloaded_preview_file_name = provide_unique_file_name(attachment_duplicate_file_names,
                                                                attachment_file_matching, downloaded_preview_file_name)
        download_file(clean_preview_url, download_folder, downloaded_preview_file_name, depth=depth, error_output=False)

    return {'file_name': downloaded_file_name, 'file_path': downloaded_file_path}
Example #6
0
def paper_details(purl):
    purl = utils.decode_url(purl)
    try:
        store_history(purl)
    except:
        pass
    cursor = g.conn.execute(text("""
        SELECT  P.title, P.purl, P.model, PO.url
        FROM Papers P LEFT OUTER JOIN Published_On PO ON P.purl = PO.purl
        WHERE P.purl = :purl;
      """), purl=purl)
    paper = cursor.fetchone()
    cursor = g.conn.execute(text("""
      SELECT  PB.aid, A.first_name, A.last_name, I.iid, I.name
      FROM Papers P RIGHT OUTER JOIN Published_by PB ON P.purl = PB.purl
      INNER JOIN Authors A ON PB.aid = A.aid
      INNER JOIN Works_At WA ON WA.aid = A.aid
      INNER JOIN Institutions I ON I.iid = WA.iid
      WHERE P.purl = :purl;
    """), purl=purl)
    authors = list(cursor.fetchall())
    cursor.close()
    return render_template('paper_details.html', paper=paper, authors=authors)
Example #7
0
    def get(self):
        next = self.request.get('next')
        redirect_uri = settings.APP_DOMAIN + self.uri_for('onfb') + '?next='+ next

        error = self.request.get('error')
        code = self.request.get('code')

        # If error on login
        if error:
            self.response.out.write('Anda gagal login ke Bokerface.com.')

        # If code received
        elif code:
            try:
                token = facebook.get_access_token_from_code(code, redirect_uri,
                                                            settings.FACEBOOK_APP_ID,
                                                            settings.FACEBOOK_APP_SECRET)
            except facebook.GraphAPIError as e:
                self.response.out.write(e)
            else:
                access_token = token['access_token']
                # Get user profile
                graph = facebook.GraphAPI(access_token)
                profile = graph.get_object('me')

                uid = profile.get('id')
                user = User.get_by_key_name(uid)

                # Update already user access-token
                if user:
                    if user.access_token != access_token:
                        user.access_token = access_token
                        user.put()

                # Create new user
                else:
                    user = User(
                        key_name = str(profile['id']),
                        id = str(profile['id']),
                        username = '******' % str(profile['id'])[-4:],
                        name = profile['name'],
                        profile_url = profile['link'],
                        access_token = access_token
                    )
                    user.put()

                # Save user to session 
                self.session['user'] = dict(
                    username=user.username,
                    name=user.name,
                    profile_url=user.profile_url,
                    id=user.id,
                    access_token=user.access_token,
                    is_admin=user.is_admin,
                )

                self.redirect(decode_url(next))

        # Default action, authorize app
        else:
            fbauth_url = u'https://www.facebook.com/dialog/oauth?client_id=%s&scope=publish_actions&redirect_uri=%s' % (
                settings.FACEBOOK_APP_ID, redirect_uri
                )
            self.redirect(str(fbauth_url))
Example #8
0
 def post(self):
     to = self.request.get('to', encode_url('/'))
     # self.redirect(self.uri_for('onfb') + '?next=' + to)
     self.redirect(decode_url(to))
Example #9
0
def show_results(search_word):
    search_word = utils.decode_url(search_word)
    results = search_term(search_word)
    return render_template('results.html', results=results)
def handle_html_references(html_content,
                           page_duplicate_file_names,
                           page_file_matching,
                           depth=0):
    """ Repairs links in the page contents with local links.

    :param html_content: Confluence HTML content.
    :param page_duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates}
    :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'}
    :param depth: (optional) Hierarchy depth of the handled Confluence page.
    :returns: Fixed HTML content.
    """
    try:
        html_tree = html.fromstring(html_content)
    except ParserError:
        print('page is empty')
        return html_content
    except XMLSyntaxError:
        print(
            '%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.'
            % ('\t' * (depth + 1)))
        return html_content

    # Fix links to other Confluence pages
    # Example: /display/TES/pictest1
    #       => pictest1.html
    # TODO: This code does not work for "Recent space activity" areas in space pages because of a different url format.
    xpath_expr = '//a[contains(@href, "/display/")]'
    for link_element in html_tree.xpath(xpath_expr):
        if not link_element.get('class'):
            page_title = link_element.attrib['href'].split('/')[3]
            page_title = page_title.replace('+', ' ')
            decoded_page_title = utils.decode_url(page_title)
            offline_link = provide_unique_file_name(
                page_duplicate_file_names,
                page_file_matching,
                decoded_page_title,
                explicit_file_extension='html')
            link_element.attrib['href'] = utils.encode_url(offline_link)

    # Fix links to other Confluence pages when page ids are used
    xpath_expr = '//a[contains(@href, "/pages/viewpage.action?pageId=")]'
    for link_element in html_tree.xpath(xpath_expr):
        if not link_element.get('class'):
            page_id = link_element.attrib['href'].split(
                '/pages/viewpage.action?pageId=')[1]
            offline_link = '%s.html' % utils.sanitize_for_filename(page_id)
            link_element.attrib['href'] = utils.encode_url(offline_link)

    # Fix attachment links
    xpath_expr = '//a[contains(@class, "confluence-embedded-file")]'
    for link_element in html_tree.xpath(xpath_expr):
        file_url = link_element.attrib['href']
        file_name = derive_downloaded_file_name(file_url)
        relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER,
                                        file_name)
        #link_element.attrib['href'] = utils.encode_url(relative_file_path)
        link_element.attrib['href'] = relative_file_path

    # Fix file paths for img tags
    # TODO: Handle non-<img> tags as well if necessary.
    # TODO: Support files with different versions as well if necessary.
    possible_image_xpaths = [
        '//img[contains(@src, "/download/")]',
        '//img[contains(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]'
    ]
    xpath_expr = '|'.join(possible_image_xpaths)
    for img_element in html_tree.xpath(xpath_expr):
        # Replace file path
        file_url = img_element.attrib['src']
        file_name = derive_downloaded_file_name(file_url)
        relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER,
                                        file_name)
        img_element.attrib['src'] = relative_file_path

        # Add alt attribute if it does not exist yet
        if not 'alt' in img_element.attrib.keys():
            img_element.attrib['alt'] = relative_file_path

    return html.tostring(html_tree)