Esempio n. 1
0
def normalizeTextForTagger(text):
    text = text.replace("&", "&")
    text = HTMLParser.HTMLParser().unescape(text)
    return text
Esempio n. 2
0
def cb_injection_handler(url, delay, filename, http_request_method):

    counter = 1
    vp_flag = True
    no_result = True
    is_encoded = False
    export_injection_info = False
    injection_type = "results-based command injection"
    technique = "classic injection technique"

    if not settings.LOAD_SESSION:
        info_msg = "Testing the " + technique + "... "
        sys.stdout.write(settings.print_info_msg(info_msg))
        sys.stdout.flush()
        if settings.VERBOSITY_LEVEL >= 1:
            print ""

    i = 0
    # Calculate all possible combinations
    total = len(settings.WHITESPACE) * len(settings.PREFIXES) * len(
        settings.SEPARATORS) * len(settings.SUFFIXES)
    for whitespace in settings.WHITESPACE:
        for prefix in settings.PREFIXES:
            for suffix in settings.SUFFIXES:
                for separator in settings.SEPARATORS:

                    # If a previous session is available.
                    if settings.LOAD_SESSION and session_handler.notification(
                            url, technique):
                        url, technique, injection_type, separator, shell, vuln_parameter, prefix, suffix, TAG, alter_shell, payload, http_request_method, url_time_response, delay, how_long, output_length, is_vulnerable = session_handler.injection_point_exportation(
                            url, http_request_method)
                        checks.check_for_stored_tamper(payload)

                    else:
                        i = i + 1
                        # Check for bad combination of prefix and separator
                        combination = prefix + separator
                        if combination in settings.JUNK_COMBINATION:
                            prefix = ""

                        # Change TAG on every request to prevent false-positive results.
                        TAG = ''.join(
                            random.choice(string.ascii_uppercase)
                            for i in range(6))

                        randv1 = random.randrange(100)
                        randv2 = random.randrange(100)
                        randvcalc = randv1 + randv2

                        # Define alter shell
                        alter_shell = menu.options.alter_shell

                        try:
                            if alter_shell:
                                # Classic -alter shell- decision payload (check if host is vulnerable).
                                payload = cb_payloads.decision_alter_shell(
                                    separator, TAG, randv1, randv2)
                            else:
                                # Classic decision payload (check if host is vulnerable).
                                payload = cb_payloads.decision(
                                    separator, TAG, randv1, randv2)

                            # Define prefixes & suffixes
                            payload = parameters.prefixes(payload, prefix)
                            payload = parameters.suffixes(payload, suffix)

                            # Whitespace fixation
                            payload = re.sub(" ", whitespace, payload)

                            # Check for base64 / hex encoding
                            payload = checks.perform_payload_encoding(payload)

                            # Check if defined "--verbose" option.
                            if settings.VERBOSITY_LEVEL == 1:
                                print settings.print_payload(payload)
                            elif settings.VERBOSITY_LEVEL > 1:
                                info_msg = "Generating a payload for injection..."
                                print settings.print_info_msg(info_msg)
                                print settings.print_payload(payload)

                            # Cookie Injection
                            if settings.COOKIE_INJECTION == True:
                                # Check if target host is vulnerable to cookie injection.
                                vuln_parameter = parameters.specify_cookie_parameter(
                                    menu.options.cookie)
                                response = cb_injector.cookie_injection_test(
                                    url, vuln_parameter, payload)

                            # User-Agent Injection
                            elif settings.USER_AGENT_INJECTION == True:
                                # Check if target host is vulnerable to user-agent injection.
                                vuln_parameter = parameters.specify_user_agent_parameter(
                                    menu.options.agent)
                                response = cb_injector.user_agent_injection_test(
                                    url, vuln_parameter, payload)

                            # Referer Injection
                            elif settings.REFERER_INJECTION == True:
                                # Check if target host is vulnerable to referer injection.
                                vuln_parameter = parameters.specify_referer_parameter(
                                    menu.options.referer)
                                response = cb_injector.referer_injection_test(
                                    url, vuln_parameter, payload)

                            # Custom HTTP header Injection
                            elif settings.CUSTOM_HEADER_INJECTION == True:
                                # Check if target host is vulnerable to custom http header injection.
                                vuln_parameter = parameters.specify_custom_header_parameter(
                                    settings.INJECT_TAG)
                                response = cb_injector.custom_header_injection_test(
                                    url, vuln_parameter, payload)

                            else:
                                # Check if target host is vulnerable.
                                response, vuln_parameter = cb_injector.injection_test(
                                    payload, http_request_method, url)

                            # Try target page reload (if it is required).
                            if settings.URL_RELOAD:
                                response = requests.url_reload(url, delay)
                            # Evaluate test results.
                            shell = cb_injector.injection_test_results(
                                response, TAG, randvcalc)

                            if not settings.VERBOSITY_LEVEL >= 1:
                                percent = ((i * 100) / total)
                                float_percent = "{0:.1f}".format(
                                    round(((i * 100) / (total * 1.0)), 2))

                                if shell == False:
                                    info_msg = "Testing the " + technique + "... " + "[ " + float_percent + "%" + " ]"
                                    sys.stdout.write(
                                        "\r" +
                                        settings.print_info_msg(info_msg))
                                    sys.stdout.flush()

                                if float(float_percent) >= 99.9:
                                    if no_result == True:
                                        percent = Fore.RED + "FAILED" + Style.RESET_ALL
                                    else:
                                        percent = str(float_percent) + "%"
                                elif len(shell) != 0:
                                    percent = Fore.GREEN + "SUCCEED" + Style.RESET_ALL
                                else:
                                    percent = str(float_percent) + "%"
                                info_msg = "Testing the " + technique + "... " + "[ " + percent + " ]"
                                sys.stdout.write(
                                    "\r" + settings.print_info_msg(info_msg))
                                sys.stdout.flush()

                        except KeyboardInterrupt:
                            raise

                        except SystemExit:
                            raise

                        except:
                            continue

                    # Yaw, got shellz!
                    # Do some magic tricks!
                    if shell:
                        found = True
                        no_result = False

                        if settings.COOKIE_INJECTION == True:
                            header_name = " cookie"
                            found_vuln_parameter = vuln_parameter
                            the_type = " parameter"

                        elif settings.USER_AGENT_INJECTION == True:
                            header_name = " User-Agent"
                            found_vuln_parameter = ""
                            the_type = " HTTP header"

                        elif settings.REFERER_INJECTION == True:
                            header_name = " Referer"
                            found_vuln_parameter = ""
                            the_type = " HTTP header"

                        elif settings.CUSTOM_HEADER_INJECTION == True:
                            header_name = " " + settings.CUSTOM_HEADER_NAME
                            found_vuln_parameter = ""
                            the_type = " HTTP header"

                        else:
                            header_name = ""
                            the_type = " parameter"
                            if http_request_method == "GET":
                                found_vuln_parameter = parameters.vuln_GET_param(
                                    url)
                            else:
                                found_vuln_parameter = vuln_parameter

                        if len(found_vuln_parameter) != 0:
                            found_vuln_parameter = " '" + found_vuln_parameter + Style.RESET_ALL + Style.BRIGHT + "'"

                        # Print the findings to log file.
                        if export_injection_info == False:
                            export_injection_info = logs.add_type_and_technique(
                                export_injection_info, filename,
                                injection_type, technique)
                        if vp_flag == True:
                            vp_flag = logs.add_parameter(
                                vp_flag, filename, the_type, header_name,
                                http_request_method, vuln_parameter, payload)
                        logs.update_payload(filename, counter, payload)
                        counter = counter + 1

                        if not settings.VERBOSITY_LEVEL >= 1 and not settings.LOAD_SESSION:
                            print ""

                        # Print the findings to terminal.
                        success_msg = "The"
                        if found_vuln_parameter == " ":
                            success_msg += http_request_method + ""
                        success_msg += the_type + header_name
                        success_msg += found_vuln_parameter + " seems injectable via "
                        success_msg += "(" + injection_type.split(
                            " ")[0] + ") " + technique + "."
                        print settings.print_success_msg(success_msg)
                        print settings.SUB_CONTENT_SIGN + "Payload: " + re.sub(
                            "%20", " ", re.sub("%2B", "+",
                                               payload)) + Style.RESET_ALL
                        # Export session
                        if not settings.LOAD_SESSION:
                            session_handler.injection_point_importation(
                                url,
                                technique,
                                injection_type,
                                separator,
                                shell[0],
                                vuln_parameter,
                                prefix,
                                suffix,
                                TAG,
                                alter_shell,
                                payload,
                                http_request_method,
                                url_time_response=0,
                                delay=0,
                                how_long=0,
                                output_length=0,
                                is_vulnerable=menu.options.level)
                        else:
                            whitespace = settings.WHITESPACE[0]
                            settings.LOAD_SESSION = False

                        # Check for any enumeration options.
                        new_line = True
                        if settings.ENUMERATION_DONE == True:
                            while True:
                                if not menu.options.batch:
                                    question_msg = "Do you want to enumerate again? [Y/n] > "
                                    enumerate_again = raw_input(
                                        "\n" + settings.print_question_msg(
                                            question_msg)).lower()
                                else:
                                    enumerate_again = ""
                                if len(enumerate_again) == 0:
                                    enumerate_again = "y"
                                if enumerate_again in settings.CHOICE_YES:
                                    cb_enumeration.do_check(
                                        separator, TAG, prefix, suffix,
                                        whitespace, http_request_method, url,
                                        vuln_parameter, alter_shell, filename,
                                        delay)
                                    #print ""
                                    break
                                elif enumerate_again in settings.CHOICE_NO:
                                    new_line = False
                                    break
                                elif enumerate_again in settings.CHOICE_QUIT:
                                    sys.exit(0)
                                else:
                                    err_msg = "'" + enumerate_again + "' is not a valid answer."
                                    print settings.print_error_msg(err_msg)
                                    pass
                        else:
                            if menu.enumeration_options():
                                cb_enumeration.do_check(
                                    separator, TAG, prefix, suffix, whitespace,
                                    http_request_method, url, vuln_parameter,
                                    alter_shell, filename, delay)

                        if not menu.file_access_options(
                        ) and not menu.options.os_cmd and new_line:
                            print ""

                        # Check for any system file access options.
                        if settings.FILE_ACCESS_DONE == True:
                            if settings.ENUMERATION_DONE != True:
                                print ""
                            while True:
                                if not menu.options.batch:
                                    question_msg = "Do you want to access files again? [Y/n] > "
                                    sys.stdout.write(
                                        settings.print_question_msg(
                                            question_msg))
                                    file_access_again = sys.stdin.readline(
                                    ).replace("\n", "").lower()
                                else:
                                    file_access_again = ""
                                if len(file_access_again) == 0:
                                    file_access_again = "y"
                                if file_access_again in settings.CHOICE_YES:
                                    cb_file_access.do_check(
                                        separator, TAG, prefix, suffix,
                                        whitespace, http_request_method, url,
                                        vuln_parameter, alter_shell, filename,
                                        delay)
                                    print ""
                                    break
                                elif file_access_again in settings.CHOICE_NO:
                                    break
                                elif file_access_again in settings.CHOICE_QUIT:
                                    sys.exit(0)
                                else:
                                    err_msg = "'" + file_access_again + "' is not a valid answer."
                                    print settings.print_error_msg(err_msg)
                                    pass
                        else:
                            if menu.file_access_options():
                                if not menu.enumeration_options():
                                    print ""
                                cb_file_access.do_check(
                                    separator, TAG, prefix, suffix, whitespace,
                                    http_request_method, url, vuln_parameter,
                                    alter_shell, filename, delay)
                                print ""

                        # Check if defined single cmd.
                        if menu.options.os_cmd:
                            if not menu.file_access_options():
                                print ""
                            cb_enumeration.single_os_cmd_exec(
                                separator, TAG, prefix, suffix, whitespace,
                                http_request_method, url, vuln_parameter,
                                alter_shell, filename, delay)

                        # Pseudo-Terminal shell
                        go_back = False
                        go_back_again = False
                        while True:
                            if go_back == True:
                                break
                            # if settings.ENUMERATION_DONE == False and settings.FILE_ACCESS_DONE == False:
                            #   if settings.VERBOSITY_LEVEL >= 1:
                            #     print ""
                            if not menu.options.batch:
                                question_msg = "Do you want a Pseudo-Terminal shell? [Y/n] > "
                                sys.stdout.write(
                                    settings.print_question_msg(question_msg))
                                gotshell = sys.stdin.readline().replace(
                                    "\n", "").lower()
                            else:
                                gotshell = ""
                            if len(gotshell) == 0:
                                gotshell = "y"
                            if gotshell in settings.CHOICE_YES:
                                if not menu.options.batch:
                                    print ""
                                print "Pseudo-Terminal (type '" + Style.BRIGHT + "?" + Style.RESET_ALL + "' for available options)"
                                if readline_error:
                                    checks.no_readline_module()
                                while True:
                                    try:
                                        if not readline_error:
                                            # Tab compliter
                                            readline.set_completer(
                                                menu.tab_completer)
                                            # MacOSX tab compliter
                                            if getattr(
                                                    readline, '__doc__', ''
                                            ) is not None and 'libedit' in getattr(
                                                    readline, '__doc__', ''):
                                                readline.parse_and_bind(
                                                    "bind ^I rl_complete")
                                            # Unix tab compliter
                                            else:
                                                readline.parse_and_bind(
                                                    "tab: complete")
                                        cmd = raw_input("""commix(""" +
                                                        Style.BRIGHT +
                                                        Fore.RED +
                                                        """os_shell""" +
                                                        Style.RESET_ALL +
                                                        """) > """)
                                        cmd = checks.escaped_cmd(cmd)
                                        if cmd.lower(
                                        ) in settings.SHELL_OPTIONS:
                                            go_back, go_back_again = shell_options.check_option(
                                                separator, TAG, cmd, prefix,
                                                suffix, whitespace,
                                                http_request_method, url,
                                                vuln_parameter, alter_shell,
                                                filename, technique, go_back,
                                                no_result, delay,
                                                go_back_again)
                                            if go_back and go_back_again == False:
                                                break
                                            if go_back and go_back_again:
                                                return True
                                        else:
                                            # Command execution results.
                                            response = cb_injector.injection(
                                                separator, TAG, cmd, prefix,
                                                suffix, whitespace,
                                                http_request_method, url,
                                                vuln_parameter, alter_shell,
                                                filename)
                                            # Try target page reload (if it is required).
                                            if settings.URL_RELOAD:
                                                response = requests.url_reload(
                                                    url, delay)
                                            if menu.options.ignore_session or \
                                               session_handler.export_stored_cmd(url, cmd, vuln_parameter) == None:
                                                # Evaluate injection results.
                                                try:
                                                    shell = cb_injector.injection_results(
                                                        response, TAG, cmd)
                                                    shell = "".join(
                                                        str(p) for p in shell)
                                                except:
                                                    print ""
                                                    continue
                                                if not menu.options.ignore_session:
                                                    session_handler.store_cmd(
                                                        url, cmd, shell,
                                                        vuln_parameter)
                                            else:
                                                shell = session_handler.export_stored_cmd(
                                                    url, cmd, vuln_parameter)
                                            if shell:
                                                html_parser = HTMLParser.HTMLParser(
                                                )
                                                shell = html_parser.unescape(
                                                    shell)
                                                # Update logs with executed cmds and execution results.
                                                logs.executed_command(
                                                    filename, cmd, shell)
                                            if shell != "":
                                                if settings.VERBOSITY_LEVEL == 1:
                                                    print ""
                                                print "\n" + Fore.GREEN + Style.BRIGHT + shell + Style.RESET_ALL + "\n"
                                            else:
                                                if settings.VERBOSITY_LEVEL >= 1:
                                                    print ""
                                                err_msg = "The '" + cmd + "' command, does not return any output."
                                                print settings.print_critical_msg(
                                                    err_msg) + "\n"

                                    except KeyboardInterrupt:
                                        raise

                                    except SystemExit:
                                        raise

                            elif gotshell in settings.CHOICE_NO:
                                if checks.next_attack_vector(
                                        technique, go_back) == True:
                                    break
                                else:
                                    if no_result == True:
                                        return False
                                    else:
                                        return True

                            elif gotshell in settings.CHOICE_QUIT:
                                sys.exit(0)

                            else:
                                err_msg = "'" + gotshell + "' is not a valid answer."
                                print settings.print_error_msg(err_msg)
                                pass

    if no_result == True:
        print ""
        return False
    else:
        sys.stdout.write("\r")
        sys.stdout.flush()
Esempio n. 3
0
 def parse(source=source):
     parser = HTMLParser.HTMLParser()
     parser.feed(source)
     parser.close()
Esempio n. 4
0
from __future__ import absolute_import, division, print_function

import json
import re

import apache_beam as beam
import six

import nlp


if six.PY2:
    import HTMLParser as html_parser  # pylint:disable=g-import-not-at-top

    html_unescape = html_parser.HTMLParser().unescape
else:
    import html  # pylint:disable=g-import-not-at-top

    html_unescape = html.unescape

_CITATION = """
@article{47761,
title	= {Natural Questions: a Benchmark for Question Answering Research},
author	= {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year	= {2019},
journal	= {Transactions of the Association of Computational Linguistics}
}
"""

_DESCRIPTION = """
Esempio n. 5
0
  <affected-histogram name="HistogramEnum"/>
</histogram_suffixes>

</histogram_suffixes_list>

</histogram-configuration>
"""

import bisect
import copy
import datetime
import itertools

try:
    import HTMLParser
    html = HTMLParser.HTMLParser()
except ImportError:  # For Py3 compatibility
    import html

import logging
import re
import xml.dom.minidom

BASIC_EMAIL_REGEXP = r'^[\w\-\+\%\.]+\@[\w\-\+\%\.]+$'

OWNER_PLACEHOLDER = (
    'Please list the metric\'s owners. Add more owner tags as needed.')

MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH = 5

DEFAULT_BASE_HISTOGRAM_OBSOLETE_REASON = (
Esempio n. 6
0
def send_credit_notifications(username, course_key):
    """Sends email notification to user on different phases during credit
    course e.g., credit eligibility, credit payment etc.
    """
    try:
        user = User.objects.get(username=username)
    except User.DoesNotExist:
        log.error('No user with %s exist', username)
        return

    course = modulestore().get_course(course_key, depth=0)
    course_display_name = course.display_name
    tracking_context = tracker.get_tracker().resolve_context()
    tracking_id = str(tracking_context.get('user_id'))
    client_id = str(tracking_context.get('client_id'))
    events = '&t=event&ec=email&ea=open'
    tracking_pixel = 'https://www.google-analytics.com/collect?v=1&tid' + tracking_id + '&cid' + client_id + events
    dashboard_link = _email_url_parser('dashboard')
    credit_course_link = _email_url_parser('courses', '?type=credit')

    # get attached branded logo
    logo_image = cache.get('credit.email.attached-logo')
    if logo_image is None:
        branded_logo = {
            'title': 'Logo',
            'path': settings.NOTIFICATION_EMAIL_EDX_LOGO,
            'cid': str(uuid.uuid4())
        }
        logo_image_id = branded_logo['cid']
        logo_image = attach_image(branded_logo, 'Header Logo')
        if logo_image:
            cache.set('credit.email.attached-logo', logo_image,
                      settings.CREDIT_NOTIFICATION_CACHE_TIMEOUT)
    else:
        # strip enclosing angle brackets from 'logo_image' cache 'Content-ID'
        logo_image_id = logo_image.get('Content-ID', '')[1:-1]

    providers_names = get_credit_provider_display_names(course_key)
    providers_string = make_providers_strings(providers_names)
    context = {
        'full_name': user.get_full_name(),
        'platform_name': settings.PLATFORM_NAME,
        'course_name': course_display_name,
        'branded_logo': logo_image_id,
        'dashboard_link': dashboard_link,
        'credit_course_link': credit_course_link,
        'tracking_pixel': tracking_pixel,
        'providers': providers_string,
    }

    # create the root email message
    notification_msg = MIMEMultipart('related')
    # add 'alternative' part to root email message to encapsulate the plain and
    # HTML versions, so message agents can decide which they want to display.
    msg_alternative = MIMEMultipart('alternative')
    notification_msg.attach(msg_alternative)
    # render the credit notification templates
    subject = _(u'Course Credit Eligibility')

    if providers_string:
        subject = _(u'You are eligible for credit from {providers_string}'
                    ).format(providers_string=providers_string)

    # add alternative plain text message
    email_body_plain = render_to_string(
        'credit_notifications/credit_eligibility_email.txt', context)
    msg_alternative.attach(
        SafeMIMEText(email_body_plain, _subtype='plain', _charset='utf-8'))

    # add alternative html message
    email_body_content = cache.get('credit.email.css-email-body')
    if email_body_content is None:
        html_file_path = file_path_finder(
            'templates/credit_notifications/credit_eligibility_email.html')
        if html_file_path:
            with open(html_file_path, 'r') as cur_file:
                cur_text = cur_file.read()
                # use html parser to unescape html characters which are changed
                # by the 'pynliner' while adding inline css to html content
                html_parser = HTMLParser.HTMLParser()
                email_body_content = html_parser.unescape(
                    with_inline_css(cur_text))
                # cache the email body content before rendering it since the
                # email context will change for each user e.g., 'full_name'
                cache.set('credit.email.css-email-body', email_body_content,
                          settings.CREDIT_NOTIFICATION_CACHE_TIMEOUT)
        else:
            email_body_content = ''

    email_body = Template(email_body_content).render([context])
    msg_alternative.attach(
        SafeMIMEText(email_body, _subtype='html', _charset='utf-8'))

    # attach logo image
    if logo_image:
        notification_msg.attach(logo_image)

    # add email addresses of sender and receiver
    from_address = microsite.get_value('default_from_email',
                                       settings.DEFAULT_FROM_EMAIL)
    to_address = user.email

    # send the root email message
    msg = EmailMessage(subject, None, from_address, [to_address])
    msg.attach(notification_msg)
    msg.send()
Esempio n. 7
0
    return dict((r.review_id, r) for r in qs)


def _retrieve_translation(text, language):
    try:
        r = requests.get(settings.GOOGLE_TRANSLATE_API_URL,
                         params={
                             'key': getattr(settings, 'GOOGLE_API_CREDENTIALS',
                                            ''),
                             'q': text,
                             'target': language
                         })
    except Exception, e:
        log.error(e)
    try:
        translated = (HTMLParser.HTMLParser().unescape(
            r.json()['data']['translations'][0]['translatedText']))
    except (KeyError, IndexError):
        translated = ''
    return translated, r


@addon_view
@waffle_switch('reviews-translate')
@non_atomic_requests
def translate(request, addon, review_id, language):
    """
    Use the Google Translate API for ajax, redirect to Google Translate for
    non ajax calls.
    """
    review = get_object_or_404(Review, pk=review_id, addon=addon)
    if '-' in language:
Esempio n. 8
0
        counter = 0
        for day in sDict.values():
            for show in day:
                counter += 1
                print str(counter) + ". " + show.format_output()


def printHTMLTable(sDict):
    if isinstance(sDict, OrderedDefaultdict):
        for day, shows in sDict.items():
            print "<p>&nbsp;</p>\n"
            print "<p><span style=\"font-family:times new roman,times,serif;\"><strong>" + day + "</strong></span></p>\n"
            print "<p>&nbsp;</p>\n"
            print "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" dir=\"ltr\" style=\"width:754px;\" width=\"754\">\n<colgroup>\n<col /><col /><col /><col /></colgroup>\n<tbody>\n<tr>\n<td height=\"16\" style=\"height:21px;width:75px; padding-left:2px; padding-right:2px;\"><strong>Time</strong></td>\n<td style=\"width:225px;\"><strong>Show Name</strong></td>\n<td style=\"width:225px;\"><strong>Genre</strong></td>\n<td style=\"width:225px;\"><strong>Tagline</strong></td>\n</tr>\"\n"
            for show in shows:
                print show.format_html_output()
            print "</tbody>\n</table>"
            print "<p>&nbsp;</p>\n"


#Main
f = open(filename)
for line in f:
    lineAsString = line.rstrip()
    lineAsString = HTMLParser.HTMLParser().unescape(lineAsString)
    if parseString(lineAsString) is not None:
        infoOrg(parseString(lineAsString))
#printNumberedList(schedule)
printHTMLTable(schedule)
f.close()
 def __init__(self):
     super(ToutiaoExtractor, self).__init__()
     self.html_parser = HTMLParser.HTMLParser()
Esempio n. 10
0
def unescape(text):
    return HTMLParser.HTMLParser().unescape(text)
def unescape(string):
    html_parser = HTMLParser.HTMLParser()
    return html_parser.unescape(string)
Esempio n. 12
0
def render_comment_plain(comment, context):
    parser = HTMLParser.HTMLParser()
    chunks = list(get_file_chunks_in_range_custom(
        context,
        comment.filediff,
        comment.interfilediff,
        comment.first_line,
        comment.num_lines))

    lines = [
        "::: %s" % comment.filediff.dest_file_display,
    ]

    if comment.interfilediff:
        lines.append(
            "(Diff revisions %s - %s)" % (
                comment.filediff.diffset.revision,
                comment.interfilediff.diffset.revision))
    else:
        lines.append(
            "(Diff revision %s)" % comment.filediff.diffset.revision)

    for chunk in chunks:
        if chunk['change'] == "equal":
            lines.extend(render_equal_chunk(chunk, parser))
        elif chunk['change'] == "insert":
            for line in chunk['lines']:
                lines.append("> +%s" % parser.unescape(line[5]))
        elif chunk['change'] == "delete":
            for line in chunk['lines']:
                lines.append("> -%s" % parser.unescape(line[2]))
        elif chunk['change'] == "replace":
            for line in chunk['lines']:
                lines.append("> -%s" % parser.unescape(line[2]))
            for line in chunk['lines']:
                lines.append("> +%s" % parser.unescape(line[5]))

    lines.append("")

    comments = []
    c = comment
    depth = 0

    while True:
        if depth:
            prefix = '%s ' % ('>' * depth,)
        else:
            prefix = ''

        comments.append("%s%s" % (prefix, c))

        if c.reply_to:
            c = c.reply_to
            depth += 1
        else:
            break

    comments.reverse()

    lines.extend(comments)

    return "\n".join(lines)
Esempio n. 13
0
def filter_cases(request, domain, app_id, module_id):
    app = Application.get(app_id)
    module = app.get_module(module_id)
    delegation = request.GET.get('task-list') == 'true'
    auth_cookie = request.COOKIES.get('sessionid')

    suite_gen = SuiteGenerator(app)
    xpath = suite_gen.get_filter_xpath(module, delegation=delegation)
    extra_instances = [{
        'id': inst.id,
        'src': inst.src
    } for inst in suite_gen.get_instances_for_module(
        module, additional_xpaths=[xpath])]

    # touchforms doesn't like this to be escaped
    xpath = HTMLParser.HTMLParser().unescape(xpath)
    if delegation:
        case_type = DELEGATION_STUB_CASE_TYPE
    else:
        case_type = module.case_type

    if xpath:
        # if we need to do a custom filter, send it to touchforms for processing
        additional_filters = {
            "properties/case_type": case_type,
            "footprint": True
        }

        helper = SessionDataHelper(domain, request.couch_user)
        result = helper.filter_cases(xpath,
                                     additional_filters,
                                     DjangoAuth(auth_cookie),
                                     extra_instances=extra_instances)
        if result.get('status', None) == 'error':
            return HttpResponseServerError(
                result.get("message",
                           _("Something went wrong filtering your cases.")))

        case_ids = result.get("cases", [])
    else:
        # otherwise just use our built in api with the defaults
        case_ids = [
            res.id
            for res in get_filtered_cases(domain,
                                          status=CASE_STATUS_OPEN,
                                          case_type=case_type,
                                          user_id=request.couch_user._id,
                                          ids_only=True)
        ]

    cases = [
        CommCareCase.wrap(doc)
        for doc in iter_docs(CommCareCase.get_db(), case_ids)
    ]
    # refilter these because we might have accidentally included footprint cases
    # in the results from touchforms. this is a little hacky but the easiest
    # (quick) workaround. should be revisted when we optimize the case list.
    cases = filter(lambda c: c.type == case_type, cases)
    cases = [c.get_json(lite=True) for c in cases if c]
    parents = []
    if delegation:
        for case in cases:
            parent_id = case['indices']['parent']['case_id']
            parents.append(CommCareCase.get(parent_id))
        return json_response({'cases': cases, 'parents': parents})
    else:
        return json_response(cases)
Esempio n. 14
0
def parse_patents(fd, fd2):
    import re, csv, os, codecs, zipfile, traceback
    import string, random, HTMLParser

    def id_generator(size=25, chars=string.ascii_lowercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    type_kind = {
        '1': ["A", "utility"],
        '2': ["E", "reissue"],
        '3': ["I5", "TVPP"],
        '4': ["S", "design"],
        '5': ["I4", "defensive publication"],
        '6': ["P", "plant"],
        '7': ["H", "statutory invention registration"]
    }

    reldoctype = [
        'continuation-in-part', 'continuation_in_part', 'continuing_reissue',
        'division', 'reissue', 'related_publication', 'substitution',
        'us_provisional_application', 'us_reexamination_reissue_merger',
        'continuation'
    ]

    fd += '/'
    fd2 += '/'
    diri = os.listdir(fd)
    diri = [d for d in diri if d.endswith('zip')]

    #Initiate HTML Parser for unescape characters
    h = HTMLParser.HTMLParser()

    #Remove all files from output dir before writing
    outdir = os.listdir(fd2)
    for oo in outdir:
        os.remove(os.path.join(fd2, oo))

    det_desc_textfile = open(os.path.join(fd2, 'detail_desc_text.csv'), 'wb')
    det_desc_textfile.write(codecs.BOM_UTF8)
    det_desc = csv.writer(det_desc_textfile, delimiter='\t')
    det_desc.writerow(['uuid', 'patent_id', 'text', 'length'])

    det_desc_textfile.close()

    loggroups = [
        'PATN', 'INVT', 'ASSG', 'PRIR', 'REIS', 'RLAP', 'CLAS', 'UREF', 'FREF',
        'OREF', 'LREP', 'PCTA', 'ABST', 'GOVT', 'PARN', 'BSUM', 'DRWD', 'DETD',
        'CLMS', 'DCLM'
    ]

    numii = 0
    rawlocation = {}
    mainclassdata = {}
    subclassdata = {}

    for d in diri:
        print d
        inp = zipfile.ZipFile(os.path.join(fd, d))
        for i in inp.namelist():
            infile = h.unescape(
                inp.open(i).read().decode('utf-8', 'ignore').replace(
                    '&angst', '&aring')).replace("\r", "").split('PATN')
            del infile[0]

        for i in infile:
            numii += 1
            i = i.encode('utf-8', 'ignore')
            # Get relevant logical groups from patent records according to documentation
            # Some patents can contain several INVT, ASSG and other logical groups - so, is important to retain all
            avail_fields = {}
            num = 1
            avail_fields['PATN'] = i.split('INVT')[0]
            runnums = []
            for n in range(1, len(loggroups)):
                try:
                    gg = re.search('\n' + loggroups[n], i).group()
                    if num - n == 0:
                        runnums.append(n)
                        num += 1
                        go = list(re.finditer('\n' + loggroups[n - 1], i))
                        if len(go) == 1:
                            needed = i.split(loggroups[n - 1])[1]
                            avail_fields[loggroups[n - 1]] = needed.split(
                                loggroups[n])[0]
                        elif len(go) > 1:
                            needed = '\n\n\n\n\n'.join(
                                i.split(loggroups[n - 1])[1:])
                            avail_fields[loggroups[n - 1]] = needed.split(
                                loggroups[n])[0]
                        else:
                            pass
                    else:
                        go = list(re.finditer('\n' + loggroups[runnums[-1]],
                                              i))
                        if len(go) == 1:
                            needed = i.split(loggroups[runnums[-1]])[1]
                            avail_fields[loggroups[
                                runnums[-1]]] = needed.split(loggroups[n])[0]
                        elif len(go) > 1:
                            needed = '\n\n\n\n\n'.join(
                                i.split(loggroups[runnums[-1]])[1:])
                            avail_fields[loggroups[
                                runnums[-1]]] = needed.split(loggroups[n])[0]
                        else:
                            pass
                        runnums.append(n)
                        num = n + 1

                except:
                    pass
            # Create containers based on existing Berkeley DB schema (not all are currently used - possible compatibility issues)
            application = {}
            claimsdata = {}
            examiner = {}
            foreigncitation = {}
            ipcr = {}
            otherreference = {}
            patentdata = {}
            pctdata = {}
            prioritydata = {}
            rawassignee = {}
            rawinventor = {}
            rawlawyer = {}
            usappcitation = {}
            uspatentcitation = {}
            uspc = {}
            usreldoc = {}
            figureinfo = {}
            termofgrant = {}
            drawdescdata = {}
            relappdata = {}

            ###                PARSERS FOR LOGICAL GROUPS                  ###

            try:
                numfigs = ''
                numsheets = ''
                disclaimerdate = ''
                termpat = ''
                patent = avail_fields['PATN'].split('\n')
                for line in patent:
                    if line.startswith("WKU"):
                        patnum = re.search('WKU\s+(.*?)$', line).group(1)
                        updnum = re.sub('^H0', 'H', patnum)[:8]
                        updnum = re.sub('^RE0', 'RE', updnum)[:8]
                        updnum = re.sub('^PP0', 'PP', updnum)[:8]
                        updnum = re.sub('^PP0', 'PP', updnum)[:8]
                        updnum = re.sub('^D0', 'D', updnum)[:8]
                        updnum = re.sub('^T0', 'T', updnum)[:8]
                        if len(patnum) > 7 and patnum.startswith('0'):
                            updnum = patnum[1:8]
                        #data['patnum'] = updnum
                        #print updnum
                        patent_id = updnum
                    if line.startswith('SRC'):
                        seriescode = re.search('SRC\s+(.*?)$', line).group(1)
                        try:
                            gg = int(seriescode)
                            if len(seriescode) == 1:
                                seriescode = '0' + seriescode
                        except:
                            pass
                    if line.startswith('APN'):
                        appnum = re.search('APN\s+(.*?)$', line).group(1)[:6]
                        if len(appnum) != 6:
                            appnum = 'NULL'
                            #data['appnum'] = appnum
                    if line.startswith('APT'):
                        apptype = re.search('APT\s+(.*?)$', line).group(1)
                        apptype = re.search('\d', apptype).group()
                    if line.startswith('APD'):
                        appdate = re.search('APD\s+(.*?)$', line).group(1)
                        appdate = appdate[:4] + '-' + appdate[
                            4:6] + '-' + appdate[6:]
                        #print appdate
                    if line.startswith('TTL'):
                        title = re.search('TTL\s+(.*?)ISD',
                                          avail_fields['PATN'],
                                          re.DOTALL).group(1)
                        title = re.sub('[\n\t\r\f]+', '', title)
                        title = re.sub('\s+$', '', title)
                        title = re.sub('\s+', ' ', title)
                    if line.startswith('ISD'):
                        issdate = re.search('ISD\s+(.*?)$', line).group(1)
                        if issdate[6:] == "00":
                            day = '01'
                        else:
                            day = issdate[6:]
                        if issdate[4:6] == "00":
                            month = '01'
                        else:
                            month = issdate[4:6]
                        year = issdate[:4]
                        issdate = year + '-' + month + '-' + day
                        #print issdate
                    if line.startswith("NCL"):
                        numclaims = re.search('NCL\s+(.*?)$', line).group(1)

                    #Figure and sheet info
                    if line.startswith('NDR'):
                        numsheets = re.search('NDR\s+(.*?)$', line).group(1)
                    if line.startswith('NFG'):
                        numfigs = re.search('NFG\s+(.*?)$', line).group(1)

                    #U.S. term of grant
                    if line.startswith('TRM'):
                        termpat = re.sub(
                            '[\n\t\r\f]+', '',
                            re.search('TRM\s+(.*?)$', line).group(1))
                    if line.startswith('DCD'):
                        disclaimerdate = re.sub(
                            '[\n\t\r\f]+', '',
                            re.search('DCD\s+(.*?)$', line).group(1))
                        disclaimerdate = disclaimerdate[:4] + '-' + disclaimerdate[
                            4:6] + '-' + disclaimerdate[6:]

                    # Examiner info
                    sequence = 0
                    if line.startswith("EXA"):
                        sequence += 1
                        assistexam = re.search('EXA\s+(.*?)$',
                                               line).group(1).split("; ")
                        assistexamfname = assistexam[1]
                        assistexamlname = assistexam[0]
                        examiner[id_generator()] = [
                            patent_id, assistexamfname, assistexamlname,
                            "assistant", "NULL"
                        ]
                    if line.startswith("EXP"):
                        sequence += 1
                        primexam = re.search('EXP\s+(.*?)$',
                                             line).group(1).split("; ")
                        primexamfname = primexam[1]
                        primexamlname = primexam[0]
                        examiner[id_generator()] = [
                            patent_id, primexamfname, primexamlname, "primary",
                            "NULL"
                        ]
                    if line.startswith("ECL"):
                        exemplary = re.search('ECL\s+(.*?)$', line).group(1)
                        exemplary_list = exemplary.split(",")
            except:
                pass

            patent_id = updnum

            # Detail description
            detdesc = 'NULL'
            try:
                detdesc = re.sub('PAR\s+', ' ', avail_fields['DETD'])
                detdesc = re.sub('PAC\s+', ' ', detdesc)
                detdesc = re.sub('PA\d+\s+', ' ', detdesc)
                detdesc = re.sub('TBL\s+', '', detdesc)
                detdesc = re.sub('\s+', ' ', detdesc)
            except:
                pass

            det_desc_textfile = csv.writer(open(
                os.path.join(fd2, 'detail_desc_text.csv'), 'ab'),
                                           delimiter='\t')
            det_desc_textfile.writerow(
                [id_generator(), patent_id, detdesc,
                 len(detdesc)])
Esempio n. 15
0
def html_unescape(value):
    h = HTMLParser.HTMLParser()
    return h.unescape(value)
Esempio n. 16
0
def getRegexParsed(
        regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={},
        rawPost=False, cookie_jar_file=None):
        # cachedPages = {}
        # print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    setresolved = True
    for k in doRegexs:
        if k in regexs:
                # print 'processing ' ,k
            m = regexs[k]
            # print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                    # print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            # print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar is None:
                    # print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split('open[')[1].split(']')[0]

                    cookieJar = getCookieJar(cookie_jar_file)
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(']')[0]
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m['setcookie']:
                m['setcookie'] = getRegexParsed(regexs, m['setcookie'],
                                                cookieJar, recursiveCall=True, cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m['appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'],
                                                   cookieJar, recursiveCall=True, cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages)

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs, m['rawpost'],
                                              cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True)
                # print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2())

            link = ''
            if m['page'] and m['page'] in cachedPages and 'ignorecache' not in m and forCookieJarOnly is False:
                # print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith('http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$', getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$', getEpocTime2())

                    # print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

                    current_proxies = urllib2.ProxyHandler(urllib2.getproxies())

                    # print 'getting pageUrl',pageUrl
                    req = urllib2.Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']

                        if pageUrl[:5] == "https":
                            proxy = urllib2.ProxyHandler({'https': proxytouse})
                        else:
                            proxy = urllib2.ProxyHandler({'http': proxytouse})
                        opener = urllib2.build_opener(proxy)
                        urllib2.install_opener(opener)

                    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1')
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(
                                version=0, name=n, value=v, port=None, port_specified=False, domain=w,
                                domain_specified=False, domain_initial_dot=False, path='/', path_specified=True,
                                secure=False, expires=None, discard=True, comment=None, comment_url=None,
                                rest={'HttpOnly': None},
                                rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if cookieJar is not None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                        opener = urllib2.build_opener(
                            cookie_handler, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        urllib2.install_opener(opener)
#                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = urllib2.build_opener(
                                cookie_handler, NoRedirection, urllib2.HTTPBasicAuthHandler(),
                                urllib2.HTTPHandler())
                            urllib2.install_opener(opener)
                    elif 'noredirect' in m:
                        opener = urllib2.build_opener(
                            NoRedirection, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        urllib2.install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = urllib2.build_opener(keepalive_handler)
                        urllib2.install_opener(opener)

                    # print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        # if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urllib.urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                    link = ''
                    try:

                        if post:
                            response = urllib2.urlopen(req, post)
                        else:
                            response = urllib2.urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and current_proxies is not None:
                            urllib2.install_opener(urllib2.build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        # print repr(link)
                        # print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            # link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b+':'+response.headers.get(b)+'\n'
                            link += '$$HEADERS_END$$:'
    #                        print link

                        response.close()
                    except Exception:
                        pass
                    cachedPages[m['page']] = link
                    # print link
                    # print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs, m['expres'],
                                             cookieJar, recursiveCall=True, cachedPages=cachedPages)

            if not m['expres'] == '':
                # print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    # print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith('$pyFunction:') or '#$pyFunction' in m['expres']:
                    # print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']:
                        return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except Exception:
                        url = url.replace("$doregex[" + k + "]", val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        # print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except Exception:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] is None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = urllib.quote_plus(val)
                    if 'htmlunescape' in m:
                        # val=urllib.unquote_plus(val)
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except Exception:
                        url = url.replace("$doregex[" + k + "]", val.decode("utf-8"))
                    # print 'ur',url
                    # return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall:
        return url
    # print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved
Esempio n. 17
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib,urllib2,re,xbmcaddon,xbmcplugin,xbmcgui,xbmc,HTMLParser

htmlparser = HTMLParser.HTMLParser()
pluginhandle = int(sys.argv[1])
itemcnt = 0
baseurl = 'http://www.gamestar.de'
channelurl = 'http://www.gamestar.de/videos/video-kanaele/'
getvideourl = 'http://www.gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId='
googleresize = 'http://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&url='
settings = xbmcaddon.Addon(id='plugin.video.gamestar_ll')
maxitems = (int(settings.getSetting("items_per_page"))+1)*10
forceMovieViewMode = settings.getSetting("forceMovieViewMode") == 'true'
useThumbAsFanart = settings.getSetting("useThumbAsFanart") == 'true'
hirespix = settings.getSetting("useHighresPix") == 'true'
movieViewMode = str(settings.getSetting("movieViewMode"))

premium = False
dbg = False

cats = [
	('http://www.gamestar.de/videos/latest/','Neueste Videos','',''),
	('http://www.gamestar.de/videos/popular/','Meist gesehen','',''),
	('http://www.gamestar.de/videos/news,100/','News','Von Montag bis Freitag immer mittags berichten wir in unserer News-Show über die wichtigsten Spiele-Themen des Tages.','http://images.gamestar.de/images/idgwpgsgp/bdb/2558457/b144x81.jpg'),
	('http://www.gamestar.de/videos/was-ist-,96/','Was ist ...?','In »Was ist…?« präsentieren wir Indie-Hits, Geheimtipps und andere Spiele-Kleinode mit kommentierten Spielszenen.',''),
	('http://www.gamestar.de/videos/feedback,99/','Feedback','In Feedback beantwortet unser Team regelmäßig Fragen der Community und plaudert mit Moderator Andre Peschke aus dem Nähkästchen.',''),
	('http://www.gamestar.de/videos/kino-und-dvd,26/','Kino und DVD','Aktuelle Trailer zu Kinofilmen und DVD-Neuerscheinungen.','http://images.gamestar.de/images/idgwpgsgp/bdb/2334506/b144x81.jpg'),
	('http://www.gamestar.de/videos/gamewatch,97/','Gamewatch','Neue Trailer, Gameplay-Videos oder Live-Demos.',''),
	('http://www.gamestar.de/videos/public-viewing,37/','Public Viewing','Neue Spiele ausführlich angespielt und vorgestellt','http://images.gamestar.de/images/idgwpgsgp/bdb/2121485/b144x81.jpg'),
	('http://www.gamestar.de/index.cfm?pid=1589&ci=9','Quickplay','Alle Trailer aus dem Action-Genre mit den Unterrubriken Ego-Shooter, Action-Adventures, Flugsimulationen und anderen.','http://images.gamestar.de/images/idgwpgsgp/bdb/2016676/b144x81.jpg'),
Esempio n. 18
0
def gen_quotes(category, title):
    #print "%s %s %s" % ("=" * 30, "quotes", "=" * 30)
    keyword = ""
    '''delete punctuation in title'''
    exclude = set(string.punctuation)
    title = ''.join(ch for ch in title if ch not in exclude)
    # print "%s %s %s" % ("="*30, "title without punctuation", "="*30)
    # print title
    '''generate the list of keywords from title'''
    keywords = alchemy_keywords(title)
    if len(keywords) <= 1:
        #print "keywords not good from alchemyapi"
        keywords = [
            word for word in title.lower().split()
            if word not in stopwords.words('english')
        ]
    elif len(keywords) > 3:
        keywords = keywords[:2]
        #print "shorten the keywords: " + keywords

    web_url = ""
    while web_url == "":
        keywords_str = ' '.join(keywords)
        query1 = category + " " + keywords_str + " site:brainyquote.com"
        query2 = urllib.urlencode({'q': query1})
        response = urllib.urlopen(
            'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' +
            query2).read()
        json = m_json.loads(response)
        '''in case we hit the limit of searching'''
        try:
            results = json['responseData']['results']
            '''too many keywords for finding a good result'''
            try:
                #print "Query tried: ", query1
                web_url = results[0]["url"]
                #print web_url
                contents = results[0]['content'].split("...")
                contents = filter(None, contents)
                h = HTMLParser.HTMLParser()
                if len(contents) == 1:
                    index = 0
                else:
                    index = 1
                target_content = h.unescape(contents[index])
                target_content = BeautifulSoup(target_content).text
                target_content = " ".join(target_content.split())
            except Exception:
                #print "-----broke"
                del keywords[-1]
                '''lower the quality of target_content in except'''
                target_content = category
        except Exception:
            #print "query no result: " + query1
            result_urls = search(query1, num=10, pause=1.0)
            urls_list = [link for (num, link) in list(enumerate(result_urls))]
            web_url = urls_list[0]
            #print web_url
            target_content = category

    #print "target_content:" + target_content
    '''start to fetch quotes from the right link'''
    success, quote, author = get_quote(web_url, target_content)
    '''if can't find one quote with target_content which happens in two cases: 1. the quote is in a later page. 2. the target_content is not right'''
    if success == -1:
        #first case: do another search with target_content
        requery = target_content + " site:brainyquote.com"
        #print "requery: " + requery
        #print "target_content: " + target_content
        quotes_urls = search(requery, num=20, pause=2.0)
        urls_list = [link for (num, link) in list(enumerate(quotes_urls))]
        quote_url = urls_list[0]
        success, quote, author = get_quote(quote_url, target_content)
        #second case: pick the first quote

    return quote, author
import ast
import HTMLParser
import re
import sys
import requests
import time
import string

API_URL = "https://api.stackexchange.com"
HTML_PARSER = HTMLParser.HTMLParser()
SEPARATOR_1 = "===================================================="
SEPARATOR_2 = "----------------------------------------------------"
SEPARATOR_3 = "####################################################"

def get_segment(input_string, starting_index=0, ending_index=0, beginning_token="(", ending_token=")", escape=True):
    if beginning_token == ending_token or starting_index >= len(input_string)-1 or starting_index < 0:
        return ""
    if ending_index <= starting_index or ending_index >= len(input_string):
        ending_index = len(input_string)    
    if ending_index <= 0:
        return ""
    
    s = []
    output_starting_index = starting_index
    output_ending_index = ending_index
    found_first_beginning_token = False
    for match_oject in re.finditer(r'({})|({})'.format(re.escape(beginning_token), re.escape(ending_token)), input_string[starting_index:ending_index]):
        token = match_oject.group(0)
        if match_oject.group(0) == beginning_token:
            if not found_first_beginning_token:
                output_starting_index = match_oject.start()
Esempio n. 20
0
def edit(request, quoteId=1):

    # Quote editing/modification logic
    if (request.method == 'POST') and ('delete' in request.POST):

        # --- Handle delete requests ---
        quote_in_question = get_object_or_404(Quote, pk=quoteId)
        quote_in_question.delete()

        return redirect('/intranet/quote/')

    elif (request.method == 'POST'):

        # --- Handle save requests (from edit form to quote list) ---
        quote_in_question = get_object_or_404(Quote, pk=quoteId)

        # Add current user to _posters list, if necessary
        if not ("," + request.user.username +
                ",") in quote_in_question.quote_posters:

            # Strip is used to provide backwards compatibility with old quotes
            quote_in_question.quote_posters = "," + quote_in_question.quote_posters.strip(
                ",") + "," + request.user.username + ","

        quote_form = QuoteForm(request.POST, instance=quote_in_question)
        quote_form.save()

        return redirect('/intranet/quote/')
    else:

        # Make sure quote editor can actually edit the current quote (and reject their request if they can't)
        user = request.user
        quote_obj = get_object_or_404(Quote, pk=quoteId)
        quote_usernames = quote_obj.quote_sources.strip(",").split(",")
        poster_usernames = quote_obj.quote_posters.strip(",").split(",")

        canEdit = (not user.is_anonymous() and
                   (user.username in quote_usernames) or
                   (user.username in poster_usernames)) or (user.is_top4())

        if (not canEdit):
            raise PermissionDenied  # Current user cannot edit this quote

        # --- Handle edit page requests (from quote list to edit form) ---

        # Get authors' Member objects
        quoteMembers = Member.objects.filter(username__in=quote_usernames)

        # Unescape escaped quote text
        quote_obj.quote_text = HTMLParser.HTMLParser().unescape(
            quote_obj.quote_text)

        # Remove hashtags/authortags in text
        quote_obj.quote_text = string.replace(
            re.sub("<a href='.+?'>", "", quote_obj.quote_text), "</a>", "")

        # Convert <br />'s into newlines (\n - TODO?: this may cause issues for Windows users)
        quote_obj.quote_text = string.replace(quote_obj.quote_text, "<br />",
                                              "\n")

        quote_form = QuoteForm(instance=quote_obj)
        quote_form.fields["quote_posters"].widget = forms.HiddenInput()

        # -- Handle quote editing --
        return render_to_response('intranet/quote/edit.html', {
            "section": "intranet",
            "page": 'quote',
            "form": quote_form,
            "members": Member.objects.all(),
            "quoteMembers": quoteMembers,
            "quote_id": quoteId,
            "user": request.user
        },
                                  context_instance=RequestContext(request))
Esempio n. 21
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright 2014 Techdealer

##############BIBLIOTECAS A IMPORTAR E DEFINICOES####################
import urllib,urllib2,re,xbmcplugin,xbmcgui,sys,xbmc,xbmcaddon,xbmcvfs,socket,HTMLParser
import json
h = HTMLParser.HTMLParser()

addon_id = 'plugin.video.replaypt'
selfAddon = xbmcaddon.Addon(id=addon_id)
addonfolder = selfAddon.getAddonInfo('path')
artfolder = '/resources/img/'

vernovelas_url = 'http://vernovelas.com.br/'
##################################################

def listar_categorias():
	try:
		codigo_fonte = abrir_url(vernovelas_url)
	except:
		codigo_fonte = ''
	if codigo_fonte:
		match = re.findall('<li id=".+?" class=".+?"><a.+?href="(.+?)">(.+?)</a></li>', codigo_fonte)
		for url,name in match:
			if name=='INICIO' or name=='FUTEBOL AO VIVO' or name=='CONTATO':
				continue
			elif url=='http://www.vernovelas.com.br/category/resumo-de-em-familia' or url=='http://www.vernovelas.com.br/category/resumo-de-malhacao-2' or url=='http://www.vernovelas.com.br/category/resumo-de-joia-rara' or url=='http://www.vernovelas.com.br/category/resumo-de-o-cravo-e-a-rosa' or url=='http://www.vernovelas.com.br/category/resumo-de-chiquititas-2' or url=='http://www.vernovelas.com.br/category/resumo-de-rebelde' or url=='http://www.vernovelas.com.br/category/resumo-de-pecado-mortal':
				continue
			elif url=='http://www.vernovelas.com.br/category/tv-globo' or url=='http://www.vernovelas.com.br/category/band' or url=='http://www.vernovelas.com.br/category/sbt' or url=='http://www.vernovelas.com.br/category/record' or url=='http://www.vernovelas.com.br/category/canal-viva' or url=='http://www.vernovelas.com.br/category/sportv' or url=='http://www.vernovelas.com.br/category/combate-2' or url=='http://www.vernovelas.com.br/category/espn' or url=='http://www.vernovelas.com.br/category/fox' or url=='http://www.vernovelas.com.br/category/hbo':
				continue
Esempio n. 22
0
import httplib
#import re
#import sys
import os
import Cookie

import string, xbmc, xbmcgui, xbmcplugin, urllib, cookielib, xbmcaddon
#-------------------------------
import urllib, urllib2, time, random
#from time import gmtime, strftime
#from urlparse import urlparse

import HTMLParser

hpar = HTMLParser.HTMLParser()
#-----------------------------------------
import socket

socket.setdefaulttimeout(50)

icon = ""
siteUrl = 'www.KinoPoisk.ru'
httpSiteUrl = 'http://' + siteUrl
sid_file = os.path.join(
    xbmc.translatePath('special://temp/'),
    '2kp.cookies.sid')  #'plugin.video.krasfs.ru.cookies.sid'

#h = int(sys.argv[1])

#---------------
Esempio n. 23
0
def html_to_text(s):
    return re.sub(r'[\s\x0B\xC2\xA0]+', ' ',
                  HTMLParser.HTMLParser().unescape(re.sub('<.*?>', ' ', s)),
                  re.S).strip()
Esempio n. 24
0
def unescape(entity, encoding):
    if encoding == 'utf-8':
        return HTMLParser.HTMLParser().unescape(entity).encode(encoding)
    elif encoding == 'cp1251':
        return entity.decode(encoding).encode('utf-8')
Esempio n. 25
0
def replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt
Esempio n. 26
0
class URLFollow(CommandInterface):
    triggers = ['urlfollow', 'follow']
    acceptedTypes = ['PRIVMSG', 'ACTION']
    help = 'automatic function that follows urls and grabs information about the resultant webpage'
    runInThread = True

    htmlParser = HTMLParser.HTMLParser()
    
    graySplitter = assembleFormattedText(A.normal[' ', A.fg.gray['|'], ' '])

    def onLoad(self):
        self.handledExternally = {}
        """@type : dict[str, list[str]]"""
        # dict of regex patterns not to follow. populated by other modules so they can handle them themselves

        self.youtubeKey = load_key(u'YouTube')
        self.imgurClientID = load_key(u'imgur Client ID')
        
        self.autoFollow = True

    def shouldExecute(self, message):
        """
        @type message: IRCMessage
        """
        if message.Type not in self.acceptedTypes:
            return False
        if ignores.ignoreList is not None:
            if message.User.Name.lower() in ignores.ignoreList:
                return False
        return True
    
    def execute(self, message):
        """
        @type message: IRCMessage
        """
        match = None
        if message.Command.lower() in self.triggers:
            if message.ParameterList[0].lower() == 'on':
                self.autoFollow = True
                return IRCResponse(ResponseType.Say, 'Auto-follow on', message.ReplyTo)
            elif message.ParameterList[0].lower() == 'off': 
                self.autoFollow = False
                return IRCResponse(ResponseType.Say, 'Auto-follow off', message.ReplyTo)
            else:
                match = re.search(r'(?P<url>(https?://|www\.)[^\s]+)', message.Parameters, re.IGNORECASE)
        elif self.autoFollow:
            match = re.search(r'(?P<url>(https?://|www\.)[^\s]+)', message.MessageString, re.IGNORECASE)
        if not match:
            return

        for module, patterns in self.handledExternally.iteritems():
            for pattern in patterns:
                if re.search(pattern, message.MessageString):
                    return  # url will be handled by another module

        return self.DispatchToFollows(match.group('url'), message)

    def DispatchToFollows(self, url, message):
        """
        @type url: unicode
        @type message: IRCMessage
        """
        youtubeMatch = re.search(r'(youtube\.com/watch.+v=|youtu\.be/)(?P<videoID>[^&#\?]{11})', url)
        imgurMatch   = re.search(r'(i\.)?imgur\.com/(?P<imgurID>[^\.]+)', url)
        twitterMatch = re.search(r'twitter\.com/(?P<tweeter>[^/]+)/status(es)?/(?P<tweetID>[0-9]+)', url)
        steamMatch   = re.search(r'store\.steampowered\.com/(?P<steamType>(app|sub))/(?P<steamID>[0-9]+)', url)
        ksMatch      = re.search(r'kickstarter\.com/projects/(?P<ksID>[^/]+/[^/&#\?]+)', url)
        twitchMatch  = re.search(r'twitch\.tv/(?P<twitchChannel>[^/]+)/?(\s|$)', url)
        
        if youtubeMatch:
            return self.FollowYouTube(youtubeMatch.group('videoID'), message)
        elif imgurMatch:
            return self.FollowImgur(imgurMatch.group('imgurID'), message)
        elif twitterMatch:
            return self.FollowTwitter(twitterMatch.group('tweeter'), twitterMatch.group('tweetID'), message)
        elif steamMatch:
            return self.FollowSteam(steamMatch.group('steamType'), steamMatch.group('steamID'), message)
        elif ksMatch:
            return self.FollowKickstarter(ksMatch.group('ksID'), message)
        elif twitchMatch:
            return self.FollowTwitch(twitchMatch.group('twitchChannel'), message)
        elif not re.search('\.(jpe?g|gif|png|bmp)$', url):
            return self.FollowStandard(url, message)
        
    def FollowYouTube(self, videoID, message):
        if self.youtubeKey is None:
            return IRCResponse(ResponseType.Say, '[YouTube API key not found]', message.ReplyTo)

        fields = 'items(id,snippet(title,description,channelTitle),contentDetails(duration))'
        parts = 'snippet,contentDetails'
        url = 'https://www.googleapis.com/youtube/v3/videos?id={}&fields={}&part={}&key={}'.format(videoID, fields, parts, self.youtubeKey)
        
        webPage = WebUtils.fetchURL(url)
        webPage.body = webPage.body.decode('utf-8')
        j = json.loads(webPage.body)

        if 'items' not in j:
            return None

        title = j['items'][0]['snippet']['title']
        description = j['items'][0]['snippet']['description']
        channel = j['items'][0]['snippet']['channelTitle']
        length = parse_duration(j["items"][0]["contentDetails"]["duration"]).total_seconds()

        m, s = divmod(int(length), 60)
        h, m = divmod(m, 60)
        if h > 0:
            length = u'{0:02d}:{1:02d}:{2:02d}'.format(h, m, s)
        else:
            length = u'{0:02d}:{1:02d}'.format(m, s)

        if not description:
            description = u'<no description available>'
        description = re.sub('(\n|\s)+', ' ', description)
        limit = 150
        if len(description) > limit:
            description = u'{} ...'.format(description[:limit].rsplit(' ', 1)[0])

        return IRCResponse(ResponseType.Say,
                           self.graySplitter.join([title, length, channel, description]),
                           message.ReplyTo,
                           {'urlfollowURL': 'http://youtu.be/{}'.format(videoID)})
    
    def FollowImgur(self, imgurID, message):
        if self.imgurClientID is None:
            return IRCResponse(ResponseType.Say, '[imgur Client ID not found]', message.ReplyTo)

        if imgurID.startswith('gallery/'):
            imgurID = imgurID.replace('gallery/', '')

        albumLink = False
        if imgurID.startswith('a/'):
            imgurID = imgurID.replace('a/', '')
            url = 'https://api.imgur.com/3/album/{0}'.format(imgurID)
            albumLink = True
        else:
            url = 'https://api.imgur.com/3/image/{0}'.format(imgurID)

        headers = [('Authorization', 'Client-ID {0}'.format(self.imgurClientID))]
        
        webPage = WebUtils.fetchURL(url, headers)
        
        if webPage is None:
            url = 'https://api.imgur.com/3/gallery/{0}'.format(imgurID)
            webPage = WebUtils.fetchURL(url, headers)

        if webPage is None:
            return
        
        response = json.loads(webPage.body)
        
        imageData = response['data']

        if imageData['title'] is None:
            url = 'https://api.imgur.com/3/gallery/{0}'.format(imgurID)
            webPage = WebUtils.fetchURL(url, headers)
            if webPage is not None:
                imageData = json.loads(webPage.body)['data']

            if imageData['title'] is None:
                webPage = WebUtils.fetchURL('http://imgur.com/{0}'.format(imgurID))
                imageData['title'] = self.GetTitle(webPage.body).replace(' - Imgur', '')
                if imageData['title'] == 'imgur: the simple image sharer':
                    imageData['title'] = None
        
        data = []
        if imageData['title'] is not None:
            data.append(imageData['title'])
        else:
            data.append(u'<No Title>')
        if imageData['nsfw']:
            data.append(u'\x034\x02NSFW!\x0F')
        if albumLink:
            data.append(u'Album: {0} Images'.format(imageData['images_count']))
        else:
            if 'is_album' in imageData and imageData['is_album']:
                data.append(u'Album: {0:,d} Images'.format(len(imageData['images'])))
            else:
                if imageData[u'animated']:
                    data.append(u'\x032\x02Animated!\x0F')
                data.append(u'{0:,d}x{1:,d}'.format(imageData['width'], imageData['height']))
                data.append(u'Size: {0:,d}kb'.format(int(imageData['size'])/1024))
        data.append(u'Views: {0:,d}'.format(imageData['views']))
        
        return IRCResponse(ResponseType.Say,
                           self.graySplitter.join(data),
                           message.ReplyTo,
                           {'urlfollowURL': '[nope, imgur is too hard. also, pointless?]'})

    def FollowTwitter(self, tweeter, tweetID, message):
        webPage = WebUtils.fetchURL('https://twitter.com/{0}/status/{1}'.format(tweeter, tweetID))

        soup = BeautifulSoup(webPage.body)

        tweet = soup.find(class_='permalink-tweet')
        
        user = tweet.find(class_='username').text

        tweetText = tweet.find(class_='tweet-text')
        
        tweetTimeText = tweet.find(class_='client-and-actions').text.strip()
        try:
            tweetTimeText = time.strftime('%Y/%m/%d %H:%M', time.strptime(tweetTimeText, '%I:%M %p - %d %b %Y'))
        except ValueError:
            pass

        links = tweetText.find_all('a', {'data-expanded-url': True})
        for link in links:
            link.string = link['data-expanded-url']

        embeddedLinks = tweetText.find_all('a', {'data-pre-embedded': 'true'})
        for link in embeddedLinks:
            link.string = link['href']

        text = StringUtils.unescapeXHTML(tweetText.text)
        text = re.sub('[\r\n]+', self.graySplitter, text)

        formatString = unicode(assembleFormattedText(A.normal[A.fg.gray['[{0}]'], A.bold[' {1}:'], ' {2}']))

        return IRCResponse(ResponseType.Say,
                           formatString.format(tweetTimeText, user, text),
                           message.ReplyTo,
                           {'urlfollowURL': 'https://twitter.com/{}/status/{}'.format(tweeter, tweetID)})

    def FollowSteam(self, steamType, steamId, message):
        steamType = {'app': 'app', 'sub': 'package'}[steamType]
        webPage = WebUtils.fetchURL('http://store.steampowered.com/api/{0}details/?{0}ids={1}&cc=US&l=english&v=1'.format(steamType, steamId))

        response = json.loads(webPage.body)
        if not response[steamId]['success']:
            return  # failure

        appData = response[steamId]['data']

        data = []

        # name
        if 'developers' in appData:
            name = assembleFormattedText(A.normal[appData['name'], A.fg.gray[' by '], u', '.join(appData['developers'])])
        else:
            name = appData['name']
        data.append(name)
        
        # package contents (might need to trim this...)
        if 'apps' in appData:
            appNames = [app['name'] for app in appData['apps']]
            apps = u'Package containing: {}'.format(u', '.join(appNames))
            data.append(apps)

        # genres
        if 'genres' in appData:
            data.append(u'Genres: ' + ', '.join([genre['description'] for genre in appData['genres']]))

        # release date
        releaseDate = appData['release_date']
        if not releaseDate['coming_soon']:
            if releaseDate['date']:
                data.append(u'Release Date: ' + releaseDate['date'])
        else:
            data.append(assembleFormattedText(A.normal['Release Date: ', A.fg.cyan[str(releaseDate['date'])]]))

        # metacritic
        # http://www.metacritic.com/faq#item32 (Why is the breakdown of green, yellow, and red scores different for games?)
        if 'metacritic' in appData:
            metaScore = appData['metacritic']['score']
            if metaScore < 50:
                metacritic = assembleFormattedText(A.normal[A.fg.red[str(metaScore)]])
            elif metaScore < 75:
                metacritic = assembleFormattedText(A.normal[A.fg.yellow[str(metaScore)]])
            else:
                metacritic = assembleFormattedText(A.normal[A.fg.green[str(metaScore)]])
            data.append(u'Metacritic: {0}'.format(metacritic))

        # prices
        priceField = {'app': 'price_overview', 'package': 'price'}[steamType]
        if priceField in appData:
            prices = {'USD': appData[priceField],
                      'GBP': self.getSteamPrice(steamType, steamId, 'GB'),
                      'EUR': self.getSteamPrice(steamType, steamId, 'FR'),
                      'AUD': self.getSteamPrice(steamType, steamId, 'AU')}

            currencies = {'USD': u'$',
                          'GBP': u'\u00A3',
                          'EUR': u'\u20AC',
                          'AUD': u'AU$'}

            if not prices['AUD'] or prices['AUD']['final'] == prices['USD']['final']:
                del prices['AUD']
            
            # filter out any missing prices
            prices = {key: val for key, val in prices.iteritems() if val}

            priceString = u'/'.join([currencies[val['currency']] + unicode(val['final'] / 100.0) for val in prices.values()])
            if prices['USD']['discount_percent'] > 0:
                priceString += assembleFormattedText(A.normal[A.fg.green[' ({0}% sale!)'.format(prices['USD']['discount_percent'])]])

            data.append(priceString)
        
        # description
        if 'about_the_game' in appData and appData['about_the_game'] is not None:
            limit = 150
            description = re.sub(r'(<[^>]+>|[\r\n\t])+', assembleFormattedText(A.normal[' ', A.fg.gray['>'], ' ']), appData['about_the_game'])
            if len(description) > limit:
                description = u'{0} ...'.format(description[:limit].rsplit(' ', 1)[0])
            data.append(description)

        return IRCResponse(ResponseType.Say,
                           self.graySplitter.join(data),
                           message.ReplyTo,
                           {'urlfollowURL': 'http://store.steampowered.com/{}/{}'.format({'app': 'app', 'package': 'sub'}[steamType], steamId)})

    @classmethod
    def getSteamPrice(cls, appType, appId, region):
        webPage = WebUtils.fetchURL('http://store.steampowered.com/api/{0}details/?{0}ids={1}&cc={2}&l=english&v=1'.format(appType, appId, region))
        priceField = {'app': 'price_overview', 'package': 'price'}[appType]
        response = json.loads(webPage.body)
        
        if 'data' not in response[appId]:
            return
        
        if region == 'AU':
            response[appId]['data'][priceField]['currency'] = 'AUD'
        return response[appId]['data'][priceField]

    def FollowKickstarter(self, ksID, message):
        webPage = WebUtils.fetchURL('https://www.kickstarter.com/projects/{}/description'.format(ksID))

        soup = BeautifulSoup(webPage.body)

        data = []

        shorturl = soup.find(rel='shorturl')['href']
        if shorturl is None:
            shorturl = 'https://www.kickstarter.com/projects/{}/'.format(ksID)

        title = soup.find(property='og:title')
        if title is not None:
            creator = soup.find(attrs={'data-modal-class': 'modal_project_by'})
            if creator is not None:
                data.append(unicode(assembleFormattedText(A.normal['{0}',
                                                                   A.fg.gray[' by '],
                                                                   '{1}'])).format(title['content'].strip(),
                                                                                   creator.text.strip()))
            else:
                data.append(title['content'].strip())

        stats = soup.find(id='stats')
        # projects in progress
        if stats is not None:
            backerCount = stats.find(id='backers_count')
            if backerCount is not None:
                backerCount = int(backerCount['data-backers-count'])
        # completed projects
        else:
            backerCount = soup.find(class_='NS_projects__spotlight_stats')
            if backerCount is not None:
                backerCount = int(backerCount.b.text.strip().split()[0].replace(',', ''))

        data.append('Backers: {0:,}'.format(backerCount))

        if stats is not None:
            pledgeData = stats.find(id='pledged')
            if pledgeData is not None:
                pledged = float(pledgeData['data-pledged'])
                goal = float(pledgeData['data-goal'])
                percentage = float(pledgeData['data-percent-raised'])
                if backerCount > 0:
                    pledgePerBacker = pledged / backerCount
                else:
                    pledgePerBacker = 0
        else:
            money = soup.select('span.money.no-code')
            if money:
                pledgedString = money[0].text.strip()
                goalString = money[1].text.strip()
                pledged = float(re.sub(ur'[^0-9.]', u'', pledgedString))
                goal = float(re.sub(ur'[^0-9.]', u'', goalString))
                percentage = (pledged / goal)
                if backerCount > 0:
                    pledgePerBacker = pledged / backerCount
                else:
                    pledgePerBacker = 0

        currency = soup.select('span.money.no-code')[-1]['class']
        currency.remove('money')
        currency.remove('no-code')
        currency = currency[0].upper()

        if percentage >= 1.0:
            percentageString = A.fg.green['({3:,.0f}% funded)']
        else:
            percentageString = A.fg.red['({3:,.0f}% funded)']

        pledgePerBackerString = A.fg.gray['{4:,.0f}/backer']

        pledgedString = assembleFormattedText(A.normal['Pledged: {0:,.0f}', A.fg.gray['/'], '{1:,.0f} {2} ', percentageString, ' ', pledgePerBackerString])
        data.append(pledgedString.format(pledged,
                                         goal,
                                         currency,
                                         #pledgedData.data['data-currency'],
                                         percentage * 100,
                                         pledgePerBacker))

        findState = soup.find(id='main_content')
        if 'Project-state-canceled' in findState['class']:
            data.append(assembleFormattedText(A.normal[A.fg.red['Cancelled']]))
        
        elif 'Project-state-suspended' in findState['class']:
            data.append(assembleFormattedText(A.normal[A.fg.blue['Suspended']]))
            
        elif 'Project-state-failed' in findState['class']:
            data.append(assembleFormattedText(A.normal[A.fg.red['Failed']]))

        elif 'Project-state-successful' in findState['class']:
                data.append(assembleFormattedText(A.normal[A.fg.green['Successful']]))

        elif 'Project-state-live' in findState['class']:
            duration = stats.find(id='project_duration_data')

            if duration is not None:
                remaining = float(duration['data-hours-remaining'])
                days = math.floor(remaining/24)
                hours = remaining % 24

                data.append('Duration: {0:.0f} days {1:.1f} hours to go'.format(days, hours))

        return IRCResponse(ResponseType.Say,
                           self.graySplitter.join(data),
                           message.ReplyTo,
                           {'urlfollowURL': shorturl})

    def FollowTwitch(self, channel, message):
        # Heavily based on Didero's DideRobot code for the same
        # https://github.com/Didero/DideRobot/blob/06629fc3c8bddf8f729ce2d27742ff999dfdd1f6/commands/urlTitleFinder.py#L37
        # TODO: other stats?
        chanData = {}
        channelOnline = False
        twitchHeaders = [('Accept', 'application/vnd.twitchtv.v2+json')]
        webPage = WebUtils.fetchURL(u'https://api.twitch.tv/kraken/streams/{}'.format(channel), twitchHeaders)

        streamData = json.loads(webPage.body)

        if 'stream' in streamData and streamData['stream'] is not None:
            chanData = streamData['stream']['channel']
            channelOnline = True
        elif 'error' not in streamData:
            webPage = WebUtils.fetchURL(u'https://api.twitch.tv/kraken/channels/{}'.format(channel), twitchHeaders)
            chanData = json.loads(webPage.body)

        if len(chanData) > 0:
            if channelOnline:
                channelInfo = assembleFormattedText(A.fg.green['']) + u'{}'.format(chanData['display_name']) + assembleFormattedText(A.normal[''])
            else:
                channelInfo = assembleFormattedText(A.fg.red['']) + u'{}'.format(chanData['display_name']) + assembleFormattedText(A.normal[''])
            channelInfo += u' "{}"'.format(re.sub(r'[\r\n]+', self.graySplitter, chanData['status'].strip()))
            if chanData['game'] is not None:
                channelInfo += assembleFormattedText(A.normal[A.fg.gray[', playing '], u'{}'.format(chanData['game'])])
            if chanData['mature']:
                channelInfo += assembleFormattedText(A.normal[A.fg.lightRed[' [Mature]']])
            if channelOnline:
                channelInfo += assembleFormattedText(A.normal[A.fg.green[' (Live with {0:,d} viewers)'.format(streamData['stream']['viewers'])]])
            else:
                channelInfo += assembleFormattedText(A.normal[A.fg.red[' (Offline)']])

            return IRCResponse(ResponseType.Say,
                               channelInfo,
                               message.ReplyTo,
                               {'urlfollowURL': 'https://twitch.tv/{}'.format(channel)})
    
    def FollowStandard(self, url, message):
        webPage = WebUtils.fetchURL(url)
        
        if webPage is None:
            return

        if webPage.responseUrl != url:
            return self.DispatchToFollows(webPage.responseUrl, message)
        
        title = self.GetTitle(webPage.body)
        if title is not None:
            return IRCResponse(ResponseType.Say,
                               u'{0} (at {1})'.format(title, webPage.domain),
                               message.ReplyTo,
                               {'urlfollowURL': url})
        
        return

    def GetTitle(self, webpage):
        soup = BeautifulSoup(webpage)
        title = soup.title
        if title:
            title = title.text
            title = re.sub(u'[\r\n]+', u'', title)  # strip any newlines
            title = title.strip()   # strip all whitespace either side
            title = re.sub(u'\s+', u' ', title)     # replace multiple whitespace chars with a single space
            title = self.htmlParser.unescape(title)     # unescape html entities

            # Split on the first space before 300 characters, and replace the rest with '...'
            if len(title) > 300:
                title = title[:300].rsplit(u' ', 1)[0] + u" ..."

            return title
        
        return None
Esempio n. 27
0
 def test_unescape_function(self):
     parser = HTMLParser.HTMLParser()
     self.assertEqual(parser.unescape('&#bad;'), '&#bad;')
     self.assertEqual(parser.unescape('&#0038;'), '&')
Esempio n. 28
0
from datetime import datetime
from email.utils import parsedate
from contextlib import closing
from functools import partial
from xml.sax.saxutils import escape, quoteattr

USER_AGENT = 'calibre mirror'
MR_URL = 'http://www.mobileread.com/forums/'
WORKDIR = '/srv/plugins' if os.path.exists('/srv') else '/t/plugins'
PLUGINS = 'plugins.json.bz2'
INDEX = MR_URL + 'showpost.php?p=1362767&postcount=1'
# INDEX = 'file:///t/raw.html'

IndexEntry = namedtuple(
    'IndexEntry', 'name url donate history uninstall deprecated thread_id')
u = HTMLParser.HTMLParser().unescape


def read(url, get_info=False):  # {{{
    if url.startswith("file://"):
        return urllib2.urlopen(url).read()
    opener = urllib2.build_opener()
    opener.addheaders = [
        ('User-Agent', USER_AGENT),
        ('Accept-Encoding', 'gzip,deflate'),
    ]
    res = opener.open(url)
    info = res.info()
    encoding = info.get('Content-Encoding')
    raw = res.read()
    res.close()
Esempio n. 29
0
city_f=['Los_Angeles,_CA','San_Francisco,_CA','Manhattan,_NY','Houston,_TX','Chicago,_IL','Philadelphia,_PA','Toronto,_Ontario','Atlanta,_GA','San_Diego,_CA','Orlando,_FL','Washington,_DC','Boston,_MA']
cities=['Los_A','San_F','Manha','San_D','Houst','Chica','Phila','Toron','Atlan','Washi','Bosto','Orlan']


#input file names 
train_file=sys.argv[1]
test_file=sys.argv[2]
output_file=sys.argv[3]

f=open(train_file,"rb")
lines=f.read().splitlines()
f.close()

docs=[]
 
html_parser = HTMLParser.HTMLParser()
#removing newlines in tweets plus smileys
for i in range(len(lines)) :
    temp=lines[i].split(" ")
    if temp[0] in city_f : 
        docs.append(html_parser.unescape(lines[i].decode('utf8','ignore').encode('ascii','ignore')))
    else :
        docs[-1]=docs[-1]+" "+html_parser.unescape(lines[i].decode('utf8','ignore').encode('ascii','ignore'))



#removing punctuation from training set
for i in range (len(docs)) :
    punc=set(string.punctuation)
    ftweet=""
    for char in docs[i] :
Esempio n. 30
0
def getPaintingGenerator(query=u''):
    '''
    Bla %02d
    '''

    searchurl = u'http://collectie.boijmans.nl/nl?p=%s&f.type=schilderij'

    htmlparser = HTMLParser.HTMLParser()

    # http://collectie.boijmans.nl/nl?p=54&f.type=schilderij is acting up

    # Nerds start at 0
    for i in range(55, 90):
        print u'\n\n\n\n'
        print searchurl % (i, )

        searchPage = urllib2.urlopen(searchurl % (i, ))
        searchData = searchPage.read()

        itemregex = u'<a href="/nl/collection/([^"]+)" class="block padding bg-light">'

        for match in re.finditer(itemregex, searchData):
            url = u'http://collectie.boijmans.nl/nl/collection/%s' % (
                match.group(1), )
            urlen = u'http://collectie.boijmans.nl/en/collection/%s' % (
                match.group(1), )

            itemPage = urllib2.urlopen(url)
            itemData = itemPage.read()

            itemenPage = urllib2.urlopen(urlen)
            itemenData = itemenPage.read()

            metadata = {}
            metadata['url'] = url
            print url

            titlenlregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Titel</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top">([^<]+)</p>'
            titleenregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Title</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top">([^<]+)</p>'
            creatorregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>(Schilder|Kunstenaar|Maker)</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top"><a href="[^"]+">([^<]+)</a>'
            #|toegeschreven aan|Atelier|school van
            yearregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Jaartal</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top"><a href="[^"]+">(\d+)</a></p>'
            idregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Inventarisnummer</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top">([^<]+)</p>'
            mediumregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Materiaal en techniek</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top    m-reset-padding-top">([^<]+)</p>'

            htmlparser = HTMLParser.HTMLParser()

            titlenlmatch = re.search(titlenlregex, itemData, flags=re.M)
            metadata[u'titlenl'] = htmlparser.unescape(
                unicode(titlenlmatch.group(1), "utf-8"))

            titleenmatch = re.search(titleenregex, itemenData, flags=re.M)
            metadata[u'titleen'] = htmlparser.unescape(
                unicode(titleenmatch.group(1), "utf-8"))

            creatormatch = re.search(creatorregex, itemData, flags=re.M)
            if creatormatch:
                metadata[u'creator'] = htmlparser.unescape(
                    unicode(creatormatch.group(2), "utf-8"))
            else:
                metadata[u'creator'] = u'anonymous'

            yearmatch = re.search(yearregex, itemData, flags=re.M)
            if yearmatch:
                metadata[u'year'] = htmlparser.unescape(
                    unicode(yearmatch.group(1), "utf-8"))

            idmatch = re.search(idregex, itemData, flags=re.M)
            metadata[u'id'] = htmlparser.unescape(
                unicode(idmatch.group(1), "utf-8"))

            mediummatch = re.search(mediumregex, itemData, flags=re.M)
            metadata[u'medium'] = htmlparser.unescape(
                unicode(mediummatch.group(1), "utf-8"))

            yield metadata