def normalizeTextForTagger(text): text = text.replace("&", "&") text = HTMLParser.HTMLParser().unescape(text) return text
def cb_injection_handler(url, delay, filename, http_request_method): counter = 1 vp_flag = True no_result = True is_encoded = False export_injection_info = False injection_type = "results-based command injection" technique = "classic injection technique" if not settings.LOAD_SESSION: info_msg = "Testing the " + technique + "... " sys.stdout.write(settings.print_info_msg(info_msg)) sys.stdout.flush() if settings.VERBOSITY_LEVEL >= 1: print "" i = 0 # Calculate all possible combinations total = len(settings.WHITESPACE) * len(settings.PREFIXES) * len( settings.SEPARATORS) * len(settings.SUFFIXES) for whitespace in settings.WHITESPACE: for prefix in settings.PREFIXES: for suffix in settings.SUFFIXES: for separator in settings.SEPARATORS: # If a previous session is available. if settings.LOAD_SESSION and session_handler.notification( url, technique): url, technique, injection_type, separator, shell, vuln_parameter, prefix, suffix, TAG, alter_shell, payload, http_request_method, url_time_response, delay, how_long, output_length, is_vulnerable = session_handler.injection_point_exportation( url, http_request_method) checks.check_for_stored_tamper(payload) else: i = i + 1 # Check for bad combination of prefix and separator combination = prefix + separator if combination in settings.JUNK_COMBINATION: prefix = "" # Change TAG on every request to prevent false-positive results. TAG = ''.join( random.choice(string.ascii_uppercase) for i in range(6)) randv1 = random.randrange(100) randv2 = random.randrange(100) randvcalc = randv1 + randv2 # Define alter shell alter_shell = menu.options.alter_shell try: if alter_shell: # Classic -alter shell- decision payload (check if host is vulnerable). payload = cb_payloads.decision_alter_shell( separator, TAG, randv1, randv2) else: # Classic decision payload (check if host is vulnerable). payload = cb_payloads.decision( separator, TAG, randv1, randv2) # Define prefixes & suffixes payload = parameters.prefixes(payload, prefix) payload = parameters.suffixes(payload, suffix) # Whitespace fixation payload = re.sub(" ", whitespace, payload) # Check for base64 / hex encoding payload = checks.perform_payload_encoding(payload) # Check if defined "--verbose" option. if settings.VERBOSITY_LEVEL == 1: print settings.print_payload(payload) elif settings.VERBOSITY_LEVEL > 1: info_msg = "Generating a payload for injection..." print settings.print_info_msg(info_msg) print settings.print_payload(payload) # Cookie Injection if settings.COOKIE_INJECTION == True: # Check if target host is vulnerable to cookie injection. vuln_parameter = parameters.specify_cookie_parameter( menu.options.cookie) response = cb_injector.cookie_injection_test( url, vuln_parameter, payload) # User-Agent Injection elif settings.USER_AGENT_INJECTION == True: # Check if target host is vulnerable to user-agent injection. vuln_parameter = parameters.specify_user_agent_parameter( menu.options.agent) response = cb_injector.user_agent_injection_test( url, vuln_parameter, payload) # Referer Injection elif settings.REFERER_INJECTION == True: # Check if target host is vulnerable to referer injection. vuln_parameter = parameters.specify_referer_parameter( menu.options.referer) response = cb_injector.referer_injection_test( url, vuln_parameter, payload) # Custom HTTP header Injection elif settings.CUSTOM_HEADER_INJECTION == True: # Check if target host is vulnerable to custom http header injection. vuln_parameter = parameters.specify_custom_header_parameter( settings.INJECT_TAG) response = cb_injector.custom_header_injection_test( url, vuln_parameter, payload) else: # Check if target host is vulnerable. response, vuln_parameter = cb_injector.injection_test( payload, http_request_method, url) # Try target page reload (if it is required). if settings.URL_RELOAD: response = requests.url_reload(url, delay) # Evaluate test results. shell = cb_injector.injection_test_results( response, TAG, randvcalc) if not settings.VERBOSITY_LEVEL >= 1: percent = ((i * 100) / total) float_percent = "{0:.1f}".format( round(((i * 100) / (total * 1.0)), 2)) if shell == False: info_msg = "Testing the " + technique + "... " + "[ " + float_percent + "%" + " ]" sys.stdout.write( "\r" + settings.print_info_msg(info_msg)) sys.stdout.flush() if float(float_percent) >= 99.9: if no_result == True: percent = Fore.RED + "FAILED" + Style.RESET_ALL else: percent = str(float_percent) + "%" elif len(shell) != 0: percent = Fore.GREEN + "SUCCEED" + Style.RESET_ALL else: percent = str(float_percent) + "%" info_msg = "Testing the " + technique + "... " + "[ " + percent + " ]" sys.stdout.write( "\r" + settings.print_info_msg(info_msg)) sys.stdout.flush() except KeyboardInterrupt: raise except SystemExit: raise except: continue # Yaw, got shellz! # Do some magic tricks! if shell: found = True no_result = False if settings.COOKIE_INJECTION == True: header_name = " cookie" found_vuln_parameter = vuln_parameter the_type = " parameter" elif settings.USER_AGENT_INJECTION == True: header_name = " User-Agent" found_vuln_parameter = "" the_type = " HTTP header" elif settings.REFERER_INJECTION == True: header_name = " Referer" found_vuln_parameter = "" the_type = " HTTP header" elif settings.CUSTOM_HEADER_INJECTION == True: header_name = " " + settings.CUSTOM_HEADER_NAME found_vuln_parameter = "" the_type = " HTTP header" else: header_name = "" the_type = " parameter" if http_request_method == "GET": found_vuln_parameter = parameters.vuln_GET_param( url) else: found_vuln_parameter = vuln_parameter if len(found_vuln_parameter) != 0: found_vuln_parameter = " '" + found_vuln_parameter + Style.RESET_ALL + Style.BRIGHT + "'" # Print the findings to log file. if export_injection_info == False: export_injection_info = logs.add_type_and_technique( export_injection_info, filename, injection_type, technique) if vp_flag == True: vp_flag = logs.add_parameter( vp_flag, filename, the_type, header_name, http_request_method, vuln_parameter, payload) logs.update_payload(filename, counter, payload) counter = counter + 1 if not settings.VERBOSITY_LEVEL >= 1 and not settings.LOAD_SESSION: print "" # Print the findings to terminal. success_msg = "The" if found_vuln_parameter == " ": success_msg += http_request_method + "" success_msg += the_type + header_name success_msg += found_vuln_parameter + " seems injectable via " success_msg += "(" + injection_type.split( " ")[0] + ") " + technique + "." print settings.print_success_msg(success_msg) print settings.SUB_CONTENT_SIGN + "Payload: " + re.sub( "%20", " ", re.sub("%2B", "+", payload)) + Style.RESET_ALL # Export session if not settings.LOAD_SESSION: session_handler.injection_point_importation( url, technique, injection_type, separator, shell[0], vuln_parameter, prefix, suffix, TAG, alter_shell, payload, http_request_method, url_time_response=0, delay=0, how_long=0, output_length=0, is_vulnerable=menu.options.level) else: whitespace = settings.WHITESPACE[0] settings.LOAD_SESSION = False # Check for any enumeration options. new_line = True if settings.ENUMERATION_DONE == True: while True: if not menu.options.batch: question_msg = "Do you want to enumerate again? [Y/n] > " enumerate_again = raw_input( "\n" + settings.print_question_msg( question_msg)).lower() else: enumerate_again = "" if len(enumerate_again) == 0: enumerate_again = "y" if enumerate_again in settings.CHOICE_YES: cb_enumeration.do_check( separator, TAG, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, delay) #print "" break elif enumerate_again in settings.CHOICE_NO: new_line = False break elif enumerate_again in settings.CHOICE_QUIT: sys.exit(0) else: err_msg = "'" + enumerate_again + "' is not a valid answer." print settings.print_error_msg(err_msg) pass else: if menu.enumeration_options(): cb_enumeration.do_check( separator, TAG, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, delay) if not menu.file_access_options( ) and not menu.options.os_cmd and new_line: print "" # Check for any system file access options. if settings.FILE_ACCESS_DONE == True: if settings.ENUMERATION_DONE != True: print "" while True: if not menu.options.batch: question_msg = "Do you want to access files again? [Y/n] > " sys.stdout.write( settings.print_question_msg( question_msg)) file_access_again = sys.stdin.readline( ).replace("\n", "").lower() else: file_access_again = "" if len(file_access_again) == 0: file_access_again = "y" if file_access_again in settings.CHOICE_YES: cb_file_access.do_check( separator, TAG, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, delay) print "" break elif file_access_again in settings.CHOICE_NO: break elif file_access_again in settings.CHOICE_QUIT: sys.exit(0) else: err_msg = "'" + file_access_again + "' is not a valid answer." print settings.print_error_msg(err_msg) pass else: if menu.file_access_options(): if not menu.enumeration_options(): print "" cb_file_access.do_check( separator, TAG, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, delay) print "" # Check if defined single cmd. if menu.options.os_cmd: if not menu.file_access_options(): print "" cb_enumeration.single_os_cmd_exec( separator, TAG, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, delay) # Pseudo-Terminal shell go_back = False go_back_again = False while True: if go_back == True: break # if settings.ENUMERATION_DONE == False and settings.FILE_ACCESS_DONE == False: # if settings.VERBOSITY_LEVEL >= 1: # print "" if not menu.options.batch: question_msg = "Do you want a Pseudo-Terminal shell? [Y/n] > " sys.stdout.write( settings.print_question_msg(question_msg)) gotshell = sys.stdin.readline().replace( "\n", "").lower() else: gotshell = "" if len(gotshell) == 0: gotshell = "y" if gotshell in settings.CHOICE_YES: if not menu.options.batch: print "" print "Pseudo-Terminal (type '" + Style.BRIGHT + "?" + Style.RESET_ALL + "' for available options)" if readline_error: checks.no_readline_module() while True: try: if not readline_error: # Tab compliter readline.set_completer( menu.tab_completer) # MacOSX tab compliter if getattr( readline, '__doc__', '' ) is not None and 'libedit' in getattr( readline, '__doc__', ''): readline.parse_and_bind( "bind ^I rl_complete") # Unix tab compliter else: readline.parse_and_bind( "tab: complete") cmd = raw_input("""commix(""" + Style.BRIGHT + Fore.RED + """os_shell""" + Style.RESET_ALL + """) > """) cmd = checks.escaped_cmd(cmd) if cmd.lower( ) in settings.SHELL_OPTIONS: go_back, go_back_again = shell_options.check_option( separator, TAG, cmd, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename, technique, go_back, no_result, delay, go_back_again) if go_back and go_back_again == False: break if go_back and go_back_again: return True else: # Command execution results. response = cb_injector.injection( separator, TAG, cmd, prefix, suffix, whitespace, http_request_method, url, vuln_parameter, alter_shell, filename) # Try target page reload (if it is required). if settings.URL_RELOAD: response = requests.url_reload( url, delay) if menu.options.ignore_session or \ session_handler.export_stored_cmd(url, cmd, vuln_parameter) == None: # Evaluate injection results. try: shell = cb_injector.injection_results( response, TAG, cmd) shell = "".join( str(p) for p in shell) except: print "" continue if not menu.options.ignore_session: session_handler.store_cmd( url, cmd, shell, vuln_parameter) else: shell = session_handler.export_stored_cmd( url, cmd, vuln_parameter) if shell: html_parser = HTMLParser.HTMLParser( ) shell = html_parser.unescape( shell) # Update logs with executed cmds and execution results. logs.executed_command( filename, cmd, shell) if shell != "": if settings.VERBOSITY_LEVEL == 1: print "" print "\n" + Fore.GREEN + Style.BRIGHT + shell + Style.RESET_ALL + "\n" else: if settings.VERBOSITY_LEVEL >= 1: print "" err_msg = "The '" + cmd + "' command, does not return any output." print settings.print_critical_msg( err_msg) + "\n" except KeyboardInterrupt: raise except SystemExit: raise elif gotshell in settings.CHOICE_NO: if checks.next_attack_vector( technique, go_back) == True: break else: if no_result == True: return False else: return True elif gotshell in settings.CHOICE_QUIT: sys.exit(0) else: err_msg = "'" + gotshell + "' is not a valid answer." print settings.print_error_msg(err_msg) pass if no_result == True: print "" return False else: sys.stdout.write("\r") sys.stdout.flush()
def parse(source=source): parser = HTMLParser.HTMLParser() parser.feed(source) parser.close()
from __future__ import absolute_import, division, print_function import json import re import apache_beam as beam import six import nlp if six.PY2: import HTMLParser as html_parser # pylint:disable=g-import-not-at-top html_unescape = html_parser.HTMLParser().unescape else: import html # pylint:disable=g-import-not-at-top html_unescape = html.unescape _CITATION = """ @article{47761, title = {Natural Questions: a Benchmark for Question Answering Research}, author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov}, year = {2019}, journal = {Transactions of the Association of Computational Linguistics} } """ _DESCRIPTION = """
<affected-histogram name="HistogramEnum"/> </histogram_suffixes> </histogram_suffixes_list> </histogram-configuration> """ import bisect import copy import datetime import itertools try: import HTMLParser html = HTMLParser.HTMLParser() except ImportError: # For Py3 compatibility import html import logging import re import xml.dom.minidom BASIC_EMAIL_REGEXP = r'^[\w\-\+\%\.]+\@[\w\-\+\%\.]+$' OWNER_PLACEHOLDER = ( 'Please list the metric\'s owners. Add more owner tags as needed.') MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH = 5 DEFAULT_BASE_HISTOGRAM_OBSOLETE_REASON = (
def send_credit_notifications(username, course_key): """Sends email notification to user on different phases during credit course e.g., credit eligibility, credit payment etc. """ try: user = User.objects.get(username=username) except User.DoesNotExist: log.error('No user with %s exist', username) return course = modulestore().get_course(course_key, depth=0) course_display_name = course.display_name tracking_context = tracker.get_tracker().resolve_context() tracking_id = str(tracking_context.get('user_id')) client_id = str(tracking_context.get('client_id')) events = '&t=event&ec=email&ea=open' tracking_pixel = 'https://www.google-analytics.com/collect?v=1&tid' + tracking_id + '&cid' + client_id + events dashboard_link = _email_url_parser('dashboard') credit_course_link = _email_url_parser('courses', '?type=credit') # get attached branded logo logo_image = cache.get('credit.email.attached-logo') if logo_image is None: branded_logo = { 'title': 'Logo', 'path': settings.NOTIFICATION_EMAIL_EDX_LOGO, 'cid': str(uuid.uuid4()) } logo_image_id = branded_logo['cid'] logo_image = attach_image(branded_logo, 'Header Logo') if logo_image: cache.set('credit.email.attached-logo', logo_image, settings.CREDIT_NOTIFICATION_CACHE_TIMEOUT) else: # strip enclosing angle brackets from 'logo_image' cache 'Content-ID' logo_image_id = logo_image.get('Content-ID', '')[1:-1] providers_names = get_credit_provider_display_names(course_key) providers_string = make_providers_strings(providers_names) context = { 'full_name': user.get_full_name(), 'platform_name': settings.PLATFORM_NAME, 'course_name': course_display_name, 'branded_logo': logo_image_id, 'dashboard_link': dashboard_link, 'credit_course_link': credit_course_link, 'tracking_pixel': tracking_pixel, 'providers': providers_string, } # create the root email message notification_msg = MIMEMultipart('related') # add 'alternative' part to root email message to encapsulate the plain and # HTML versions, so message agents can decide which they want to display. msg_alternative = MIMEMultipart('alternative') notification_msg.attach(msg_alternative) # render the credit notification templates subject = _(u'Course Credit Eligibility') if providers_string: subject = _(u'You are eligible for credit from {providers_string}' ).format(providers_string=providers_string) # add alternative plain text message email_body_plain = render_to_string( 'credit_notifications/credit_eligibility_email.txt', context) msg_alternative.attach( SafeMIMEText(email_body_plain, _subtype='plain', _charset='utf-8')) # add alternative html message email_body_content = cache.get('credit.email.css-email-body') if email_body_content is None: html_file_path = file_path_finder( 'templates/credit_notifications/credit_eligibility_email.html') if html_file_path: with open(html_file_path, 'r') as cur_file: cur_text = cur_file.read() # use html parser to unescape html characters which are changed # by the 'pynliner' while adding inline css to html content html_parser = HTMLParser.HTMLParser() email_body_content = html_parser.unescape( with_inline_css(cur_text)) # cache the email body content before rendering it since the # email context will change for each user e.g., 'full_name' cache.set('credit.email.css-email-body', email_body_content, settings.CREDIT_NOTIFICATION_CACHE_TIMEOUT) else: email_body_content = '' email_body = Template(email_body_content).render([context]) msg_alternative.attach( SafeMIMEText(email_body, _subtype='html', _charset='utf-8')) # attach logo image if logo_image: notification_msg.attach(logo_image) # add email addresses of sender and receiver from_address = microsite.get_value('default_from_email', settings.DEFAULT_FROM_EMAIL) to_address = user.email # send the root email message msg = EmailMessage(subject, None, from_address, [to_address]) msg.attach(notification_msg) msg.send()
return dict((r.review_id, r) for r in qs) def _retrieve_translation(text, language): try: r = requests.get(settings.GOOGLE_TRANSLATE_API_URL, params={ 'key': getattr(settings, 'GOOGLE_API_CREDENTIALS', ''), 'q': text, 'target': language }) except Exception, e: log.error(e) try: translated = (HTMLParser.HTMLParser().unescape( r.json()['data']['translations'][0]['translatedText'])) except (KeyError, IndexError): translated = '' return translated, r @addon_view @waffle_switch('reviews-translate') @non_atomic_requests def translate(request, addon, review_id, language): """ Use the Google Translate API for ajax, redirect to Google Translate for non ajax calls. """ review = get_object_or_404(Review, pk=review_id, addon=addon) if '-' in language:
counter = 0 for day in sDict.values(): for show in day: counter += 1 print str(counter) + ". " + show.format_output() def printHTMLTable(sDict): if isinstance(sDict, OrderedDefaultdict): for day, shows in sDict.items(): print "<p> </p>\n" print "<p><span style=\"font-family:times new roman,times,serif;\"><strong>" + day + "</strong></span></p>\n" print "<p> </p>\n" print "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" dir=\"ltr\" style=\"width:754px;\" width=\"754\">\n<colgroup>\n<col /><col /><col /><col /></colgroup>\n<tbody>\n<tr>\n<td height=\"16\" style=\"height:21px;width:75px; padding-left:2px; padding-right:2px;\"><strong>Time</strong></td>\n<td style=\"width:225px;\"><strong>Show Name</strong></td>\n<td style=\"width:225px;\"><strong>Genre</strong></td>\n<td style=\"width:225px;\"><strong>Tagline</strong></td>\n</tr>\"\n" for show in shows: print show.format_html_output() print "</tbody>\n</table>" print "<p> </p>\n" #Main f = open(filename) for line in f: lineAsString = line.rstrip() lineAsString = HTMLParser.HTMLParser().unescape(lineAsString) if parseString(lineAsString) is not None: infoOrg(parseString(lineAsString)) #printNumberedList(schedule) printHTMLTable(schedule) f.close()
def __init__(self): super(ToutiaoExtractor, self).__init__() self.html_parser = HTMLParser.HTMLParser()
def unescape(text): return HTMLParser.HTMLParser().unescape(text)
def unescape(string): html_parser = HTMLParser.HTMLParser() return html_parser.unescape(string)
def render_comment_plain(comment, context): parser = HTMLParser.HTMLParser() chunks = list(get_file_chunks_in_range_custom( context, comment.filediff, comment.interfilediff, comment.first_line, comment.num_lines)) lines = [ "::: %s" % comment.filediff.dest_file_display, ] if comment.interfilediff: lines.append( "(Diff revisions %s - %s)" % ( comment.filediff.diffset.revision, comment.interfilediff.diffset.revision)) else: lines.append( "(Diff revision %s)" % comment.filediff.diffset.revision) for chunk in chunks: if chunk['change'] == "equal": lines.extend(render_equal_chunk(chunk, parser)) elif chunk['change'] == "insert": for line in chunk['lines']: lines.append("> +%s" % parser.unescape(line[5])) elif chunk['change'] == "delete": for line in chunk['lines']: lines.append("> -%s" % parser.unescape(line[2])) elif chunk['change'] == "replace": for line in chunk['lines']: lines.append("> -%s" % parser.unescape(line[2])) for line in chunk['lines']: lines.append("> +%s" % parser.unescape(line[5])) lines.append("") comments = [] c = comment depth = 0 while True: if depth: prefix = '%s ' % ('>' * depth,) else: prefix = '' comments.append("%s%s" % (prefix, c)) if c.reply_to: c = c.reply_to depth += 1 else: break comments.reverse() lines.extend(comments) return "\n".join(lines)
def filter_cases(request, domain, app_id, module_id): app = Application.get(app_id) module = app.get_module(module_id) delegation = request.GET.get('task-list') == 'true' auth_cookie = request.COOKIES.get('sessionid') suite_gen = SuiteGenerator(app) xpath = suite_gen.get_filter_xpath(module, delegation=delegation) extra_instances = [{ 'id': inst.id, 'src': inst.src } for inst in suite_gen.get_instances_for_module( module, additional_xpaths=[xpath])] # touchforms doesn't like this to be escaped xpath = HTMLParser.HTMLParser().unescape(xpath) if delegation: case_type = DELEGATION_STUB_CASE_TYPE else: case_type = module.case_type if xpath: # if we need to do a custom filter, send it to touchforms for processing additional_filters = { "properties/case_type": case_type, "footprint": True } helper = SessionDataHelper(domain, request.couch_user) result = helper.filter_cases(xpath, additional_filters, DjangoAuth(auth_cookie), extra_instances=extra_instances) if result.get('status', None) == 'error': return HttpResponseServerError( result.get("message", _("Something went wrong filtering your cases."))) case_ids = result.get("cases", []) else: # otherwise just use our built in api with the defaults case_ids = [ res.id for res in get_filtered_cases(domain, status=CASE_STATUS_OPEN, case_type=case_type, user_id=request.couch_user._id, ids_only=True) ] cases = [ CommCareCase.wrap(doc) for doc in iter_docs(CommCareCase.get_db(), case_ids) ] # refilter these because we might have accidentally included footprint cases # in the results from touchforms. this is a little hacky but the easiest # (quick) workaround. should be revisted when we optimize the case list. cases = filter(lambda c: c.type == case_type, cases) cases = [c.get_json(lite=True) for c in cases if c] parents = [] if delegation: for case in cases: parent_id = case['indices']['parent']['case_id'] parents.append(CommCareCase.get(parent_id)) return json_response({'cases': cases, 'parents': parents}) else: return json_response(cases)
def parse_patents(fd, fd2): import re, csv, os, codecs, zipfile, traceback import string, random, HTMLParser def id_generator(size=25, chars=string.ascii_lowercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) type_kind = { '1': ["A", "utility"], '2': ["E", "reissue"], '3': ["I5", "TVPP"], '4': ["S", "design"], '5': ["I4", "defensive publication"], '6': ["P", "plant"], '7': ["H", "statutory invention registration"] } reldoctype = [ 'continuation-in-part', 'continuation_in_part', 'continuing_reissue', 'division', 'reissue', 'related_publication', 'substitution', 'us_provisional_application', 'us_reexamination_reissue_merger', 'continuation' ] fd += '/' fd2 += '/' diri = os.listdir(fd) diri = [d for d in diri if d.endswith('zip')] #Initiate HTML Parser for unescape characters h = HTMLParser.HTMLParser() #Remove all files from output dir before writing outdir = os.listdir(fd2) for oo in outdir: os.remove(os.path.join(fd2, oo)) det_desc_textfile = open(os.path.join(fd2, 'detail_desc_text.csv'), 'wb') det_desc_textfile.write(codecs.BOM_UTF8) det_desc = csv.writer(det_desc_textfile, delimiter='\t') det_desc.writerow(['uuid', 'patent_id', 'text', 'length']) det_desc_textfile.close() loggroups = [ 'PATN', 'INVT', 'ASSG', 'PRIR', 'REIS', 'RLAP', 'CLAS', 'UREF', 'FREF', 'OREF', 'LREP', 'PCTA', 'ABST', 'GOVT', 'PARN', 'BSUM', 'DRWD', 'DETD', 'CLMS', 'DCLM' ] numii = 0 rawlocation = {} mainclassdata = {} subclassdata = {} for d in diri: print d inp = zipfile.ZipFile(os.path.join(fd, d)) for i in inp.namelist(): infile = h.unescape( inp.open(i).read().decode('utf-8', 'ignore').replace( '&angst', 'å')).replace("\r", "").split('PATN') del infile[0] for i in infile: numii += 1 i = i.encode('utf-8', 'ignore') # Get relevant logical groups from patent records according to documentation # Some patents can contain several INVT, ASSG and other logical groups - so, is important to retain all avail_fields = {} num = 1 avail_fields['PATN'] = i.split('INVT')[0] runnums = [] for n in range(1, len(loggroups)): try: gg = re.search('\n' + loggroups[n], i).group() if num - n == 0: runnums.append(n) num += 1 go = list(re.finditer('\n' + loggroups[n - 1], i)) if len(go) == 1: needed = i.split(loggroups[n - 1])[1] avail_fields[loggroups[n - 1]] = needed.split( loggroups[n])[0] elif len(go) > 1: needed = '\n\n\n\n\n'.join( i.split(loggroups[n - 1])[1:]) avail_fields[loggroups[n - 1]] = needed.split( loggroups[n])[0] else: pass else: go = list(re.finditer('\n' + loggroups[runnums[-1]], i)) if len(go) == 1: needed = i.split(loggroups[runnums[-1]])[1] avail_fields[loggroups[ runnums[-1]]] = needed.split(loggroups[n])[0] elif len(go) > 1: needed = '\n\n\n\n\n'.join( i.split(loggroups[runnums[-1]])[1:]) avail_fields[loggroups[ runnums[-1]]] = needed.split(loggroups[n])[0] else: pass runnums.append(n) num = n + 1 except: pass # Create containers based on existing Berkeley DB schema (not all are currently used - possible compatibility issues) application = {} claimsdata = {} examiner = {} foreigncitation = {} ipcr = {} otherreference = {} patentdata = {} pctdata = {} prioritydata = {} rawassignee = {} rawinventor = {} rawlawyer = {} usappcitation = {} uspatentcitation = {} uspc = {} usreldoc = {} figureinfo = {} termofgrant = {} drawdescdata = {} relappdata = {} ### PARSERS FOR LOGICAL GROUPS ### try: numfigs = '' numsheets = '' disclaimerdate = '' termpat = '' patent = avail_fields['PATN'].split('\n') for line in patent: if line.startswith("WKU"): patnum = re.search('WKU\s+(.*?)$', line).group(1) updnum = re.sub('^H0', 'H', patnum)[:8] updnum = re.sub('^RE0', 'RE', updnum)[:8] updnum = re.sub('^PP0', 'PP', updnum)[:8] updnum = re.sub('^PP0', 'PP', updnum)[:8] updnum = re.sub('^D0', 'D', updnum)[:8] updnum = re.sub('^T0', 'T', updnum)[:8] if len(patnum) > 7 and patnum.startswith('0'): updnum = patnum[1:8] #data['patnum'] = updnum #print updnum patent_id = updnum if line.startswith('SRC'): seriescode = re.search('SRC\s+(.*?)$', line).group(1) try: gg = int(seriescode) if len(seriescode) == 1: seriescode = '0' + seriescode except: pass if line.startswith('APN'): appnum = re.search('APN\s+(.*?)$', line).group(1)[:6] if len(appnum) != 6: appnum = 'NULL' #data['appnum'] = appnum if line.startswith('APT'): apptype = re.search('APT\s+(.*?)$', line).group(1) apptype = re.search('\d', apptype).group() if line.startswith('APD'): appdate = re.search('APD\s+(.*?)$', line).group(1) appdate = appdate[:4] + '-' + appdate[ 4:6] + '-' + appdate[6:] #print appdate if line.startswith('TTL'): title = re.search('TTL\s+(.*?)ISD', avail_fields['PATN'], re.DOTALL).group(1) title = re.sub('[\n\t\r\f]+', '', title) title = re.sub('\s+$', '', title) title = re.sub('\s+', ' ', title) if line.startswith('ISD'): issdate = re.search('ISD\s+(.*?)$', line).group(1) if issdate[6:] == "00": day = '01' else: day = issdate[6:] if issdate[4:6] == "00": month = '01' else: month = issdate[4:6] year = issdate[:4] issdate = year + '-' + month + '-' + day #print issdate if line.startswith("NCL"): numclaims = re.search('NCL\s+(.*?)$', line).group(1) #Figure and sheet info if line.startswith('NDR'): numsheets = re.search('NDR\s+(.*?)$', line).group(1) if line.startswith('NFG'): numfigs = re.search('NFG\s+(.*?)$', line).group(1) #U.S. term of grant if line.startswith('TRM'): termpat = re.sub( '[\n\t\r\f]+', '', re.search('TRM\s+(.*?)$', line).group(1)) if line.startswith('DCD'): disclaimerdate = re.sub( '[\n\t\r\f]+', '', re.search('DCD\s+(.*?)$', line).group(1)) disclaimerdate = disclaimerdate[:4] + '-' + disclaimerdate[ 4:6] + '-' + disclaimerdate[6:] # Examiner info sequence = 0 if line.startswith("EXA"): sequence += 1 assistexam = re.search('EXA\s+(.*?)$', line).group(1).split("; ") assistexamfname = assistexam[1] assistexamlname = assistexam[0] examiner[id_generator()] = [ patent_id, assistexamfname, assistexamlname, "assistant", "NULL" ] if line.startswith("EXP"): sequence += 1 primexam = re.search('EXP\s+(.*?)$', line).group(1).split("; ") primexamfname = primexam[1] primexamlname = primexam[0] examiner[id_generator()] = [ patent_id, primexamfname, primexamlname, "primary", "NULL" ] if line.startswith("ECL"): exemplary = re.search('ECL\s+(.*?)$', line).group(1) exemplary_list = exemplary.split(",") except: pass patent_id = updnum # Detail description detdesc = 'NULL' try: detdesc = re.sub('PAR\s+', ' ', avail_fields['DETD']) detdesc = re.sub('PAC\s+', ' ', detdesc) detdesc = re.sub('PA\d+\s+', ' ', detdesc) detdesc = re.sub('TBL\s+', '', detdesc) detdesc = re.sub('\s+', ' ', detdesc) except: pass det_desc_textfile = csv.writer(open( os.path.join(fd2, 'detail_desc_text.csv'), 'ab'), delimiter='\t') det_desc_textfile.writerow( [id_generator(), patent_id, detdesc, len(detdesc)])
def html_unescape(value): h = HTMLParser.HTMLParser() return h.unescape(value)
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): # cachedPages = {} # print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) setresolved = True for k in doRegexs: if k in regexs: # print 'processing ' ,k m = regexs[k] # print m cookieJarParam = False if 'cookiejar' in m: # so either create or reuse existing jar # print 'cookiejar exists',m['cookiejar'] cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True # print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar is None: # print 'create cookie jar' cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('open[')[1].split(']')[0] cookieJar = getCookieJar(cookie_jar_file) if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split(']')[0] saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m['setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m['appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) # print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m['page'] in cachedPages and 'ignorecache' not in m and forCookieJarOnly is False: # print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith('http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) # print 'Ingoring Cache',m['page'] page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] current_proxies = urllib2.ProxyHandler(urllib2.getproxies()) # print 'getting pageUrl',pageUrl req = urllib2.Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] if pageUrl[:5] == "https": proxy = urllib2.ProxyHandler({'https': proxytouse}) else: proxy = urllib2.ProxyHandler({'http': proxytouse}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1') proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = cookielib.Cookie( version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if cookieJar is not None: # print 'cookieJarVal',cookieJar cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener( cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) urllib2.install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = urllib2.build_opener( cookie_handler, NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) urllib2.install_opener(opener) elif 'noredirect' in m: opener = urllib2.build_opener( NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) urllib2.install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib2.build_opener(keepalive_handler) urllib2.install_opener(opener) # print 'after cookie jar' post = None if 'post' in m: postData = m['post'] # if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib.urlencode(post) if 'rawpost' in m: post = m['rawpost'] link = '' try: if post: response = urllib2.urlopen(req, post) else: response = urllib2.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': from StringIO import StringIO import gzip buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and current_proxies is not None: urllib2.install_opener(urllib2.build_opener(current_proxies)) link = javascriptUnEscape(link) # print repr(link) # print link This just print whole webpage in LOG if 'includeheaders' in m: # link+=str(response.headers.get('Set-Cookie')) link += '$$HEADERS_START$$:' for b in response.headers: link += b+':'+response.headers.get(b)+'\n' link += '$$HEADERS_END$$:' # print link response.close() except Exception: pass cachedPages[m['page']] = link # print link # print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar # do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar # do nothing link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': # print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) # print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith('$pyFunction:') or '#$pyFunction' in m['expres']: # print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar # do nothing if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except Exception: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': # print 'link',link reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except Exception: traceback.print_exc() elif m['page'] == '' or m['page'] is None: val = m['expres'] if rawPost: # print 'rawpost' val = urllib.quote_plus(val) if 'htmlunescape' in m: # val=urllib.unquote_plus(val) import HTMLParser val = HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except Exception: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) # print 'ur',url # return val else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url # print 'final url',repr(url) if url == "": return else: return url, setresolved
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib,urllib2,re,xbmcaddon,xbmcplugin,xbmcgui,xbmc,HTMLParser htmlparser = HTMLParser.HTMLParser() pluginhandle = int(sys.argv[1]) itemcnt = 0 baseurl = 'http://www.gamestar.de' channelurl = 'http://www.gamestar.de/videos/video-kanaele/' getvideourl = 'http://www.gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' googleresize = 'http://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&url=' settings = xbmcaddon.Addon(id='plugin.video.gamestar_ll') maxitems = (int(settings.getSetting("items_per_page"))+1)*10 forceMovieViewMode = settings.getSetting("forceMovieViewMode") == 'true' useThumbAsFanart = settings.getSetting("useThumbAsFanart") == 'true' hirespix = settings.getSetting("useHighresPix") == 'true' movieViewMode = str(settings.getSetting("movieViewMode")) premium = False dbg = False cats = [ ('http://www.gamestar.de/videos/latest/','Neueste Videos','',''), ('http://www.gamestar.de/videos/popular/','Meist gesehen','',''), ('http://www.gamestar.de/videos/news,100/','News','Von Montag bis Freitag immer mittags berichten wir in unserer News-Show über die wichtigsten Spiele-Themen des Tages.','http://images.gamestar.de/images/idgwpgsgp/bdb/2558457/b144x81.jpg'), ('http://www.gamestar.de/videos/was-ist-,96/','Was ist ...?','In »Was ist…?« präsentieren wir Indie-Hits, Geheimtipps und andere Spiele-Kleinode mit kommentierten Spielszenen.',''), ('http://www.gamestar.de/videos/feedback,99/','Feedback','In Feedback beantwortet unser Team regelmäßig Fragen der Community und plaudert mit Moderator Andre Peschke aus dem Nähkästchen.',''), ('http://www.gamestar.de/videos/kino-und-dvd,26/','Kino und DVD','Aktuelle Trailer zu Kinofilmen und DVD-Neuerscheinungen.','http://images.gamestar.de/images/idgwpgsgp/bdb/2334506/b144x81.jpg'), ('http://www.gamestar.de/videos/gamewatch,97/','Gamewatch','Neue Trailer, Gameplay-Videos oder Live-Demos.',''), ('http://www.gamestar.de/videos/public-viewing,37/','Public Viewing','Neue Spiele ausführlich angespielt und vorgestellt','http://images.gamestar.de/images/idgwpgsgp/bdb/2121485/b144x81.jpg'), ('http://www.gamestar.de/index.cfm?pid=1589&ci=9','Quickplay','Alle Trailer aus dem Action-Genre mit den Unterrubriken Ego-Shooter, Action-Adventures, Flugsimulationen und anderen.','http://images.gamestar.de/images/idgwpgsgp/bdb/2016676/b144x81.jpg'),
def gen_quotes(category, title): #print "%s %s %s" % ("=" * 30, "quotes", "=" * 30) keyword = "" '''delete punctuation in title''' exclude = set(string.punctuation) title = ''.join(ch for ch in title if ch not in exclude) # print "%s %s %s" % ("="*30, "title without punctuation", "="*30) # print title '''generate the list of keywords from title''' keywords = alchemy_keywords(title) if len(keywords) <= 1: #print "keywords not good from alchemyapi" keywords = [ word for word in title.lower().split() if word not in stopwords.words('english') ] elif len(keywords) > 3: keywords = keywords[:2] #print "shorten the keywords: " + keywords web_url = "" while web_url == "": keywords_str = ' '.join(keywords) query1 = category + " " + keywords_str + " site:brainyquote.com" query2 = urllib.urlencode({'q': query1}) response = urllib.urlopen( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query2).read() json = m_json.loads(response) '''in case we hit the limit of searching''' try: results = json['responseData']['results'] '''too many keywords for finding a good result''' try: #print "Query tried: ", query1 web_url = results[0]["url"] #print web_url contents = results[0]['content'].split("...") contents = filter(None, contents) h = HTMLParser.HTMLParser() if len(contents) == 1: index = 0 else: index = 1 target_content = h.unescape(contents[index]) target_content = BeautifulSoup(target_content).text target_content = " ".join(target_content.split()) except Exception: #print "-----broke" del keywords[-1] '''lower the quality of target_content in except''' target_content = category except Exception: #print "query no result: " + query1 result_urls = search(query1, num=10, pause=1.0) urls_list = [link for (num, link) in list(enumerate(result_urls))] web_url = urls_list[0] #print web_url target_content = category #print "target_content:" + target_content '''start to fetch quotes from the right link''' success, quote, author = get_quote(web_url, target_content) '''if can't find one quote with target_content which happens in two cases: 1. the quote is in a later page. 2. the target_content is not right''' if success == -1: #first case: do another search with target_content requery = target_content + " site:brainyquote.com" #print "requery: " + requery #print "target_content: " + target_content quotes_urls = search(requery, num=20, pause=2.0) urls_list = [link for (num, link) in list(enumerate(quotes_urls))] quote_url = urls_list[0] success, quote, author = get_quote(quote_url, target_content) #second case: pick the first quote return quote, author
import ast import HTMLParser import re import sys import requests import time import string API_URL = "https://api.stackexchange.com" HTML_PARSER = HTMLParser.HTMLParser() SEPARATOR_1 = "====================================================" SEPARATOR_2 = "----------------------------------------------------" SEPARATOR_3 = "####################################################" def get_segment(input_string, starting_index=0, ending_index=0, beginning_token="(", ending_token=")", escape=True): if beginning_token == ending_token or starting_index >= len(input_string)-1 or starting_index < 0: return "" if ending_index <= starting_index or ending_index >= len(input_string): ending_index = len(input_string) if ending_index <= 0: return "" s = [] output_starting_index = starting_index output_ending_index = ending_index found_first_beginning_token = False for match_oject in re.finditer(r'({})|({})'.format(re.escape(beginning_token), re.escape(ending_token)), input_string[starting_index:ending_index]): token = match_oject.group(0) if match_oject.group(0) == beginning_token: if not found_first_beginning_token: output_starting_index = match_oject.start()
def edit(request, quoteId=1): # Quote editing/modification logic if (request.method == 'POST') and ('delete' in request.POST): # --- Handle delete requests --- quote_in_question = get_object_or_404(Quote, pk=quoteId) quote_in_question.delete() return redirect('/intranet/quote/') elif (request.method == 'POST'): # --- Handle save requests (from edit form to quote list) --- quote_in_question = get_object_or_404(Quote, pk=quoteId) # Add current user to _posters list, if necessary if not ("," + request.user.username + ",") in quote_in_question.quote_posters: # Strip is used to provide backwards compatibility with old quotes quote_in_question.quote_posters = "," + quote_in_question.quote_posters.strip( ",") + "," + request.user.username + "," quote_form = QuoteForm(request.POST, instance=quote_in_question) quote_form.save() return redirect('/intranet/quote/') else: # Make sure quote editor can actually edit the current quote (and reject their request if they can't) user = request.user quote_obj = get_object_or_404(Quote, pk=quoteId) quote_usernames = quote_obj.quote_sources.strip(",").split(",") poster_usernames = quote_obj.quote_posters.strip(",").split(",") canEdit = (not user.is_anonymous() and (user.username in quote_usernames) or (user.username in poster_usernames)) or (user.is_top4()) if (not canEdit): raise PermissionDenied # Current user cannot edit this quote # --- Handle edit page requests (from quote list to edit form) --- # Get authors' Member objects quoteMembers = Member.objects.filter(username__in=quote_usernames) # Unescape escaped quote text quote_obj.quote_text = HTMLParser.HTMLParser().unescape( quote_obj.quote_text) # Remove hashtags/authortags in text quote_obj.quote_text = string.replace( re.sub("<a href='.+?'>", "", quote_obj.quote_text), "</a>", "") # Convert <br />'s into newlines (\n - TODO?: this may cause issues for Windows users) quote_obj.quote_text = string.replace(quote_obj.quote_text, "<br />", "\n") quote_form = QuoteForm(instance=quote_obj) quote_form.fields["quote_posters"].widget = forms.HiddenInput() # -- Handle quote editing -- return render_to_response('intranet/quote/edit.html', { "section": "intranet", "page": 'quote', "form": quote_form, "members": Member.objects.all(), "quoteMembers": quoteMembers, "quote_id": quoteId, "user": request.user }, context_instance=RequestContext(request))
#!/usr/bin/env python # -*- coding: UTF-8 -*- # Copyright 2014 Techdealer ##############BIBLIOTECAS A IMPORTAR E DEFINICOES#################### import urllib,urllib2,re,xbmcplugin,xbmcgui,sys,xbmc,xbmcaddon,xbmcvfs,socket,HTMLParser import json h = HTMLParser.HTMLParser() addon_id = 'plugin.video.replaypt' selfAddon = xbmcaddon.Addon(id=addon_id) addonfolder = selfAddon.getAddonInfo('path') artfolder = '/resources/img/' vernovelas_url = 'http://vernovelas.com.br/' ################################################## def listar_categorias(): try: codigo_fonte = abrir_url(vernovelas_url) except: codigo_fonte = '' if codigo_fonte: match = re.findall('<li id=".+?" class=".+?"><a.+?href="(.+?)">(.+?)</a></li>', codigo_fonte) for url,name in match: if name=='INICIO' or name=='FUTEBOL AO VIVO' or name=='CONTATO': continue elif url=='http://www.vernovelas.com.br/category/resumo-de-em-familia' or url=='http://www.vernovelas.com.br/category/resumo-de-malhacao-2' or url=='http://www.vernovelas.com.br/category/resumo-de-joia-rara' or url=='http://www.vernovelas.com.br/category/resumo-de-o-cravo-e-a-rosa' or url=='http://www.vernovelas.com.br/category/resumo-de-chiquititas-2' or url=='http://www.vernovelas.com.br/category/resumo-de-rebelde' or url=='http://www.vernovelas.com.br/category/resumo-de-pecado-mortal': continue elif url=='http://www.vernovelas.com.br/category/tv-globo' or url=='http://www.vernovelas.com.br/category/band' or url=='http://www.vernovelas.com.br/category/sbt' or url=='http://www.vernovelas.com.br/category/record' or url=='http://www.vernovelas.com.br/category/canal-viva' or url=='http://www.vernovelas.com.br/category/sportv' or url=='http://www.vernovelas.com.br/category/combate-2' or url=='http://www.vernovelas.com.br/category/espn' or url=='http://www.vernovelas.com.br/category/fox' or url=='http://www.vernovelas.com.br/category/hbo': continue
import httplib #import re #import sys import os import Cookie import string, xbmc, xbmcgui, xbmcplugin, urllib, cookielib, xbmcaddon #------------------------------- import urllib, urllib2, time, random #from time import gmtime, strftime #from urlparse import urlparse import HTMLParser hpar = HTMLParser.HTMLParser() #----------------------------------------- import socket socket.setdefaulttimeout(50) icon = "" siteUrl = 'www.KinoPoisk.ru' httpSiteUrl = 'http://' + siteUrl sid_file = os.path.join( xbmc.translatePath('special://temp/'), '2kp.cookies.sid') #'plugin.video.krasfs.ru.cookies.sid' #h = int(sys.argv[1]) #---------------
def html_to_text(s): return re.sub(r'[\s\x0B\xC2\xA0]+', ' ', HTMLParser.HTMLParser().unescape(re.sub('<.*?>', ' ', s)), re.S).strip()
def unescape(entity, encoding): if encoding == 'utf-8': return HTMLParser.HTMLParser().unescape(entity).encode(encoding) elif encoding == 'cp1251': return entity.decode(encoding).encode('utf-8')
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt
class URLFollow(CommandInterface): triggers = ['urlfollow', 'follow'] acceptedTypes = ['PRIVMSG', 'ACTION'] help = 'automatic function that follows urls and grabs information about the resultant webpage' runInThread = True htmlParser = HTMLParser.HTMLParser() graySplitter = assembleFormattedText(A.normal[' ', A.fg.gray['|'], ' ']) def onLoad(self): self.handledExternally = {} """@type : dict[str, list[str]]""" # dict of regex patterns not to follow. populated by other modules so they can handle them themselves self.youtubeKey = load_key(u'YouTube') self.imgurClientID = load_key(u'imgur Client ID') self.autoFollow = True def shouldExecute(self, message): """ @type message: IRCMessage """ if message.Type not in self.acceptedTypes: return False if ignores.ignoreList is not None: if message.User.Name.lower() in ignores.ignoreList: return False return True def execute(self, message): """ @type message: IRCMessage """ match = None if message.Command.lower() in self.triggers: if message.ParameterList[0].lower() == 'on': self.autoFollow = True return IRCResponse(ResponseType.Say, 'Auto-follow on', message.ReplyTo) elif message.ParameterList[0].lower() == 'off': self.autoFollow = False return IRCResponse(ResponseType.Say, 'Auto-follow off', message.ReplyTo) else: match = re.search(r'(?P<url>(https?://|www\.)[^\s]+)', message.Parameters, re.IGNORECASE) elif self.autoFollow: match = re.search(r'(?P<url>(https?://|www\.)[^\s]+)', message.MessageString, re.IGNORECASE) if not match: return for module, patterns in self.handledExternally.iteritems(): for pattern in patterns: if re.search(pattern, message.MessageString): return # url will be handled by another module return self.DispatchToFollows(match.group('url'), message) def DispatchToFollows(self, url, message): """ @type url: unicode @type message: IRCMessage """ youtubeMatch = re.search(r'(youtube\.com/watch.+v=|youtu\.be/)(?P<videoID>[^&#\?]{11})', url) imgurMatch = re.search(r'(i\.)?imgur\.com/(?P<imgurID>[^\.]+)', url) twitterMatch = re.search(r'twitter\.com/(?P<tweeter>[^/]+)/status(es)?/(?P<tweetID>[0-9]+)', url) steamMatch = re.search(r'store\.steampowered\.com/(?P<steamType>(app|sub))/(?P<steamID>[0-9]+)', url) ksMatch = re.search(r'kickstarter\.com/projects/(?P<ksID>[^/]+/[^/&#\?]+)', url) twitchMatch = re.search(r'twitch\.tv/(?P<twitchChannel>[^/]+)/?(\s|$)', url) if youtubeMatch: return self.FollowYouTube(youtubeMatch.group('videoID'), message) elif imgurMatch: return self.FollowImgur(imgurMatch.group('imgurID'), message) elif twitterMatch: return self.FollowTwitter(twitterMatch.group('tweeter'), twitterMatch.group('tweetID'), message) elif steamMatch: return self.FollowSteam(steamMatch.group('steamType'), steamMatch.group('steamID'), message) elif ksMatch: return self.FollowKickstarter(ksMatch.group('ksID'), message) elif twitchMatch: return self.FollowTwitch(twitchMatch.group('twitchChannel'), message) elif not re.search('\.(jpe?g|gif|png|bmp)$', url): return self.FollowStandard(url, message) def FollowYouTube(self, videoID, message): if self.youtubeKey is None: return IRCResponse(ResponseType.Say, '[YouTube API key not found]', message.ReplyTo) fields = 'items(id,snippet(title,description,channelTitle),contentDetails(duration))' parts = 'snippet,contentDetails' url = 'https://www.googleapis.com/youtube/v3/videos?id={}&fields={}&part={}&key={}'.format(videoID, fields, parts, self.youtubeKey) webPage = WebUtils.fetchURL(url) webPage.body = webPage.body.decode('utf-8') j = json.loads(webPage.body) if 'items' not in j: return None title = j['items'][0]['snippet']['title'] description = j['items'][0]['snippet']['description'] channel = j['items'][0]['snippet']['channelTitle'] length = parse_duration(j["items"][0]["contentDetails"]["duration"]).total_seconds() m, s = divmod(int(length), 60) h, m = divmod(m, 60) if h > 0: length = u'{0:02d}:{1:02d}:{2:02d}'.format(h, m, s) else: length = u'{0:02d}:{1:02d}'.format(m, s) if not description: description = u'<no description available>' description = re.sub('(\n|\s)+', ' ', description) limit = 150 if len(description) > limit: description = u'{} ...'.format(description[:limit].rsplit(' ', 1)[0]) return IRCResponse(ResponseType.Say, self.graySplitter.join([title, length, channel, description]), message.ReplyTo, {'urlfollowURL': 'http://youtu.be/{}'.format(videoID)}) def FollowImgur(self, imgurID, message): if self.imgurClientID is None: return IRCResponse(ResponseType.Say, '[imgur Client ID not found]', message.ReplyTo) if imgurID.startswith('gallery/'): imgurID = imgurID.replace('gallery/', '') albumLink = False if imgurID.startswith('a/'): imgurID = imgurID.replace('a/', '') url = 'https://api.imgur.com/3/album/{0}'.format(imgurID) albumLink = True else: url = 'https://api.imgur.com/3/image/{0}'.format(imgurID) headers = [('Authorization', 'Client-ID {0}'.format(self.imgurClientID))] webPage = WebUtils.fetchURL(url, headers) if webPage is None: url = 'https://api.imgur.com/3/gallery/{0}'.format(imgurID) webPage = WebUtils.fetchURL(url, headers) if webPage is None: return response = json.loads(webPage.body) imageData = response['data'] if imageData['title'] is None: url = 'https://api.imgur.com/3/gallery/{0}'.format(imgurID) webPage = WebUtils.fetchURL(url, headers) if webPage is not None: imageData = json.loads(webPage.body)['data'] if imageData['title'] is None: webPage = WebUtils.fetchURL('http://imgur.com/{0}'.format(imgurID)) imageData['title'] = self.GetTitle(webPage.body).replace(' - Imgur', '') if imageData['title'] == 'imgur: the simple image sharer': imageData['title'] = None data = [] if imageData['title'] is not None: data.append(imageData['title']) else: data.append(u'<No Title>') if imageData['nsfw']: data.append(u'\x034\x02NSFW!\x0F') if albumLink: data.append(u'Album: {0} Images'.format(imageData['images_count'])) else: if 'is_album' in imageData and imageData['is_album']: data.append(u'Album: {0:,d} Images'.format(len(imageData['images']))) else: if imageData[u'animated']: data.append(u'\x032\x02Animated!\x0F') data.append(u'{0:,d}x{1:,d}'.format(imageData['width'], imageData['height'])) data.append(u'Size: {0:,d}kb'.format(int(imageData['size'])/1024)) data.append(u'Views: {0:,d}'.format(imageData['views'])) return IRCResponse(ResponseType.Say, self.graySplitter.join(data), message.ReplyTo, {'urlfollowURL': '[nope, imgur is too hard. also, pointless?]'}) def FollowTwitter(self, tweeter, tweetID, message): webPage = WebUtils.fetchURL('https://twitter.com/{0}/status/{1}'.format(tweeter, tweetID)) soup = BeautifulSoup(webPage.body) tweet = soup.find(class_='permalink-tweet') user = tweet.find(class_='username').text tweetText = tweet.find(class_='tweet-text') tweetTimeText = tweet.find(class_='client-and-actions').text.strip() try: tweetTimeText = time.strftime('%Y/%m/%d %H:%M', time.strptime(tweetTimeText, '%I:%M %p - %d %b %Y')) except ValueError: pass links = tweetText.find_all('a', {'data-expanded-url': True}) for link in links: link.string = link['data-expanded-url'] embeddedLinks = tweetText.find_all('a', {'data-pre-embedded': 'true'}) for link in embeddedLinks: link.string = link['href'] text = StringUtils.unescapeXHTML(tweetText.text) text = re.sub('[\r\n]+', self.graySplitter, text) formatString = unicode(assembleFormattedText(A.normal[A.fg.gray['[{0}]'], A.bold[' {1}:'], ' {2}'])) return IRCResponse(ResponseType.Say, formatString.format(tweetTimeText, user, text), message.ReplyTo, {'urlfollowURL': 'https://twitter.com/{}/status/{}'.format(tweeter, tweetID)}) def FollowSteam(self, steamType, steamId, message): steamType = {'app': 'app', 'sub': 'package'}[steamType] webPage = WebUtils.fetchURL('http://store.steampowered.com/api/{0}details/?{0}ids={1}&cc=US&l=english&v=1'.format(steamType, steamId)) response = json.loads(webPage.body) if not response[steamId]['success']: return # failure appData = response[steamId]['data'] data = [] # name if 'developers' in appData: name = assembleFormattedText(A.normal[appData['name'], A.fg.gray[' by '], u', '.join(appData['developers'])]) else: name = appData['name'] data.append(name) # package contents (might need to trim this...) if 'apps' in appData: appNames = [app['name'] for app in appData['apps']] apps = u'Package containing: {}'.format(u', '.join(appNames)) data.append(apps) # genres if 'genres' in appData: data.append(u'Genres: ' + ', '.join([genre['description'] for genre in appData['genres']])) # release date releaseDate = appData['release_date'] if not releaseDate['coming_soon']: if releaseDate['date']: data.append(u'Release Date: ' + releaseDate['date']) else: data.append(assembleFormattedText(A.normal['Release Date: ', A.fg.cyan[str(releaseDate['date'])]])) # metacritic # http://www.metacritic.com/faq#item32 (Why is the breakdown of green, yellow, and red scores different for games?) if 'metacritic' in appData: metaScore = appData['metacritic']['score'] if metaScore < 50: metacritic = assembleFormattedText(A.normal[A.fg.red[str(metaScore)]]) elif metaScore < 75: metacritic = assembleFormattedText(A.normal[A.fg.yellow[str(metaScore)]]) else: metacritic = assembleFormattedText(A.normal[A.fg.green[str(metaScore)]]) data.append(u'Metacritic: {0}'.format(metacritic)) # prices priceField = {'app': 'price_overview', 'package': 'price'}[steamType] if priceField in appData: prices = {'USD': appData[priceField], 'GBP': self.getSteamPrice(steamType, steamId, 'GB'), 'EUR': self.getSteamPrice(steamType, steamId, 'FR'), 'AUD': self.getSteamPrice(steamType, steamId, 'AU')} currencies = {'USD': u'$', 'GBP': u'\u00A3', 'EUR': u'\u20AC', 'AUD': u'AU$'} if not prices['AUD'] or prices['AUD']['final'] == prices['USD']['final']: del prices['AUD'] # filter out any missing prices prices = {key: val for key, val in prices.iteritems() if val} priceString = u'/'.join([currencies[val['currency']] + unicode(val['final'] / 100.0) for val in prices.values()]) if prices['USD']['discount_percent'] > 0: priceString += assembleFormattedText(A.normal[A.fg.green[' ({0}% sale!)'.format(prices['USD']['discount_percent'])]]) data.append(priceString) # description if 'about_the_game' in appData and appData['about_the_game'] is not None: limit = 150 description = re.sub(r'(<[^>]+>|[\r\n\t])+', assembleFormattedText(A.normal[' ', A.fg.gray['>'], ' ']), appData['about_the_game']) if len(description) > limit: description = u'{0} ...'.format(description[:limit].rsplit(' ', 1)[0]) data.append(description) return IRCResponse(ResponseType.Say, self.graySplitter.join(data), message.ReplyTo, {'urlfollowURL': 'http://store.steampowered.com/{}/{}'.format({'app': 'app', 'package': 'sub'}[steamType], steamId)}) @classmethod def getSteamPrice(cls, appType, appId, region): webPage = WebUtils.fetchURL('http://store.steampowered.com/api/{0}details/?{0}ids={1}&cc={2}&l=english&v=1'.format(appType, appId, region)) priceField = {'app': 'price_overview', 'package': 'price'}[appType] response = json.loads(webPage.body) if 'data' not in response[appId]: return if region == 'AU': response[appId]['data'][priceField]['currency'] = 'AUD' return response[appId]['data'][priceField] def FollowKickstarter(self, ksID, message): webPage = WebUtils.fetchURL('https://www.kickstarter.com/projects/{}/description'.format(ksID)) soup = BeautifulSoup(webPage.body) data = [] shorturl = soup.find(rel='shorturl')['href'] if shorturl is None: shorturl = 'https://www.kickstarter.com/projects/{}/'.format(ksID) title = soup.find(property='og:title') if title is not None: creator = soup.find(attrs={'data-modal-class': 'modal_project_by'}) if creator is not None: data.append(unicode(assembleFormattedText(A.normal['{0}', A.fg.gray[' by '], '{1}'])).format(title['content'].strip(), creator.text.strip())) else: data.append(title['content'].strip()) stats = soup.find(id='stats') # projects in progress if stats is not None: backerCount = stats.find(id='backers_count') if backerCount is not None: backerCount = int(backerCount['data-backers-count']) # completed projects else: backerCount = soup.find(class_='NS_projects__spotlight_stats') if backerCount is not None: backerCount = int(backerCount.b.text.strip().split()[0].replace(',', '')) data.append('Backers: {0:,}'.format(backerCount)) if stats is not None: pledgeData = stats.find(id='pledged') if pledgeData is not None: pledged = float(pledgeData['data-pledged']) goal = float(pledgeData['data-goal']) percentage = float(pledgeData['data-percent-raised']) if backerCount > 0: pledgePerBacker = pledged / backerCount else: pledgePerBacker = 0 else: money = soup.select('span.money.no-code') if money: pledgedString = money[0].text.strip() goalString = money[1].text.strip() pledged = float(re.sub(ur'[^0-9.]', u'', pledgedString)) goal = float(re.sub(ur'[^0-9.]', u'', goalString)) percentage = (pledged / goal) if backerCount > 0: pledgePerBacker = pledged / backerCount else: pledgePerBacker = 0 currency = soup.select('span.money.no-code')[-1]['class'] currency.remove('money') currency.remove('no-code') currency = currency[0].upper() if percentage >= 1.0: percentageString = A.fg.green['({3:,.0f}% funded)'] else: percentageString = A.fg.red['({3:,.0f}% funded)'] pledgePerBackerString = A.fg.gray['{4:,.0f}/backer'] pledgedString = assembleFormattedText(A.normal['Pledged: {0:,.0f}', A.fg.gray['/'], '{1:,.0f} {2} ', percentageString, ' ', pledgePerBackerString]) data.append(pledgedString.format(pledged, goal, currency, #pledgedData.data['data-currency'], percentage * 100, pledgePerBacker)) findState = soup.find(id='main_content') if 'Project-state-canceled' in findState['class']: data.append(assembleFormattedText(A.normal[A.fg.red['Cancelled']])) elif 'Project-state-suspended' in findState['class']: data.append(assembleFormattedText(A.normal[A.fg.blue['Suspended']])) elif 'Project-state-failed' in findState['class']: data.append(assembleFormattedText(A.normal[A.fg.red['Failed']])) elif 'Project-state-successful' in findState['class']: data.append(assembleFormattedText(A.normal[A.fg.green['Successful']])) elif 'Project-state-live' in findState['class']: duration = stats.find(id='project_duration_data') if duration is not None: remaining = float(duration['data-hours-remaining']) days = math.floor(remaining/24) hours = remaining % 24 data.append('Duration: {0:.0f} days {1:.1f} hours to go'.format(days, hours)) return IRCResponse(ResponseType.Say, self.graySplitter.join(data), message.ReplyTo, {'urlfollowURL': shorturl}) def FollowTwitch(self, channel, message): # Heavily based on Didero's DideRobot code for the same # https://github.com/Didero/DideRobot/blob/06629fc3c8bddf8f729ce2d27742ff999dfdd1f6/commands/urlTitleFinder.py#L37 # TODO: other stats? chanData = {} channelOnline = False twitchHeaders = [('Accept', 'application/vnd.twitchtv.v2+json')] webPage = WebUtils.fetchURL(u'https://api.twitch.tv/kraken/streams/{}'.format(channel), twitchHeaders) streamData = json.loads(webPage.body) if 'stream' in streamData and streamData['stream'] is not None: chanData = streamData['stream']['channel'] channelOnline = True elif 'error' not in streamData: webPage = WebUtils.fetchURL(u'https://api.twitch.tv/kraken/channels/{}'.format(channel), twitchHeaders) chanData = json.loads(webPage.body) if len(chanData) > 0: if channelOnline: channelInfo = assembleFormattedText(A.fg.green['']) + u'{}'.format(chanData['display_name']) + assembleFormattedText(A.normal['']) else: channelInfo = assembleFormattedText(A.fg.red['']) + u'{}'.format(chanData['display_name']) + assembleFormattedText(A.normal['']) channelInfo += u' "{}"'.format(re.sub(r'[\r\n]+', self.graySplitter, chanData['status'].strip())) if chanData['game'] is not None: channelInfo += assembleFormattedText(A.normal[A.fg.gray[', playing '], u'{}'.format(chanData['game'])]) if chanData['mature']: channelInfo += assembleFormattedText(A.normal[A.fg.lightRed[' [Mature]']]) if channelOnline: channelInfo += assembleFormattedText(A.normal[A.fg.green[' (Live with {0:,d} viewers)'.format(streamData['stream']['viewers'])]]) else: channelInfo += assembleFormattedText(A.normal[A.fg.red[' (Offline)']]) return IRCResponse(ResponseType.Say, channelInfo, message.ReplyTo, {'urlfollowURL': 'https://twitch.tv/{}'.format(channel)}) def FollowStandard(self, url, message): webPage = WebUtils.fetchURL(url) if webPage is None: return if webPage.responseUrl != url: return self.DispatchToFollows(webPage.responseUrl, message) title = self.GetTitle(webPage.body) if title is not None: return IRCResponse(ResponseType.Say, u'{0} (at {1})'.format(title, webPage.domain), message.ReplyTo, {'urlfollowURL': url}) return def GetTitle(self, webpage): soup = BeautifulSoup(webpage) title = soup.title if title: title = title.text title = re.sub(u'[\r\n]+', u'', title) # strip any newlines title = title.strip() # strip all whitespace either side title = re.sub(u'\s+', u' ', title) # replace multiple whitespace chars with a single space title = self.htmlParser.unescape(title) # unescape html entities # Split on the first space before 300 characters, and replace the rest with '...' if len(title) > 300: title = title[:300].rsplit(u' ', 1)[0] + u" ..." return title return None
def test_unescape_function(self): parser = HTMLParser.HTMLParser() self.assertEqual(parser.unescape('&#bad;'), '&#bad;') self.assertEqual(parser.unescape('&'), '&')
from datetime import datetime from email.utils import parsedate from contextlib import closing from functools import partial from xml.sax.saxutils import escape, quoteattr USER_AGENT = 'calibre mirror' MR_URL = 'http://www.mobileread.com/forums/' WORKDIR = '/srv/plugins' if os.path.exists('/srv') else '/t/plugins' PLUGINS = 'plugins.json.bz2' INDEX = MR_URL + 'showpost.php?p=1362767&postcount=1' # INDEX = 'file:///t/raw.html' IndexEntry = namedtuple( 'IndexEntry', 'name url donate history uninstall deprecated thread_id') u = HTMLParser.HTMLParser().unescape def read(url, get_info=False): # {{{ if url.startswith("file://"): return urllib2.urlopen(url).read() opener = urllib2.build_opener() opener.addheaders = [ ('User-Agent', USER_AGENT), ('Accept-Encoding', 'gzip,deflate'), ] res = opener.open(url) info = res.info() encoding = info.get('Content-Encoding') raw = res.read() res.close()
city_f=['Los_Angeles,_CA','San_Francisco,_CA','Manhattan,_NY','Houston,_TX','Chicago,_IL','Philadelphia,_PA','Toronto,_Ontario','Atlanta,_GA','San_Diego,_CA','Orlando,_FL','Washington,_DC','Boston,_MA'] cities=['Los_A','San_F','Manha','San_D','Houst','Chica','Phila','Toron','Atlan','Washi','Bosto','Orlan'] #input file names train_file=sys.argv[1] test_file=sys.argv[2] output_file=sys.argv[3] f=open(train_file,"rb") lines=f.read().splitlines() f.close() docs=[] html_parser = HTMLParser.HTMLParser() #removing newlines in tweets plus smileys for i in range(len(lines)) : temp=lines[i].split(" ") if temp[0] in city_f : docs.append(html_parser.unescape(lines[i].decode('utf8','ignore').encode('ascii','ignore'))) else : docs[-1]=docs[-1]+" "+html_parser.unescape(lines[i].decode('utf8','ignore').encode('ascii','ignore')) #removing punctuation from training set for i in range (len(docs)) : punc=set(string.punctuation) ftweet="" for char in docs[i] :
def getPaintingGenerator(query=u''): ''' Bla %02d ''' searchurl = u'http://collectie.boijmans.nl/nl?p=%s&f.type=schilderij' htmlparser = HTMLParser.HTMLParser() # http://collectie.boijmans.nl/nl?p=54&f.type=schilderij is acting up # Nerds start at 0 for i in range(55, 90): print u'\n\n\n\n' print searchurl % (i, ) searchPage = urllib2.urlopen(searchurl % (i, )) searchData = searchPage.read() itemregex = u'<a href="/nl/collection/([^"]+)" class="block padding bg-light">' for match in re.finditer(itemregex, searchData): url = u'http://collectie.boijmans.nl/nl/collection/%s' % ( match.group(1), ) urlen = u'http://collectie.boijmans.nl/en/collection/%s' % ( match.group(1), ) itemPage = urllib2.urlopen(url) itemData = itemPage.read() itemenPage = urllib2.urlopen(urlen) itemenData = itemenPage.read() metadata = {} metadata['url'] = url print url titlenlregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Titel</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top">([^<]+)</p>' titleenregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Title</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top">([^<]+)</p>' creatorregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>(Schilder|Kunstenaar|Maker)</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top"><a href="[^"]+">([^<]+)</a>' #|toegeschreven aan|Atelier|school van yearregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Jaartal</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top"><a href="[^"]+">(\d+)</a></p>' idregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Inventarisnummer</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top">([^<]+)</p>' mediumregex = u'<p class="col m-full l-3 xl-3 s-reset-padding-bottom m-reset-padding-bottom clear-left"><strong>Materiaal en techniek</strong></p>\s*<p class="col m-full l-9 xl-9 s-reset-padding-top m-reset-padding-top">([^<]+)</p>' htmlparser = HTMLParser.HTMLParser() titlenlmatch = re.search(titlenlregex, itemData, flags=re.M) metadata[u'titlenl'] = htmlparser.unescape( unicode(titlenlmatch.group(1), "utf-8")) titleenmatch = re.search(titleenregex, itemenData, flags=re.M) metadata[u'titleen'] = htmlparser.unescape( unicode(titleenmatch.group(1), "utf-8")) creatormatch = re.search(creatorregex, itemData, flags=re.M) if creatormatch: metadata[u'creator'] = htmlparser.unescape( unicode(creatormatch.group(2), "utf-8")) else: metadata[u'creator'] = u'anonymous' yearmatch = re.search(yearregex, itemData, flags=re.M) if yearmatch: metadata[u'year'] = htmlparser.unescape( unicode(yearmatch.group(1), "utf-8")) idmatch = re.search(idregex, itemData, flags=re.M) metadata[u'id'] = htmlparser.unescape( unicode(idmatch.group(1), "utf-8")) mediummatch = re.search(mediumregex, itemData, flags=re.M) metadata[u'medium'] = htmlparser.unescape( unicode(mediummatch.group(1), "utf-8")) yield metadata