def getNTGenerator(): """ Generator to return Hungarian National Trust paintings Search has a max of 250 pages, so that's 5*5*250=6250 of the 12,472 paintings. So need to try the different ways to get all of them. """ htmlparser = HTMLParser() locations = nationalTrustLocationsOnWikidata() missedlocations = {} baseSearchUrl = u'http://www.nationaltrustcollections.org.uk/results?Categories=7456ee20fffffe0702132e04e5764fd3&Sort=collection&Page=%s' for i in range(1, 250): print (missedlocations) searchUrl = baseSearchUrl % (i,) print (searchUrl) searchPage = requests.get(searchUrl) searchPageData = searchPage.text searchRegex = u'\<a href\=\"\/object\/([\d+\.]+)\"\>' for match in re.finditer(searchRegex, searchPageData): url = u'http://www.nationaltrustcollections.org.uk/object/%s' % (match.group(1),) print (url) itemPage = requests.get(url) itemPageData = itemPage.text metadata = {} metadata['url'] = url metadata['collectionqid'] = u'Q333515' metadata['collectionshort'] = u'NT' locationregex = u'\<h4\>Collection\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>[\r\n\t\s]*\<h4\>On show at\<\/h4\>[\r\n\t\s]*\<p\>\<a href\=\"https?\:\/\/www\.nationaltrust\.org\.uk\/([^\"]+)\"' locationMatch = re.search(locationregex, itemPageData) location2regex = u'\<h4\>Collection\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>[\r\n\t\s]*\<h4\>On show at\<\/h4\>' location2Match = re.search(location2regex, itemPageData) if locationMatch: #print (locationMatch.group(1)) #print (locationMatch.group(2)) location = locationMatch.group(2).strip(u'/').lower() if location in locations: metadata['locationqid'] = locations.get(location) else: if location not in missedlocations: missedlocations[location]=0 missedlocations[location] += 1 metadata['locationqid'] = locations.get(location) elif location2Match: print (location2Match.group(1)) location = location2Match.group(1).split(u',')[0].lower().replace(u' ', u'-') print (location) if location in locations: print (u'Location found') metadata['locationqid'] = locations.get(location) else: if location not in missedlocations: missedlocations[location]=0 missedlocations[location] += 1 metadata['locationqid'] = locations.get(location) # Search is for paintings metadata['instanceofqid'] = u'Q3305213' metadata['idpid'] = u'P217' metadata['id'] = u'%s' % (match.group(1),) metadata['artworkidpid'] = u'P4373' metadata['artworkid'] = u'%s' % (match.group(1),) titleRegex = u'\<h2 class\=\"section-title\"\>([^\<]+)\<\/h2\>' titleMatch = re.search(titleRegex, itemPageData) if titleMatch: title = htmlparser.unescape(titleMatch.group(1)).strip() else: # Sometimes nothing is returned. Just sleep and continue with the next one pywikibot.output(u'No title found, probably something went wrong. Sleeping and skipping') time.sleep(60) continue #title = u'(without title)' if len(title) > 220: title = title[0:200] metadata['title'] = { u'en' : title, } artistRegex = u'\<h3 class\=\"section-subtitle\"\>([^\<]+)\<\/h3\>' artistMatch = re.search(artistRegex, itemPageData) artistCleanupRegex = u'^(.+)\(([^\)]+)\)$' if artistMatch: dirtyname = htmlparser.unescape(artistMatch.group(1)).strip() else: dirtyname = u'anonymous' artistCleanupMatch = re.match(artistCleanupRegex, dirtyname) if artistCleanupMatch: name = artistCleanupMatch.group(1).strip() else: name = dirtyname.strip() metadata['creatorname'] = name metadata['description'] = { u'nl' : u'%s van %s' % (u'schilderij', metadata.get('creatorname'),), u'en' : u'%s by %s' % (u'painting', metadata.get('creatorname'),), u'de' : u'%s von %s' % (u'Gemälde', metadata.get('creatorname'),), } # Only match on years dateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*(\d\d\d\d)\s*(\(signed and dated\))?\<\/p\>' circadateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*circa (\d\d\d\d)\s*\<\/p\>' perioddateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*(\d\d\d\d)\s*-\s*(\d\d\d\d)\s*\<\/p\>' dateMatch = re.search(dateRegex, itemPageData) circadateMatch = re.search(circadateRegex, itemPageData) perioddateMatch = re.search(perioddateRegex, itemPageData) if dateMatch: metadata['inception'] = htmlparser.unescape(dateMatch.group(1)) elif circadateMatch: metadata['inception'] = htmlparser.unescape(circadateMatch.group(1)) metadata['inceptioncirca'] = True elif perioddateMatch: metadata['inceptionstart'] = int(perioddateMatch.group(1),) metadata['inceptionend'] = int(perioddateMatch.group(2),) # acquisitiondate not available # acquisitiondateRegex = u'\<em\>Acknowledgement\<\/em\>\:\s*.+(\d\d\d\d)[\r\n\t\s]*\<br\>' #acquisitiondateMatch = re.search(acquisitiondateRegex, itemPageData) #if acquisitiondateMatch: # metadata['acquisitiondate'] = acquisitiondateMatch.group(1) mediumRegex = u'\<h4\>Materials\<\/h4\>[\r\n\t\s]*\<p\>Oil on canvas\<\/p\>' mediumMatch = re.search(mediumRegex, itemPageData) if mediumMatch: metadata['medium'] = u'oil on canvas' dimensionRegex = u'\<h4\>Measurements\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>' dimensionMatch = re.search(dimensionRegex, itemPageData) if dimensionMatch: dimensiontext = dimensionMatch.group(1).strip() regex_2d = u'^(?P<height>\d+)\s*(x|×)\s*(?P<width>\d+)\s*mm' regex_3d = u'^(?P<height>\d+)\s*(x|×)\s*(?P<width>\d+)\s*(x|×)\s*(?P<depth>\d+)\s*mm' match_2d = re.match(regex_2d, dimensiontext) match_3d = re.match(regex_3d, dimensiontext) if match_2d: metadata['heightcm'] = u'%s' % (float(match_2d.group(u'height'))/10, ) metadata['widthcm'] = u'%s' % (float(match_2d.group(u'width'))/10, ) if match_3d: metadata['heightcm'] = u'%s' % (float(match_3d.group(u'height'))/10, ) metadata['widthcm'] = u'%s' % (float(match_3d.group(u'width'))/10, ) metadata['depthcm'] = u'%s' % (float(match_3d.group(u'depth'))/10, ) # Image use policy unclear #imageMatch = re.search(imageregex, itemPageData) #if imageMatch: # metadata[u'imageurl'] = imageMatch.group(1) # metadata[u'imageurlformat'] = u'Q2195' #JPEG yield metadata pywikibot.output(u'Final list of missed locations') pywikibot.output(missedlocations)
import re PY3 = sys.version_info >= (3, 0) PY34 = sys.version_info >= (3, 4) if PY3: ustr = str # noqa uchr = chr # noqa from urllib.request import pathname2url, url2pathname # noqa from urllib.parse import urlparse, urlunparse, quote # noqa from html.parser import HTMLParser # noqa if PY34: import html # noqa html_unescape = html.unescape # noqa else: # pragma: no cover html_unescape = HTMLParser().unescape # noqa else: ustr = unicode # noqa uchr = unichr # noqa from urllib import pathname2url, url2pathname, quote # noqa from urlparse import urlparse, urlunparse # noqa from HTMLParser import HTMLParser # noqa html_unescape = HTMLParser().unescape # noqa RE_WIN_DRIVE_LETTER = re.compile(r"^[A-Za-z]$") RE_WIN_DRIVE_PATH = re.compile(r"^[A-Za-z]:(?:\\.*)?$") RE_URL = re.compile('(http|ftp)s?|data|mailto|tel|news') IS_NARROW = sys.maxunicode == 0xFFFF RE_WIN_DEFAULT_PROTOCOL = re.compile(r"^///[A-Za-z]:(?:/.*)?$") if sys.platform.startswith('win'):
def on_data(self, data): all_data = json.loads(HTMLParser().unescape(data)) #https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object #https://gist.github.com/hrp/900964 if 'text' in all_data: #1 tweet = all_data['text'] tweet = unidecode(tweet) #2 tweetID = all_data['id_str'] #3 source = all_data['source'] source = unidecode(source) #4 if all_data['place']: country = all_data['place']['country'] country = unidecode(country) #5 country_code = all_data['place']['country_code'] country_code = unidecode(country_code) #6 full_name = all_data['place']['full_name'] full_name = unidecode(full_name) #7 name = all_data['place']['name'] name = unidecode(name) #8 place_type = all_data['place']['place_type'] place_type = unidecode(place_type) #9 else: country = country_code = full_name = name = place_type = "0" quote_count = all_data['quote_count'] #10 reply_count = all_data['reply_count'] #11 retweet_count = all_data['retweet_count'] #12 favorite_count = all_data['favorite_count'] #13 screen_name = all_data['user']['screen_name'] screen_name = unidecode(screen_name) #13 followers_count = all_data['user']['followers_count'] #14 friends_count = all_data['user']['friends_count'] #15 verified = all_data['user']['verified'] #print("verified value is:", verified) #type(verified) #tweetNoPunctuation = regex.sub('', tweet) tweetNoPunctuation = clean_tweet(tweet) #we want to make sure while compiling tweets, we do not include the oens that are retweeted if not all_data['retweeted'] and not tweet.startswith( 'RT') and 't.co' not in tweet: sentiment_value, confidence = sentiment(tweetNoPunctuation) #print(tweet, sentiment_value, confidence) #print output #value manipulations if (sentiment_value.lower() == "neg"): num_sentiment = 0 else: num_sentiment = 1 blob_senti = text_blob_sentiment(tweetNoPunctuation) if (verified == True): verified_bit = 1 else: verified_bit = 0 found = False party = "" for word in tweetNoPunctuation.split(" "): if word.lower() in party_tags.keys(): party_name = party_tags[word.lower()] #print("Found keyword: ", word, " belongs to party: ", party_name) found = True break if found: created_at = time.strftime('%Y-%m-%d %H:%M:%S') newID = (int)(all_data['id']) #twitter JSON is being parsed with queries below and using sentiment module, we are assigning confidence values # tweetID, party_name, dateTime, tweet, source,country, country_code, full_name, name, place_type,\ # reply_count, retweet_count, favorite_count, result, confidence,num_sentiment tweet_data = (tweetID ,party_name, created_at, tweet,screen_name,followers_count,friends_count,\ verified_bit, source,country, country_code,full_name,name, place_type,\ reply_count, retweet_count, favorite_count, sentiment_value.lower(), confidence, num_sentiment) data_to_dump = prepare_tweet_json([tweetID ,party_name, created_at, tweet,screen_name, source,country, country_code,full_name, place_type,\ sentiment_value.lower(), num_sentiment, confidence, followers_count, blob_senti]) write_to_es(data_to_dump) print(data_to_dump) # Write a row to the CSV file. I use encode UTF-8 # csvWriter.writerow([tweetID ,party_name, created_at, tweet,screen_name,followers_count,friends_count,\ # verified_bit, source,country, country_code,full_name,name, place_type,\ # reply_count, retweet_count, favorite_count, sentiment_value.lower(), confidence, num_sentiment]) # c.execute(add_tweet, tweet_data) # conn.commit() # else: # print('unrelated tweet found') # else: # print('retweeted data found') # else: # print('no text field found') #error handling, since tweepy tends to time out with twitter with out any reason closing the connection from their side def on_limit(self, track): print('Limit hit! Track = %s' % track) return True def on_error(self, status): print(status) def on_disconnect(self, notice): print(notice) return True
def preproc1(comment, steps=range(1, 11), print_help=False): ''' This function pre-processes a single comment Parameters: comment : string, the body of a comment steps : list of ints, each entry in this list corresponds to a preprocessing step Returns: modComm : string, the modified comment ''' global nlp global total_line_split comment_after_five = "" modComm = '' if (print_help): print("Comment before: " + comment) if 1 in steps: comment = comment.replace('\n', '') comment = re.sub(r'[ ]+', " ", comment) if (print_help): print("Comment after 1: " + comment) if 2 in steps: remove_html_escape = HTMLParser() comment = remove_html_escape.unescape(comment) if (print_help): print("Comment after 2: " + comment) if 3 in steps: comment = re.sub(r'http\S*', '', comment) comment = re.sub(r'www\S*', '', comment) if (print_help): print("Comment after 3: " + comment) if 4 in steps: comment = re.sub(r'([' + re.escape(punct) + r']+)', r' \1 ', comment) comment = re.sub(r'([a-zA-Z] . [a-zA-Z . ]+)', r'\1'.replace(" ", ""), comment) comment = re.sub(r'([ ]+)', r' ', comment) if (print_help): print("Comment after 4: " + comment) if 5 in steps: comment = re.sub(r"([A-Za-z]{1}[']{1}[A-Za-z]{1})", r' \1', comment) comment = re.sub(r"([A-Za-z]{1}['] ])", r'\1'.replace("'", "") + " " + "'", comment) comment = re.sub(r'[ ]+', ' ', comment) comment = comment.strip(" ") comment_after_five = comment if (print_help): print("Comment after 5: " + comment) if 6 in steps: new_comment_temp = "" utt = nlp(u"" + comment + "") temp_string = "" prev_tag = "" for token in utt: if (token.text in punct): temp_string = temp_string + token.text prev_tag = token.tag_ else: if (temp_string != ''): new_comment_temp = new_comment_temp + temp_string + "/" + prev_tag + " " prev_tag = '' temp_string = '' new_comment_temp = new_comment_temp + token.text + "/" + token.tag_ + " " if (temp_string != ''): new_comment_temp = new_comment_temp + temp_string + "/" + prev_tag + " " comment = new_comment_temp.strip(" ") if (print_help): print("Comment after 6: " + comment) if 7 in steps: comment = " " + comment + " " comment = re.sub(total_line_split, ' ', comment) if (print_help): print("Comment after 7: " + comment) if 8 in steps: utt = nlp(u"" + comment_after_five + "") for token in utt: if (token.lemma_[0] == '-' and token.text[0] != '-'): continue else: try: comment = re.sub(r'' + re.escape(token.text) + r'', token.lemma_, comment) except: pass if (print_help): print("Comment after 8: " + comment) if 9 in steps: split_comment = comment.split(" ") new_comment = "" for i in range(len(split_comment)): if (len(split_comment[i]) == 0): continue elif (i == 0): new_comment = new_comment + split_comment[i] + " " continue elif (split_comment[i][0] == '.'): abbrev_flag = False for line in common_abbrev: if (abbrev_flag == True): break linesplit = line.split(" ") for abbrev in linesplit: abbrev = abbrev.replace('.', '') abbrev = abbrev.replace('\n', '') if (bool( re.search(" " + abbrev + "/", " " + split_comment[i - 1]))): abbrev_flag = True if (abbrev_flag == False): new_comment = new_comment + split_comment[i] + "\n" else: new_comment = new_comment + split_comment[i] + " " else: new_comment = new_comment + split_comment[i] + " " comment = new_comment if (print_help): print("Comment after 9: " + comment) if 10 in steps: comment = comment.lower() if (print_help): print("Comment after 10: " + comment) modComm = comment return modComm
def getAttribute(cls, node, attr=None): if attr: attr = node.attrib.get(attr, None) if attr: attr = HTMLParser().unescape(attr) return attr
import os.path import click import getpass import sys import re import configparser import random from html.parser import HTMLParser from mastodon import Mastodon from collections import OrderedDict from termcolor import colored, cprint CONF_PATH = os.path.expanduser('~/.config/tootstream/') CONF_FILE = "tootstream.conf" html_parser = HTMLParser() COLORS = ['red','green','yellow','blue','magenta','cyan','white'] def parse_config(): if not os.path.exists(CONF_PATH): os.makedirs(CONF_PATH) filename = CONF_PATH + CONF_FILE if not os.path.isfile(filename): return {} config = configparser.ConfigParser() parsed = config.read(filename) if len(parsed) == 0:
import re try: import urllib.parse as urlparse except ImportError: import urlparse try: from html import unescape except ImportError: try: from html.parser import HTMLParser except ImportError: from HTMLParser import HTMLParser unescape = HTMLParser().unescape from typing import Generator from typing import Union import html5lib import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache from cachy import CacheManager import poetry.packages from poetry.config import Config from poetry.locations import CACHE_DIR
def __init__(self, data): self.data = data self.htmlParser = HTMLParser()
def cleanhtml(raw_html): htmlparser = HTMLParser() cleantext = htmlparser.unescape(raw_html) # cleanr = re.compile('<.*?>') # cleantext = re.sub(cleanr, '', raw_html) return cleantext
def getSKDGenerator(): """ Generator to return Staatliche Kunstsammlungen Dresden paintings """ htmlparser = HTMLParser() # No watercolors baseSearchUrl = u'https://skd-online-collection.skd.museum/Home/Index?page=%s&tIds=2891,2700,2870,2854,2889' for i in range(1, 317): searchUrl = baseSearchUrl % (i, ) print(searchUrl) searchPage = requests.get(searchUrl) searchPageData = searchPage.text searchRegex = u'\<a href\=\"\/Details\/Index\/([^\"]+)\"\>' idlist = [] for match in re.finditer(searchRegex, searchPageData): idlist.append(match.group(1)) for pageid in list(set(idlist)): url = u'https://skd-online-collection.skd.museum/Details/Index/%s' % ( pageid, ) print(url) metadata = {} metadata['collectionqid'] = u'Q653002' metadata['collectionshort'] = u'SKD' # Search is for paintings metadata['instanceofqid'] = u'Q3305213' metadata['url'] = url itemPage = requests.get(url) itemPageData = itemPage.text titleRegex = u'\<div class\=\"skd-module-text detail-module-text\"\>[\r\n\t\s]*\<h2\>([^\<]+)\<\/h2\>' matchTitle = re.search(titleRegex, itemPageData) #if not matchTitle: # titleRegex = u'\<dt\>Artwork title\<\/dt\>[\r\n\t\s]*\<dd\>\<em\>\<span class\=\"noItalics\"\>([^\<]+)\<' # matchTitle = re.search(titleRegex, itemPageData) metadata['title'] = { u'de': htmlparser.unescape(matchTitle.group(1).strip()), } creatorRegex = u'\<a href\=\"\/Home\/Index\?page\=1\&pId\=\d+\"\>([^\<]+)\<span\>\s*-\s*(Maler|Autor|K\&\#xFC\;nstler)\<\/span\>\<\/a\>' creatorMatch = re.search(creatorRegex, itemPageData) #if not creatorMatch: # creatorRegex = u'\<dt\>Artist names\<\/dt\>[\r\n\t\s]*\<dd\>\<a href\=\"[^\"]*\">([^\<]+)\<\/a\>' # creatorMatch = re.search(creatorRegex, itemPageData) if creatorMatch: name = htmlparser.unescape(creatorMatch.group(1).strip()) print(u'Before name: %s' % (name, )) # Handle a couple of cases otherwise just fallback to what we got cregexes = [ (u'^unbekannt$', u'anonymous'), (u'^([^,]+) \([^\)]*\d+[^\)]\d+\)$', u'\\1'), (u'^(.+), (.+) \(\d\d\d\d-\)$', u'\\2 \\1'), (u'^(.+), (.+) \([^\)]*\d+[^\)]\d+\)$', u'\\2 \\1'), (u'^([^,]+) \([^\)]*\d+[^\)]\d+\)\s*(Kopie nach|Nachfolger|Schule|Umkreis|Werkstatt|zugeschrieben)$', u'\\2 \\1'), (u'^(.+), (.+) \([^\)]*\d+[^\)]\d+\)\s*(Kopie nach|Nachfolger|Schule|Umkreis|Werkstatt|zugeschrieben)$', u'\\3 \\2 \\1'), ] for (regex, replace) in cregexes: if re.match(regex, name): name = re.sub(regex, replace, name) print(u'After name: %s' % (name, )) break metadata['creatorname'] = name else: metadata['creatorname'] = u'anonymous (not found in metadata)' # Set the creator qid to anonymous in these cases if metadata['creatorname'] == u'anonymous' or metadata['creatorname'].startswith(u'Kopie nach ') or \ metadata['creatorname'].startswith(u'Nachfolger ') or \ metadata['creatorname'].startswith(u'Schule ') or \ metadata['creatorname'].startswith(u'Umkreis ') or \ metadata['creatorname'].startswith(u'Werkstatt '): metadata['creatorqid'] = u'Q4233718' # Customized description if the creator is completely unknown if metadata['creatorname'] == u'anonymous': metadata['description'] = { u'de': u'Gemälde von unbekannt', u'nl': u'schilderij van anonieme schilder', u'en': u'painting by anonymous painter', } else: metadata['description'] = { u'de': u'%s von %s' % ( u'Gemälde', metadata.get('creatorname'), ), u'nl': u'%s van %s' % ( u'schilderij', metadata.get('creatorname'), ), u'en': u'%s by %s' % ( u'painting', metadata.get('creatorname'), ), } # https://skd-online-collection.skd.museum/Home/Index?page=1&sId=1 locations = { 1: u'Q4890', # Gemäldegalerie Alte Meister 2: u'Q472706', # Galerie Neue Meister 3: u'Q707407', # Grünes Gewölbe 4: u'Q50320660', # Kunstfonds 5: u'Q1331753', # Kunstgewerbemuseum 6: u'Q570620', # Kupferstich-Kabinett 7: u'Q321088', # Mathematisch-Physikalischer Salon 8: u'Q324263', # Münzkabinett 9: u'Q1305061', # Museum für Sächsische Volkskunst 10: u'Q1754671', # Puppentheatersammlung 11: u'Q473848', # Porzellansammlung 12: u'Q571773', # Rüstkammer 13: u'Q869690', # Skulpturensammlung } locationRegex = u'\<span\>Museum\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page\=1\&sId\=(\d\d?)\"\>' locationMatch = re.search(locationRegex, itemPageData) metadata['locationqid'] = locations.get(int( locationMatch.group(1))) invRegex = u'\<span\>Inventarnummer\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>([^\<]+)\<\/span\>' invMatch = re.search(invRegex, itemPageData) metadata['id'] = invMatch.group(1).strip() metadata['idpid'] = u'P217' dateRegex = u'\<span\>Ort, Datierung\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page=1&dVon\=(\d\d\d\d)\&dBis\=(\d\d\d\d)\"\>([^\<]+)\<\/a\>' dateMatch = re.search(dateRegex, itemPageData) if dateMatch: circaregex = u'^[uU]m (\d\d\d\d)$' circamatch = re.search(circaregex, dateMatch.group(3)) if circamatch: metadata['inception'] = circamatch.group(1) metadata['inceptioncirca'] = True elif dateMatch.group(1) == dateMatch.group(2): metadata['inception'] = dateMatch.group(1) else: metadata['inceptionstart'] = int(dateMatch.group(1), ) metadata['inceptionend'] = int(dateMatch.group(2), ) # acquisition date is not available #metadata['acquisitiondate'] = acquisitiondateMatch.group(1) mediumRegex = u'\<span\>Material und Technik\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page\=1\&q\=([^\"]+)\"\>' mediumMatch = re.search(mediumRegex, itemPageData) if mediumMatch and mediumMatch.group( 1).strip() == u'%C3%96l%20auf%20Leinwand': metadata['medium'] = u'oil on canvas' dimensionRegex = u'\<span\>Abmessungen\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>([^\<]+)\<\/span\>' dimensionMatch = re.search(dimensionRegex, itemPageData) if dimensionMatch: dimensiontext = dimensionMatch.group(1).strip() regex_2d = u'^(?P<height>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<width>\d+(,\d+)?)\s*cm$' regex_3d = u'^(?P<height>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<width>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<depth>\d+(,\d+)?)\s*cm$' match_2d = re.match(regex_2d, dimensiontext) match_3d = re.match(regex_3d, dimensiontext) if match_2d: metadata['heightcm'] = match_2d.group(u'height').replace( u',', u'.') metadata['widthcm'] = match_2d.group(u'width').replace( u',', u'.') if match_3d: metadata['heightcm'] = match_3d.group(u'height').replace( u',', u'.') metadata['widthcm'] = match_3d.group(u'width').replace( u',', u'.') metadata['depthcm'] = match_3d.group(u'depth').replace( u',', u'.') # Image use policy unclear and most (if not all) in copyright #imageMatch = re.search(imageregex, itemPageData) #if imageMatch: # metadata[u'imageurl'] = imageMatch.group(1) # metadata[u'imageurlformat'] = u'Q2195' #JPEG yield metadata
import six if six.PY3: from html.parser import HTMLParser else: from HTMLParser import HTMLParser htmlparser = HTMLParser() from lxml.etree import XMLSyntaxError import logging log = logging.getLogger('confluence-tool.page') class Page(object): def __init__(self, api, data, expand=None): self.api = api self.data = data if 'body' in self.data: body = self.data['body'] if 'storage' in body: body = body['storage'] log.debug("body: %s", body['value']) body['value'] = htmlparser.unescape(body['value']) log.debug("unescaped body: %s", body['value']) elif 'view' in body:
def getENameUnparsed(self): htmlparse = HTMLParser() return htmlparse.unescape(self.eName)
def unescape(s): return HTMLParser().unescape(s)
def replace_html_codes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt
def getitems(subreddit, multireddit=False, previd='', reddit_sort=None): """Return list of items from a subreddit. :param subreddit: subreddit to load the post :param multireddit: multireddit if given instead subreddit :param previd: previous post id, to get more post :param reddit_sort: type of sorting post :returns: list -- list of post url :Example: >>> # Recent items for Python. >>> items = getitems('python') >>> for item in items: ... print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP >>> # Previous items for Python. >>> olditems = getitems('python', ITEMS[-1]['id']) >>> for item in olditems: ... print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP """ if subreddit == '': raise Exception("No subreddit provided") if multireddit: if '/m/' not in subreddit: warning = ('That doesn\'t look like a multireddit. Are you sure' 'you need that multireddit flag?') print(warning) sys.exit(1) url = 'http://www.reddit.com/user/%s.json' % subreddit if not multireddit: if '/m/' in subreddit: warning = ( 'It looks like you are trying to fetch a multireddit. \n' 'Check the multireddit flag. ' 'Call --help for more info') print(warning) sys.exit(1) # no sorting needed if reddit_sort is None: url = 'http://www.reddit.com/r/{}.json'.format(subreddit) # if sort is top or controversial, may include advanced sort (ie week, all etc) elif 'top' in reddit_sort: url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top') elif 'controversial' in reddit_sort: url = 'http://www.reddit.com/r/{}/{}.json'.format( subreddit, 'controversial') # use default else: url = 'http://www.reddit.com/r/{}/{}.json'.format( subreddit, reddit_sort) # Get items after item with 'id' of previd. hdr = {'User-Agent': 'RedditImageGrab script.'} # here where is query start # query for previd comment if previd: url = '%s?after=t3_%s' % (url, previd) # query for more advanced top and controversial sort # available extension : hour, day, week, month, year, all # ie tophour, topweek, topweek etc # ie controversialhour, controversialweek etc # check if reddit_sort is advanced sort is_advanced_sort = False if reddit_sort is not None: if reddit_sort == 'top' or reddit_sort == 'controversial': # dont need another additional query is_advanced_sort = False elif 'top' in reddit_sort: is_advanced_sort = True sort_time_limit = reddit_sort[3:] sort_type = 'top' elif 'controversial' in reddit_sort: is_advanced_sort = True sort_time_limit = reddit_sort[13:] sort_type = 'controversial' if is_advanced_sort: # check if url have already query if '?' in url.split('/')[-1]: url += '&' else: # url dont have query yet url += '?' # add advanced sort url += 'sort={}&t={}'.format(sort_type, sort_time_limit) try: req = Request(url, headers=hdr) json = urlopen(req).read() json = json.decode('utf-8') data = JSONDecoder().decode(json) if isinstance(data, dict): items = [x['data'] for x in data['data']['children']] elif isinstance(data, list): # e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json items = [ x['data'] for subdata in data for x in subdata['data']['children'] ] items = [item for item in items if item.get('url')] except HTTPError as ERROR: error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url) sys.exit(error_message) except ValueError as ERROR: if ERROR.args[0] == 'No JSON object could be decoded': error_message = 'ERROR: subreddit "%s" does not exist' % ( subreddit) sys.exit(error_message) raise ERROR except KeyboardInterrupt as ERROR: error_message = '\tKeyboardInterrupt: url:{}.'.format(url) sys.exit(error_message) # This is weird but apparently necessary: reddit's json data # returns `url` values html-escaped, whereas we normally need them # in the way they are meant to be downloaded (i.e. urlquoted at # most). htmlparser = HTMLParser() for item in items: if item.get('url'): item['url'] = htmlparser.unescape(item['url']) return items
def normalizeTextForTagger(text): text = text.replace("&", "&") text = HTMLParser.HTMLParser().unescape(text) return text
def handle_starttag(self, tag, attr): print('Start tag: ', tag) for attr in attr: print('attr: ', attr) def handle_endtag(self, tag): print('End tag: ', tag) def handle_comment(self, data): print('Comment: ', data) def handle_data(self, data): print('Data: ', data) parser = HTMLParser() parser.feed( '<html><head><code></title></head><body><h1><!--hi-->I am a code</h1></body></html>' ) print() input = input('Put in the HTML code') parser.feed(input) print() htmlFile = open('samHTML.html', 'r') s = '' for line in htmlFile: s += line parser.feed(s)
def getBarnesGenerator(): """ Generator to return Barnes Foundation paintings """ size = 100 basesearchurl = u'https://collection.barnesfoundation.org/api/search?body={%%22from%%22:%s,%%22size%%22:%s}' htmlparser = HTMLParser() # 963 results, 20 per page (starting at 0) for i in range(0, 2700, size): searchurl = basesearchurl % (i, size) pywikibot.output(searchurl) searchPage = requests.get(searchurl) searchJson = searchPage.json() for object in searchJson.get(u'hits').get(u'hits'): item = object.get(u'_source') #print (item) metadata = {} #print (item.get('classification')) if not item.get('classification') == u'Paintings': continue #We checked, it's a painting metadata['instanceofqid'] = u'Q3305213' #print (itemurl) metadata['artworkidpid'] = u'P4709' # Something weird going on with the id's if item.get('id'): metadata['artworkid'] = u'%s' % (item.get('id'), ) elif object.get(u'_id'): metadata['artworkid'] = u'%s' % (object.get('_id'), ) # This will crash the bot if no valid id was found url = u'https://collection.barnesfoundation.org/objects/%s/details' % ( metadata['artworkid'], ) # Museum site probably doesn't like it when we go fast # time.sleep(5) pywikibot.output(url) #itempage = requests.get(url) metadata['url'] = url metadata['collectionqid'] = u'Q808462' metadata['collectionshort'] = u'Barnes' metadata['locationqid'] = u'Q808462' # Get the ID. This needs to burn if it's not available metadata['id'] = item.get('invno') metadata['idpid'] = u'P217' if item.get('title'): title = htmlparser.unescape(item.get('title')) else: title = u'(without title)' metadata['title'] = { u'en': title, } name = htmlparser.unescape(item.get('people')) #if u',' in name: # (surname, sep, firstname) = name.partition(u',') # name = u'%s %s' % (firstname.strip(), surname.strip(),) metadata['creatorname'] = name metadata['description'] = { u'nl': u'%s van %s' % ( u'schilderij', metadata.get('creatorname'), ), u'en': u'%s by %s' % ( u'painting', metadata.get('creatorname'), ), } metadata['inception'] = item.get('displayDate') if item.get('medium') and item.get( 'medium').strip() == u'Oil on canvas': metadata['medium'] = u'oil on canvas' # Could implement this later again #if bigmatch.group(u'dimensions'): # dimensiontext = bigmatch.group(u'dimensions').strip() # regex_2d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) cm\)$' # regex_3d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) x (?P<depth>\d+(\.\d+)?) cm\)$' # match_2d = re.match(regex_2d, dimensiontext) # match_3d = re.match(regex_3d, dimensiontext) # if match_2d: # metadata['heightcm'] = match_2d.group(u'height') # metadata['widthcm'] = match_2d.group(u'width') # elif match_3d: # metadata['heightcm'] = match_3d.group(u'height') # metadata['widthcm'] = match_3d.group(u'width') # metadata['depthcm'] = match_3d.group(u'depth') if not item.get('copyright') and item.get( 'objRightsTypeId') == u'8': if item.get('imageOriginalSecret'): metadata[ u'imageurl'] = u'http://s3.amazonaws.com/barnes-image-repository/images/%s_%s_o.jpg' % ( metadata['artworkid'], item.get('imageOriginalSecret')) metadata[u'imageurlformat'] = u'Q2195' #JPEG yield metadata
def __init__(self): self.session = self.get_session() self.parser = HTMLParser()
from urllib.request import urlopen from urllib.error import HTTPError unicode = bytes unichr = chr else: from HTMLParser import HTMLParser from urllib import quote_plus from urllib2 import urlopen, HTTPError from urlparse import urlparse API_URL = "http://node-hnapi.herokuapp.com" MARKDOWN_URL = "http://fuckyeahmarkdown.com/go/?read=1&u=" SEARCH = ("https://hn.algolia.com/api/v1/search" + "?tags=story&hitsPerPage=60&query=") html = HTMLParser() def bwrite(s): b = vim.current.buffer # Never write more than two blank lines in a row if not s.strip() and not b[-1].strip() and not b[-2].strip(): return # Vim buffer.append() cannot accept unicode type, # must first encode to UTF-8 string if isinstance(s, unicode): s = s.encode('utf-8', errors='replace') # Code block markers for syntax highlighting cb = unichr(160)
#!/usr/bin/python # -*- coding: utf-8 -*- import re import sys import libardrssparser import libmediathek3 as libMediathek from html.parser import HTMLParser h = HTMLParser() useThumbAsFanart = True baseUrl = "http://www.ardmediathek.de" defaultThumb = baseUrl+"/ard/static/pics/default/16_9/default_webM_16_9.jpg" defaultBackground = "http://www.ard.de/pool/img/ard/background/base_xl.jpg" icon = '' # todo showDateInTitle = False def listRSS(url, page=0): if page > 1: url += '&mcontents=page.'+str(page) response = libMediathek.getUrl(url) data = libardrssparser.parser(response) if page == 0: return data else: if len(data) == 50: return data, True else: return data, False
try: from html import unescape except ImportError: from html.parser import HTMLParser unescape = HTMLParser().unescape # type: ignore from re import IGNORECASE from re import compile as re_compile from typing import Optional from crontab import CronItem from wtforms import StringField from wtforms import TextAreaField from wtforms.validators import HostnameValidation from wtforms.validators import Regexp from wtforms.validators import ValidationError class CronSchedule: def __init__(self, message=None): self.message = message def __call__(self, form, field, message=None): schedule = (field.data or '').strip() if not schedule: return try: CronItem().setall(schedule) except (KeyError, ValueError): message = message or self.message or field.gettext('Invalid cron')
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from html.parser import HTMLParser from lxml import etree, html import json if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = 'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath('//div[@class="bSearchResult"]') # log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0]) if entries_block: entries = doc.xpath('//div[contains(@itemprop, "itemListElement")]') # log.debug(u'entries_block') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=str))) json_pat = re.compile('dataLayer\s*=\s*(.+)?;') json_info = re.search(json_pat, entry_string) jsondata = json_info.group(1) if json_info else None # log.debug(u'jsondata: %s' % jsondata) dataLayer = json.loads(jsondata) if jsondata else None ozon_id = None if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]: jsproduct = dataLayer[0]['ecommerce']['detail']['products'][0] ozon_id = as_unicode(jsproduct['id']) entry_title = as_unicode(jsproduct['name']) log.debug('ozon_id %s' % ozon_id) log.debug('entry_title %s' % entry_title) if ozon_id: metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) if not ozon_id: log.error('No SearchResults in Ozon.ru response found!') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def getGilcreaseGenerator(): """ Generator to return Gilcrease Museum paintings """ basesearchurl = u'https://collections.gilcrease.org/search/site?page=%s&f%%5B0%%5D=im_field_classification%%3A1045' htmlparser = HTMLParser() # 2307 hits, 20 per page for i in range(0, 116): searchurl = basesearchurl % (i, ) print(searchurl) searchPage = requests.get(searchurl) workidregex = u'\<a href\=\"https\:\/\/collections\.gilcrease\.org\/object\/(\d+)\"' matches = re.finditer(workidregex, searchPage.text) for match in matches: url = u'https://collections.gilcrease.org/object/%s' % ( match.group(1), ) metadata = {} itempage = requests.get(url) pywikibot.output(url) metadata['url'] = url metadata['collectionqid'] = u'Q14708424' metadata['collectionshort'] = u'Gilcrease' metadata['locationqid'] = u'Q14708424' #No need to check, I'm actually searching for paintings. metadata['instanceofqid'] = u'Q3305213' metadata['idpid'] = u'P217' invregex = u'\<div class\=\"field-label\"\>Accession No\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>' invmatch = re.search(invregex, itempage.text) # Not sure if I need to replace space here metadata['id'] = htmlparser.unescape( invmatch.group(1).replace(u' ', u' ')).strip() titleregex = u'\<div class\=\"field-label\"\>Title\(s\)\: \;\<\/div\>\<div class\=\"field-items\"\><div class\=\"field-item even\"\>([^\<]+)\<\/div\>' titlematch = re.search(titleregex, itempage.text) title = htmlparser.unescape(titlematch.group(1)).strip() # Chop chop, several very long titles if len(title) > 220: title = title[0:200] metadata['title'] = { u'en': title, } creatorregex = u'\<div class\=\"field-label\"\>Creator\(s\)\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>' creatormatch = re.search(creatorregex, itempage.text) # Rare cases without a match if creatormatch or True: creatorname = htmlparser.unescape( creatormatch.group(1)).strip() metadata['creatorname'] = creatorname metadata['description'] = { u'nl': u'%s van %s' % ( u'schilderij', metadata.get('creatorname'), ), u'en': u'%s by %s' % ( u'painting', metadata.get('creatorname'), ), u'de': u'%s von %s' % ( u'Gemälde', metadata.get('creatorname'), ), u'fr': u'%s de %s' % ( u'peinture', metadata.get('creatorname'), ), } # Let's see if we can extract some dates. dateregex = u'\<div class\=\"field-label\"\>Date\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>(\d\d\d\d)\<\/div\>' datecircaregex = u'\<div class\=\"field-label\"\>Date\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>circa (\d\d\d\d)\<\/div\>' periodregex = u'\<span property\=\"dateCreated\" itemprop\=\"dateCreated\" class\=\"detailFieldValue\"\>(\d\d\d\d)[-–](\d\d\d\d)\<\/span\>' # Not seen circaperiodregex = u'\<span property\=\"dateCreated\" itemprop\=\"dateCreated\" class\=\"detailFieldValue\"\>about (\d\d\d\d)[-–](\d\d\d\d)\<\/span\>' # Not seen shortperiodregex = u'\<meta content\=\"(\d\d)(\d\d)-(\d\d)\" property\=\"schema\:dateCreated\" itemprop\=\"dateCreated\"\>' # Not seen circashortperiodregex = u'\<meta content\=\"ca?\.\s*(\d\d)(\d\d)-(\d\d)\" property\=\"schema\:dateCreated\" itemprop\=\"dateCreated\"\>' # Not seen otherdateregex = u'\<div class\=\"field-label\"\>Date\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>' datematch = re.search(dateregex, itempage.text) datecircamatch = re.search(datecircaregex, itempage.text) periodmatch = re.search(periodregex, itempage.text) circaperiodmatch = re.search(circaperiodregex, itempage.text) shortperiodmatch = re.search(shortperiodregex, itempage.text) circashortperiodmatch = re.search(circashortperiodregex, itempage.text) otherdatematch = re.search(otherdateregex, itempage.text) if datematch: metadata['inception'] = int(datematch.group(1).strip()) elif datecircamatch: metadata['inception'] = int(datecircamatch.group(1).strip()) metadata['inceptioncirca'] = True elif periodmatch: metadata['inceptionstart'] = int(periodmatch.group(1)) metadata['inceptionend'] = int(periodmatch.group(2)) elif circaperiodmatch: metadata['inceptionstart'] = int(circaperiodmatch.group(1)) metadata['inceptionend'] = int(circaperiodmatch.group(2)) metadata['inceptioncirca'] = True elif shortperiodmatch: metadata['inceptionstart'] = int(u'%s%s' % ( shortperiodmatch.group(1), shortperiodmatch.group(2), )) metadata['inceptionend'] = int(u'%s%s' % ( shortperiodmatch.group(1), shortperiodmatch.group(3), )) elif circashortperiodmatch: metadata['inceptionstart'] = int(u'%s%s' % ( circashortperiodmatch.group(1), circashortperiodmatch.group(2), )) metadata['inceptionend'] = int(u'%s%s' % ( circashortperiodmatch.group(1), circashortperiodmatch.group(3), )) metadata['inceptioncirca'] = True elif otherdatematch: print(u'Could not parse date: "%s"' % (otherdatematch.group(1), )) # Credit line sometimes contains a date acquisitiondateregex = u'\<div class\=\"field-label\"\>Credit Line\: \;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>[^\<]+ (\d\d\d\d)\\<\/div\>' acquisitiondatematch = re.search(acquisitiondateregex, itempage.text) if acquisitiondatematch: metadata['acquisitiondate'] = int( acquisitiondatematch.group(1)) mediumregex = u'\<div class\=\"field-label\"\>Materials\/Techniques\: \;\<\/div\>\<div class\=\"field-items\"\><div class\=\"field-item even\"\>oil on canvas\<\/div\>' mediummatch = re.search(mediumregex, itempage.text) if mediummatch: metadata['medium'] = u'oil on canvas' # Dimensions is a pain to parse #measurementsregex = u'\<div class\=\"detailField dimensionsField\"\>\<span class\=\"detailFieldLabel\"\>Dimensions\:\<\/span\>\<span class\=\"detailFieldValue\"\>\<div\>(board|canvas|panel)?\:\s*(?P<dim>[^\<]+)\<\/div\>' #measurementsmatch = re.search(measurementsregex, itempage.text) #if measurementsmatch: # measurementstext = measurementsmatch.group(u'dim') # regex_2d = u'^(?P<height>\d+(\.\d+)?)\s*x\s*(?P<width>\d+(\.\d+)?)\s*cm' # match_2d = re.match(regex_2d, measurementstext) # if match_2d: # metadata['heightcm'] = match_2d.group(u'height').replace(u',', u'.') # metadata['widthcm'] = match_2d.group(u'width').replace(u',', u'.') # Add genre portrait. Tagging so other things don't seem to be very good quality portraitregex = u'\<a href\=\"\/tags\/portraits\"\>portraits?\<\/a\>' portraitmatch = re.search(portraitregex, itempage.text) if portraitmatch: metadata[u'genreqid'] = u'Q134307' ## NO free images #imageregex = u'\<meta property\=\"og:image\" content\=\"([^\"]+)\"\ \/\>' #imagematch = re.search(imageregex, itempage.text) #if imagematch and u'https://creativecommons.org/licenses/by-sa/4.0/' in itempage.text: # metadata[u'imageurl'] = imagematch.group(1) # metadata[u'imageurlformat'] = u'Q2195' #JPEG # metadata[u'imageurllicense'] = u'Q18199165' # cc-by-sa.40 # metadata[u'imageoperatedby'] = u'Q262234' # # Used this to add suggestions everywhere # #metadata[u'imageurlforce'] = True yield metadata
class GlobalVars: false_positives = [] whitelisted_users = [] blacklisted_users = [] blacklisted_usernames = [] blacklisted_websites = [] bad_keywords = [] watched_keywords = {} ignored_posts = [] auto_ignored_posts = [] startup_utc_date = datetime.utcnow() startup_utc = startup_utc_date.strftime("%H:%M:%S") latest_questions = [] api_backoff_time = 0 deletion_watcher = None metasmoke_last_ping_time = datetime.now() not_privileged_warning = \ "You are not a privileged user. Please see " \ "[the privileges wiki page](https://charcoal-se.org/smokey/Privileges) for " \ "information on what privileges are and what is expected of privileged users." experimental_reasons = { # Don't widely report these "potentially bad keyword in answer", "potentially bad keyword in body", "potentially bad keyword in title", "potentially bad keyword in username", "potentially bad NS for domain in title", "potentially bad NS for domain in body", "toxic body detected", "toxic answer detected", } parser = HTMLParser() parser.unescape = unescape code_privileged_users = None censored_committer_names = {"3f4ed0f38df010ce300dba362fa63a62": "Undo1"} # GlobalVars.reload() commit = None commit_with_author = None on_master = None s = "" s_reverted = "" s_norestart = "" s_norestart2 = "" apiquota = -1 bodyfetcher = None se_sites = [] why_data = [] notifications = [] listen_to_these_if_edited = [] multiple_reporters = [] api_calls_per_site = {} reason_weights = {} standby_message = "" standby_mode = False api_request_lock = threading.Lock() num_posts_scanned = 0 post_scan_time = 0 posts_scan_stats_lock = threading.Lock() config_parser = RawConfigParser() if os.path.isfile('config') and "pytest" not in sys.modules: config_parser.read('config') log('debug', "Configuration loaded from \"config\"") else: config_parser.read('config.ci') if "pytest" in sys.modules and os.path.isfile( 'config'): # Another config found while running in pytest log('debug', "Running in pytest, force load config from \"config.ci\"") else: log('debug', "Configuration loaded from \"config.ci\"") config = config_parser["Config"] # It's a collections.OrderedDict now # environ_or_none replaced by os.environ.get (essentially dict.get) bot_name = os.environ.get("SMOKEDETECTOR_NAME", "SmokeDetector") bot_repo_slug = os.environ.get("SMOKEDETECTOR_REPO", "Charcoal-SE/SmokeDetector") bot_repository = "//github.com/{}".format(bot_repo_slug) chatmessage_prefix = "[{}]({})".format(bot_name, bot_repository) site_id_dict = {} post_site_id_to_question = {} location = config.get("location", "Continuous Integration") metasmoke_ws = None metasmoke_down = False metasmoke_failures = 0 # Consecutive count, not cumulative chatexchange_u = config.get("ChatExchangeU") chatexchange_p = config.get("ChatExchangeP") metasmoke_host = config.get("metasmoke_host") metasmoke_key = config.get("metasmoke_key") metasmoke_ws_host = config.get("metasmoke_ws_host") github_username = config.get("github_username") github_password = config.get("github_password") perspective_key = config.get("perspective_key") flovis_host = config.get("flovis_host") flovis = None # Miscellaneous log_time_format = config.get("log_time_format", "%H:%M:%S") valid_content = """This is a totally valid post that should never be caught. Any blacklist or watchlist item that triggers on this item should be avoided. java.io.BbbCccDddException: nothing wrong found. class Safe { perfect valid code(int float &#%$*v a b c =+ /* - 0 1 2 3 456789.EFGQ} English 中文Français Español Português Italiano Deustch ~@#%*-_/'()?!:;" vvv kkk www sss ttt mmm absolute std::adjacent_find (power).each do |s| bbb end ert zal l gsopsq kdowhs@ xjwk* %_sooqmzb xjwpqpxnf.""" # noqa: E501 @staticmethod def reload(): commit = git_commit_info() censored_committer_names = GlobalVars.censored_committer_names if md5(commit['author'][0].encode( 'utf-8')).hexdigest() in censored_committer_names: commit['author'] = censored_committer_names[md5( commit['author'][0].encode('utf-8')).hexdigest()] GlobalVars.commit = commit GlobalVars.commit_with_author = "`{}` ({}: {})".format( commit['id'], commit['author'][0] if type(commit['author']) in {list, tuple} else commit['author'], commit['message']) GlobalVars.on_master = git_ref_q() GlobalVars.s = "[ {} ] SmokeDetector started at [rev {}]({}/commit/{}) (running on {}, Python {})".format( GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository, GlobalVars.commit['id'], GlobalVars.location, platform.python_version()) GlobalVars.s_reverted = \ "[ {} ] SmokeDetector started in [reverted mode](" \ "https://charcoal-se.org/smokey/SmokeDetector-Statuses#reverted-mode) " \ "at [rev {}]({}/commit/{}) (running on {})".format( GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository, GlobalVars.commit['id'], GlobalVars.location) GlobalVars.s_norestart = "[ {} ] Blacklists reloaded at [rev {}]({}/commit/{}) (running on {})".format( GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository, GlobalVars.commit['id'], GlobalVars.location) GlobalVars.s_norestart2 = "[ {} ] FindSpam module reloaded at [rev {}]({}/commit/{}) (running on {})".format( GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository, GlobalVars.commit['id'], GlobalVars.location) GlobalVars.standby_message = \ "[ {} ] SmokeDetector started in [standby mode](" \ "https://charcoal-se.org/smokey/SmokeDetector-Statuses#standby-mode) " \ "at [rev {}]({}/commit/{}) (running on {})".format( GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository, GlobalVars.commit['id'], GlobalVars.location) log('debug', "GlobalVars loaded")
class GlobalVars: false_positives = [] whitelisted_users = [] blacklisted_users = [] ignored_posts = [] auto_ignored_posts = [] startup_utc = datetime.utcnow().strftime("%H:%M:%S") latest_questions = [] api_backoff_time = 0 charcoal_room_id = "11540" meta_tavern_room_id = "89" socvr_room_id = "41570" blockedTime = { "all": 0, charcoal_room_id: 0, meta_tavern_room_id: 0, socvr_room_id: 0 } metasmoke_last_ping_time = datetime.now() not_privileged_warning = """ You are not a privileged user. Please see [the privileges wiki page](https://charcoal-se.org/smokey/Privileges) for information on what privileges are and what is expected of privileged users. """.strip().replace("\n", " ") experimental_reasons = [ # Don't widely report these "potentially bad keyword in answer", "potentially bad keyword in body", "potentially bad keyword in title", "potentially bad keyword in username" ] non_socvr_reasons = [] # Don't report to SOCVR non_tavern_reasons = [ # Don't report in the Tavern "all-caps body", "all-caps answer", "repeating characters in body", "repeating characters in title", "repeating characters in answer", "few unique characters in body", "few unique characters in answer", "title has only one unique char", "phone number detected in title", "offensive body detected", "no whitespace in body", "no whitespace in answer", ] non_tavern_sites = ["stackoverflow.com"] parser = HTMLParser() wrap = Client("stackexchange.com") wrapm = Client("meta.stackexchange.com") wrapso = Client("stackoverflow.com") privileged_users = { charcoal_room_id: [ "117490", # Normal Human "66258", # Andy "31768", # ManishEarth "103081", # hichris123 "73046", # Undo "88521", # ProgramFOX "59776", # Doorknob "31465", # Seth "88577", # Santa Claus "34124", # Andrew Leach "54229", # apnorton "20459", # S.L. Barth "32436", # tchrist "30477", # Brock Adams "58529", # ferrybig "145208", # Robert Longson "178825", # Ms Yvette "171800", # JAL "64978", # PeterJ "125141", # Jeffrey Bosboom "54902", # bummi "135450", # M.A.R. "145604", # Quill "60548", # rene "121401", # michaelpri "116218", # JamesENL "82927", # Braiam "11606", # bwDraco "19761", # Ilmari Karonen "108271", # Andrew T. "171054", # Magisch "190011", # Petter Friberg "165661", # Tunaki "145086", # Wai Ha Lee "137665", # ByteCommander "147884", # wythagoras "186395", # Åna "181293", # Ashish Ahuja "163686", # Gothdo "145827", # angussidney "244748", # Supreme Leader SnokeDetector (angussidney's sock) "121520", # ArtOfCode "244382", # Lt. A. Code (ArtOfCode's sock to test things with) "137388", # QPaysTaxes "212311", # Ryan Bemrose "172397", # Kyll "224538", # FrankerZ "61202", # OldSkool "56166", # Jan Dvorak "133966", # DavidPostill "22839", # djsmiley2k "97389", # Kaz Wolfe "144962", # DJMcMayhem "139423", # NobodyNada "62118", # tripleee "130558", # Registered User "128113", # arda "164318", # Glorfindel "175347", # Floern "180274", # Alexander O'Mara "158742", # Rob "207356", # 4castle "133031", # Mithrandir "215671", # Locutus of Borg (Mithrandir's Sock) "169713", # Mego "126657", # Cerbrus "10145", # Thomas Ward "161943", # J F "195967", # CaffeineAddiction "5363", # Stijn "248139", # FelixSFD "156721", # D-side "167070", # quartata "172450", # Hovercraft Full Of Eels "56200", # Eric Leschinski "211021", # Henders "255290", # Gypsy Spellweaver "64521", # CalvT "165474", # Hyper Neutrino "281362", # Hyper Neutrino v2 "169252", # Cai "155243", # Nisse Engström "69330", # Sconibulus "164187", # Okx "202619", # John Militer ], meta_tavern_room_id: [ "315433", # Normal Human "244519", # CRABOLO "244382", # TGMCians "194047", # Jan Dvorak "158100", # rene "178438", # Manishearth "237685", # hichris123 "215468", # Undo "229438", # ProgramFOX "180276", # Doorknob "161974", # Lynn Crumbling "186281", # Andy "266094", # Unihedro "245167", # Infinite Recursion "230261", # Jason C "213575", # Braiam "241919", # Andrew T. "203389", # backwards-Seth "202832", # Mooseman "160017", # bwDraco "201151", # bummi "188558", # Frank "229166", # Santa Claus "159034", # Kevin Brown "203972", # PeterJ "188673", # Alexis King "258672", # AstroCB "227577", # Sam "255735", # cybermonkey "279182", # Ixrec "271104", # James "220428", # Qantas 94 Heavy "153355", # tchrist "238426", # Ed Cottrell "166899", # Second Rikudo "287999", # ASCIIThenANSI "208518", # JNat "284141", # michaelpri "260312", # vaultah "244062", # SouravGhosh "152859", # Shadow Wizard "201314", # apnorton "280934", # M.A.Ramezani "200235", # durron597 "148310", # Awesome Poodles / Brock Adams "168333", # S.L. Barth "257207", # Unikitty "244282", # DroidDev "163250", # Cupcake "298265", # BoomsPlus "253560", # josilber "244254", # misterManSam "188189", # Robert Longson "174699", # Ilmari Karonen "202362", # chmod 666 telkitty "289717", # Quill "237813", # bjb568 "311345", # Simon Klaver "171881", # rekire "260388", # Pandya "310756", # Ms Yvette "262399", # Jeffrey Bosboom "242209", # JAL "280883", # ByteCommander "302251", # kos "262823", # ArtOfCode "215067", # Ferrybig "308386", # Magisch "285368", # angussidney "158829", # Thomas Ward "294691", # Mithrandir "203553", # CalvT "289971" # Hyper Neutrino ], socvr_room_id: [ "1849664", # Undo "2581872", # hichris123 "1198729", # Manishearth "3717023", # Normal Human aka 1999 "2619912", # ProgramFOX "578411", # rene "1043380", # gunr2171 "2246344", # Sam "2756409", # TylerH "1768232", # durron597 "359284", # Kevin Brown "258400", # easwee "3622940", # Unihedron "3204551", # Deduplicator "4342498", # NathanOliver "4639281", # Tiny Giant "3093387", # josilber "1652962", # cimmanon "1677912", # Mogsdad "656243", # Lynn Crumbling "3933332", # Rizier123 "2422013", # cybermonkey "3478852", # Nisse Engström "2302862", # Siguza "1324", # Paul Roub "1743880", # Tunaki "1663001", # DavidG "2415822", # JAL "4174897", # Kyll "5299236", # Kevin Guan "4050842", # Thaillie "1816093", # Drew "874188", # Triplee "880772", # approxiblue "1835379", # Cerbrus "3956566", # JamesENL "2357233", # Ms Yvette "3155639", # AlexanderOMara "462627", # Praveen Kumar "4490559", # intboolstring "1364007", # Wai Ha Lee "1699210", # bummi "563532", # Rob "5389107", # Magisch "4099593", # bhargav-rao "1542723", # Ferrybig "2025923", # Tushar "5292302", # Petter Friberg "792066", # Braiam "5666987", # Ian "3160466", # ArtOfCode "4688119", # Ashish Ahuja "3476191", # Nobody Nada "2227743", # Eric D "821878", # Ryan Bemrose "1413395", # Panta Rei "4875631", # FrankerZ "2958086", # Compass "499214", # JanDvorak "5647260", # Andrew L. "559745", # Floern "5743988", # 4castle "4622463", # angussidney "603346", # Thomas Ward "3002139", # Baum mit Augen "1863564", # QPaysTaxes "4687348", # FelixSFD "4751173", # Glorfindel "2233391", # henders "4805174", # kayess "2370483", # Machavity "1873567", # CalvT "4826457" # suraj ], '111347': [ "3160466", # ArtOfCode "1849664", # Undo "3002139", # Baum mit Augen "3476191", # Nobody Nada "5292302", # Petter Friberg "4688119", # Ashish Ahuja "4099593", # Bhargav Rao "1743880", # Tunaki "559745", # Floern "4687348" # FelixSFD ] } code_privileged_users = None smokeDetector_user_id = { charcoal_room_id: "120914", meta_tavern_room_id: "266345", socvr_room_id: "3735529", '111347': '3735529' } censored_committer_names = {"3f4ed0f38df010ce300dba362fa63a62": "Undo1"} commit = git_commit_info() if md5(commit['author'][0].encode( 'utf-8')).hexdigest() in censored_committer_names: commit['author'] = censored_committer_names[md5( commit['author'][0].encode('utf-8')).hexdigest()] commit_with_author = "%s (%s: *%s*)" % ( commit['id'], commit['author'][0] if type(commit['author']) in [list, tuple] else commit['author'], commit['message']) on_master = "HEAD detached" not in git_status() charcoal_hq = None tavern_on_the_meta = None socvr = None s = "" s_reverted = "" specialrooms = [] apiquota = -1 bodyfetcher = None se_sites = [] users_chatting = { meta_tavern_room_id: [], charcoal_room_id: [], socvr_room_id: [], '111347': [] } why_data = [] why_data_allspam = [] notifications = [] listen_to_these_if_edited = [] multiple_reporters = [] api_calls_per_site = {} standby_message = "" standby_mode = False api_request_lock = threading.Lock() num_posts_scanned = 0 post_scan_time = 0 posts_scan_stats_lock = threading.Lock() config = RawConfigParser() if os.path.isfile('config'): config.read('config') else: config.read('config.ci') latest_smokedetector_messages = { meta_tavern_room_id: [], charcoal_room_id: [], socvr_room_id: [], '111347': [] } # environ_or_none defined in helpers.py bot_name = environ_or_none("SMOKEDETECTOR_NAME") or "SmokeDetector" bot_repository = environ_or_none( "SMOKEDETECTOR_REPO") or "//github.com/Charcoal-SE/SmokeDetector" chatmessage_prefix = "[{}]({})".format(bot_name, bot_repository) site_id_dict = {} post_site_id_to_question = {} location = config.get("Config", "location") metasmoke_ws = None try: metasmoke_host = config.get("Config", "metasmoke_host") except NoOptionError: metasmoke_host = None log( 'info', "metasmoke host not found. Set it as metasmoke_host in the config file." "See https://github.com/Charcoal-SE/metasmoke.") try: metasmoke_key = config.get("Config", "metasmoke_key") except NoOptionError: metasmoke_key = "" log( 'info', "No metasmoke key found, which is okay if both are running on the same host" ) try: metasmoke_ws_host = config.get("Config", "metasmoke_ws_host") except NoOptionError: metasmoke_ws_host = "" log( 'info', "No metasmoke websocket host found, which is okay if you're anti-websocket" ) try: github_username = config.get("Config", "github_username") github_password = config.get("Config", "github_password") except NoOptionError: github_username = None github_password = None
# Third party imports import pytest import webtest # Local Imports from pypiserver import __main__, bottle import tests.test_core as test_core # Enable logging to detect any problems with it ## __main__.init_logging(level=logging.NOTSET) hp = HTMLParser() @pytest.fixture() def _app(app): return app.module @pytest.fixture def app(tmpdir): from pypiserver import app return app(root=tmpdir.strpath, authenticated=[]) @pytest.fixture def testapp(app):
def gamepage(request): if request.method == 'GET': response = requests.get( 'https://opentdb.com/api.php?amount=1&type=multiple') data = response.json() for item in data['results']: # get to the right list question = item['question'] request.session['correct_answer'] = item['correct_answer'] request.session['incorrect_answers'] = item['incorrect_answers'] # request.session allows data from two different dictionaries to persist through the requests # and allow them to be combined into one list h = HTMLParser() question = h.unescape(question) print(question) correct = request.session[ 'correct_answer'] # reference keys for correct answers choices = request.session[ 'incorrect_answers'] # reverence keys for incorrect answers answers = [] for choice in choices: answers.append( choice ) # this adds the wrong answers to the list "answers" answers.append( correct) # this adds the right answer to he list "answers" random.shuffle( answers ) # this randomizes all the answers from the list "answers" h = HTMLParser() answers = h.unescape(answers) print(answers) print(correct) context = { 'trivia': question, 'answer0': answers[0], 'answer1': answers[1], 'answer2': answers[2], 'answer3': answers[3], 'correct': correct, } else: correct = request.session[ 'correct_answer'] # reference keys for correct answers choices = request.session[ 'incorrect_answers'] # reverence keys for incorrect answers answers = [] for choice in choices: answers.append( choice) # this adds the wrong answers to the list "answers" answers.append( correct) # this adds the right answer to he list "answers" random.shuffle( answers) # this randomizes all the answers from the list "answers" h = HTMLParser() answers = h.unescape(answers) context = { 'answer0': answers[0], 'answer1': answers[1], 'answer2': answers[2], 'answer3': answers[3], 'correct': correct, 'number_already_answered': QuestionAnswered.objects.filter().count(), } print(answers) print(correct) if 'a0' in request.POST: print("checked a0") answerzero = request.POST['a0'] print("---", answerzero, "---") if answerzero == correct: print("---Correct!---") return redirect('correct/') else: print("---Picked 0, wrong answer---") return redirect('incorrect/') if 'a1' in request.POST: print("checked a1") answerone = request.POST['a1'] print("---", answerone, "---") if answerone == correct: print("---Correct!---") return redirect('correct/') else: print("---Picked 1, wrong answer---") return redirect('incorrect/') if 'a2' in request.POST: print("checked a2") answertwo = request.POST['a2'] print("---", answertwo, "---") if answertwo == correct: print("---Correct!---") return redirect('correct/') else: print("---Picked 2, wrong answer---") return redirect('incorrect/') if 'a3' in request.POST: print("checked a3") answerthree = request.POST['a3'] print("---", answerthree, "---") if answerthree == correct: print("---Correct!---") return redirect('correct/') else: print("---Picked 3, wrong answer---") return redirect('incorrect/') return render(request, 'game.html', context)
def html_unescape(content): html_parser = HTMLParser() html = html_parser.unescape(content) return html
def getThyssenGenerator(): """ Generator to return Thyssen paintings """ # 1 - 1000 ? baseUrl = u'http://www.museothyssen.org/en/thyssen/ficha_obra/%s' htmlparser = htmlparser = HTMLParser() for i in range(1, 1500): url = baseUrl % (i, ) print(url) metadata = {} metadata['collectionqid'] = u'Q176251' metadata['collectionshort'] = u'Thyssen-Bornemisza' metadata['locationqid'] = u'Q176251' metadata['instanceofqid'] = u'Q3305213' metadata['idpid'] = u'P217' metadata['url'] = url metadata['url_en'] = url metadata[ 'url_es'] = u'http://www.museothyssen.org/thyssen/ficha_obra/%s' % ( i, ) itemPageEn = requests.get(metadata['url_en']) itemPageEs = requests.get(metadata['url_es']) itemPageEn.encoding = 'utf-8' itemPageEs.encoding = 'utf-8' itemPageEnData = itemPageEn.text #print itemPageEn.encoding #itemPageEnDataCleaned = re.sub("(<!--.*?-->)", "", itemPageEn.text, flags=re.DOTALL) # Strip out comment junk #pywikibot.showDiff(itemPageEnData, itemPageEnDataCleaned) #pywikibot.output(itemPageEnDataCleaned) itemPageEsData = itemPageEs.text if len(itemPageEn.text) < 100: #That's not a valid page continue regexes = {} regexes[ 'creatorname'] = u'<dt>Autor:</dt>[\r\n\s]+<dd>[\r\n\s]+<a href="[^"]+" title="[^"]+">[\r\n\s]+<span>([^<]+)</span></a>[\r\n\s]+</dd>' regexes[ 'title'] = u'tulo:</dt>[\r\n\s]+<dd class="dd_titulo"><em>([^<]+)<' # Also possible to have <BR />/em></dd>' regexes[ 'date'] = u'<dt>Fecha:</dt>[\r\n\s]+<dd class="dd_fecha">([^<]+\d+[^<]+)</dd>' # Medium doesn't work #regexes['medium'] = u'<dt>T.?cnica:'#</dt>[\r\n\s]+'#<dd class="dd_tecnica">([^<]+)</dd>' #regexes['medium'] = u'cnica:</dt>[\r\n\s]+<dd class="dd_tecnica">([^<]+)</dd>' regexes[ 'size'] = u'<dt>Medidas:</dt>[\r\n\s]+<dd class="dd_medidas">[\r\n\s]+(.+)x(.+)cm[\r\n\s]+</dd>' regexes[ 'id'] = u'<dt>Numero de inventario</dt>[\r\n\s]+<dd><abbr title="INV. Nr.">INV. Nr.</abbr>([^<]+)</dd>' matches = {} matches['creatorname'] = re.search(regexes['creatorname'], itemPageEnData) metadata['creatorname'] = matches['creatorname'].group(1).strip() metadata['description'] = { u'nl': u'%s van %s' % ( u'schilderij', metadata['creatorname'], ), u'en': u'%s by %s' % ( u'painting', metadata['creatorname'], ), } matches['titleen'] = re.search(regexes['title'], itemPageEnData) matches['titlees'] = re.search(regexes['title'], itemPageEsData) metadata['title'] = { u'en': htmlparser.unescape(matches['titleen'].group(1).strip()), u'es': htmlparser.unescape(matches['titlees'].group(1).strip()), } matches['date'] = re.search(regexes['date'], itemPageEnData) if matches['date']: metadata['date'] = matches['date'].group(1).strip() #matches['medium']=re.search(regexes['medium'], itemPageEnData) #metadata['medium']=matches['medium'].group(1).strip() # Ignore size for now. Needs two fields anyway #matches['size']=re.search(regexes['size'], itemPageEnData) #metadata['size']=matches['size'].group(1) matches['id'] = re.search(regexes['id'], itemPageEnData) metadata['id'] = matches['id'].group(1).strip() # Crude way to filter out the non-painting if not metadata['id'].startswith(u'(CTB.DEC'): yield metadata ''' for field, regex in regexes.iteritems(): matches[field] = re.search(regex, itemPageEnData) print field #print regex if matches[field]: print matches[field].group(1) else: print u'No match found' #print itemPageEnData headerRegex = u'<header>[\r\n\s]+<h3>([^<]*)</h3>[\r\n\s]+<h1>([^<]*)</h1>[\r\n\s]+<p>([^<]*)</p>[\r\n\s]+</header>' matchEn = re.search(headerRegex, itemPageEnData) if not matchEn: pywikibot.output(u'The data for this painting is BORKED!') continue matchRu = re.search(headerRegex, itemPageRuData) metadata['title'] = { u'en' : htmlparser.unescape(matchEn.group(2)), u'ru' : htmlparser.unescape(matchRu.group(2)), } #pywikibot.output(metadata.get('title')) painterName = matchEn.group(1) painterRegexes = [u'([^,]+),\s([^\.]+)\.(.+)', u'([^,]+),\s([^,]+),(.+)', ] for painterRegex in painterRegexes: painterMatch = re.match(painterRegex, painterName) if painterMatch: painterName = '%s %s' % (painterMatch.group(2), painterMatch.group(1),) continue metadata['creatorname'] = painterName metadata['description'] = { u'nl' : u'%s van %s' % (u'schilderij', painterName,), u'en' : u'%s by %s' % (u'painting', painterName,), } #pywikibot.output(metadata.get('description')) invRegex = u'<p>[\r\n\s]+Inventory Number:[\r\n\s]+</p>[\r\n\s]+</div>[\r\n\s]+<div class="her-data-tbl-val">[\r\n\s]+<p>[\r\n\s]+(.*\d+)[\r\n\s]+</p>' invMatch = re.search(invRegex, itemPageEnData) if not invMatch: pywikibot.output(u'No inventory number found! Skipping') continue metadata['id'] = invMatch.group(1) dateDimRegex = u'var descriptionWoA = \'.*Date of creation: (.+), Dimension: ([^\s]+)x([^\s]+)\s?[sc]m\.?\';' dateDimMatch = re.search(dateDimRegex, itemPageEnData) if dateDimMatch: metadata['inception'] = dateDimMatch.group(1) metadata['height'] = dateDimMatch.group(2) metadata['heightunitqid'] = u'Q174728' metadata['width'] = dateDimMatch.group(2) metadata['widthunitqid'] = u'Q174728' yield metadata #print matchEn.group(1) #print matchEn.group(2) #print matchEn.group(3) ''' '''