Python HTMLParser.HTMLParser Examples, html.parser.HTMLParser.HTMLParser Python Examples

Example #1

0

Show file

def getNTGenerator():
    """
    Generator to return Hungarian National Trust paintings

    Search has a max of 250 pages, so that's 5*5*250=6250 of the 12,472 paintings.
    So need to try the different ways to get all of them.
    
    """
    htmlparser = HTMLParser()
    locations = nationalTrustLocationsOnWikidata()
    missedlocations = {}
    baseSearchUrl = u'http://www.nationaltrustcollections.org.uk/results?Categories=7456ee20fffffe0702132e04e5764fd3&Sort=collection&Page=%s'

    for i in range(1, 250):
        print (missedlocations)
        searchUrl = baseSearchUrl % (i,)
        print (searchUrl)

        searchPage = requests.get(searchUrl)
        searchPageData = searchPage.text
        searchRegex = u'\<a href\=\"\/object\/([\d+\.]+)\"\>'

        for match in re.finditer(searchRegex, searchPageData):
            url = u'http://www.nationaltrustcollections.org.uk/object/%s' % (match.group(1),)

            print (url)

            itemPage = requests.get(url)
            itemPageData = itemPage.text

            metadata = {}
            metadata['url'] = url

            metadata['collectionqid'] = u'Q333515'
            metadata['collectionshort'] = u'NT'

            locationregex = u'\<h4\>Collection\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>[\r\n\t\s]*\<h4\>On show at\<\/h4\>[\r\n\t\s]*\<p\>\<a href\=\"https?\:\/\/www\.nationaltrust\.org\.uk\/([^\"]+)\"'
            locationMatch = re.search(locationregex, itemPageData)

            location2regex = u'\<h4\>Collection\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>[\r\n\t\s]*\<h4\>On show at\<\/h4\>'
            location2Match = re.search(location2regex, itemPageData)

            if locationMatch:
                #print (locationMatch.group(1))
                #print (locationMatch.group(2))
                location = locationMatch.group(2).strip(u'/').lower()
                if location in locations:
                    metadata['locationqid'] = locations.get(location)
                else:
                    if location not in missedlocations:
                        missedlocations[location]=0
                    missedlocations[location] += 1

                    metadata['locationqid'] = locations.get(location)
            elif location2Match:
                print (location2Match.group(1))
                location = location2Match.group(1).split(u',')[0].lower().replace(u' ', u'-')
                print (location)

                if location in locations:
                    print (u'Location found')
                    metadata['locationqid'] = locations.get(location)
                else:
                    if location not in missedlocations:
                        missedlocations[location]=0
                    missedlocations[location] += 1

                    metadata['locationqid'] = locations.get(location)

            # Search is for paintings
            metadata['instanceofqid'] = u'Q3305213'

            metadata['idpid'] = u'P217'
            metadata['id'] = u'%s' % (match.group(1),)
            metadata['artworkidpid'] = u'P4373'
            metadata['artworkid'] = u'%s' % (match.group(1),)

            titleRegex = u'\<h2 class\=\"section-title\"\>([^\<]+)\<\/h2\>'
            titleMatch = re.search(titleRegex, itemPageData)

            if titleMatch:
                title = htmlparser.unescape(titleMatch.group(1)).strip()
            else:
                # Sometimes nothing is returned. Just sleep and continue with the next one
                pywikibot.output(u'No title found, probably something went wrong. Sleeping and skipping')
                time.sleep(60)
                continue
                #title = u'(without title)'

            if len(title) > 220:
                title = title[0:200]
            metadata['title'] = { u'en' : title,
                              }

            artistRegex = u'\<h3 class\=\"section-subtitle\"\>([^\<]+)\<\/h3\>'
            artistMatch = re.search(artistRegex, itemPageData)

            artistCleanupRegex = u'^(.+)\(([^\)]+)\)$'

            if artistMatch:
                dirtyname = htmlparser.unescape(artistMatch.group(1)).strip()
            else:
                dirtyname = u'anonymous'

            artistCleanupMatch = re.match(artistCleanupRegex, dirtyname)

            if artistCleanupMatch:
                name = artistCleanupMatch.group(1).strip()
            else:
                name = dirtyname.strip()

            metadata['creatorname'] = name
            metadata['description'] = { u'nl' : u'%s van %s' % (u'schilderij', metadata.get('creatorname'),),
                                        u'en' : u'%s by %s' % (u'painting', metadata.get('creatorname'),),
                                        u'de' : u'%s von %s' % (u'Gemälde', metadata.get('creatorname'),),
                                        }

            # Only match on years
            dateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*(\d\d\d\d)\s*(\(signed and dated\))?\<\/p\>'
            circadateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*circa (\d\d\d\d)\s*\<\/p\>'
            perioddateRegex = u'\<h4\>Date\<\/h4\>[\r\n\t\s]*\<p\>\s*(\d\d\d\d)\s*-\s*(\d\d\d\d)\s*\<\/p\>'

            dateMatch = re.search(dateRegex, itemPageData)
            circadateMatch = re.search(circadateRegex, itemPageData)
            perioddateMatch = re.search(perioddateRegex, itemPageData)
            if dateMatch:
                metadata['inception'] = htmlparser.unescape(dateMatch.group(1))
            elif circadateMatch:
                metadata['inception'] = htmlparser.unescape(circadateMatch.group(1))
                metadata['inceptioncirca'] = True
            elif perioddateMatch:
                metadata['inceptionstart'] = int(perioddateMatch.group(1),)
                metadata['inceptionend'] = int(perioddateMatch.group(2),)

            # acquisitiondate not available
            # acquisitiondateRegex = u'\<em\>Acknowledgement\<\/em\>\:\s*.+(\d\d\d\d)[\r\n\t\s]*\<br\>'
            #acquisitiondateMatch = re.search(acquisitiondateRegex, itemPageData)
            #if acquisitiondateMatch:
            #    metadata['acquisitiondate'] = acquisitiondateMatch.group(1)

            mediumRegex = u'\<h4\>Materials\<\/h4\>[\r\n\t\s]*\<p\>Oil on canvas\<\/p\>'
            mediumMatch = re.search(mediumRegex, itemPageData)

            if mediumMatch:
                metadata['medium'] = u'oil on canvas'

            dimensionRegex = u'\<h4\>Measurements\<\/h4\>[\r\n\t\s]*\<p\>([^\<]+)\<\/p\>'
            dimensionMatch = re.search(dimensionRegex, itemPageData)

            if dimensionMatch:
                dimensiontext = dimensionMatch.group(1).strip()
                regex_2d = u'^(?P<height>\d+)\s*(x|×)\s*(?P<width>\d+)\s*mm'
                regex_3d = u'^(?P<height>\d+)\s*(x|×)\s*(?P<width>\d+)\s*(x|×)\s*(?P<depth>\d+)\s*mm'
                match_2d = re.match(regex_2d, dimensiontext)
                match_3d = re.match(regex_3d, dimensiontext)
                if match_2d:
                    metadata['heightcm'] = u'%s' % (float(match_2d.group(u'height'))/10, )
                    metadata['widthcm'] = u'%s' % (float(match_2d.group(u'width'))/10, )
                if match_3d:
                    metadata['heightcm'] = u'%s' % (float(match_3d.group(u'height'))/10, )
                    metadata['widthcm'] = u'%s' % (float(match_3d.group(u'width'))/10, )
                    metadata['depthcm'] = u'%s' % (float(match_3d.group(u'depth'))/10, )

            # Image use policy unclear
            #imageMatch = re.search(imageregex, itemPageData)
            #if imageMatch:
            #    metadata[u'imageurl'] = imageMatch.group(1)
            #    metadata[u'imageurlformat'] = u'Q2195' #JPEG
            yield metadata
    pywikibot.output(u'Final list of missed locations')
    pywikibot.output(missedlocations)

Example #2

0

Show file

import re

PY3 = sys.version_info >= (3, 0)
PY34 = sys.version_info >= (3, 4)

if PY3:
    ustr = str  # noqa
    uchr = chr  # noqa
    from urllib.request import pathname2url, url2pathname  # noqa
    from urllib.parse import urlparse, urlunparse, quote  # noqa
    from html.parser import HTMLParser  # noqa
    if PY34:
        import html  # noqa
        html_unescape = html.unescape  # noqa
    else:  # pragma: no cover
        html_unescape = HTMLParser().unescape  # noqa
else:
    ustr = unicode  # noqa
    uchr = unichr  # noqa
    from urllib import pathname2url, url2pathname, quote  # noqa
    from urlparse import urlparse, urlunparse  # noqa
    from HTMLParser import HTMLParser  # noqa
    html_unescape = HTMLParser().unescape  # noqa

RE_WIN_DRIVE_LETTER = re.compile(r"^[A-Za-z]$")
RE_WIN_DRIVE_PATH = re.compile(r"^[A-Za-z]:(?:\\.*)?$")
RE_URL = re.compile('(http|ftp)s?|data|mailto|tel|news')
IS_NARROW = sys.maxunicode == 0xFFFF
RE_WIN_DEFAULT_PROTOCOL = re.compile(r"^///[A-Za-z]:(?:/.*)?$")

if sys.platform.startswith('win'):

Example #3

0

Show file

File: twitter_stream_handler (1).py Project: souradeepta/Sentiment-Analysis-on-Indian-General-Elections-2019

    def on_data(self, data):
        all_data = json.loads(HTMLParser().unescape(data))
        #https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
        #https://gist.github.com/hrp/900964
        if 'text' in all_data:
            #1
            tweet = all_data['text']
            tweet = unidecode(tweet)
            #2
            tweetID = all_data['id_str']
            #3
            source = all_data['source']
            source = unidecode(source)
            #4
            if all_data['place']:
                country = all_data['place']['country']
                country = unidecode(country)
                #5
                country_code = all_data['place']['country_code']
                country_code = unidecode(country_code)
                #6
                full_name = all_data['place']['full_name']
                full_name = unidecode(full_name)
                #7
                name = all_data['place']['name']
                name = unidecode(name)
                #8
                place_type = all_data['place']['place_type']
                place_type = unidecode(place_type)
                #9
            else:
                country = country_code = full_name = name = place_type = "0"

            quote_count = all_data['quote_count']
            #10
            reply_count = all_data['reply_count']
            #11
            retweet_count = all_data['retweet_count']
            #12
            favorite_count = all_data['favorite_count']
            #13
            screen_name = all_data['user']['screen_name']
            screen_name = unidecode(screen_name)
            #13
            followers_count = all_data['user']['followers_count']
            #14
            friends_count = all_data['user']['friends_count']
            #15
            verified = all_data['user']['verified']
            #print("verified value is:", verified)
            #type(verified)

            #tweetNoPunctuation = regex.sub('', tweet)
            tweetNoPunctuation = clean_tweet(tweet)
            #we want to make sure while compiling tweets, we do not include the oens that are retweeted
            if not all_data['retweeted'] and not tweet.startswith(
                    'RT') and 't.co' not in tweet:
                sentiment_value, confidence = sentiment(tweetNoPunctuation)
                #print(tweet, sentiment_value, confidence) #print output

                #value manipulations
                if (sentiment_value.lower() == "neg"):
                    num_sentiment = 0
                else:
                    num_sentiment = 1

                blob_senti = text_blob_sentiment(tweetNoPunctuation)

                if (verified == True):
                    verified_bit = 1
                else:
                    verified_bit = 0

                found = False
                party = ""
                for word in tweetNoPunctuation.split(" "):
                    if word.lower() in party_tags.keys():
                        party_name = party_tags[word.lower()]
                        #print("Found keyword: ", word, " belongs to party: ", party_name)
                        found = True
                        break

                if found:
                    created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                    newID = (int)(all_data['id'])
                    #twitter JSON is being parsed with queries below and using sentiment module, we are assigning confidence values
                    # tweetID, party_name, dateTime, tweet, source,country, country_code, full_name, name, place_type,\
                    #  reply_count, retweet_count, favorite_count, result, confidence,num_sentiment
                    tweet_data = (tweetID ,party_name, created_at, tweet,screen_name,followers_count,friends_count,\
                                  verified_bit, source,country, country_code,full_name,name, place_type,\
                                  reply_count, retweet_count, favorite_count, sentiment_value.lower(), confidence, num_sentiment)

                    data_to_dump = prepare_tweet_json([tweetID ,party_name, created_at, tweet,screen_name,
                                        source,country, country_code,full_name, place_type,\
                                  sentiment_value.lower(), num_sentiment, confidence, followers_count, blob_senti])
                    write_to_es(data_to_dump)
                    print(data_to_dump)

                    # Write a row to the CSV file. I use encode UTF-8
                    # csvWriter.writerow([tweetID ,party_name, created_at, tweet,screen_name,followers_count,friends_count,\
                    #                     verified_bit, source,country, country_code,full_name,name, place_type,\
                    #               reply_count, retweet_count, favorite_count, sentiment_value.lower(), confidence, num_sentiment])

                    # c.execute(add_tweet, tweet_data)
                    # conn.commit()
                # else:
                # print('unrelated tweet found')
            # else:
            # print('retweeted data found')
        # else:
        # print('no text field found')

        #error handling, since tweepy tends to time out with twitter with out any reason closing the connection from their side
        def on_limit(self, track):
            print('Limit hit! Track = %s' % track)
            return True

        def on_error(self, status):
            print(status)

        def on_disconnect(self, notice):
            print(notice)

        return True

Example #4

0

Show file

def preproc1(comment, steps=range(1, 11), print_help=False):
    ''' This function pre-processes a single comment

    Parameters:                                                                      
        comment : string, the body of a comment
        steps   : list of ints, each entry in this list corresponds to a preprocessing step  

    Returns:
        modComm : string, the modified comment 
    '''

    global nlp
    global total_line_split

    comment_after_five = ""
    modComm = ''
    if (print_help):
        print("Comment before: " + comment)
    if 1 in steps:
        comment = comment.replace('\n', '')
        comment = re.sub(r'[ ]+', " ", comment)
        if (print_help):
            print("Comment after 1: " + comment)
    if 2 in steps:
        remove_html_escape = HTMLParser()
        comment = remove_html_escape.unescape(comment)
        if (print_help):
            print("Comment after 2: " + comment)
    if 3 in steps:
        comment = re.sub(r'http\S*', '', comment)
        comment = re.sub(r'www\S*', '', comment)
        if (print_help):
            print("Comment after 3: " + comment)
    if 4 in steps:
        comment = re.sub(r'([' + re.escape(punct) + r']+)', r' \1 ', comment)
        comment = re.sub(r'([a-zA-Z] . [a-zA-Z . ]+)', r'\1'.replace(" ", ""),
                         comment)
        comment = re.sub(r'([ ]+)', r' ', comment)
        if (print_help):
            print("Comment after 4: " + comment)
    if 5 in steps:
        comment = re.sub(r"([A-Za-z]{1}[']{1}[A-Za-z]{1})", r' \1', comment)
        comment = re.sub(r"([A-Za-z]{1}['] ])",
                         r'\1'.replace("'", "") + " " + "'", comment)
        comment = re.sub(r'[ ]+', ' ', comment)
        comment = comment.strip(" ")
        comment_after_five = comment
        if (print_help):
            print("Comment after 5: " + comment)
    if 6 in steps:
        new_comment_temp = ""
        utt = nlp(u"" + comment + "")
        temp_string = ""
        prev_tag = ""
        for token in utt:
            if (token.text in punct):
                temp_string = temp_string + token.text
                prev_tag = token.tag_
            else:
                if (temp_string != ''):
                    new_comment_temp = new_comment_temp + temp_string + "/" + prev_tag + " "
                    prev_tag = ''
                    temp_string = ''
                new_comment_temp = new_comment_temp + token.text + "/" + token.tag_ + " "
        if (temp_string != ''):
            new_comment_temp = new_comment_temp + temp_string + "/" + prev_tag + " "
        comment = new_comment_temp.strip(" ")
        if (print_help):
            print("Comment after 6: " + comment)
    if 7 in steps:
        comment = " " + comment + " "
        comment = re.sub(total_line_split, ' ', comment)
        if (print_help):
            print("Comment after 7: " + comment)
    if 8 in steps:
        utt = nlp(u"" + comment_after_five + "")
        for token in utt:
            if (token.lemma_[0] == '-' and token.text[0] != '-'):
                continue
            else:
                try:
                    comment = re.sub(r'' + re.escape(token.text) + r'',
                                     token.lemma_, comment)
                except:
                    pass
        if (print_help):
            print("Comment after 8: " + comment)
    if 9 in steps:
        split_comment = comment.split(" ")
        new_comment = ""
        for i in range(len(split_comment)):
            if (len(split_comment[i]) == 0):
                continue
            elif (i == 0):
                new_comment = new_comment + split_comment[i] + " "
                continue
            elif (split_comment[i][0] == '.'):
                abbrev_flag = False
                for line in common_abbrev:
                    if (abbrev_flag == True):
                        break
                    linesplit = line.split(" ")
                    for abbrev in linesplit:
                        abbrev = abbrev.replace('.', '')
                        abbrev = abbrev.replace('\n', '')
                        if (bool(
                                re.search(" " + abbrev + "/",
                                          " " + split_comment[i - 1]))):
                            abbrev_flag = True
                if (abbrev_flag == False):
                    new_comment = new_comment + split_comment[i] + "\n"
                else:
                    new_comment = new_comment + split_comment[i] + " "
            else:
                new_comment = new_comment + split_comment[i] + " "
        comment = new_comment
        if (print_help):
            print("Comment after 9: " + comment)
    if 10 in steps:
        comment = comment.lower()
        if (print_help):
            print("Comment after 10: " + comment)

    modComm = comment
    return modComm

Example #5

0

Show file

File: parsers.py Project: ratulesrar3/cs122-project-group

 def getAttribute(cls, node, attr=None):
     if attr:
         attr = node.attrib.get(attr, None)
     if attr:
         attr = HTMLParser().unescape(attr)
     return attr

Example #6

0

Show file

import os.path
import click
import getpass
import sys
import re
import configparser
import random
from html.parser import HTMLParser
from mastodon import Mastodon
from collections import OrderedDict
from termcolor import colored, cprint

CONF_PATH = os.path.expanduser('~/.config/tootstream/')
CONF_FILE = "tootstream.conf"
html_parser = HTMLParser()

COLORS = ['red','green','yellow','blue','magenta','cyan','white']


def parse_config():
    if not os.path.exists(CONF_PATH):
        os.makedirs(CONF_PATH)

    filename = CONF_PATH + CONF_FILE
    if not os.path.isfile(filename):
        return {}

    config = configparser.ConfigParser()

    parsed = config.read(filename)
    if len(parsed) == 0:

Example #7

0

Show file

import re

try:
    import urllib.parse as urlparse
except ImportError:
    import urlparse

try:
    from html import unescape
except ImportError:
    try:
        from html.parser import HTMLParser
    except ImportError:
        from HTMLParser import HTMLParser

    unescape = HTMLParser().unescape

from typing import Generator
from typing import Union

import html5lib
import requests

from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachy import CacheManager

import poetry.packages

from poetry.config import Config
from poetry.locations import CACHE_DIR

Example #8

0

Show file

 def __init__(self, data):
     self.data = data
     self.htmlParser = HTMLParser()

Example #9

0

Show file

File: nltk_utils.py Project: Naveen481/ToxicCommentClassifier

def cleanhtml(raw_html):
    htmlparser = HTMLParser()
    cleantext = htmlparser.unescape(raw_html)
    # cleanr = re.compile('<.*?>')
    # cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

Example #10

0

Show file

def getSKDGenerator():
    """
    Generator to return Staatliche Kunstsammlungen Dresden paintings
    
    """
    htmlparser = HTMLParser()

    # No watercolors
    baseSearchUrl = u'https://skd-online-collection.skd.museum/Home/Index?page=%s&tIds=2891,2700,2870,2854,2889'
    for i in range(1, 317):
        searchUrl = baseSearchUrl % (i, )
        print(searchUrl)
        searchPage = requests.get(searchUrl)
        searchPageData = searchPage.text
        searchRegex = u'\<a href\=\"\/Details\/Index\/([^\"]+)\"\>'

        idlist = []

        for match in re.finditer(searchRegex, searchPageData):
            idlist.append(match.group(1))

        for pageid in list(set(idlist)):
            url = u'https://skd-online-collection.skd.museum/Details/Index/%s' % (
                pageid, )
            print(url)
            metadata = {}

            metadata['collectionqid'] = u'Q653002'
            metadata['collectionshort'] = u'SKD'
            # Search is for paintings
            metadata['instanceofqid'] = u'Q3305213'

            metadata['url'] = url

            itemPage = requests.get(url)
            itemPageData = itemPage.text

            titleRegex = u'\<div class\=\"skd-module-text detail-module-text\"\>[\r\n\t\s]*\<h2\>([^\<]+)\<\/h2\>'

            matchTitle = re.search(titleRegex, itemPageData)
            #if not matchTitle:
            #    titleRegex = u'\<dt\>Artwork title\<\/dt\>[\r\n\t\s]*\<dd\>\<em\>\<span class\=\"noItalics\"\>([^\<]+)\<'
            #    matchTitle = re.search(titleRegex, itemPageData)

            metadata['title'] = {
                u'de': htmlparser.unescape(matchTitle.group(1).strip()),
            }

            creatorRegex = u'\<a href\=\"\/Home\/Index\?page\=1\&pId\=\d+\"\>([^\<]+)\<span\>\s*-\s*(Maler|Autor|K\&\#xFC\;nstler)\<\/span\>\<\/a\>'

            creatorMatch = re.search(creatorRegex, itemPageData)
            #if not creatorMatch:
            #    creatorRegex = u'\<dt\>Artist names\<\/dt\>[\r\n\t\s]*\<dd\>\<a href\=\"[^\"]*\">([^\<]+)\<\/a\>'
            #    creatorMatch = re.search(creatorRegex, itemPageData)

            if creatorMatch:
                name = htmlparser.unescape(creatorMatch.group(1).strip())
                print(u'Before name: %s' % (name, ))
                # Handle a couple of cases otherwise just fallback to what we got
                cregexes = [
                    (u'^unbekannt$', u'anonymous'),
                    (u'^([^,]+) \([^\)]*\d+[^\)]\d+\)$', u'\\1'),
                    (u'^(.+), (.+) \(\d\d\d\d-\)$', u'\\2 \\1'),
                    (u'^(.+), (.+) \([^\)]*\d+[^\)]\d+\)$', u'\\2 \\1'),
                    (u'^([^,]+) \([^\)]*\d+[^\)]\d+\)\s*(Kopie nach|Nachfolger|Schule|Umkreis|Werkstatt|zugeschrieben)$',
                     u'\\2 \\1'),
                    (u'^(.+), (.+) \([^\)]*\d+[^\)]\d+\)\s*(Kopie nach|Nachfolger|Schule|Umkreis|Werkstatt|zugeschrieben)$',
                     u'\\3 \\2 \\1'),
                ]

                for (regex, replace) in cregexes:
                    if re.match(regex, name):
                        name = re.sub(regex, replace, name)
                        print(u'After name: %s' % (name, ))
                        break
                metadata['creatorname'] = name
            else:
                metadata['creatorname'] = u'anonymous (not found in metadata)'

            # Set the creator qid to anonymous in these cases
            if metadata['creatorname'] == u'anonymous' or metadata['creatorname'].startswith(u'Kopie nach ') or \
                    metadata['creatorname'].startswith(u'Nachfolger ') or \
                    metadata['creatorname'].startswith(u'Schule ') or \
                    metadata['creatorname'].startswith(u'Umkreis ') or \
                    metadata['creatorname'].startswith(u'Werkstatt '):
                metadata['creatorqid'] = u'Q4233718'

            # Customized description if the creator is completely unknown
            if metadata['creatorname'] == u'anonymous':
                metadata['description'] = {
                    u'de': u'Gemälde von unbekannt',
                    u'nl': u'schilderij van anonieme schilder',
                    u'en': u'painting by anonymous painter',
                }
            else:
                metadata['description'] = {
                    u'de':
                    u'%s von %s' % (
                        u'Gemälde',
                        metadata.get('creatorname'),
                    ),
                    u'nl':
                    u'%s van %s' % (
                        u'schilderij',
                        metadata.get('creatorname'),
                    ),
                    u'en':
                    u'%s by %s' % (
                        u'painting',
                        metadata.get('creatorname'),
                    ),
                }
            # https://skd-online-collection.skd.museum/Home/Index?page=1&sId=1
            locations = {
                1: u'Q4890',  # Gemäldegalerie Alte Meister
                2: u'Q472706',  #  Galerie Neue Meister
                3: u'Q707407',  # Grünes Gewölbe
                4: u'Q50320660',  # Kunstfonds
                5: u'Q1331753',  # Kunstgewerbemuseum
                6: u'Q570620',  # Kupferstich-Kabinett
                7: u'Q321088',  # Mathematisch-Physikalischer Salon
                8: u'Q324263',  # Münzkabinett
                9: u'Q1305061',  # Museum für Sächsische Volkskunst
                10: u'Q1754671',  # Puppentheatersammlung
                11: u'Q473848',  # Porzellansammlung
                12: u'Q571773',  # Rüstkammer
                13: u'Q869690',  # Skulpturensammlung
            }

            locationRegex = u'\<span\>Museum\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page\=1\&sId\=(\d\d?)\"\>'
            locationMatch = re.search(locationRegex, itemPageData)

            metadata['locationqid'] = locations.get(int(
                locationMatch.group(1)))

            invRegex = u'\<span\>Inventarnummer\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>([^\<]+)\<\/span\>'
            invMatch = re.search(invRegex, itemPageData)
            metadata['id'] = invMatch.group(1).strip()
            metadata['idpid'] = u'P217'

            dateRegex = u'\<span\>Ort, Datierung\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page=1&dVon\=(\d\d\d\d)\&dBis\=(\d\d\d\d)\"\>([^\<]+)\<\/a\>'
            dateMatch = re.search(dateRegex, itemPageData)
            if dateMatch:
                circaregex = u'^[uU]m (\d\d\d\d)$'
                circamatch = re.search(circaregex, dateMatch.group(3))
                if circamatch:
                    metadata['inception'] = circamatch.group(1)
                    metadata['inceptioncirca'] = True
                elif dateMatch.group(1) == dateMatch.group(2):
                    metadata['inception'] = dateMatch.group(1)
                else:
                    metadata['inceptionstart'] = int(dateMatch.group(1), )
                    metadata['inceptionend'] = int(dateMatch.group(2), )

            # acquisition date is not available
            #metadata['acquisitiondate'] = acquisitiondateMatch.group(1)

            mediumRegex = u'\<span\>Material und Technik\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>\<a href\=\"\/Home\/Index\?page\=1\&q\=([^\"]+)\"\>'
            mediumMatch = re.search(mediumRegex, itemPageData)

            if mediumMatch and mediumMatch.group(
                    1).strip() == u'%C3%96l%20auf%20Leinwand':
                metadata['medium'] = u'oil on canvas'

            dimensionRegex = u'\<span\>Abmessungen\<\/span\>[\r\n\t\s]*\<\/div\>[\r\n\t\s]*\<div class\=\"col-xs-12 col-sm-8\"\>[\r\n\t\s]*\<span\>([^\<]+)\<\/span\>'
            dimensionMatch = re.search(dimensionRegex, itemPageData)

            if dimensionMatch:
                dimensiontext = dimensionMatch.group(1).strip()
                regex_2d = u'^(?P<height>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<width>\d+(,\d+)?)\s*cm$'
                regex_3d = u'^(?P<height>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<width>\d+(,\d+)?)\s*(cm\s*)?(x|×)\s*(?P<depth>\d+(,\d+)?)\s*cm$'
                match_2d = re.match(regex_2d, dimensiontext)
                match_3d = re.match(regex_3d, dimensiontext)
                if match_2d:
                    metadata['heightcm'] = match_2d.group(u'height').replace(
                        u',', u'.')
                    metadata['widthcm'] = match_2d.group(u'width').replace(
                        u',', u'.')
                if match_3d:
                    metadata['heightcm'] = match_3d.group(u'height').replace(
                        u',', u'.')
                    metadata['widthcm'] = match_3d.group(u'width').replace(
                        u',', u'.')
                    metadata['depthcm'] = match_3d.group(u'depth').replace(
                        u',', u'.')

            # Image use policy unclear and most (if not all) in copyright
            #imageMatch = re.search(imageregex, itemPageData)
            #if imageMatch:
            #    metadata[u'imageurl'] = imageMatch.group(1)
            #    metadata[u'imageurlformat'] = u'Q2195' #JPEG
            yield metadata

Example #11

0

Show file

File: page.py Project: klorenz/python-confluence-tool

import six

if six.PY3:
    from html.parser import HTMLParser
else:
    from HTMLParser import HTMLParser

htmlparser = HTMLParser()

from lxml.etree import XMLSyntaxError

import logging
log = logging.getLogger('confluence-tool.page')


class Page(object):
    def __init__(self, api, data, expand=None):
        self.api = api
        self.data = data

        if 'body' in self.data:
            body = self.data['body']

            if 'storage' in body:
                body = body['storage']

                log.debug("body: %s", body['value'])
                body['value'] = htmlparser.unescape(body['value'])
                log.debug("unescaped body: %s", body['value'])

            elif 'view' in body:

Example #12

0

Show file

 def getENameUnparsed(self):
     htmlparse = HTMLParser()
     return htmlparse.unescape(self.eName)

Example #13

0

Show file

def unescape(s):
    return HTMLParser().unescape(s)

Example #14

0

Show file

def replace_html_codes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt

Example #15

0

Show file

File: reddit.py Project: BrettJSettle/RedWall

def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
    """Return list of items from a subreddit.

    :param subreddit: subreddit to load the post
    :param multireddit: multireddit if given instead subreddit
    :param previd: previous post id, to get more post
    :param reddit_sort: type of sorting post
    :returns: list -- list of post url

    :Example:

    >>> # Recent items for Python.
    >>> items = getitems('python')
    >>> for item in items:
    ...     print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP

    >>> # Previous items for Python.
    >>> olditems = getitems('python', ITEMS[-1]['id'])
    >>> for item in olditems:
    ...     print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP
    """
    if subreddit == '':
        raise Exception("No subreddit provided")

    if multireddit:
        if '/m/' not in subreddit:
            warning = ('That doesn\'t look like a multireddit. Are you sure'
                       'you need that multireddit flag?')
            print(warning)
            sys.exit(1)
        url = 'http://www.reddit.com/user/%s.json' % subreddit
    if not multireddit:
        if '/m/' in subreddit:
            warning = (
                'It looks like you are trying to fetch a multireddit. \n'
                'Check the multireddit flag. '
                'Call --help for more info')
            print(warning)
            sys.exit(1)
        # no sorting needed
        if reddit_sort is None:
            url = 'http://www.reddit.com/r/{}.json'.format(subreddit)
        # if sort is top or controversial, may include advanced sort (ie week, all etc)
        elif 'top' in reddit_sort:
            url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top')
        elif 'controversial' in reddit_sort:
            url = 'http://www.reddit.com/r/{}/{}.json'.format(
                subreddit, 'controversial')
        # use default
        else:
            url = 'http://www.reddit.com/r/{}/{}.json'.format(
                subreddit, reddit_sort)

    # Get items after item with 'id' of previd.

    hdr = {'User-Agent': 'RedditImageGrab script.'}

    # here where is query start
    # query for previd comment
    if previd:
        url = '%s?after=t3_%s' % (url, previd)

    # query for more advanced top and controversial sort
    # available extension : hour, day, week, month, year, all
    # ie tophour, topweek, topweek etc
    # ie controversialhour, controversialweek etc

    # check if reddit_sort is advanced sort
    is_advanced_sort = False
    if reddit_sort is not None:
        if reddit_sort == 'top' or reddit_sort == 'controversial':
            # dont need another additional query
            is_advanced_sort = False
        elif 'top' in reddit_sort:
            is_advanced_sort = True
            sort_time_limit = reddit_sort[3:]
            sort_type = 'top'
        elif 'controversial' in reddit_sort:
            is_advanced_sort = True
            sort_time_limit = reddit_sort[13:]
            sort_type = 'controversial'

        if is_advanced_sort:
            # check if url have already query
            if '?' in url.split('/')[-1]:
                url += '&'
            else:  # url dont have query yet
                url += '?'
            # add advanced sort
            url += 'sort={}&t={}'.format(sort_type, sort_time_limit)

    try:
        req = Request(url, headers=hdr)
        json = urlopen(req).read()
        json = json.decode('utf-8')
        data = JSONDecoder().decode(json)
        if isinstance(data, dict):
            items = [x['data'] for x in data['data']['children']]
        elif isinstance(data, list):
            # e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json
            items = [
                x['data'] for subdata in data
                for x in subdata['data']['children']
            ]
            items = [item for item in items if item.get('url')]
    except HTTPError as ERROR:
        error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url)
        sys.exit(error_message)
    except ValueError as ERROR:
        if ERROR.args[0] == 'No JSON object could be decoded':
            error_message = 'ERROR: subreddit "%s" does not exist' % (
                subreddit)
            sys.exit(error_message)
        raise ERROR
    except KeyboardInterrupt as ERROR:
        error_message = '\tKeyboardInterrupt: url:{}.'.format(url)
        sys.exit(error_message)

    # This is weird but apparently necessary: reddit's json data
    # returns `url` values html-escaped, whereas we normally need them
    # in the way they are meant to be downloaded (i.e. urlquoted at
    # most).
    htmlparser = HTMLParser()
    for item in items:
        if item.get('url'):
            item['url'] = htmlparser.unescape(item['url'])

    return items

Example #16

0

Show file

def normalizeTextForTagger(text):
    text = text.replace("&amp;", "&")
    text = HTMLParser.HTMLParser().unescape(text)
    return text

Example #17

0

Show file

File: 04_05_Start.py Project: liberbell/opendata01

    def handle_starttag(self, tag, attr):
        print('Start tag: ', tag)
        for attr in attr:
            print('attr: ', attr)

    def handle_endtag(self, tag):
        print('End tag: ', tag)

    def handle_comment(self, data):
        print('Comment: ', data)

    def handle_data(self, data):
        print('Data: ', data)


parser = HTMLParser()
parser.feed(
    '<html><head><code></title></head><body><h1><!--hi-->I am a code</h1></body></html>'
)
print()

input = input('Put in the HTML code')
parser.feed(input)
print()

htmlFile = open('samHTML.html', 'r')
s = ''
for line in htmlFile:
    s += line
parser.feed(s)

Example #18

0

Show file

def getBarnesGenerator():
    """
    Generator to return Barnes Foundation paintings
    """
    size = 100
    basesearchurl = u'https://collection.barnesfoundation.org/api/search?body={%%22from%%22:%s,%%22size%%22:%s}'
    htmlparser = HTMLParser()

    # 963 results, 20 per page (starting at 0)
    for i in range(0, 2700, size):
        searchurl = basesearchurl % (i, size)

        pywikibot.output(searchurl)
        searchPage = requests.get(searchurl)
        searchJson = searchPage.json()

        for object in searchJson.get(u'hits').get(u'hits'):
            item = object.get(u'_source')
            #print (item)
            metadata = {}
            #print (item.get('classification'))
            if not item.get('classification') == u'Paintings':
                continue
            #We checked, it's a painting
            metadata['instanceofqid'] = u'Q3305213'

            #print (itemurl)
            metadata['artworkidpid'] = u'P4709'
            # Something weird going on with the id's
            if item.get('id'):
                metadata['artworkid'] = u'%s' % (item.get('id'), )
            elif object.get(u'_id'):
                metadata['artworkid'] = u'%s' % (object.get('_id'), )

            # This will crash the bot if no valid id was found
            url = u'https://collection.barnesfoundation.org/objects/%s/details' % (
                metadata['artworkid'], )

            # Museum site probably doesn't like it when we go fast
            # time.sleep(5)

            pywikibot.output(url)

            #itempage = requests.get(url)
            metadata['url'] = url

            metadata['collectionqid'] = u'Q808462'
            metadata['collectionshort'] = u'Barnes'
            metadata['locationqid'] = u'Q808462'

            # Get the ID. This needs to burn if it's not available
            metadata['id'] = item.get('invno')
            metadata['idpid'] = u'P217'

            if item.get('title'):
                title = htmlparser.unescape(item.get('title'))
            else:
                title = u'(without title)'
            metadata['title'] = {
                u'en': title,
            }

            name = htmlparser.unescape(item.get('people'))
            #if u',' in name:
            #    (surname, sep, firstname) = name.partition(u',')
            #    name = u'%s %s' % (firstname.strip(), surname.strip(),)
            metadata['creatorname'] = name

            metadata['description'] = {
                u'nl': u'%s van %s' % (
                    u'schilderij',
                    metadata.get('creatorname'),
                ),
                u'en': u'%s by %s' % (
                    u'painting',
                    metadata.get('creatorname'),
                ),
            }

            metadata['inception'] = item.get('displayDate')
            if item.get('medium') and item.get(
                    'medium').strip() == u'Oil on canvas':
                metadata['medium'] = u'oil on canvas'

            # Could implement this later again
            #if bigmatch.group(u'dimensions'):
            #    dimensiontext = bigmatch.group(u'dimensions').strip()
            #    regex_2d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) cm\)$'
            #    regex_3d = u'.+\((?P<height>\d+(\.\d+)?) x (?P<width>\d+(\.\d+)?) x (?P<depth>\d+(\.\d+)?) cm\)$'
            #    match_2d = re.match(regex_2d, dimensiontext)
            #    match_3d = re.match(regex_3d, dimensiontext)
            #    if match_2d:
            #        metadata['heightcm'] = match_2d.group(u'height')
            #        metadata['widthcm'] = match_2d.group(u'width')
            #    elif match_3d:
            #        metadata['heightcm'] = match_3d.group(u'height')
            #        metadata['widthcm'] = match_3d.group(u'width')
            #        metadata['depthcm'] = match_3d.group(u'depth')

            if not item.get('copyright') and item.get(
                    'objRightsTypeId') == u'8':
                if item.get('imageOriginalSecret'):
                    metadata[
                        u'imageurl'] = u'http://s3.amazonaws.com/barnes-image-repository/images/%s_%s_o.jpg' % (
                            metadata['artworkid'],
                            item.get('imageOriginalSecret'))
                    metadata[u'imageurlformat'] = u'Q2195'  #JPEG
            yield metadata

Example #19

0

Show file

 def __init__(self):
     self.session = self.get_session()
     self.parser = HTMLParser()

Example #20

0

Show file

    from urllib.request import urlopen
    from urllib.error import HTTPError
    unicode = bytes
    unichr = chr
else:
    from HTMLParser import HTMLParser
    from urllib import quote_plus
    from urllib2 import urlopen, HTTPError
    from urlparse import urlparse

API_URL = "http://node-hnapi.herokuapp.com"
MARKDOWN_URL = "http://fuckyeahmarkdown.com/go/?read=1&u="
SEARCH = ("https://hn.algolia.com/api/v1/search" +
          "?tags=story&hitsPerPage=60&query=")

html = HTMLParser()


def bwrite(s):
    b = vim.current.buffer
    # Never write more than two blank lines in a row
    if not s.strip() and not b[-1].strip() and not b[-2].strip():
        return

    # Vim buffer.append() cannot accept unicode type,
    # must first encode to UTF-8 string
    if isinstance(s, unicode):
        s = s.encode('utf-8', errors='replace')

    # Code block markers for syntax highlighting
    cb = unichr(160)

Example #21

0

Show file

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import sys
import libardrssparser
import libmediathek3 as libMediathek
from html.parser import HTMLParser

h = HTMLParser()
useThumbAsFanart = True
baseUrl = "http://www.ardmediathek.de"
defaultThumb = baseUrl+"/ard/static/pics/default/16_9/default_webM_16_9.jpg"
defaultBackground = "http://www.ard.de/pool/img/ard/background/base_xl.jpg"
icon = ''  # todo
showDateInTitle = False


def listRSS(url, page=0):
    if page > 1:
        url += '&mcontents=page.'+str(page)

    response = libMediathek.getUrl(url)
    data = libardrssparser.parser(response)
    if page == 0:
        return data
    else:
        if len(data) == 50:
            return data, True
        else:
            return data, False

Example #22

0

Show file

try:
    from html import unescape
except ImportError:
    from html.parser import HTMLParser
    unescape = HTMLParser().unescape  # type: ignore

from re import IGNORECASE
from re import compile as re_compile
from typing import Optional

from crontab import CronItem
from wtforms import StringField
from wtforms import TextAreaField
from wtforms.validators import HostnameValidation
from wtforms.validators import Regexp
from wtforms.validators import ValidationError


class CronSchedule:
    def __init__(self, message=None):
        self.message = message

    def __call__(self, form, field, message=None):
        schedule = (field.data or '').strip()
        if not schedule:
            return

        try:
            CronItem().setall(schedule)
        except (KeyError, ValueError):
            message = message or self.message or field.gettext('Invalid cron')

Example #23

0

Show file

File: ozon.py Project: T30rGRB7/calibre-python3

    def identify(self, log, result_queue, abort, title=None, authors=None,
                 identifiers={}, timeout=90):  # {{{
        from calibre.ebooks.chardet import xml_to_unicode
        from html.parser import HTMLParser
        from lxml import etree, html
        import json

        if not self.is_configured():
            return
        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        if not query:
            err = 'Insufficient metadata to construct query'
            log.error(err)
            return err

        try:
            raw = self.browser.open_novisit(query).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
            entries_block = doc.xpath('//div[@class="bSearchResult"]')

            # log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0])

            if entries_block:
                entries = doc.xpath('//div[contains(@itemprop, "itemListElement")]')
                # log.debug(u'entries_block')
                # for entry in entries:
                #   log.debug('entries %s' % entree.tostring(entry))
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
            else:
                # Redirect page: trying to extract ozon_id from javascript data
                h = HTMLParser()
                entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=str)))
                json_pat = re.compile('dataLayer\s*=\s*(.+)?;')
                json_info = re.search(json_pat, entry_string)
                jsondata = json_info.group(1) if json_info else None

                # log.debug(u'jsondata: %s' % jsondata)
                dataLayer = json.loads(jsondata) if jsondata else None

                ozon_id = None
                if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]:
                    jsproduct = dataLayer[0]['ecommerce']['detail']['products'][0]
                    ozon_id = as_unicode(jsproduct['id'])
                    entry_title = as_unicode(jsproduct['name'])

                    log.debug('ozon_id %s' % ozon_id)
                    log.debug('entry_title %s' % entry_title)

                    if ozon_id:
                        metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
                        identifiers['ozon'] = ozon_id
                        self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})

                if not ozon_id:
                    log.error('No SearchResults in Ozon.ru response found!')

        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

Example #24

0

Show file

def getGilcreaseGenerator():
    """
    Generator to return Gilcrease Museum  paintings
    """
    basesearchurl = u'https://collections.gilcrease.org/search/site?page=%s&f%%5B0%%5D=im_field_classification%%3A1045'
    htmlparser = HTMLParser()

    # 2307 hits, 20 per page

    for i in range(0, 116):
        searchurl = basesearchurl % (i, )

        print(searchurl)
        searchPage = requests.get(searchurl)

        workidregex = u'\<a href\=\"https\:\/\/collections\.gilcrease\.org\/object\/(\d+)\"'
        matches = re.finditer(workidregex, searchPage.text)

        for match in matches:
            url = u'https://collections.gilcrease.org/object/%s' % (
                match.group(1), )
            metadata = {}

            itempage = requests.get(url)
            pywikibot.output(url)

            metadata['url'] = url

            metadata['collectionqid'] = u'Q14708424'
            metadata['collectionshort'] = u'Gilcrease'
            metadata['locationqid'] = u'Q14708424'

            #No need to check, I'm actually searching for paintings.
            metadata['instanceofqid'] = u'Q3305213'

            metadata['idpid'] = u'P217'

            invregex = u'\<div class\=\"field-label\"\>Accession No\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>'
            invmatch = re.search(invregex, itempage.text)
            # Not sure if I need to replace space here
            metadata['id'] = htmlparser.unescape(
                invmatch.group(1).replace(u'&nbsp;', u' ')).strip()

            titleregex = u'\<div class\=\"field-label\"\>Title\(s\)\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\><div class\=\"field-item even\"\>([^\<]+)\<\/div\>'
            titlematch = re.search(titleregex, itempage.text)

            title = htmlparser.unescape(titlematch.group(1)).strip()

            # Chop chop, several very long titles
            if len(title) > 220:
                title = title[0:200]
            metadata['title'] = {
                u'en': title,
            }

            creatorregex = u'\<div class\=\"field-label\"\>Creator\(s\)\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>'
            creatormatch = re.search(creatorregex, itempage.text)

            # Rare cases without a match
            if creatormatch or True:
                creatorname = htmlparser.unescape(
                    creatormatch.group(1)).strip()

                metadata['creatorname'] = creatorname

                metadata['description'] = {
                    u'nl':
                    u'%s van %s' % (
                        u'schilderij',
                        metadata.get('creatorname'),
                    ),
                    u'en':
                    u'%s by %s' % (
                        u'painting',
                        metadata.get('creatorname'),
                    ),
                    u'de':
                    u'%s von %s' % (
                        u'Gemälde',
                        metadata.get('creatorname'),
                    ),
                    u'fr':
                    u'%s de %s' % (
                        u'peinture',
                        metadata.get('creatorname'),
                    ),
                }

            # Let's see if we can extract some dates.
            dateregex = u'\<div class\=\"field-label\"\>Date\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>(\d\d\d\d)\<\/div\>'
            datecircaregex = u'\<div class\=\"field-label\"\>Date\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>circa (\d\d\d\d)\<\/div\>'
            periodregex = u'\<span property\=\"dateCreated\" itemprop\=\"dateCreated\" class\=\"detailFieldValue\"\>(\d\d\d\d)[-–](\d\d\d\d)\<\/span\>'  # Not seen
            circaperiodregex = u'\<span property\=\"dateCreated\" itemprop\=\"dateCreated\" class\=\"detailFieldValue\"\>about (\d\d\d\d)[-–](\d\d\d\d)\<\/span\>'  # Not seen
            shortperiodregex = u'\<meta content\=\"(\d\d)(\d\d)-(\d\d)\" property\=\"schema\:dateCreated\" itemprop\=\"dateCreated\"\>'  # Not seen
            circashortperiodregex = u'\<meta content\=\"ca?\.\s*(\d\d)(\d\d)-(\d\d)\" property\=\"schema\:dateCreated\" itemprop\=\"dateCreated\"\>'  # Not seen
            otherdateregex = u'\<div class\=\"field-label\"\>Date\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>([^\<]+)\<\/div\>'

            datematch = re.search(dateregex, itempage.text)
            datecircamatch = re.search(datecircaregex, itempage.text)
            periodmatch = re.search(periodregex, itempage.text)
            circaperiodmatch = re.search(circaperiodregex, itempage.text)
            shortperiodmatch = re.search(shortperiodregex, itempage.text)
            circashortperiodmatch = re.search(circashortperiodregex,
                                              itempage.text)
            otherdatematch = re.search(otherdateregex, itempage.text)

            if datematch:
                metadata['inception'] = int(datematch.group(1).strip())
            elif datecircamatch:
                metadata['inception'] = int(datecircamatch.group(1).strip())
                metadata['inceptioncirca'] = True
            elif periodmatch:
                metadata['inceptionstart'] = int(periodmatch.group(1))
                metadata['inceptionend'] = int(periodmatch.group(2))
            elif circaperiodmatch:
                metadata['inceptionstart'] = int(circaperiodmatch.group(1))
                metadata['inceptionend'] = int(circaperiodmatch.group(2))
                metadata['inceptioncirca'] = True
            elif shortperiodmatch:
                metadata['inceptionstart'] = int(u'%s%s' % (
                    shortperiodmatch.group(1),
                    shortperiodmatch.group(2),
                ))
                metadata['inceptionend'] = int(u'%s%s' % (
                    shortperiodmatch.group(1),
                    shortperiodmatch.group(3),
                ))
            elif circashortperiodmatch:
                metadata['inceptionstart'] = int(u'%s%s' % (
                    circashortperiodmatch.group(1),
                    circashortperiodmatch.group(2),
                ))
                metadata['inceptionend'] = int(u'%s%s' % (
                    circashortperiodmatch.group(1),
                    circashortperiodmatch.group(3),
                ))
                metadata['inceptioncirca'] = True
            elif otherdatematch:
                print(u'Could not parse date: "%s"' %
                      (otherdatematch.group(1), ))

            # Credit line sometimes contains a date
            acquisitiondateregex = u'\<div class\=\"field-label\"\>Credit Line\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\>\<div class\=\"field-item even\"\>[^\<]+ (\d\d\d\d)\\<\/div\>'
            acquisitiondatematch = re.search(acquisitiondateregex,
                                             itempage.text)
            if acquisitiondatematch:
                metadata['acquisitiondate'] = int(
                    acquisitiondatematch.group(1))

            mediumregex = u'\<div class\=\"field-label\"\>Materials\/Techniques\:&nbsp\;\<\/div\>\<div class\=\"field-items\"\><div class\=\"field-item even\"\>oil on canvas\<\/div\>'
            mediummatch = re.search(mediumregex, itempage.text)
            if mediummatch:
                metadata['medium'] = u'oil on canvas'

            # Dimensions is a pain to parse
            #measurementsregex = u'\<div class\=\"detailField dimensionsField\"\>\<span class\=\"detailFieldLabel\"\>Dimensions\:\<\/span\>\<span class\=\"detailFieldValue\"\>\<div\>(board|canvas|panel)?\:\s*(?P<dim>[^\<]+)\<\/div\>'
            #measurementsmatch = re.search(measurementsregex, itempage.text)
            #if measurementsmatch:
            #    measurementstext = measurementsmatch.group(u'dim')
            #    regex_2d = u'^(?P<height>\d+(\.\d+)?)\s*x\s*(?P<width>\d+(\.\d+)?)\s*cm'
            #    match_2d = re.match(regex_2d, measurementstext)
            #    if match_2d:
            #        metadata['heightcm'] = match_2d.group(u'height').replace(u',', u'.')
            #        metadata['widthcm'] = match_2d.group(u'width').replace(u',', u'.')

            # Add genre portrait. Tagging so other things don't seem to be very good quality
            portraitregex = u'\<a href\=\"\/tags\/portraits\"\>portraits?\<\/a\>'
            portraitmatch = re.search(portraitregex, itempage.text)
            if portraitmatch:
                metadata[u'genreqid'] = u'Q134307'

            ## NO free images
            #imageregex = u'\<meta property\=\"og:image\" content\=\"([^\"]+)\"\ \/\>'
            #imagematch = re.search(imageregex, itempage.text)
            #if imagematch and u'https://creativecommons.org/licenses/by-sa/4.0/' in itempage.text:
            #    metadata[u'imageurl'] = imagematch.group(1)
            #    metadata[u'imageurlformat'] = u'Q2195' #JPEG
            #    metadata[u'imageurllicense'] = u'Q18199165' # cc-by-sa.40
            #    metadata[u'imageoperatedby'] = u'Q262234'
            #    # Used this to add suggestions everywhere
            #    #metadata[u'imageurlforce'] = True

            yield metadata

Example #25

0

Show file

class GlobalVars:
    false_positives = []
    whitelisted_users = []
    blacklisted_users = []
    blacklisted_usernames = []
    blacklisted_websites = []
    bad_keywords = []
    watched_keywords = {}
    ignored_posts = []
    auto_ignored_posts = []
    startup_utc_date = datetime.utcnow()
    startup_utc = startup_utc_date.strftime("%H:%M:%S")
    latest_questions = []
    api_backoff_time = 0
    deletion_watcher = None

    metasmoke_last_ping_time = datetime.now()
    not_privileged_warning = \
        "You are not a privileged user. Please see " \
        "[the privileges wiki page](https://charcoal-se.org/smokey/Privileges) for " \
        "information on what privileges are and what is expected of privileged users."

    experimental_reasons = {  # Don't widely report these
        "potentially bad keyword in answer",
        "potentially bad keyword in body",
        "potentially bad keyword in title",
        "potentially bad keyword in username",
        "potentially bad NS for domain in title",
        "potentially bad NS for domain in body",
        "toxic body detected",
        "toxic answer detected",
    }

    parser = HTMLParser()
    parser.unescape = unescape

    code_privileged_users = None
    censored_committer_names = {"3f4ed0f38df010ce300dba362fa63a62": "Undo1"}

    # GlobalVars.reload()
    commit = None
    commit_with_author = None
    on_master = None

    s = ""
    s_reverted = ""
    s_norestart = ""
    s_norestart2 = ""
    apiquota = -1
    bodyfetcher = None
    se_sites = []
    why_data = []
    notifications = []
    listen_to_these_if_edited = []
    multiple_reporters = []
    api_calls_per_site = {}
    reason_weights = {}

    standby_message = ""
    standby_mode = False

    api_request_lock = threading.Lock()

    num_posts_scanned = 0
    post_scan_time = 0
    posts_scan_stats_lock = threading.Lock()

    config_parser = RawConfigParser()

    if os.path.isfile('config') and "pytest" not in sys.modules:
        config_parser.read('config')
        log('debug', "Configuration loaded from \"config\"")
    else:
        config_parser.read('config.ci')
        if "pytest" in sys.modules and os.path.isfile(
                'config'):  # Another config found while running in pytest
            log('debug',
                "Running in pytest, force load config from \"config.ci\"")
        else:
            log('debug', "Configuration loaded from \"config.ci\"")

    config = config_parser["Config"]  # It's a collections.OrderedDict now

    # environ_or_none replaced by os.environ.get (essentially dict.get)
    bot_name = os.environ.get("SMOKEDETECTOR_NAME", "SmokeDetector")
    bot_repo_slug = os.environ.get("SMOKEDETECTOR_REPO",
                                   "Charcoal-SE/SmokeDetector")
    bot_repository = "//github.com/{}".format(bot_repo_slug)
    chatmessage_prefix = "[{}]({})".format(bot_name, bot_repository)

    site_id_dict = {}
    post_site_id_to_question = {}

    location = config.get("location", "Continuous Integration")

    metasmoke_ws = None
    metasmoke_down = False
    metasmoke_failures = 0  # Consecutive count, not cumulative

    chatexchange_u = config.get("ChatExchangeU")
    chatexchange_p = config.get("ChatExchangeP")

    metasmoke_host = config.get("metasmoke_host")
    metasmoke_key = config.get("metasmoke_key")
    metasmoke_ws_host = config.get("metasmoke_ws_host")

    github_username = config.get("github_username")
    github_password = config.get("github_password")

    perspective_key = config.get("perspective_key")

    flovis_host = config.get("flovis_host")
    flovis = None

    # Miscellaneous
    log_time_format = config.get("log_time_format", "%H:%M:%S")

    valid_content = """This is a totally valid post that should never be caught. Any blacklist or watchlist item that triggers on this item should be avoided. java.io.BbbCccDddException: nothing wrong found. class Safe { perfect valid code(int float &#%$*v a b c =+ /* - 0 1 2 3 456789.EFGQ} English 中文Français Español Português Italiano Deustch ~@#%*-_/'()?!:;" vvv kkk www sss ttt mmm absolute std::adjacent_find (power).each do |s| bbb end ert zal l gsopsq kdowhs@ xjwk* %_sooqmzb xjwpqpxnf."""  # noqa: E501

    @staticmethod
    def reload():
        commit = git_commit_info()
        censored_committer_names = GlobalVars.censored_committer_names
        if md5(commit['author'][0].encode(
                'utf-8')).hexdigest() in censored_committer_names:
            commit['author'] = censored_committer_names[md5(
                commit['author'][0].encode('utf-8')).hexdigest()]
        GlobalVars.commit = commit

        GlobalVars.commit_with_author = "`{}` ({}: {})".format(
            commit['id'], commit['author'][0] if type(commit['author'])
            in {list, tuple} else commit['author'], commit['message'])

        GlobalVars.on_master = git_ref_q()
        GlobalVars.s = "[ {} ] SmokeDetector started at [rev {}]({}/commit/{}) (running on {}, Python {})".format(
            GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author,
            GlobalVars.bot_repository, GlobalVars.commit['id'],
            GlobalVars.location, platform.python_version())
        GlobalVars.s_reverted = \
            "[ {} ] SmokeDetector started in [reverted mode](" \
            "https://charcoal-se.org/smokey/SmokeDetector-Statuses#reverted-mode) " \
            "at [rev {}]({}/commit/{}) (running on {})".format(
                GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository,
                GlobalVars.commit['id'], GlobalVars.location)
        GlobalVars.s_norestart = "[ {} ] Blacklists reloaded at [rev {}]({}/commit/{}) (running on {})".format(
            GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author,
            GlobalVars.bot_repository, GlobalVars.commit['id'],
            GlobalVars.location)
        GlobalVars.s_norestart2 = "[ {} ] FindSpam module reloaded at [rev {}]({}/commit/{}) (running on {})".format(
            GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author,
            GlobalVars.bot_repository, GlobalVars.commit['id'],
            GlobalVars.location)
        GlobalVars.standby_message = \
            "[ {} ] SmokeDetector started in [standby mode](" \
            "https://charcoal-se.org/smokey/SmokeDetector-Statuses#standby-mode) " \
            "at [rev {}]({}/commit/{}) (running on {})".format(
                GlobalVars.chatmessage_prefix, GlobalVars.commit_with_author, GlobalVars.bot_repository,
                GlobalVars.commit['id'], GlobalVars.location)
        log('debug', "GlobalVars loaded")

Example #26

0

Show file

File: globalvars.py Project: sohaeb/SmokeDetector

class GlobalVars:
    false_positives = []
    whitelisted_users = []
    blacklisted_users = []
    ignored_posts = []
    auto_ignored_posts = []
    startup_utc = datetime.utcnow().strftime("%H:%M:%S")
    latest_questions = []
    api_backoff_time = 0
    charcoal_room_id = "11540"
    meta_tavern_room_id = "89"
    socvr_room_id = "41570"
    blockedTime = {
        "all": 0,
        charcoal_room_id: 0,
        meta_tavern_room_id: 0,
        socvr_room_id: 0
    }
    metasmoke_last_ping_time = datetime.now()
    not_privileged_warning = """
    You are not a privileged user. Please see
    [the privileges wiki page](https://charcoal-se.org/smokey/Privileges) for
    information on what privileges are and what is expected of privileged users.
    """.strip().replace("\n", " ")

    experimental_reasons = [  # Don't widely report these
        "potentially bad keyword in answer", "potentially bad keyword in body",
        "potentially bad keyword in title",
        "potentially bad keyword in username"
    ]
    non_socvr_reasons = []  # Don't report to SOCVR
    non_tavern_reasons = [  # Don't report in the Tavern
        "all-caps body",
        "all-caps answer",
        "repeating characters in body",
        "repeating characters in title",
        "repeating characters in answer",
        "few unique characters in body",
        "few unique characters in answer",
        "title has only one unique char",
        "phone number detected in title",
        "offensive body detected",
        "no whitespace in body",
        "no whitespace in answer",
    ]
    non_tavern_sites = ["stackoverflow.com"]

    parser = HTMLParser()
    wrap = Client("stackexchange.com")
    wrapm = Client("meta.stackexchange.com")
    wrapso = Client("stackoverflow.com")
    privileged_users = {
        charcoal_room_id: [
            "117490",  # Normal Human
            "66258",  # Andy
            "31768",  # ManishEarth
            "103081",  # hichris123
            "73046",  # Undo
            "88521",  # ProgramFOX
            "59776",  # Doorknob
            "31465",  # Seth
            "88577",  # Santa Claus
            "34124",  # Andrew Leach
            "54229",  # apnorton
            "20459",  # S.L. Barth
            "32436",  # tchrist
            "30477",  # Brock Adams
            "58529",  # ferrybig
            "145208",  # Robert Longson
            "178825",  # Ms Yvette
            "171800",  # JAL
            "64978",  # PeterJ
            "125141",  # Jeffrey Bosboom
            "54902",  # bummi
            "135450",  # M.A.R.
            "145604",  # Quill
            "60548",  # rene
            "121401",  # michaelpri
            "116218",  # JamesENL
            "82927",  # Braiam
            "11606",  # bwDraco
            "19761",  # Ilmari Karonen
            "108271",  # Andrew T.
            "171054",  # Magisch
            "190011",  # Petter Friberg
            "165661",  # Tunaki
            "145086",  # Wai Ha Lee
            "137665",  # ByteCommander
            "147884",  # wythagoras
            "186395",  # Åna
            "181293",  # Ashish Ahuja
            "163686",  # Gothdo
            "145827",  # angussidney
            "244748",  # Supreme Leader SnokeDetector (angussidney's sock)
            "121520",  # ArtOfCode
            "244382",  # Lt. A. Code (ArtOfCode's sock to test things with)
            "137388",  # QPaysTaxes
            "212311",  # Ryan Bemrose
            "172397",  # Kyll
            "224538",  # FrankerZ
            "61202",  # OldSkool
            "56166",  # Jan Dvorak
            "133966",  # DavidPostill
            "22839",  # djsmiley2k
            "97389",  # Kaz Wolfe
            "144962",  # DJMcMayhem
            "139423",  # NobodyNada
            "62118",  # tripleee
            "130558",  # Registered User
            "128113",  # arda
            "164318",  # Glorfindel
            "175347",  # Floern
            "180274",  # Alexander O'Mara
            "158742",  # Rob
            "207356",  # 4castle
            "133031",  # Mithrandir
            "215671",  # Locutus of Borg (Mithrandir's Sock)
            "169713",  # Mego
            "126657",  # Cerbrus
            "10145",  # Thomas Ward
            "161943",  # J F
            "195967",  # CaffeineAddiction
            "5363",  # Stijn
            "248139",  # FelixSFD
            "156721",  # D-side
            "167070",  # quartata
            "172450",  # Hovercraft Full Of Eels
            "56200",  # Eric Leschinski
            "211021",  # Henders
            "255290",  # Gypsy Spellweaver
            "64521",  # CalvT
            "165474",  # Hyper Neutrino
            "281362",  # Hyper Neutrino v2
            "169252",  # Cai
            "155243",  # Nisse Engström
            "69330",  # Sconibulus
            "164187",  # Okx
            "202619",  # John Militer
        ],
        meta_tavern_room_id: [
            "315433",  # Normal Human
            "244519",  # CRABOLO
            "244382",  # TGMCians
            "194047",  # Jan Dvorak
            "158100",  # rene
            "178438",  # Manishearth
            "237685",  # hichris123
            "215468",  # Undo
            "229438",  # ProgramFOX
            "180276",  # Doorknob
            "161974",  # Lynn Crumbling
            "186281",  # Andy
            "266094",  # Unihedro
            "245167",  # Infinite Recursion
            "230261",  # Jason C
            "213575",  # Braiam
            "241919",  # Andrew T.
            "203389",  # backwards-Seth
            "202832",  # Mooseman
            "160017",  # bwDraco
            "201151",  # bummi
            "188558",  # Frank
            "229166",  # Santa Claus
            "159034",  # Kevin Brown
            "203972",  # PeterJ
            "188673",  # Alexis King
            "258672",  # AstroCB
            "227577",  # Sam
            "255735",  # cybermonkey
            "279182",  # Ixrec
            "271104",  # James
            "220428",  # Qantas 94 Heavy
            "153355",  # tchrist
            "238426",  # Ed Cottrell
            "166899",  # Second Rikudo
            "287999",  # ASCIIThenANSI
            "208518",  # JNat
            "284141",  # michaelpri
            "260312",  # vaultah
            "244062",  # SouravGhosh
            "152859",  # Shadow Wizard
            "201314",  # apnorton
            "280934",  # M.A.Ramezani
            "200235",  # durron597
            "148310",  # Awesome Poodles / Brock Adams
            "168333",  # S.L. Barth
            "257207",  # Unikitty
            "244282",  # DroidDev
            "163250",  # Cupcake
            "298265",  # BoomsPlus
            "253560",  # josilber
            "244254",  # misterManSam
            "188189",  # Robert Longson
            "174699",  # Ilmari Karonen
            "202362",  # chmod 666 telkitty
            "289717",  # Quill
            "237813",  # bjb568
            "311345",  # Simon Klaver
            "171881",  # rekire
            "260388",  # Pandya
            "310756",  # Ms Yvette
            "262399",  # Jeffrey Bosboom
            "242209",  # JAL
            "280883",  # ByteCommander
            "302251",  # kos
            "262823",  # ArtOfCode
            "215067",  # Ferrybig
            "308386",  # Magisch
            "285368",  # angussidney
            "158829",  # Thomas Ward
            "294691",  # Mithrandir
            "203553",  # CalvT
            "289971"  # Hyper Neutrino
        ],
        socvr_room_id: [
            "1849664",  # Undo
            "2581872",  # hichris123
            "1198729",  # Manishearth
            "3717023",  # Normal Human aka 1999
            "2619912",  # ProgramFOX
            "578411",  # rene
            "1043380",  # gunr2171
            "2246344",  # Sam
            "2756409",  # TylerH
            "1768232",  # durron597
            "359284",  # Kevin Brown
            "258400",  # easwee
            "3622940",  # Unihedron
            "3204551",  # Deduplicator
            "4342498",  # NathanOliver
            "4639281",  # Tiny Giant
            "3093387",  # josilber
            "1652962",  # cimmanon
            "1677912",  # Mogsdad
            "656243",  # Lynn Crumbling
            "3933332",  # Rizier123
            "2422013",  # cybermonkey
            "3478852",  # Nisse Engström
            "2302862",  # Siguza
            "1324",  # Paul Roub
            "1743880",  # Tunaki
            "1663001",  # DavidG
            "2415822",  # JAL
            "4174897",  # Kyll
            "5299236",  # Kevin Guan
            "4050842",  # Thaillie
            "1816093",  # Drew
            "874188",  # Triplee
            "880772",  # approxiblue
            "1835379",  # Cerbrus
            "3956566",  # JamesENL
            "2357233",  # Ms Yvette
            "3155639",  # AlexanderOMara
            "462627",  # Praveen Kumar
            "4490559",  # intboolstring
            "1364007",  # Wai Ha Lee
            "1699210",  # bummi
            "563532",  # Rob
            "5389107",  # Magisch
            "4099593",  # bhargav-rao
            "1542723",  # Ferrybig
            "2025923",  # Tushar
            "5292302",  # Petter Friberg
            "792066",  # Braiam
            "5666987",  # Ian
            "3160466",  # ArtOfCode
            "4688119",  # Ashish Ahuja
            "3476191",  # Nobody Nada
            "2227743",  # Eric D
            "821878",  # Ryan Bemrose
            "1413395",  # Panta Rei
            "4875631",  # FrankerZ
            "2958086",  # Compass
            "499214",  # JanDvorak
            "5647260",  # Andrew L.
            "559745",  # Floern
            "5743988",  # 4castle
            "4622463",  # angussidney
            "603346",  # Thomas Ward
            "3002139",  # Baum mit Augen
            "1863564",  # QPaysTaxes
            "4687348",  # FelixSFD
            "4751173",  # Glorfindel
            "2233391",  # henders
            "4805174",  # kayess
            "2370483",  # Machavity
            "1873567",  # CalvT
            "4826457"  # suraj
        ],
        '111347': [
            "3160466",  # ArtOfCode
            "1849664",  # Undo
            "3002139",  # Baum mit Augen
            "3476191",  # Nobody Nada
            "5292302",  # Petter Friberg
            "4688119",  # Ashish Ahuja
            "4099593",  # Bhargav Rao
            "1743880",  # Tunaki
            "559745",  # Floern
            "4687348"  # FelixSFD
        ]
    }

    code_privileged_users = None

    smokeDetector_user_id = {
        charcoal_room_id: "120914",
        meta_tavern_room_id: "266345",
        socvr_room_id: "3735529",
        '111347': '3735529'
    }

    censored_committer_names = {"3f4ed0f38df010ce300dba362fa63a62": "Undo1"}

    commit = git_commit_info()
    if md5(commit['author'][0].encode(
            'utf-8')).hexdigest() in censored_committer_names:
        commit['author'] = censored_committer_names[md5(
            commit['author'][0].encode('utf-8')).hexdigest()]

    commit_with_author = "%s (%s: *%s*)" % (
        commit['id'], commit['author'][0] if type(commit['author'])
        in [list, tuple] else commit['author'], commit['message'])

    on_master = "HEAD detached" not in git_status()
    charcoal_hq = None
    tavern_on_the_meta = None
    socvr = None
    s = ""
    s_reverted = ""
    specialrooms = []
    apiquota = -1
    bodyfetcher = None
    se_sites = []
    users_chatting = {
        meta_tavern_room_id: [],
        charcoal_room_id: [],
        socvr_room_id: [],
        '111347': []
    }
    why_data = []
    why_data_allspam = []
    notifications = []
    listen_to_these_if_edited = []
    multiple_reporters = []
    api_calls_per_site = {}

    standby_message = ""
    standby_mode = False

    api_request_lock = threading.Lock()

    num_posts_scanned = 0
    post_scan_time = 0
    posts_scan_stats_lock = threading.Lock()

    config = RawConfigParser()

    if os.path.isfile('config'):
        config.read('config')
    else:
        config.read('config.ci')

    latest_smokedetector_messages = {
        meta_tavern_room_id: [],
        charcoal_room_id: [],
        socvr_room_id: [],
        '111347': []
    }

    # environ_or_none defined in helpers.py
    bot_name = environ_or_none("SMOKEDETECTOR_NAME") or "SmokeDetector"
    bot_repository = environ_or_none(
        "SMOKEDETECTOR_REPO") or "//github.com/Charcoal-SE/SmokeDetector"
    chatmessage_prefix = "[{}]({})".format(bot_name, bot_repository)

    site_id_dict = {}
    post_site_id_to_question = {}

    location = config.get("Config", "location")

    metasmoke_ws = None

    try:
        metasmoke_host = config.get("Config", "metasmoke_host")
    except NoOptionError:
        metasmoke_host = None
        log(
            'info',
            "metasmoke host not found. Set it as metasmoke_host in the config file."
            "See https://github.com/Charcoal-SE/metasmoke.")

    try:
        metasmoke_key = config.get("Config", "metasmoke_key")
    except NoOptionError:
        metasmoke_key = ""
        log(
            'info',
            "No metasmoke key found, which is okay if both are running on the same host"
        )

    try:
        metasmoke_ws_host = config.get("Config", "metasmoke_ws_host")
    except NoOptionError:
        metasmoke_ws_host = ""
        log(
            'info',
            "No metasmoke websocket host found, which is okay if you're anti-websocket"
        )

    try:
        github_username = config.get("Config", "github_username")
        github_password = config.get("Config", "github_password")
    except NoOptionError:
        github_username = None
        github_password = None

Example #27

0

Show file

# Third party imports
import pytest
import webtest


# Local Imports
from pypiserver import __main__, bottle

import tests.test_core as test_core


# Enable logging to detect any problems with it
##
__main__.init_logging(level=logging.NOTSET)
hp = HTMLParser()


@pytest.fixture()
def _app(app):
    return app.module


@pytest.fixture
def app(tmpdir):
    from pypiserver import app
    return app(root=tmpdir.strpath, authenticated=[])


@pytest.fixture
def testapp(app):

Example #28

0

Show file

File: views.py Project: CajucomW/groupproject

def gamepage(request):
    if request.method == 'GET':
        response = requests.get(
            'https://opentdb.com/api.php?amount=1&type=multiple')
        data = response.json()

        for item in data['results']:  # get to the right list
            question = item['question']
            request.session['correct_answer'] = item['correct_answer']
            request.session['incorrect_answers'] = item['incorrect_answers']
            # request.session allows data from two different dictionaries to persist through the requests
            # and allow them to be combined into one list

            h = HTMLParser()
            question = h.unescape(question)

            print(question)

            correct = request.session[
                'correct_answer']  # reference keys for correct answers
            choices = request.session[
                'incorrect_answers']  # reverence keys for incorrect answers

            answers = []
            for choice in choices:
                answers.append(
                    choice
                )  # this adds the wrong answers to the list "answers"
            answers.append(
                correct)  # this adds the right answer to he list "answers"
            random.shuffle(
                answers
            )  # this randomizes all the answers from the list "answers"

            h = HTMLParser()
            answers = h.unescape(answers)

            print(answers)
            print(correct)

            context = {
                'trivia': question,
                'answer0': answers[0],
                'answer1': answers[1],
                'answer2': answers[2],
                'answer3': answers[3],
                'correct': correct,
            }

    else:
        correct = request.session[
            'correct_answer']  # reference keys for correct answers
        choices = request.session[
            'incorrect_answers']  # reverence keys for incorrect answers

        answers = []
        for choice in choices:
            answers.append(
                choice)  # this adds the wrong answers to the list "answers"
        answers.append(
            correct)  # this adds the right answer to he list "answers"
        random.shuffle(
            answers)  # this randomizes all the answers from the list "answers"

        h = HTMLParser()
        answers = h.unescape(answers)

        context = {
            'answer0': answers[0],
            'answer1': answers[1],
            'answer2': answers[2],
            'answer3': answers[3],
            'correct': correct,
            'number_already_answered':
            QuestionAnswered.objects.filter().count(),
        }

        print(answers)
        print(correct)
        if 'a0' in request.POST:
            print("checked a0")
            answerzero = request.POST['a0']
            print("---", answerzero, "---")
            if answerzero == correct:
                print("---Correct!---")
                return redirect('correct/')
            else:
                print("---Picked 0, wrong answer---")
                return redirect('incorrect/')

        if 'a1' in request.POST:
            print("checked a1")
            answerone = request.POST['a1']
            print("---", answerone, "---")
            if answerone == correct:
                print("---Correct!---")
                return redirect('correct/')
            else:
                print("---Picked 1, wrong answer---")
                return redirect('incorrect/')

        if 'a2' in request.POST:
            print("checked a2")
            answertwo = request.POST['a2']
            print("---", answertwo, "---")
            if answertwo == correct:
                print("---Correct!---")
                return redirect('correct/')
            else:
                print("---Picked 2, wrong answer---")
                return redirect('incorrect/')

        if 'a3' in request.POST:
            print("checked a3")
            answerthree = request.POST['a3']
            print("---", answerthree, "---")
            if answerthree == correct:
                print("---Correct!---")
                return redirect('correct/')
            else:
                print("---Picked 3, wrong answer---")
                return redirect('incorrect/')

    return render(request, 'game.html', context)

Example #29

0

Show file

def html_unescape(content):
    html_parser = HTMLParser()
    html = html_parser.unescape(content)
    return html

Example #30

0

Show file

File: thyssen_import.py Project: xqt/toollabs

def getThyssenGenerator():
    """
    Generator to return Thyssen paintings
    """

    # 1 - 1000 ?
    baseUrl = u'http://www.museothyssen.org/en/thyssen/ficha_obra/%s'
    htmlparser = htmlparser = HTMLParser()

    for i in range(1, 1500):
        url = baseUrl % (i, )
        print(url)

        metadata = {}

        metadata['collectionqid'] = u'Q176251'
        metadata['collectionshort'] = u'Thyssen-Bornemisza'
        metadata['locationqid'] = u'Q176251'
        metadata['instanceofqid'] = u'Q3305213'
        metadata['idpid'] = u'P217'

        metadata['url'] = url
        metadata['url_en'] = url
        metadata[
            'url_es'] = u'http://www.museothyssen.org/thyssen/ficha_obra/%s' % (
                i, )

        itemPageEn = requests.get(metadata['url_en'])
        itemPageEs = requests.get(metadata['url_es'])

        itemPageEn.encoding = 'utf-8'
        itemPageEs.encoding = 'utf-8'

        itemPageEnData = itemPageEn.text
        #print itemPageEn.encoding
        #itemPageEnDataCleaned = re.sub("(<!--.*?-->)", "", itemPageEn.text, flags=re.DOTALL) # Strip out comment junk
        #pywikibot.showDiff(itemPageEnData, itemPageEnDataCleaned)
        #pywikibot.output(itemPageEnDataCleaned)
        itemPageEsData = itemPageEs.text

        if len(itemPageEn.text) < 100:
            #That's not a valid page
            continue

        regexes = {}

        regexes[
            'creatorname'] = u'<dt>Autor:</dt>[\r\n\s]+<dd>[\r\n\s]+<a href="[^"]+" title="[^"]+">[\r\n\s]+<span>([^<]+)</span></a>[\r\n\s]+</dd>'
        regexes[
            'title'] = u'tulo:</dt>[\r\n\s]+<dd class="dd_titulo"><em>([^<]+)<'  # Also possible to have <BR />/em></dd>'
        regexes[
            'date'] = u'<dt>Fecha:</dt>[\r\n\s]+<dd class="dd_fecha">([^<]+\d+[^<]+)</dd>'

        # Medium doesn't work
        #regexes['medium'] = u'<dt>T.?cnica:'#</dt>[\r\n\s]+'#<dd class="dd_tecnica">([^<]+)</dd>'
        #regexes['medium'] = u'cnica:</dt>[\r\n\s]+<dd class="dd_tecnica">([^<]+)</dd>'
        regexes[
            'size'] = u'<dt>Medidas:</dt>[\r\n\s]+<dd class="dd_medidas">[\r\n\s]+(.+)x(.+)cm[\r\n\s]+</dd>'
        regexes[
            'id'] = u'<dt>Numero de inventario</dt>[\r\n\s]+<dd><abbr title="INV. Nr.">INV. Nr.</abbr>([^<]+)</dd>'

        matches = {}

        matches['creatorname'] = re.search(regexes['creatorname'],
                                           itemPageEnData)
        metadata['creatorname'] = matches['creatorname'].group(1).strip()

        metadata['description'] = {
            u'nl': u'%s van %s' % (
                u'schilderij',
                metadata['creatorname'],
            ),
            u'en': u'%s by %s' % (
                u'painting',
                metadata['creatorname'],
            ),
        }

        matches['titleen'] = re.search(regexes['title'], itemPageEnData)
        matches['titlees'] = re.search(regexes['title'], itemPageEsData)
        metadata['title'] = {
            u'en': htmlparser.unescape(matches['titleen'].group(1).strip()),
            u'es': htmlparser.unescape(matches['titlees'].group(1).strip()),
        }

        matches['date'] = re.search(regexes['date'], itemPageEnData)
        if matches['date']:
            metadata['date'] = matches['date'].group(1).strip()

        #matches['medium']=re.search(regexes['medium'], itemPageEnData)
        #metadata['medium']=matches['medium'].group(1).strip()

        # Ignore size for now. Needs two fields anyway
        #matches['size']=re.search(regexes['size'], itemPageEnData)
        #metadata['size']=matches['size'].group(1)

        matches['id'] = re.search(regexes['id'], itemPageEnData)
        metadata['id'] = matches['id'].group(1).strip()

        # Crude way to filter out the non-painting
        if not metadata['id'].startswith(u'(CTB.DEC'):
            yield metadata
        '''
        for field, regex in regexes.iteritems():
            matches[field] = re.search(regex, itemPageEnData)
            print field
            #print regex
            if matches[field]:
                print matches[field].group(1)
            else:
                print u'No match found'
            
        

        #print itemPageEnData
        headerRegex = u'<header>[\r\n\s]+<h3>([^<]*)</h3>[\r\n\s]+<h1>([^<]*)</h1>[\r\n\s]+<p>([^<]*)</p>[\r\n\s]+</header>'
        matchEn = re.search(headerRegex, itemPageEnData)
        if not matchEn:
            pywikibot.output(u'The data for this painting is BORKED!')
            continue

        matchRu = re.search(headerRegex, itemPageRuData)


        metadata['title'] = { u'en' : htmlparser.unescape(matchEn.group(2)),
                              u'ru' : htmlparser.unescape(matchRu.group(2)), 
                              }
        #pywikibot.output(metadata.get('title'))

        painterName = matchEn.group(1)

        painterRegexes = [u'([^,]+),\s([^\.]+)\.(.+)',
                          u'([^,]+),\s([^,]+),(.+)',
                          ]
        for painterRegex in painterRegexes:
            painterMatch = re.match(painterRegex, painterName)
            if painterMatch:
                painterName = '%s %s' % (painterMatch.group(2), painterMatch.group(1),)
                continue
        metadata['creatorname'] = painterName

        metadata['description'] = { u'nl' : u'%s van %s' % (u'schilderij', painterName,),
                                    u'en' : u'%s by %s' % (u'painting', painterName,),
                                    }

        #pywikibot.output(metadata.get('description'))

        invRegex = u'<p>[\r\n\s]+Inventory Number:[\r\n\s]+</p>[\r\n\s]+</div>[\r\n\s]+<div class="her-data-tbl-val">[\r\n\s]+<p>[\r\n\s]+(.*\d+)[\r\n\s]+</p>'
        invMatch = re.search(invRegex, itemPageEnData)

        if not invMatch:
            pywikibot.output(u'No inventory number found! Skipping')
            continue
        
        metadata['id'] = invMatch.group(1)
        

        dateDimRegex = u'var descriptionWoA = \'.*Date of creation: (.+), Dimension: ([^\s]+)x([^\s]+)\s?[sc]m\.?\';'
        dateDimMatch = re.search(dateDimRegex, itemPageEnData)
        if dateDimMatch:
            metadata['inception'] = dateDimMatch.group(1)
            metadata['height'] = dateDimMatch.group(2)
            metadata['heightunitqid'] = u'Q174728'
            metadata['width'] = dateDimMatch.group(2)
            metadata['widthunitqid'] = u'Q174728'


        yield metadata

        
        
        #print matchEn.group(1)
        #print matchEn.group(2)
        #print matchEn.group(3)
            
        '''
    '''