from jsb.lib.examples import examples from jsb.plugs.common.tinyurl import get_tinyurl from jsb.lib.persistconfig import PersistConfig ## basic import import re import urlparse import xmlrpclib import socket import logging ## defines cfg = PlugPersist('urlinfo', {}) plugcfg = PersistConfig() plugcfg.define("showpictures", 1) ## sanitize function def sanitize(text): """ Remove non-urls word by word. """ text = text.strip() text = re.sub('\s\s+', ' ', text) tmp = '' for i in text.split(' '): if len(i) >= 5: if i.find('www.') != -1 or i.find('http') != -1: tmp += i + ' ' tmp = tmp.strip() tmp2 = ''
from jsb.lib.persistconfig import PersistConfig from jsb.lib.plugins import plugs as plugins ## basic imports import urllib import urllib2 import urlparse import copy import re import socket ## defines cfg = PlugPersist('snarf.cfg') pcfg = PersistConfig() pcfg.define('allow', ['text/plain', 'text/html', 'application/xml']) re_html_title = re.compile(u'<title>(.*?)</title>', re.I | re.M | re.S) re_url_match = re.compile(u'((?:http|https)://\S+)') re_html_valid = { 'result': re.compile('(Failed validation, \d+ errors?|Passed validation)', re.I | re.M), 'modified': re.compile('<th>Modified:</th><td colspan="2">([^<]+)</td>', re.I | re.M), 'server': re.compile('<th>Server:</th><td colspan="2">([^<]+)</td>', re.I | re.M), 'size':