def __init__(self, cookie_filename=None, user_agent=None, timeout=None, **kwargs): try: import mechanize except ImportError: raise DependencyNotInstalledError('mechanize') if user_agent is None: user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)' self.browser = mechanize.Browser() self.cj = cookielib.LWPCookieJar() if cookie_filename is not None: self.cj.load(cookie_filename) self.browser.set_cookiejar(self.cj) self.browser.set_handle_equiv(True) self.browser.set_handle_gzip(True) self.browser.set_handle_redirect(True) self.browser.set_handle_referer(True) self.browser.set_handle_robots(False) self.browser.addheaders = [('User-agent', user_agent)] if timeout is None: self._default_timout = mechanize._sockettimeout._GLOBAL_DEFAULT_TIMEOUT else: self._default_timout = timeout
def __init__(self, user_agent=None, **kwargs): try: import spynner except ImportError: raise DependencyNotInstalledError('spynner') if user_agent is None: user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)' self.br = spynner.Browser(user_agent=user_agent, **kwargs)
def beautiful_soup(html, logger=None): try: from bs4 import BeautifulSoup, FeatureNotFound except ImportError: raise DependencyNotInstalledError("BeautifulSoup4") try: return BeautifulSoup(html, 'lxml') except FeatureNotFound: if logger is not None: logger.warning('lxml not installed') return BeautifulSoup(html)
def __init__(self, cookie_filename=None, user_agent=None): try: import mechanize except ImportError: raise DependencyNotInstalledError('mechanize') if user_agent is None: user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)' self.browser = mechanize.Browser() self.cj = cookielib.LWPCookieJar() if cookie_filename is not None: self.cj.load(cookie_filename) self.browser.set_cookiejar(self.cj) self.browser.set_handle_equiv(True) self.browser.set_handle_redirect(True) self.browser.set_handle_referer(True) self.browser.set_handle_robots(False) self.browser.addheaders = [('User-agent', user_agent)]
def spynner_open(self, url, data=None, headers=None, method='GET', wait_for_text=None, wait_for_selector=None, tries=None): try: from PyQt4.QtNetwork import QNetworkAccessManager except ImportError: raise DependencyNotInstalledError('PyQt4') if wait_for_text is not None: def wait_callback(br): return wait_for_text in br.html elif wait_for_selector is not None: def wait_callback(br): return not br.webframe.findFirstElement( wait_for_selector).isNull() else: wait_callback = None operation = QNetworkAccessManager.GetOperation if method == 'POST': operation = QNetworkAccessManager.PostOperation if type(url) == unicode: url = urllib2.unquote(url.encode('utf-8')) self.br.load(url, wait_callback=wait_callback, tries=tries, operation=operation, body=data, headers=headers, load_timeout=self._default_timout) return self.br
from cola.core.parsers import Parser from cola.core.utils import urldecode, beautiful_soup from cola.core.errors import DependencyNotInstalledError, FetchBannedError from cola.core.logs import get_logger from login import WeiboLoginFailure from bundle import WeiboUserBundle from storage import DoesNotExist, Q, WeiboUser, Friend,\ MicroBlog, Geo, UserInfo, WorkInfo, EduInfo,\ Comment, Forward, Like, ValidationError from conf import fetch_forward, fetch_comment, fetch_like, fetch_n_comments try: from dateutil.parser import parse except ImportError: raise DependencyNotInstalledError('python-dateutil') TIMEOUT = 30.0 class WeiboParser(Parser): def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(WeiboParser, self).__init__(opener=opener, url=url, **kwargs) self.bundle = bundle self.uid = bundle.label self.opener.set_default_timeout(TIMEOUT) if not hasattr(self, 'logger') or self.logger is None: self.logger = get_logger(name='weibo_parser') def _check_url(self, dest_url, src_url): return dest_url.split('?')[0] == src_url.split('?')[0]
@author: Chine ''' import urlparse import urllib from cola.core.parsers import Parser from cola.core.errors import DependencyNotInstalledError from bundle import WeiboSearchBundle from storage import MicroBlog, DoesNotExist, Q try: from bs4 import BeautifulSoup except ImportError: raise DependencyNotInstalledError('BeautifulSoup4') try: from dateutil.parser import parse except ImportError: raise DependencyNotInstalledError('python-dateutil') try: from spynner import SpynnerTimeout except ImportError: raise DependencyNotInstalledError('spynner') class WeiboSearchParser(Parser): def __init__(self, opener=None, url=None, bundle=None, **kwargs): super(WeiboSearchParser, self).__init__(opener=opener, url=url, **kwargs) self.bundle = bundle
See the License for the specific language governing permissions and limitations under the License. Created on 2013-5-16 @author: Chine ''' import os from cola.core.errors import DependencyNotInstalledError try: import yaml except ImportError: raise DependencyNotInstalledError('pyyaml') class PropertyObject(dict): """ Wrapper of dict, providing the ability to get the key by the property. As an instance: >>> obj = PropertyObject({'k': 'v'}) >>> obj.k 'v' >>> obj.update(k={'sk': 'sv'}) >>> obj {'k': {'sk': 'sv'}} >>> obj.k.sk
limitations under the License. Created on 2019-8-26 @author: peniridis """ # Standard library imports import asyncio # Related third party imports try: import pyppeteer except ImportError: from cola.core.errors import DependencyNotInstalledError raise DependencyNotInstalledError('pyppeteer') from pyppeteer.network_manager import Request, Response # Local application/library specific imports class Opener(object): def open(self, url): raise NotImplementedError def read(self): raise NotImplementedError class PuppeteerOpener(Opener):
from cola.core.urls import UrlPatterns, Url from cola.core.parsers import Parser from cola.core.opener import MechanizeOpener from cola.core.errors import DependencyNotInstalledError from cola.core.config import Config from cola.job import JobDescription import urlparse import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) try: from bs4 import BeautifulSoup except ImportError: raise DependencyNotInstalledError('BeautifulSoup4') try: from dateutil.parser import parse except ImportError: raise DependencyNotInstalledError('python-dateutil') try: from mongoengine import connect, DoesNotExist, \ Document, StringField, DateTimeField except ImportError: raise DependencyNotInstalledError('mongoengine') get_user_conf = lambda s: os.path.join( os.path.dirname(os.path.abspath(__file__)), s) user_conf = get_user_conf('test.yaml')
@author: Chine ''' from cola.core.errors import DependencyNotInstalledError from conf import mongo_host, mongo_port, db_name, shard_key try: from mongoengine import connect, Document, EmbeddedDocument, \ DoesNotExist, Q, \ StringField, DateTimeField, EmailField, \ BooleanField, URLField, IntField, FloatField, \ ListField, EmbeddedDocumentField, \ ValidationError except ImportError: raise DependencyNotInstalledError('mongoengine') connect(db_name, host=mongo_host, port=mongo_port) DoesNotExist = DoesNotExist Q = Q ValidationError = ValidationError class Forward(EmbeddedDocument): mid = StringField(required=True) uid = StringField(required=True) avatar = URLField() content = StringField() created = DateTimeField()
@author: Chine ''' import urllib import base64 import binascii import re import json from cola.core.errors import DependencyNotInstalledError,\ LoginFailure try: import rsa except ImportError: raise DependencyNotInstalledError("rsa") class WeiboLoginFailure(LoginFailure): pass class WeiboLogin(object): def __init__(self, opener, username, passwd): self.opener = opener self.username = username self.passwd = passwd def get_user(self, username): username = urllib.quote(username) return base64.encodestring(username)[:-1] def get_passwd(self, passwd, pubkey, servertime, nonce):
Created on 2013-7-15 @author: Chine ''' import re from cola.core.logs import get_logger from cola.core.errors import DependencyNotInstalledError from cola.core.utils import beautiful_soup try: from bs4 import NavigableString except ImportError: raise DependencyNotInstalledError("BeautifulSoup4") from cola.core.extractor.preprocess import PreProcessor __all__ = ['Extractor'] REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|aside|sponsor',re.I), 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I), 'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I), 'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I), 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I), 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), 'trimRe': re.compile('^\s+|\s+$/'), 'normalizeRe': re.compile('\s{2,}/'),
import os import urlparse from cola.core.urls import UrlPatterns, Url from cola.core.parsers import Parser from cola.core.opener import MechanizeOpener from cola.core.errors import DependencyNotInstalledError from cola.core.config import Config from cola.core.extractor import Extractor from cola.core.extractor.utils import host_for_url from cola.job import Job try: from bs4 import BeautifulSoup except ImportError: raise DependencyNotInstalledError('BeautifulSoup4') try: from mongoengine import connect, DoesNotExist, \ Document, StringField, URLField except ImportError: raise DependencyNotInstalledError('mongoengine') try: from chardet import detect except ImportError: raise DependencyNotInstalledError('chardet') get_user_conf = lambda s: os.path.join( os.path.dirname(os.path.abspath(__file__)), s) user_conf = get_user_conf('test.yaml')