Beispiel #1
0
    def __init__(self,
                 cookie_filename=None,
                 user_agent=None,
                 timeout=None,
                 **kwargs):
        try:
            import mechanize
        except ImportError:
            raise DependencyNotInstalledError('mechanize')

        if user_agent is None:
            user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'

        self.browser = mechanize.Browser()

        self.cj = cookielib.LWPCookieJar()
        if cookie_filename is not None:
            self.cj.load(cookie_filename)
        self.browser.set_cookiejar(self.cj)
        self.browser.set_handle_equiv(True)
        self.browser.set_handle_gzip(True)
        self.browser.set_handle_redirect(True)
        self.browser.set_handle_referer(True)
        self.browser.set_handle_robots(False)
        self.browser.addheaders = [('User-agent', user_agent)]

        if timeout is None:
            self._default_timout = mechanize._sockettimeout._GLOBAL_DEFAULT_TIMEOUT
        else:
            self._default_timout = timeout
Beispiel #2
0
    def __init__(self, user_agent=None, **kwargs):
        try:
            import spynner
        except ImportError:
            raise DependencyNotInstalledError('spynner')

        if user_agent is None:
            user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'

        self.br = spynner.Browser(user_agent=user_agent, **kwargs)
Beispiel #3
0
def beautiful_soup(html, logger=None):
    try:
        from bs4 import BeautifulSoup, FeatureNotFound
    except ImportError:
        raise DependencyNotInstalledError("BeautifulSoup4")

    try:
        return BeautifulSoup(html, 'lxml')
    except FeatureNotFound:
        if logger is not None:
            logger.warning('lxml not installed')
        return BeautifulSoup(html)
Beispiel #4
0
    def __init__(self, cookie_filename=None, user_agent=None):
        try:
            import mechanize
        except ImportError:
            raise DependencyNotInstalledError('mechanize')

        if user_agent is None:
            user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'

        self.browser = mechanize.Browser()

        self.cj = cookielib.LWPCookieJar()
        if cookie_filename is not None:
            self.cj.load(cookie_filename)
        self.browser.set_cookiejar(self.cj)
        self.browser.set_handle_equiv(True)
        self.browser.set_handle_redirect(True)
        self.browser.set_handle_referer(True)
        self.browser.set_handle_robots(False)
        self.browser.addheaders = [('User-agent', user_agent)]
Beispiel #5
0
    def spynner_open(self,
                     url,
                     data=None,
                     headers=None,
                     method='GET',
                     wait_for_text=None,
                     wait_for_selector=None,
                     tries=None):
        try:
            from PyQt4.QtNetwork import QNetworkAccessManager
        except ImportError:
            raise DependencyNotInstalledError('PyQt4')

        if wait_for_text is not None:

            def wait_callback(br):
                return wait_for_text in br.html
        elif wait_for_selector is not None:

            def wait_callback(br):
                return not br.webframe.findFirstElement(
                    wait_for_selector).isNull()
        else:
            wait_callback = None

        operation = QNetworkAccessManager.GetOperation
        if method == 'POST':
            operation = QNetworkAccessManager.PostOperation

        if type(url) == unicode:
            url = urllib2.unquote(url.encode('utf-8'))
        self.br.load(url,
                     wait_callback=wait_callback,
                     tries=tries,
                     operation=operation,
                     body=data,
                     headers=headers,
                     load_timeout=self._default_timout)

        return self.br
Beispiel #6
0
from cola.core.parsers import Parser
from cola.core.utils import urldecode, beautiful_soup
from cola.core.errors import DependencyNotInstalledError, FetchBannedError
from cola.core.logs import get_logger

from login import WeiboLoginFailure
from bundle import WeiboUserBundle
from storage import DoesNotExist, Q, WeiboUser, Friend,\
                    MicroBlog, Geo, UserInfo, WorkInfo, EduInfo,\
                    Comment, Forward, Like, ValidationError
from conf import fetch_forward, fetch_comment, fetch_like, fetch_n_comments

try:
    from dateutil.parser import parse
except ImportError:
    raise DependencyNotInstalledError('python-dateutil')

TIMEOUT = 30.0


class WeiboParser(Parser):
    def __init__(self, opener=None, url=None, bundle=None, **kwargs):
        super(WeiboParser, self).__init__(opener=opener, url=url, **kwargs)
        self.bundle = bundle
        self.uid = bundle.label
        self.opener.set_default_timeout(TIMEOUT)
        if not hasattr(self, 'logger') or self.logger is None:
            self.logger = get_logger(name='weibo_parser')

    def _check_url(self, dest_url, src_url):
        return dest_url.split('?')[0] == src_url.split('?')[0]
Beispiel #7
0
@author: Chine
'''

import urlparse
import urllib

from cola.core.parsers import Parser
from cola.core.errors import DependencyNotInstalledError

from bundle import WeiboSearchBundle
from storage import MicroBlog, DoesNotExist, Q

try:
    from bs4 import BeautifulSoup
except ImportError:
    raise DependencyNotInstalledError('BeautifulSoup4')

try:
    from dateutil.parser import parse
except ImportError:
    raise DependencyNotInstalledError('python-dateutil')

try:
    from spynner import SpynnerTimeout
except ImportError:
    raise DependencyNotInstalledError('spynner')

class WeiboSearchParser(Parser):
    def __init__(self, opener=None, url=None, bundle=None, **kwargs):
        super(WeiboSearchParser, self).__init__(opener=opener, url=url, **kwargs)
        self.bundle = bundle
Beispiel #8
0
See the License for the specific language governing permissions and
limitations under the License.

Created on 2013-5-16

@author: Chine
'''

import os

from cola.core.errors import DependencyNotInstalledError

try:
    import yaml
except ImportError:
    raise DependencyNotInstalledError('pyyaml')


class PropertyObject(dict):
    """
    Wrapper of dict, providing the ability to get the key by the property.

    As an instance:

    >>> obj = PropertyObject({'k': 'v'})
    >>> obj.k
    'v'
    >>> obj.update(k={'sk': 'sv'})
    >>> obj
    {'k': {'sk': 'sv'}}
    >>> obj.k.sk
Beispiel #9
0
limitations under the License.

Created on 2019-8-26

@author: peniridis
"""
# Standard library imports
import asyncio

# Related third party imports
try:
    import pyppeteer
except ImportError:
    from cola.core.errors import DependencyNotInstalledError

    raise DependencyNotInstalledError('pyppeteer')

from pyppeteer.network_manager import Request, Response

# Local application/library specific imports


class Opener(object):
    def open(self, url):
        raise NotImplementedError

    def read(self):
        raise NotImplementedError


class PuppeteerOpener(Opener):
Beispiel #10
0
from cola.core.urls import UrlPatterns, Url
from cola.core.parsers import Parser
from cola.core.opener import MechanizeOpener
from cola.core.errors import DependencyNotInstalledError
from cola.core.config import Config
from cola.job import JobDescription
import urlparse
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

try:
    from bs4 import BeautifulSoup
except ImportError:
    raise DependencyNotInstalledError('BeautifulSoup4')

try:
    from dateutil.parser import parse
except ImportError:
    raise DependencyNotInstalledError('python-dateutil')

try:
    from mongoengine import connect, DoesNotExist, \
                            Document, StringField, DateTimeField
except ImportError:
    raise DependencyNotInstalledError('mongoengine')

get_user_conf = lambda s: os.path.join(
    os.path.dirname(os.path.abspath(__file__)), s)
user_conf = get_user_conf('test.yaml')
Beispiel #11
0
@author: Chine
'''

from cola.core.errors import DependencyNotInstalledError

from conf import mongo_host, mongo_port, db_name, shard_key

try:
    from mongoengine import connect, Document, EmbeddedDocument, \
                            DoesNotExist, Q, \
                            StringField, DateTimeField, EmailField, \
                            BooleanField, URLField, IntField, FloatField, \
                            ListField, EmbeddedDocumentField, \
                            ValidationError
except ImportError:
    raise DependencyNotInstalledError('mongoengine')

connect(db_name, host=mongo_host, port=mongo_port)

DoesNotExist = DoesNotExist
Q = Q
ValidationError = ValidationError


class Forward(EmbeddedDocument):
    mid = StringField(required=True)
    uid = StringField(required=True)
    avatar = URLField()
    content = StringField()
    created = DateTimeField()
Beispiel #12
0
@author: Chine
'''

import urllib
import base64
import binascii
import re
import json

from cola.core.errors import DependencyNotInstalledError,\
                             LoginFailure

try:
    import rsa
except ImportError:
    raise DependencyNotInstalledError("rsa")

class WeiboLoginFailure(LoginFailure): pass

class WeiboLogin(object):
    def __init__(self, opener, username, passwd):
        self.opener = opener
        
        self.username = username
        self.passwd = passwd
        
    def get_user(self, username):
        username = urllib.quote(username)
        return base64.encodestring(username)[:-1]
    
    def get_passwd(self, passwd, pubkey, servertime, nonce):
Created on 2013-7-15

@author: Chine
'''

import re

from cola.core.logs import get_logger
from cola.core.errors import DependencyNotInstalledError
from cola.core.utils import beautiful_soup

try:
    from bs4 import NavigableString
except ImportError:
    raise DependencyNotInstalledError("BeautifulSoup4")

from cola.core.extractor.preprocess import PreProcessor

__all__ = ['Extractor']

REGEXES = { 
    'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|aside|sponsor',re.I),
    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I),
    'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I),
    'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
    'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
    'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
    'trimRe': re.compile('^\s+|\s+$/'),
    'normalizeRe': re.compile('\s{2,}/'),
Beispiel #14
0
import os
import urlparse

from cola.core.urls import UrlPatterns, Url
from cola.core.parsers import Parser
from cola.core.opener import MechanizeOpener
from cola.core.errors import DependencyNotInstalledError
from cola.core.config import Config
from cola.core.extractor import Extractor
from cola.core.extractor.utils import host_for_url
from cola.job import Job

try:
    from bs4 import BeautifulSoup
except ImportError:
    raise DependencyNotInstalledError('BeautifulSoup4')

try:
    from mongoengine import connect, DoesNotExist, \
                            Document, StringField, URLField
except ImportError:
    raise DependencyNotInstalledError('mongoengine')

try:
    from chardet import detect
except ImportError:
    raise DependencyNotInstalledError('chardet')

get_user_conf = lambda s: os.path.join(
    os.path.dirname(os.path.abspath(__file__)), s)
user_conf = get_user_conf('test.yaml')