Exemple #1
0
 def test_export_binary(self):
     with catch_warnings():
         filterwarnings('ignore', category=ScrapyDeprecationWarning)
         exporter = PythonItemExporter(binary=True)
         value = self.item_class(name='John\xa3', age='22')
         expected = {b'name': b'John\xc2\xa3', b'age': b'22'}
         self.assertEqual(expected, exporter.export_item(value))
 def __init__(self, crawler):
     self.hsref = hsref.hsref
     self.pipe_writer = pipe_writer
     self.crawler = crawler
     self._write_item = self.pipe_writer.write_item
     # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9
     exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {}
     with ignore_warnings(category=ScrapyDeprecationWarning):
         self.exporter = PythonItemExporter(**exporter_kwargs)
Exemple #3
0
 def __init__(self):
     self.client_id = settings.STATNUTS_CLIENT_ID
     self.client_secret = settings.STATNUTS_SECRET
     self.sn_store_url = settings.STATNUTS_URL + 'scrap/teams/'
     self.token_url = settings.STATNUTS_URL + 'o/token/'
     self.access_token = None
     self.oauth = OAuth2Session(client=LegacyApplicationClient(
         client_id=self.client_id))
     self.exporter = PythonItemExporter(binary=False)
Exemple #4
0
class ScrapynutsPostTeamStatnutsPipeline(object):
    def __init__(self):
        self.client_id = settings.STATNUTS_CLIENT_ID
        self.client_secret = settings.STATNUTS_SECRET
        self.sn_store_url = settings.STATNUTS_URL + 'scrap/teams/'
        self.token_url = settings.STATNUTS_URL + 'o/token/'
        self.access_token = None
        self.oauth = OAuth2Session(client=LegacyApplicationClient(
            client_id=self.client_id))
        self.exporter = PythonItemExporter(binary=False)

    def _get_access_token(self):
        token = self.oauth.fetch_token(token_url=self.token_url,
                                       client_id=self.client_id,
                                       verify=False,
                                       client_secret=self.client_secret,
                                       username='******',
                                       password='******')
        return token

    def process_item(self, item, spider):
        if self.access_token is None:
            self.access_token = self._get_access_token()
        item_json = self.exporter.export_item(item)
        team_name = item_json.get('name') + '/'
        self.oauth.post(urljoin(self.sn_store_url, team_name), json=item_json)
        print('Item stored with hash = %s' % item['name'])
class HubstorageExtension(object):
    """Extension to write scraped items to HubStorage"""

    def __init__(self, crawler):
        self.hsref = hsref.hsref
        self.pipe_writer = pipe_writer
        self.crawler = crawler
        self._write_item = self.pipe_writer.write_item
        # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9
        exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {}
        with ignore_warnings(category=ScrapyDeprecationWarning):
            self.exporter = PythonItemExporter(**exporter_kwargs)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(crawler)
        crawler.signals.connect(o.item_scraped, signals.item_scraped)
        crawler.signals.connect(o.spider_closed, signals.spider_closed)
        return o

    def item_scraped(self, item, spider):
        if not isinstance(item, (dict, BaseItem)):
            log.msg("Wrong item type: %s" % item, level=logging.ERROR)
            return
        type_ = type(item).__name__
        item = self.exporter.export_item(item)
        item.setdefault("_type", type_)
        self._write_item(item)

    def spider_closed(self, spider, reason):
        self.pipe_writer.set_outcome(reason)
class HubstorageExtension(object):
    """Extension to write scraped items to HubStorage"""

    def __init__(self, crawler):
        self.hsref = hsref.hsref
        if not self.hsref.enabled:
            raise NotConfigured

        self.crawler = crawler
        self._write_item = self.hsref.job.items.write
        self.exporter = PythonItemExporter(binary=False)
        log.msg("HubStorage: writing items to %s" % self.hsref.job.items.url)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(crawler)
        crawler.signals.connect(o.item_scraped, signals.item_scraped)
        crawler.signals.connect(o.spider_closed, signals.spider_closed)
        return o

    def item_scraped(self, item, spider):
        type_ = type(item).__name__
        item = self.exporter.export_item(item)
        item.setdefault("_type", type_)
        self._write_item(item)

    def spider_closed(self, spider, reason):
        # flush item writer
        self.hsref.job.items.flush()
        # update outcome
        self.hsref.job.metadata.update(close_reason=reason)
        self.hsref.job.metadata.save()
class HubstorageExtension(object):
    """Extension to write scraped items to HubStorage"""
    def __init__(self, crawler):
        self.hsref = hsref.hsref
        self.pipe_writer = pipe_writer
        self.crawler = crawler
        self._write_item = self.pipe_writer.write_item
        # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9
        exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {}
        with ignore_warnings(category=ScrapyDeprecationWarning):
            self.exporter = PythonItemExporter(**exporter_kwargs)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(crawler)
        crawler.signals.connect(o.item_scraped, signals.item_scraped)
        crawler.signals.connect(o.spider_closed, signals.spider_closed)
        return o

    def item_scraped(self, item, spider):
        if not isinstance(item, (dict, BaseItem)):
            log.msg("Wrong item type: %s" % item, level=logging.ERROR)
            return
        type_ = type(item).__name__
        item = self.exporter.export_item(item)
        item.setdefault("_type", type_)
        self._write_item(item)

    def spider_closed(self, spider, reason):
        self.pipe_writer.set_outcome(reason)
 def __init__(self, crawler):
     self.hsref = hsref.hsref
     self.pipe_writer = pipe_writer
     self.crawler = crawler
     self._write_item = self.pipe_writer.write_item
     # https://github.com/scrapy/scrapy/commit/c76190d491fca9f35b6758bdc06c34d77f5d9be9
     exporter_kwargs = {'binary': False} if not IS_PYTHON2 else {}
     with ignore_warnings(category=ScrapyDeprecationWarning):
         self.exporter = PythonItemExporter(**exporter_kwargs)
    def __init__(self, crawler):
        self.hsref = hsref.hsref
        if not self.hsref.enabled:
            raise NotConfigured

        self.crawler = crawler
        self._write_item = self.hsref.job.items.write
        self.exporter = PythonItemExporter(binary=False)
        log.msg("HubStorage: writing items to %s" % self.hsref.job.items.url)
Exemple #10
0
def spider_handler(latitude, longitude, max_number, q):
    link = get_link_for_tripadvisor(latitude, longitude)
    output = []
    _exporter = PythonItemExporter(binary=False)
    def get_crawler_output(signal, sender, item, response, spider):
        output.append(_exporter.export_item(item))

    dispatcher.connect(get_crawler_output, signal=signals.item_scraped)
    process = CrawlerProcess({
        "USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
    })
    process.crawl(RestaurantSpider, start_url=link, max_restaurants=max_number)
    process.start()
    q.put(output)
Exemple #11
0
class ScrapyKafkaTopicWriter(KafkaTopicWriter):
    """ Kafka writer which knows how to handle Scrapy items: they are
    serialized to JSON, and "_id" field is used as Kafka key if present.
    """
    def __init__(self, *args, **kwargs):
        self._encoder = ScrapyJSONEncoder()
        self._exporter = PythonItemExporter(binary=False)
        kwargs.setdefault('value_serializer', self._serialize_value)
        super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs)

    def write_item(self, item):
        key = item.get('_id', None)
        msg = self._exporter.export_item(item)
        return self.write(key, msg)

    def _serialize_value(self, value):
        return self._encoder.encode(value).encode('utf8')
Exemple #12
0
 def _serialize(self, item, **kwargs):
     e = PythonItemExporter(binary=False, **kwargs)
     return e.export_item(item)
Exemple #13
0
 def __init__(self, *args, **kwargs):
     self._encoder = ScrapyJSONEncoder()
     self._exporter = PythonItemExporter(binary=False)
     kwargs.setdefault('value_serializer', self._serialize_value)
     super(ScrapyKafkaTopicWriter, self).__init__(*args, **kwargs)
Exemple #14
0
 def test_export_binary(self):
     exporter = PythonItemExporter(binary=True)
     value = TestItem(name=u'John\xa3', age=u'22')
     expected = {b'name': b'John\xc2\xa3', b'age': b'22'}
     self.assertEqual(expected, exporter.export_item(value))
Exemple #15
0
 def test_invalid_option(self):
     with self.assertRaisesRegexp(TypeError,
                                  "Unexpected options: invalid_option"):
         PythonItemExporter(invalid_option='something')
Exemple #16
0
 def _get_exporter(self, **kwargs):
     return PythonItemExporter(binary=False, **kwargs)
Exemple #17
0
 def test_export_binary(self):
     exporter = PythonItemExporter(binary=True)
     value = TestItem(name=u'John\xa3', age=u'22')
     expected = {b'name': b'John\xc2\xa3', b'age': b'22'}
     self.assertEqual(expected, exporter.export_item(value))
Exemple #18
0
 def test_export_binary(self):
     exporter = PythonItemExporter(binary=True)
     value = TestItem(name=u"John\xa3", age=u"22")
     expected = {b"name": b"John\xc2\xa3", b"age": b"22"}
     self.assertEqual(expected, exporter.export_item(value))
class FileHandler(object):

    data = {
        "candidate_basic": {
            "header": None
        },
        "candidate_education": {
            "header": None
        },
        "candidate_research": {
            "header": None
        },
        "candidate_publications": {
            "header": None
        },
        "candidate_courses": {
            "header": None
        },
        "candidate_workexperience": {
            "header": None
        }
    }

    def __init__(self):
        self.json_exporter = PythonItemExporter()

    @staticmethod
    def generate_id(string):
        sha1 = hashlib.sha1()
        sha1.update(string)
        return sha1.hexdigest()

    def cleanup_data(self, spider, fmt):
        directory = "eol_spider/%s/%s" % (fmt, spider.name)
        if os.path.exists(directory):
            shutil.rmtree(directory)
        os.mkdir(directory)
        for key in self.data.keys():
            self.data[key]['path'] = "%s/%s.%s" % (directory, key, fmt)
            self.data[key]['f'] = open(self.data[key]['path'], "w+")

    def write(self, fmt="json"):
        for key in self.data.keys():
            items = self.data[key]["item"]
            if not isinstance(items, list):
                items = [items]
            for item in items:
                if fmt == "csv":
                    if not self.data[key]["header"]:
                        header = self.build_csv_header(item)
                        self.data[key]["f"].write(header)
                        self.data[key]["header"] = header
                content = self.build_content(item, fmt)
                self.data[key]["f"].write(content)

    def build_content(self, item, fmt):
        if fmt == "json":
            content = json.dumps(self.json_exporter.export_item(item)) + "\n"
        elif fmt == "csv":
            content = ""
            for key in item:
                content += item[key] + "\t"
            content = content[:-1]
            content += "\n"
            pass
        return content

    @staticmethod
    def build_csv_header(item):
        header = ""
        for k in item:
            header += k + "\t"
        header = header[:-1]
        header += "\n"
        return header

    def close(self):
        for key in self.data.keys():
            self.data[key]["f"].close()
 def __init__(self):
     self.json_exporter = PythonItemExporter()
Exemple #21
0
 def _get_exporter(self, **kwargs):
     return PythonItemExporter(**kwargs)
Exemple #22
0
 def test_export_binary(self):
     exporter = PythonItemExporter(binary=True)
     value = self.item_class(name="John\xa3", age="22")
     expected = {b"name": b"John\xc2\xa3", b"age": b"22"}
     self.assertEqual(expected, exporter.export_item(value))