Example #1
0
 def test_setsave(self):
     mir_idx = Index(domain=DOMAIN)
     mir_idx.add("test text1")
     mir_idx.add("test text2")
     mir_idx.save()
     with open(PATH_NAME + ".crc64", "rb") as f:
         self.assertEqual(len(f.read()), mirror0.index.index.CRC_LEN * 2)
Example #2
0
    def test_functional(self):
        with open(PATH_NAME + ".crc64", "wb") as f:
            pass

        str_d = []
        with open("test_in.txt", "r") as f:
            str_d = f.readlines()
        not_present = set(str_d)
        present = set()
        count = len(str_d)

        def check(pres, not_pres, idx):
            for s in pres:
                self.assertEqual(idx.has(s), True)
            for s in not_pres:
                self.assertEqual(idx.has(s), False)

        j = 2
        while len(not_present):
            dbg = InitOut()
            mir_idx = Index(domain=DOMAIN)
            print len(mir_idx)
            check(present, not_present, mir_idx)

            for _ in range(0, j % 10):
                if not len(not_present): break
                s = not_present.pop()
                mir_idx.add(s)
                present.update([s])
            j += 7
            check(present, not_present, mir_idx)
            mir_idx.save()
            del mir_idx
Example #3
0
 def test_strip_url(self):
     u1 = "http://d1.d2/path"
     u2 = "https://d1.d2/path/"
     u3 = "d1.d2/path/#"
     STRIPPED = "d1.d2/path"
     self.assertEqual(STRIPPED, Index._strip_url(u1))
     self.assertEqual(STRIPPED, Index._strip_url(u2))
     self.assertEqual(STRIPPED, Index._strip_url(u3))
Example #4
0
 def test_strip_url(self):
     u1 = "http://d1.d2/path"
     u2 = "https://d1.d2/path/"
     u3 = "d1.d2/path/#"
     STRIPPED = "d1.d2/path"
     self.assertEqual(STRIPPED, Index._strip_url(u1))
     self.assertEqual(STRIPPED, Index._strip_url(u2))
     self.assertEqual(STRIPPED, Index._strip_url(u3))
Example #5
0
    def test_save_read(self):
        mir_idx = Index(domain=DOMAIN)
        mir_idx.add("test textA")
        mir_idx.add("test textB")
        mir_idx.add("test textC")
        mir_idx.save()
        del mir_idx

        dbg = InitOut()
        mir_idx = Index(domain=DOMAIN)
        self.assertEqual(len(mir_idx), 3)
        self.assertEqual(mir_idx.has("test textB"), True)
        self.assertEqual(mir_idx.has("test textC"), True)
        self.assertEqual(mir_idx.has("test textD"), False)
Example #6
0
 def test_open_fail(self):
     with open(PATH_NAME + ".crc64", "wb") as f:
         f.write("abcd0123abcd012")
     try:
         mir_idx = Index(domain=DOMAIN)
     except Exception:
         pass
     self.assertEqual('mir_idx' in locals(), False)
Example #7
0
    def __init__(self, **kw):
        try:
            scrapy.Spider.__init__(self, **kw)

            self.video_processor = None

            """attributes to be overridden"""
            self._index = NotImplementedError()
            self.disabled_pipelines = []

            self._page_count = 0
            self._dates = []
            self._links = {} 
            self._video_msg = {}
            self._existent = {}
            self._next_page_url_interrupted = ""
            self._retry_count = 0
            self._lnk_pos = 0
            self._total_count = 0
            self._object_cleaner = None

            dispatcher.connect(self._spider_idle, scrapy.signals.spider_idle)

            if kw.get('no_index', False):
                self._index = None
            else:
                self._index = Index(self.BASE_DOMAIN)

            self.__first_page = kw.get('first_page', False)

            self.start_url = kw.get('start_url')

            self._object_cleaner = kw.get('object_cleaner') 

            if "/" == self.start_url[0]:
                self.start_url = self.BASE_URL + self.start_url
            log("\n\nSTART: %s" % self.start_url, INFO)
            self.logidx("\nLog for %s started %s" % (self.start_url, time.strftime("%b %d %H:%M:%S %Y")))

            self._per_url_regex_xpath = () 
            self._debug_url = ""

        except Exception as e:
            format_exc(self, "__init__", e)
Example #8
0
    def test_functional(self):
        with open(PATH_NAME + ".crc64", "wb") as f:
            pass

        str_d = []
        with open("test_in.txt", "r") as f:
            str_d = f.readlines()
        not_present = set(str_d)
        present = set()
        count = len(str_d)

        def check(pres, not_pres, idx):
            for s in pres:
                self.assertEqual(idx.has(s), True)
            for s in not_pres:
                self.assertEqual(idx.has(s), False)

        j = 2
        while len(not_present):
            dbg = InitOut()
            mir_idx = Index(domain=DOMAIN)
            print len(mir_idx)
            check(present, not_present, mir_idx)

            for _ in range(0, j % 10):
                if not len(not_present): break
                s = not_present.pop()
                mir_idx.add(s)
                present.update([s])
            j += 7
            check(present, not_present, mir_idx)
            mir_idx.save()
            del mir_idx
Example #9
0
 def test_setsave(self):
     mir_idx = Index(domain=DOMAIN)
     mir_idx.add("test text1")
     mir_idx.add("test text2")
     mir_idx.save()
     with open(PATH_NAME + ".crc64", "rb") as f:
         self.assertEqual(len(f.read()), mirror0.index.index.CRC_LEN * 2)
Example #10
0
    def test_save_read(self):
        mir_idx = Index(domain=DOMAIN)
        mir_idx.add("test textA")
        mir_idx.add("test textB")
        mir_idx.add("test textC")
        mir_idx.save()
        del mir_idx

        dbg = InitOut()
        mir_idx = Index(domain=DOMAIN)
        self.assertEqual(len(mir_idx), 3)
        self.assertEqual(mir_idx.has("test textB"), True)
        self.assertEqual(mir_idx.has("test textC"), True)
        self.assertEqual(mir_idx.has("test textD"), False)
Example #11
0
 def test_long_string(self):
     print Index._long_string(48)
     print len(Index._long_string(48))
Example #12
0
 def test_open_ok(self):
     with open(PATH_NAME + ".crc64", "wb") as f:
         f.write("abcd0123abcd9999")
     mir_idx = Index(domain=DOMAIN)
     self.assertEqual(len(mir_idx), 2)
Example #13
0
 def test_open_empty(self):
     mir_idx = Index(domain=DOMAIN)
     print(len(mir_idx))
     self.assertEqual(len(mir_idx), 0)
Example #14
0
 def setUp(self):
     #create dir
     Index(domain=DOMAIN)
     TestIndex.clear()
Example #15
0
 def test_long_string(self):
     print Index._long_string(48)
     print len(Index._long_string(48))
Example #16
0
class Spider(scrapy.Spider):
    __metaclass__ = IdxLogInitializer

    """Scrapes one category passed with start_url constructor arg. Lists category pages, scrapes items from each page only after
    the previous page is complete, utilizing spider_idle() signal
    """
    _index_file_name = NotImplementedError()
    name = NotImplementedError()
    _item_class = NotImplementedError()

    @classmethod
    def create_start_urls():
        raise NotImplementedError()

    def _links_from_response(self, response):
        raise NotImplementedError()

    def _extract_next_url(self, response):
        raise NotImplementedError()

    def _prepare_response(self, response):
        return response

    def _is_successful(self, states):
        return TextImagePipeline.STATE_ID in states.finished and not states.incomplete

    @classmethod 
    def init_idx_log(cls):
        Spider._idx_file = os.path.join(Config.value(mirror0.SECTION_COMMON, "log_directory"), cls._index_file_name)
        with open(Spider._idx_file, "w") as f:
            f.write("Log for %s initially started %s\n" % (cls.name, time.strftime("%b %d %H:%M:%S %Y")))

    def logidx(self, msg, html=None):
        with open(self._idx_file, "a") as f:
            f.write(msg + "\n")

    def __init__(self, **kw):
        try:
            scrapy.Spider.__init__(self, **kw)

            self.video_processor = None

            """attributes to be overridden"""
            self._index = NotImplementedError()
            self.disabled_pipelines = []

            self._page_count = 0
            self._dates = []
            self._links = {} 
            self._video_msg = {}
            self._existent = {}
            self._next_page_url_interrupted = ""
            self._retry_count = 0
            self._lnk_pos = 0
            self._total_count = 0
            self._object_cleaner = None

            dispatcher.connect(self._spider_idle, scrapy.signals.spider_idle)

            if kw.get('no_index', False):
                self._index = None
            else:
                self._index = Index(self.BASE_DOMAIN)

            self.__first_page = kw.get('first_page', False)

            self.start_url = kw.get('start_url')

            self._object_cleaner = kw.get('object_cleaner') 

            if "/" == self.start_url[0]:
                self.start_url = self.BASE_URL + self.start_url
            log("\n\nSTART: %s" % self.start_url, INFO)
            self.logidx("\nLog for %s started %s" % (self.start_url, time.strftime("%b %d %H:%M:%S %Y")))

            self._per_url_regex_xpath = () 
            self._debug_url = ""

        except Exception as e:
            format_exc(self, "__init__", e)

    def start_requests(self):
        try:
            yield self._request(
                url_=self.start_url,
                callback_=self._collect_next_page_links,
                )
        except Exception as e:
            format_exc(self, "start_requests", e)


    def start_state(self, url, state_id):
        #if self._debug_url:
        #    return
        assert url in self._links, "Spider.start: bad url"
        if "?" == self._links[url]:
            self._links[url] = ObjectStateIndicators()
        self._links[url].start(state_id)

    def finalize_state(self, url, state_id):
        #if self._debug_url:
        #    return
        assert url in self._links, "Spider.finalize: bad url"
        self._links[url].finish(state_id)
            
    def started_states(self, url):
        return self._links[url].started

    def _index_successful(self):
        try:
            self._links.update(self._existent)
            self._existent.clear()
            for link, state in self._links.viewitems():
                if not type(state) is str and self._is_successful(state):
                     if self._index:
                        self._index.add(link)
            if self._index:
                self._index.save()
        except Exception as e:
            format_exc(self, "_index_successful", e)

    def spider_close(self, spider):
        super(Spider, self).spider_close(spider)
        self._spider_idle(spider)

    def _spider_idle(self, spider):
        """Collect more links, starting from the place previously stopped"""
        try:
            log("Spider {0} idle start".format(self.name), DEBUG)
            if self.video_processor:
                self.video_processor.wait_all_finished(self)
            if self._links or self._existent:
                #should complete all requests before going further
                self._index_successful()
                for link, states in self._links.viewitems():
                    self.logidx("%s %s" % (str(states), link))

                lost = sum(1 for lnk, result in self._links.viewitems() if "?" == result)
                ok = sum(1 for lnk, result in self._links.viewitems() if not type(result) is str and self._is_successful(result))
                log("Lost links: %i, OK: %i" % (lost, ok), WARNING)
                self._links.clear()

                if self.__first_page:
                    return

                if self._next_page_url_interrupted:
                    log("Idle, start collecting links")
                    self.logidx("Requesting {0}".format(self._next_page_url_interrupted))
                    req = Spider._request(self._next_page_url_interrupted, self._collect_next_page_links)
                    self._next_page_url_interrupted = ""
                    self.crawler.engine.crawl(req, spider)
        except Exception as e:
            format_exc(self, "_spider_idle", e)

    @staticmethod
    def _request(url_, callback_, errback_ = None, dont_filter_=True, meta_=None):
        return scrapy.Request(
            url=url_,
            callback=callback_,
            method="GET",
            headers={
                "Accept" : "*/*",
                "User-Agent" : "Mozilla",
#"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36" 
            },
            meta=meta_,
            dont_filter=dont_filter_,
            errback=errback_,
            )

    def _request_failed(self, failure):
        log("Failed: %s" % str(failure), ERROR)

        """ HttpError ?
        if isinstance(failure.value, spidermiddlewares.HttpError):
            response = failure.value.response
            log("Code ", str(response.status))
        else:
            log("Failed miserably: %s" % str(failure))"""
        log("Failed: %s" % str(failure))

    def _run_item(self, response):
        try:
            url = response.request.url
            if not url in self._links:
                log("Response url doesn't match: %s" % url, INFO)
            item = self._item_class(self)
            item['raw_url'] = url
            response = self._prepare_response(response)
            item['raw_html'] = response.selector
            #item['raw_text'] = response.body
            return item
        except Exception as e:
            format_exc(self, "_run_item", e)

    def _links_from_response_per_url(self, response):
        for tpl in self._per_url_regex_xpath:
            try:
                lregex, lxpath, webdriver = tpl
            except ValueError as e:
                lregex, lxpath = tpl
                webdriver = ""
                
            print lregex, response.url
            if re.search(lregex, response.url):
                links = response.xpath(lxpath).extract()
                return links, webdriver
        return None, None 

    def _collect_next_page_links(self, response):
        try:
            links = ""            
            webdriver = ""
            try:
                self._debug_url = Config.value(mirror0.SECTION_COMMON, "debug_url") 
                if self._debug_url:
                    links = [url for url in str.splitlines(self._debug_url) if url]
                webdriver = "do_use"
            except Exception:
                pass
            if not links:
                links, webdriver = self._links_from_response_per_url(response)
                if not links:
                    links = self._links_from_response(response)
                    webdriver = ""
                if not links:
                    msg = "NO LINKS %s" % response.request.url 
                    log(msg, WARNING)
                    self.logidx(msg, response.body)
                else:
                    log("Raw links: {}".format(len(links)), DEBUG)

            links = [(self.BASE_URL + lnk if "/" == lnk[0] else lnk) for lnk in links]
            try:
                first_n = int(Config.value(mirror0.SECTION_COMMON, "debug_first_n"))
                links[:] = links[:first_n]
                log("ONLY FIRST {}".format(first_n))
            except NoOptionError:
                pass

            next_url = self._extract_next_url(response)
            if next_url:
                log("Next page: %s" % next_url, WARNING)
            else:
                log("FINISHED at %s" % response.request.url, WARNING)
                self.logidx("NO SHOW MORE %s" % response.request.url, response.body)

            try:
                debug_link_regex = Config.value(mirror0.SECTION_COMMON, "debug_link_regex")
                print(debug_link_regex)
                if debug_link_regex:
                    links = [lnk for lnk in links if re.search(debug_link_regex, lnk)]
            except Exception:
                pass

            #links duplicated within page
            duplicate = 0
            #being stored in index 
            for lnk in links:
                if lnk in self._links:
                    duplicate += 1
                elif self._index and self._index.has(lnk):
                    log("Article link is in index, skipping: %s" % lnk, INFO)
                else:
                    self._links[lnk] = "?"

            self._total_count += len(links)
            log("Links collected total: %i this page: %i to process: %i duplicate within page: %i" % (self._total_count, len(links), len(self._links), duplicate), 
                WARNING)

            if INDEX_ONLY and next_url:
                return Spider._request(url_=next_url, callback_=self._collect_next_page_links)
            else:
                return self._request_next_page_links(next_url, webdriver)
        except Exception as e:
            format_exc(self, "collect_next_page_links", e)

    def _request_next_page_links(self, next_url, webdriver):

        if (len(self._links) >= LINKS_BATCH or not next_url):
            #request articles from collected links
            requests = []
            for url in self._links:
                requests.append(Spider._request(
                    url_=url, 
                    callback_=self._run_item,
                    errback_=self._request_failed,
                    dont_filter_=True,
                    meta_={"webdriver" : webdriver,},))
                self._lnk_pos += 1
            self._next_page_url_interrupted = next_url
            #scrapy sends them in the reverse order
            requests.reverse()
            log(json.dumps(self._links, separators=("\n"," ")), DEBUG)
            log("Requesting articles")
            return requests

        if next_url:
            return Spider._request(url_=next_url, callback_=self._collect_next_page_links)