Beispiel #1
0
    def process_item(self, item, spider):
        if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]:
            log("StreamPipeline skipping {}".format(item['raw_url']))
            return item

        spider.video_processor = self

        try:
            #check if already downloaded (or tried to) and link to previously saved path
            if self._no_duplicates:
                video_fname = self.get_video_filename(item)
                if video_fname:
                    if video_fname in self.__downloaded_files:
                       ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname)
                       #if os.path.isfile(ln_to):
                       ln_from = os.path.join(item['path'], video_fname)
                       rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from])
                       url = item["raw_url"]
                       log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG)
                       spider.start_state(url, self.STATE_ID)
                       spider.finalize_state(url, self.STATE_ID)
                       return item#do not download
                    else:
                        #remember fname immediately, if not done before. don't wanna wait results
                        #making things more complex. want to exclude duplicates of not-yet-finished videos.
                        self.__downloaded_files[video_fname] = item['path']
                        #print "added {0}".format(video_fname)

            logfile_path = item['vlog'].file_path
            logfile = open(logfile_path, "w", 0)

            timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30)
            data_dir = item['path']
            cmdline = "youtube-dl --no-warnings "
            if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")):
                cmdline += "--hls-prefer-native "
            cmdline += "--no-part --socket-timeout {0} ".format(timeout)
            cmdline += "-o '%s" % data_dir  
            cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter)
            cmdline += item[self.download_url_field]
            logfile.write(cmdline + "\n")
            self.__vcounter += 1

            log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG)

            self._sub_proc.append(
                (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT,
                 logfile,
                 logfile_path,
                 item["raw_url"],),
                )

            #for key, value in logging.Logger.manager.loggerDict.iteritems():

        except Exception as e:
            format_exc(self, "porcess_item", e)

        return item
Beispiel #2
0
 def phantom_login(self):
     global _webdriver
     if _webdriver: #login once per application run
         self.driver = _webdriver
         log("Reusing logged in webdriver", DEBUG)
     else:
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.loadImages"] = "false"
         dcap["phantomjs.page.settings.resourceTimeout"] = "120000"
         _webdriver = webdriver.PhantomJS(executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"), desired_capabilities=dcap)#init_chrome_driver()#
         self.driver = _webdriver
         log("Starting PhantomJS login")
         do_login(self, self.driver)
         self.driver.implicitly_wait(PAGE_TIMEOUT)
         self.driver.set_page_load_timeout(PAGE_TIMEOUT)
         getLogger("selenium.webdriver").setLevel(INFO)
Beispiel #3
0
    def __init__(self, domain):
        """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed.
        Raises IdexFingerpringException

        domain - index storage ID
        """
        try:
            POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long

            self._debug = 0

            self._hashes = set()
            self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0)

            # When run from the unit test, index directory path will be tweaked in Config
            file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") 
            dir_util.mkpath(file_path)

            file_name = domain + ".crc64"
            self._file_path = os.path.join(file_path, file_name)
            with open(self._file_path, "a+b") as f:
                data = f.read()
                if len(data) % CRC_LEN:
                    raise IndexFingerprintException("%s is corrupt!" % file_name)
                count = len(data) / CRC_LEN
                for i in range(0, count):
                    string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN]
                    int_val = Index._string_long(string_val)
                    self._hashes.update([int_val])
                log("Read %i hashes from %s" % (count, file_name))

            file_name = domain + ".log"
            self._log_file_path = os.path.join(file_path, file_name)
            # Rewrite through centralized logging
            with open(self._log_file_path, "a") as f:
                f.write("\n\nSTARTED %s\n" % time.strftime("%d %b %Y %H:%M:%S"))

        except IndexFingerprintException as e:
            format_exc(self, "__init__", e)
            log(self._file_path, ERROR)
            raise
    def process_item(self, item, spider):
        try:
            log("FSCreator start %s" % item['title'], DEBUG)
            #log("fs for %s" % item['title'])
            item_dir = os.path.join(self._top_dir, self.__class__.getItemDir(item, spider))
            if os.path.isdir(item_dir):
                log("Article path exists, overwriting: %s" % item_dir, DEBUG)
            try:
                dir_util.mkpath(item_dir)
            except Exception as e:
                log("Can't create article directory %s : %s" % (item_dir, str(e)), ERROR)
            
            item['path'] = item_dir

            if not self.__vlog_dir:
                self.__vlog_dir = os.path.join(Config.value(SECTION_COMMON, "log_directory"), spider.name + "_streaming")
                shutil.rmtree(self.__vlog_dir, True)
                try:
                    os.mkdir(self.__vlog_dir)
                except OSError as e:
                    pass
                self.__need_clean = False

            logfile_path = os.path.join(self.__vlog_dir, item['title'] + ".log")

            class VideoLog:
                def __init__(self):
                    self.logfile_path = None

            vlog = VideoLog()
            vlog.file_path = logfile_path
            vlog.__call__ = functools.partial(FSCreatorPipeline.append_file, logfile_path) 
            item['vlog'] = vlog 

            return self._create_more(item, spider)
        except Exception as e:
            if type(e) == DropItem:
                raise
            else:
                format_exc(self, "process_item", e)
def init_chrome_driver(timeout=30):
    chrome_options = Options()
    chrome_options.add_argument("--disable-bundled-ppapi-flash")
    chrome_options.add_argument("--disable-plugins-discovery")
    chrome_options.add_argument("--disable-webaudio")
    chrome_options.add_argument("--mute-audio")
    #chrome_options.add_argument("--no-startup-window")
    prefs = {}
    prefs["plugins.plugins_disabled"] = ["Adobe Flash Player", "Shockwave Flash"]
    prefs["profile.managed_default_content_settings.images"] = 2
    #prefs["profile.managed_default_content_settings.media_stream"] = 2
    chrome_options.add_experimental_option("prefs", prefs)

    path = Config.value(SECTION_COMMON, "chromedriver_path")
    if path:
        log("Chromedriver path: %s" % path, INFO)
        driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
    else:
        driver = webdriver.Chrome(chrome_options=chrome_options)

    driver.wait = WebDriverWait(driver, timeout)
    return driver
Beispiel #6
0
    def _run_item(self, response):
        try:
            if self.mode == self.NORMAL:
                item = super(FoxsportsSpider, self)._run_item(response)
                if self.TITLE_PAGE == self.start_url:
                    item['out_dir'] = "title_page"
                yield item
            elif self.mode == self.VIDEO:
                response = self._prepare_response(response)
                for sel_item in response.selector.xpath("//li[re:test(@class,'fiso-video-mosaic')]"):
                    url = sel_item.xpath("./descendant::meta[@itemprop='contentURL']/@content").extract_first()
                    if url:
                        debug_link_regex = ""
                        try:
                            debug_link_regex = Config.value(mirror0.SECTION_COMMON, "debug_link_regex")
                        except Exception:
                            pass

                        if debug_link_regex:
                            if not re.search(debug_link_regex, url):
                                 continue
                        title = sel_item.xpath("./descendant::meta[@itemprop='headline name']/@content").extract_first()
                        time = sel_item.xpath("./descendant::meta[@itemprop='uploadDate']/@content").extract_first()
                        item = self._item_class(self)
                        item['video_urls'] = [url]
                        item['title'] = RawExtractorPipeline.encode_strip(title)
                        item['raw_url'] = url
                        item['time'] = time 
                        self._links[url] = "?"

                        #"""DEBUG"""
                        yield item
                else:
                    assert "Wrong mode value"

        except Exception as e:
            format_exc(self, "_run_item", e)
Beispiel #7
0
 def create_start_urls(cls):
     urls = []
     urls += [u for u in str.splitlines(Config.value(CONFIG_SECTION, "start_urls")) if u]
     return urls 
Beispiel #8
0
    def __init__(self):
        self._sub_proc = []
        self._no_duplicates = int(Config.value(mirror0.SECTION_COMMON, "no_duplicate_videos"))

        self.download_url_field = "raw_url"
        pass 
Beispiel #9
0
import unittest

sys.path.append("../")

from mirror0 import Config
Config.enable_debug_mode()

import mirror0.index.index
from mirror0 import Config, Index

class InitOut:
    def __init__(self):
        self.count = -1

DOMAIN = "test_domain"
TEST_DIR = Config.value(mirror0.SECTION_COMMON, "index_directory") 
PATH_NAME = join(TEST_DIR, DOMAIN)

print(TEST_DIR)

logging.basicConfig(format="%(message)s", level=logging.DEBUG)

class TestIndex(unittest.TestCase):
 
    def test_open_empty(self):
        mir_idx = Index(domain=DOMAIN)
        print(len(mir_idx))
        self.assertEqual(len(mir_idx), 0)

    def test_open_ok(self):
        with open(PATH_NAME + ".crc64", "wb") as f:
Beispiel #10
0
 def init_idx_log(cls):
     Spider._idx_file = os.path.join(Config.value(mirror0.SECTION_COMMON, "log_directory"), cls._index_file_name)
     with open(Spider._idx_file, "w") as f:
         f.write("Log for %s initially started %s\n" % (cls.name, time.strftime("%b %d %H:%M:%S %Y")))
Beispiel #11
0
    def _collect_next_page_links(self, response):
        try:
            links = ""            
            webdriver = ""
            try:
                self._debug_url = Config.value(mirror0.SECTION_COMMON, "debug_url") 
                if self._debug_url:
                    links = [url for url in str.splitlines(self._debug_url) if url]
                webdriver = "do_use"
            except Exception:
                pass
            if not links:
                links, webdriver = self._links_from_response_per_url(response)
                if not links:
                    links = self._links_from_response(response)
                    webdriver = ""
                if not links:
                    msg = "NO LINKS %s" % response.request.url 
                    log(msg, WARNING)
                    self.logidx(msg, response.body)
                else:
                    log("Raw links: {}".format(len(links)), DEBUG)

            links = [(self.BASE_URL + lnk if "/" == lnk[0] else lnk) for lnk in links]
            try:
                first_n = int(Config.value(mirror0.SECTION_COMMON, "debug_first_n"))
                links[:] = links[:first_n]
                log("ONLY FIRST {}".format(first_n))
            except NoOptionError:
                pass

            next_url = self._extract_next_url(response)
            if next_url:
                log("Next page: %s" % next_url, WARNING)
            else:
                log("FINISHED at %s" % response.request.url, WARNING)
                self.logidx("NO SHOW MORE %s" % response.request.url, response.body)

            try:
                debug_link_regex = Config.value(mirror0.SECTION_COMMON, "debug_link_regex")
                print(debug_link_regex)
                if debug_link_regex:
                    links = [lnk for lnk in links if re.search(debug_link_regex, lnk)]
            except Exception:
                pass

            #links duplicated within page
            duplicate = 0
            #being stored in index 
            for lnk in links:
                if lnk in self._links:
                    duplicate += 1
                elif self._index and self._index.has(lnk):
                    log("Article link is in index, skipping: %s" % lnk, INFO)
                else:
                    self._links[lnk] = "?"

            self._total_count += len(links)
            log("Links collected total: %i this page: %i to process: %i duplicate within page: %i" % (self._total_count, len(links), len(self._links), duplicate), 
                WARNING)

            if INDEX_ONLY and next_url:
                return Spider._request(url_=next_url, callback_=self._collect_next_page_links)
            else:
                return self._request_next_page_links(next_url, webdriver)
        except Exception as e:
            format_exc(self, "collect_next_page_links", e)
Beispiel #12
0
 def create_start_urls(cls):
     _lines = str.splitlines(Config.value(CONFIG_SECTION, "start_urls"))
     urls = [l for l in _lines if l]
     assert 1 == len(urls)
     return urls
Beispiel #13
0
 def create_start_urls(cls):
     _lines = str.splitlines(Config.value(CONFIG_SECTION, "start_urls"))
     urls = [l for l in _lines if l]
     return urls
"""

from distutils import dir_util
import functools
from logging import ERROR, WARNING, DEBUG
import os
import os.path
import shutil

from scrapy.exceptions import DropItem

from mirror0 import *
from mirror0.sscommon.aux import log, format_exc
from mirror0 import Config

config_out = Config.value(SECTION_COMMON, "output_directory")

class FSCreatorPipeline(object):

    __vlog_dir = ""

    def __init__(self):
        self._top_dir = ""

    def _create_more(self, item, spider):
        return item

    def process_item(self, item, spider):
        try:
            log("FSCreator start %s" % item['title'], DEBUG)
            #log("fs for %s" % item['title'])
Beispiel #15
0
    def process_item(self, item, spider):
        if NO_VIDEO or ('skip_video' in item and item['skip_video']
                        ) or not item[self.download_url_field]:
            log("StreamPipeline skipping {}".format(item['raw_url']))
            return item

        spider.video_processor = self

        try:
            #check if already downloaded (or tried to) and link to previously saved path
            if self._no_duplicates:
                video_fname = self.get_video_filename(item)
                if video_fname:
                    if video_fname in self.__downloaded_files:
                        ln_to = os.path.join(
                            self.__downloaded_files[video_fname], video_fname)
                        #if os.path.isfile(ln_to):
                        ln_from = os.path.join(item['path'], video_fname)
                        rln = self._call([
                            "ln", "-s", "-f", "--no-dereference", ln_to,
                            ln_from
                        ])
                        url = item["raw_url"]
                        log(
                            "Linking {0} to {1} for {2}".format(
                                ln_from, ln_to, url), DEBUG)
                        spider.start_state(url, self.STATE_ID)
                        spider.finalize_state(url, self.STATE_ID)
                        return item  #do not download
                    else:
                        #remember fname immediately, if not done before. don't wanna wait results
                        #making things more complex. want to exclude duplicates of not-yet-finished videos.
                        self.__downloaded_files[video_fname] = item['path']
                        #print "added {0}".format(video_fname)

            logfile_path = item['vlog'].file_path
            logfile = open(logfile_path, "w", 0)

            timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30)
            data_dir = item['path']
            cmdline = "youtube-dl --no-warnings "
            if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")):
                cmdline += "--hls-prefer-native "
            cmdline += "--no-part --socket-timeout {0} ".format(timeout)
            cmdline += "-o '%s" % data_dir
            cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter)
            cmdline += item[self.download_url_field]
            logfile.write(cmdline + "\n")
            self.__vcounter += 1

            log(
                "Starting {0} for {1}".format(item[self.download_url_field],
                                              item["raw_url"]), DEBUG)

            self._sub_proc.append(
                (
                    subprocess.Popen([cmdline],
                                     stdout=logfile.fileno(),
                                     stderr=logfile.fileno(),
                                     shell=True),  #stderr=subprocess.STDOUT,
                    logfile,
                    logfile_path,
                    item["raw_url"],
                ), )

            #for key, value in logging.Logger.manager.loggerDict.iteritems():

        except Exception as e:
            format_exc(self, "porcess_item", e)

        return item
Beispiel #16
0
 def create_start_urls(cls):
     return [u for u in str.splitlines(Config.value(mirror0.afl.afl_spider.CONFIG_SECTION, "start_urls")) if u]