Beispiel #1
0
    def prepare_html_tree(self,
                          url,
                          domain,
                          html=None,
                          store_script=False,
                          store_style=False,
                          store_urls=True):
        if url.endswith('/'):
            url = url[:-1]
        filetype = utils.get_filetype_from_url(url)
        if filetype:
            #index = len(filetype)
            url = url[:-(len(filetype) + 1)]
        self._url = url
        self.domain = domain
        self.scheme = utils.get_scheme(url)
        self._html_source = html
        self.clean_html_source()

        self._backup_html_tree = lxml.html.fromstring(self._html_source,
                                                      parser=self._parser)
        # registering all available functions in default namespace
        # xpath2_functions.register_functions(self._backup_html_tree)
        self.body_node = self._tree_explorer.get_elements_by_tags(
            self._backup_html_tree,
            ['body'])[0]  # viene prelevato il body dal DOM
        self.language = self.extract_content_language()
        self._is_news = self._check_webnews_from_meta_tag()
        # iniziare da qui tutte le chiamate per modificare il DOM e non prima
        self._fix_relative_urls()
        if store_urls:
            self._retrieve_urls()
 def get_songs_by_uri(self, uri):    
     if not uri: return
     songs = []
     uri_scheme = utils.get_scheme(uri)
     if uri_scheme == "file":
         path = utils.get_path_from_uri(uri)
         prefix = os.path.splitext(path)[0]
         cue_file = "%s.%s" % (prefix, "cue")
         if os.path.exists(cue_file):
             try:
                 cuesheet = read_cuesheet(path, cue_file)
             except CueException, e:    
                 print e
                 song = self.get_or_create_song({"uri":uri}, "local", read_from_file=True)
                 if song: return [song]
                 else:
                     return []
             else:
                 for tag in cuesheet.get_tags():
                     s = self.get_or_create_song(tag, "cue", read_from_file=False)
                     songs.append(s)
                 return songs    
             
         song = self.get_or_create_song({"uri":uri}, "local", read_from_file=True)
         if song: return [ song ]
         else:
             return []
Beispiel #3
0
    def get_songs_by_uri(self, uri):
        if not uri: return
        songs = []
        uri_scheme = utils.get_scheme(uri)
        if uri_scheme == "file":
            path = utils.get_path_from_uri(uri)
            prefix = os.path.splitext(path)[0]
            cue_file = "%s.%s" % (prefix, "cue")
            if os.path.exists(cue_file):
                try:
                    cuesheet = read_cuesheet(path, cue_file)
                except CueException, e:
                    print e
                    song = self.get_or_create_song({"uri": uri},
                                                   "local",
                                                   read_from_file=True)
                    if song: return [song]
                    else:
                        return []
                else:
                    for tag in cuesheet.get_tags():
                        s = self.get_or_create_song(tag,
                                                    "cue",
                                                    read_from_file=False)
                        songs.append(s)
                    return songs

            song = self.get_or_create_song({"uri": uri},
                                           "local",
                                           read_from_file=True)
            if song: return [song]
            else:
                return []
 def set_song(self, song, play=False, crossfade=None, seek=None):
     uri = song.get("uri")
     mime_type = get_mime_type(uri)
     if mime_type in [ "audio/x-scpls", "audio/x-mpegurl", "video/x-ms-asf", "application/xspf+xml" ]:
         if get_scheme(song.get("uri")) != "file":
             self.fetch_song = song
             self.emit("fetch-start", song)
             ThreadRun(self.async_fetch, self.play_radio, (song,), (play, crossfade, seek)).start()
         else:    
             self.fetch_song = None
             self.__set_song(song, play, crossfade, seek)
     else:    
         self.fetch_song = None
         self.__set_song(song, play, crossfade, seek)
Beispiel #5
0
 def get_tasks(self, job):
     if not job in self.jobs:
         return 0
     new_urls = self.database.get_urls(job)
     self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls)))
     new_urls = [[https, task.decode("utf8")] for https, task in new_urls]
     new_allowed_urls = []
     for new_https, new_url in new_urls:
         task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url)
         if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed(
                 task_scheme, self.info["n"]):
             new_allowed_urls.append([new_https, new_url])
         else:
             self.database.timestamp(new_url)
     self.jobs[job]["tasks"] += new_urls
     return len(new_urls)
Beispiel #6
0
 def yield_tasks(self):
     while self.working:
         tasks = []
         for job in list(self.jobs):
             if not self.jobs[job]["tasks"]:
                 self.fill_queue.put(job)
                 continue
             after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"]
             time_since = after_delay-time.time()
             if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0:
                 https, task = self.jobs[job]["tasks"].pop(0)
                 if not len(self.jobs[job]["tasks"]) > 0:
                     self.fill_queue.put(job)
                 task_scheme = "%s%s" % (utils.get_scheme(https), task)
                 tasks.append(task_scheme)
                 self.jobs[job]["timestamp"] = time.time()
         if tasks:
             yield tasks
Beispiel #7
0
 def get_scheme(self):
     return utils.get_scheme(self.get("uri"))
 def get_scheme(self):    
     return utils.get_scheme(self.get("uri"))
Beispiel #9
0
    def get_category_urls(self, source_url, doc):
        """Inputs source lxml root and source url, extracts domain and
		finds all of the top level urls, we are assuming that these are
		the category urls.
		cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia]
		"""
        page_urls = self.get_urls(doc)
        valid_categories = []
        for p_url in page_urls:
            scheme = utils.get_scheme(p_url, allow_fragments=False)
            domain = utils.get_domain(p_url, allow_fragments=False)
            path = utils.get_path(p_url, allow_fragments=False)

            if domain:
                child_tld = tldextract.extract(p_url)
                domain_tld = tldextract.extract(source_url)
                child_subdomain_parts = child_tld.subdomain.split('.')
                subdomain_contains = False
                for part in child_subdomain_parts:
                    if part == domain_tld.domain:
                        subdomain_contains = True
                        break
                else:
                    valid_categories.append(scheme + '://' + domain)
                    # TODO account for case where category is in form
                    # http://subdomain.domain.tld/category/ <-- still legal!
            else:
                # we want a path with just one subdir
                # cnn.com/world and cnn.com/world/ are both valid_categories
                path_chunks = [x for x in path.split('/') if len(x) > 0]
                if 'index.html' in path_chunks:
                    path_chunks.remove('index.html')

                if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
                    valid_categories.append(domain + path)

        stopwords = [
            'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap',
            'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace',
            'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon',
            'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps',
            'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes',
            'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter',
            'subscribe', 'academy', 'shopping', 'purchase', 'site-map', 'shop',
            'donate', 'newsletter', 'product', 'advert', 'info', 'tickets',
            'coupons', 'forum', 'board', 'archive', 'browse', 'howto',
            'how to', 'faq', 'terms', 'charts', 'services', 'contact', 'plus',
            'admin', 'login', 'signup', 'register', 'developer', 'proxy'
        ]

        _valid_categories = []

        # TODO Stop spamming urlparse and tldextract calls...

        for p_url in valid_categories:
            path = utils.get_path(p_url)
            subdomain = tldextract.extract(p_url).subdomain
            conjunction = path + ' ' + subdomain
            bad = False
            for badword in stopwords:
                if badword.lower() in conjunction.lower():
                    bad = True
                    break
            if not bad:
                _valid_categories.append(p_url)

        _valid_categories.append('/')  # add the root

        for i, p_url in enumerate(_valid_categories):
            if p_url.startswith('://'):
                p_url = 'http' + p_url
                _valid_categories[i] = p_url

            elif p_url.startswith('//'):
                p_url = 'http:' + p_url
                _valid_categories[i] = p_url

            if p_url.endswith('/'):
                p_url = p_url[:-1]
                _valid_categories[i] = p_url

        _valid_categories = list(set(_valid_categories))

        #category_urls = [utils.prepare_url(p_url, source_url) for p_url in _valid_categories]
        #category_urls = [c for c in category_urls if c is not None]
        #return category_urls
        return None