class ResourceDownloader:
    """Commonly used tool that downloads resources."""

    _logger = LoggerFactory().getLogger('RessourceDownloader')
    _resources = []
    _downloadedResources = []

    def __init__(self):
        self._tdr = Threader()
        self._pt = PathTool.PathTool()
        self._rc = ResourceChecker()
        self._rh = ResourceHelper()
        self.last_download_timestamp = 0

    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""

        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)

        self._rh.ensurePathExists(base_path)

        args = [resource_type, resource_url, resource_target]

        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)

        time_since_last_download = time.time() - self.last_download_timestamp
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources
               ) <= 1000 and time_since_last_download <= 60:  # TODO
            return

        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Esempio n. 2
0
class ResourceDownloader:
    """Commonly used tool that downloads resources."""
    
    _logger = LoggerFactory().getLogger('RessourceDownloader')
    _resources = []
    _downloadedResources = []
    
    def __init__(self):
        self._tdr = Threader()
        self._pt = PathTool.PathTool()
        self._rc = ResourceChecker()
        self._rh = ResourceHelper()
        self.last_download_timestamp = 0

    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""
        
        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)
        
        self._rh.ensurePathExists(base_path)
        
        args = [resource_type, resource_url, resource_target]
        
        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)
        
        time_since_last_download = time.time() - self.last_download_timestamp 
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO
            return
        
        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Esempio n. 3
0
import xml.parsers.expat

import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:
    
    if not _rh.checkFeedPath(feed):
        print(("Skipping:", feed))
        continue
    
    try:
        feedDictFactory = FeedDictFactory.FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = _pt.getFeedId(feed)
            print(("Indexing", feedDict))
            #feedDict['commitWithin']="10000"
            solr.add(feedDict, commit=True)
Esempio n. 4
0
import xml.parsers.expat

import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:

    if not _rh.checkFeedPath(feed):
        print(("Skipping:", feed))
        continue

    try:
        feedDictFactory = FeedDictFactory.FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = _pt.getFeedId(feed)
            print(("Indexing", feedDict))
            #feedDict['commitWithin']="10000"
            solr.add(feedDict, commit=True)
Esempio n. 5
0
import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Resource.ResourceChecker import ResourceChecker
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
_rc = ResourceChecker()
feeds = _rh.getAllFeedPaths()
for feed in feeds:

    print feed
    
    if not _rc.check_local_resource(feed, 'feed'):
        print("Skipping:", feed)
        continue
    
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = _pt.getFeedId(feed)
 def __init__(self):
     self._tdr = Threader()
     self._pt = PathTool.PathTool()
     self._rc = ResourceChecker()
     self._rh = ResourceHelper()
     self.last_download_timestamp = 0
Esempio n. 7
0
#import sunburnt
from mysolr import Solr

from Resource.ResourceHelper import ResourceHelper
from Resource.Resource import Resource
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

solrBase = "http://localhost:8983/solr/"
updateUrl = solrBase + 'update/'

solr = Solr(solrBase)

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:   
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = Resource(feed, 'feed').get_id()
            print(feedDict['id'])
            print("Indexing", feedDict)
            
            solr.update([feedDict], 'json', commit=True)
            print('Indexed.')
    except (xml.parsers.expat.ExpatError, ValueError):
        print(("Failed:", feed))
Esempio n. 8
0
class FeedsDownloaderRunner:
    """Runs the DownloadTool with URLs of feeds, gathered from the feed lists."""

    _fd = FeedsDownloader()
    _pt = PathTool.PathTool()
    _rh = ResourceHelper()
    _logger = LoggerFactory().getLogger('FeedsDownloaderRunner')
    
    def __init__(self):
        FeedsDownloaderRunner._logger.debug('Initialized.')

    def run(self):
        """Runs downloads of all feeds."""
        feed_urls = self.get_all_feed_urls()
        self.download_feeds(feed_urls)
    
    def handle_single_feed_list(self, feed_list_path):
        """Runs downloads for one feed list."""
        feed_urls = self.get_feed_urls_from_feed_list(feed_list_path)
        self.download_feeds(feed_urls)
        
    def download_feeds(self, feed_urls):
        """Runs downloads of specified feeds."""
        FeedsDownloaderRunner._logger.info('Starting downloads for %s feeds.' % len(feed_urls))
        self._fd.downloadFeeds(feed_urls)
        FeedsDownloaderRunner._logger.info('FeedsDownloaderRunner: INFO: Done.')
        
    def get_all_feed_urls(self):
        """Collects all URLs of feeds from the lists of feeds."""
        
        feed_lists_directory = self._pt.getFeedListsPath()
        relative_feed_lists_paths = os.listdir(feed_lists_directory)
        all_feed_urls = []
        for relative_feed_list_path in relative_feed_lists_paths:
            if relative_feed_list_path == 'podster.list':
                continue
            if relative_feed_list_path == 'podcast.com.json':
                continue
            some_feed_urls = self.get_feed_urls_from_feed_list(relative_feed_list_path)
            for feed_url in some_feed_urls:
                feed_url = self._rh.stripWhitespace(feed_url)
                all_feed_urls.append(feed_url)
        return all_feed_urls
    
    def get_feed_urls_from_feed_list(self, feed_list_path):
        """Parses all feed urls from download_feeds list of feeds by its path."""
        
        feed_lists_directory = self._pt.getFeedListsPath()
        absolute_feed_list_path = feed_lists_directory + feed_list_path
        if feed_list_path.endswith('.json'):
            feed_urls = self.get_feed_urls_from_json_feed_list(absolute_feed_list_path)
        else:
            feed_urls = self.get_feed_urls_from_text_feed_list(absolute_feed_list_path)
        return feed_urls
        
    def get_feed_urls_from_json_feed_list(self, absolute_feed_list_path):
        feed_urls = []
        print(absolute_feed_list_path)
        with open(absolute_feed_list_path, 'r') as f:
            contents = f.read()
            feed_items = json.loads(contents)
            for feed_item in feed_items:
                feed_urls.append(feed_item['link'])
        return feed_urls

    def get_feed_urls_from_text_feed_list(self, absolute_feed_list_path):
        feed_urls = []
        with open(absolute_feed_list_path, 'r') as f:
            for line in f.readlines():
                feed_urls.append(self._rh.stripWhitespace(line))
        return feed_urls
Esempio n. 9
0
 def __init__(self):
     self._tdr = Threader()
     self._pt = PathTool.PathTool()
     self._rc = ResourceChecker()
     self._rh = ResourceHelper()
     self.last_download_timestamp = 0