class ResourceDownloader: """Commonly used tool that downloads resources.""" _logger = LoggerFactory().getLogger('RessourceDownloader') _resources = [] _downloadedResources = [] def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0 def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources ) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
class ResourceDownloader: """Commonly used tool that downloads resources.""" _logger = LoggerFactory().getLogger('RessourceDownloader') _resources = [] _downloadedResources = [] def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0 def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
import xml.parsers.expat import sunburnt from Resource.ResourceHelper import ResourceHelper from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory # create a connection to a solr server try: solr = sunburnt.SolrInterface("http://localhost:8983/solr/") except socket.error as e: print(e, "Is Solr started?") _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: if not _rh.checkFeedPath(feed): print(("Skipping:", feed)) continue try: feedDictFactory = FeedDictFactory.FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = _pt.getFeedId(feed) print(("Indexing", feedDict)) #feedDict['commitWithin']="10000" solr.add(feedDict, commit=True)
import sunburnt from Resource.ResourceHelper import ResourceHelper from Resource.ResourceChecker import ResourceChecker from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory # create a connection to a solr server try: solr = sunburnt.SolrInterface("http://localhost:8983/solr/") except socket.error as e: print(e, "Is Solr started?") _pt = PathTool.PathTool() _rh = ResourceHelper() _rc = ResourceChecker() feeds = _rh.getAllFeedPaths() for feed in feeds: print feed if not _rc.check_local_resource(feed, 'feed'): print("Skipping:", feed) continue try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = _pt.getFeedId(feed)
def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0
#import sunburnt from mysolr import Solr from Resource.ResourceHelper import ResourceHelper from Resource.Resource import Resource from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory solrBase = "http://localhost:8983/solr/" updateUrl = solrBase + 'update/' solr = Solr(solrBase) _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = Resource(feed, 'feed').get_id() print(feedDict['id']) print("Indexing", feedDict) solr.update([feedDict], 'json', commit=True) print('Indexed.') except (xml.parsers.expat.ExpatError, ValueError): print(("Failed:", feed))
class FeedsDownloaderRunner: """Runs the DownloadTool with URLs of feeds, gathered from the feed lists.""" _fd = FeedsDownloader() _pt = PathTool.PathTool() _rh = ResourceHelper() _logger = LoggerFactory().getLogger('FeedsDownloaderRunner') def __init__(self): FeedsDownloaderRunner._logger.debug('Initialized.') def run(self): """Runs downloads of all feeds.""" feed_urls = self.get_all_feed_urls() self.download_feeds(feed_urls) def handle_single_feed_list(self, feed_list_path): """Runs downloads for one feed list.""" feed_urls = self.get_feed_urls_from_feed_list(feed_list_path) self.download_feeds(feed_urls) def download_feeds(self, feed_urls): """Runs downloads of specified feeds.""" FeedsDownloaderRunner._logger.info('Starting downloads for %s feeds.' % len(feed_urls)) self._fd.downloadFeeds(feed_urls) FeedsDownloaderRunner._logger.info('FeedsDownloaderRunner: INFO: Done.') def get_all_feed_urls(self): """Collects all URLs of feeds from the lists of feeds.""" feed_lists_directory = self._pt.getFeedListsPath() relative_feed_lists_paths = os.listdir(feed_lists_directory) all_feed_urls = [] for relative_feed_list_path in relative_feed_lists_paths: if relative_feed_list_path == 'podster.list': continue if relative_feed_list_path == 'podcast.com.json': continue some_feed_urls = self.get_feed_urls_from_feed_list(relative_feed_list_path) for feed_url in some_feed_urls: feed_url = self._rh.stripWhitespace(feed_url) all_feed_urls.append(feed_url) return all_feed_urls def get_feed_urls_from_feed_list(self, feed_list_path): """Parses all feed urls from download_feeds list of feeds by its path.""" feed_lists_directory = self._pt.getFeedListsPath() absolute_feed_list_path = feed_lists_directory + feed_list_path if feed_list_path.endswith('.json'): feed_urls = self.get_feed_urls_from_json_feed_list(absolute_feed_list_path) else: feed_urls = self.get_feed_urls_from_text_feed_list(absolute_feed_list_path) return feed_urls def get_feed_urls_from_json_feed_list(self, absolute_feed_list_path): feed_urls = [] print(absolute_feed_list_path) with open(absolute_feed_list_path, 'r') as f: contents = f.read() feed_items = json.loads(contents) for feed_item in feed_items: feed_urls.append(feed_item['link']) return feed_urls def get_feed_urls_from_text_feed_list(self, absolute_feed_list_path): feed_urls = [] with open(absolute_feed_list_path, 'r') as f: for line in f.readlines(): feed_urls.append(self._rh.stripWhitespace(line)) return feed_urls