def open_spider(self, spider): self.csvFile = open('output_'+spider.name+'.csv', 'w', newline='') self.spamwriter = csv.writer(self.csvFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) self.rows = env.get("CSV_ROWS", allow_null = False).split(",") print(self.rows) self.spamwriter.writerow(self.rows)
def request_channel(self, channel_id: str, meta: dict) -> Request: part = ["snippet", "contentDetails", "statistics"] request_url = ( "https://www.googleapis.com/youtube/v3/channels" + "?part={}&id={}&key={}".format("%2C".join(part), channel_id, env.get("YOUTUBE_API_KEY"))) return Request(url=request_url, meta=meta, callback=self.parse_channel)
def __init__(self, **kwargs): EduSharingBase.__init__(self, **kwargs) importWhitelist = env.get("OEH_IMPORT_SOURCES", True, None) if importWhitelist: self.importWhitelist = importWhitelist.split(",") logging.info("Importing only whitelisted sources: {}".format( self.importWhitelist))
def start_requests(self): if env.get("YOUTUBE_API_KEY", False) == "": logging.error("YOUTUBE_API_KEY is required for youtube_spider") return for row in YoutubeSpider.get_csv_rows("youtube.csv"): request = self.request_row(row) if request is not None: yield request
def request_videos(self, ids: List[str], meta: dict): part = ["snippet", "status", "contentDetails"] request_url = ("https://www.googleapis.com/youtube/v3/videos" + "?part={}&id={}&key={}".format( "%2C".join(part), "%2C".join(ids), env.get("YOUTUBE_API_KEY"), )) return Request(request_url, meta=meta, callback=self.parse_videos)
def request_playlist(self, playlist_id: str, meta: dict) -> Request: part = ["snippet"] request_url = ("https://www.googleapis.com/youtube/v3/playlists" + "?part={}&id={}&key={}".format( "%2C".join(part), playlist_id, env.get("YOUTUBE_API_KEY"), )) return Request(request_url, meta=meta, callback=self.parse_playlist)
async def fetchDataPyppeteer(url: str): browser = await pyppeteer.connect({ 'browserWSEndpoint': env.get('PYPPETEER_WS_ENDPOINT'), 'logLevel': 'WARN' }) page = await browser.newPage() await page.goto(url) content = await page.content() # await page.close() return content
# Scrapy settings for project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = "converter_search_idx" SPIDER_MODULES = ["converter.spiders"] NEWSPIDER_MODULE = "converter.spiders" LOG_FILE = env.get("LOG_FILE", allow_null=True) LOG_LEVEL = env.get("LOG_LEVEL", default="INFO") LOG_FORMATTER = "converter.custom_log_formatter.CustomLogFormatter" # Default behaviour for regular crawlers of non-license-controlled content # When set True, every item will have GROUP_EVERYONE attached in edu-sharing # When set False, no permissions are set at all, which can be helpful if you want to control them later (e.g. via inherition) DEFAULT_PUBLIC_STATE = False # Splash (Web Thumbnailer) # Will be rolled out via docker-compose by default SPLASH_URL = (None if env.get_bool("DISABLE_SPLASH", default=False) else "http://localhost:8050") SPLASH_WAIT = 1 # seconds to let the page load SPLASH_HEADERS = { "User-Agent":
def __init__(self): self.files: dict[str, TextIO] = {} self.exporters: dict[str, csv.writer] = {} CSVStorePipeline.rows = env.get("CSV_ROWS", allow_null=False).split(",")
def __init__(self): self.enabled = env.get("MODE", default="edu-sharing") == "edu-sharing" if self.enabled: self.initApiClient()