def open_spider(self, spider):
     self.csvFile = open('output_'+spider.name+'.csv', 'w', newline='')
     self.spamwriter = csv.writer(self.csvFile, delimiter=',',
                             quotechar='"', quoting=csv.QUOTE_MINIMAL)
     self.rows = env.get("CSV_ROWS", allow_null = False).split(",")
     print(self.rows)
     self.spamwriter.writerow(self.rows)
Exemple #2
0
 def request_channel(self, channel_id: str, meta: dict) -> Request:
     part = ["snippet", "contentDetails", "statistics"]
     request_url = (
         "https://www.googleapis.com/youtube/v3/channels" +
         "?part={}&id={}&key={}".format("%2C".join(part), channel_id,
                                        env.get("YOUTUBE_API_KEY")))
     return Request(url=request_url, meta=meta, callback=self.parse_channel)
 def __init__(self, **kwargs):
     EduSharingBase.__init__(self, **kwargs)
     importWhitelist = env.get("OEH_IMPORT_SOURCES", True, None)
     if importWhitelist:
         self.importWhitelist = importWhitelist.split(",")
         logging.info("Importing only whitelisted sources: {}".format(
             self.importWhitelist))
Exemple #4
0
 def start_requests(self):
     if env.get("YOUTUBE_API_KEY", False) == "":
         logging.error("YOUTUBE_API_KEY is required for youtube_spider")
         return
     for row in YoutubeSpider.get_csv_rows("youtube.csv"):
         request = self.request_row(row)
         if request is not None:
             yield request
Exemple #5
0
 def request_videos(self, ids: List[str], meta: dict):
     part = ["snippet", "status", "contentDetails"]
     request_url = ("https://www.googleapis.com/youtube/v3/videos" +
                    "?part={}&id={}&key={}".format(
                        "%2C".join(part),
                        "%2C".join(ids),
                        env.get("YOUTUBE_API_KEY"),
                    ))
     return Request(request_url, meta=meta, callback=self.parse_videos)
Exemple #6
0
 def request_playlist(self, playlist_id: str, meta: dict) -> Request:
     part = ["snippet"]
     request_url = ("https://www.googleapis.com/youtube/v3/playlists" +
                    "?part={}&id={}&key={}".format(
                        "%2C".join(part),
                        playlist_id,
                        env.get("YOUTUBE_API_KEY"),
                    ))
     return Request(request_url, meta=meta, callback=self.parse_playlist)
 async def fetchDataPyppeteer(url: str):
     browser = await pyppeteer.connect({
         'browserWSEndpoint': env.get('PYPPETEER_WS_ENDPOINT'),
         'logLevel': 'WARN'
     })
     page = await browser.newPage()
     await page.goto(url)
     content = await page.content()
     # await page.close()
     return content
Exemple #8
0
# Scrapy settings for project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "converter_search_idx"

SPIDER_MODULES = ["converter.spiders"]
NEWSPIDER_MODULE = "converter.spiders"

LOG_FILE = env.get("LOG_FILE", allow_null=True)
LOG_LEVEL = env.get("LOG_LEVEL", default="INFO")
LOG_FORMATTER = "converter.custom_log_formatter.CustomLogFormatter"

# Default behaviour for regular crawlers of non-license-controlled content
# When set True, every item will have GROUP_EVERYONE attached in edu-sharing
# When set False, no permissions are set at all, which can be helpful if you want to control them later (e.g. via inherition)
DEFAULT_PUBLIC_STATE = False

# Splash (Web Thumbnailer)
# Will be rolled out via docker-compose by default
SPLASH_URL = (None if env.get_bool("DISABLE_SPLASH", default=False) else
              "http://localhost:8050")
SPLASH_WAIT = 1  # seconds to let the page load
SPLASH_HEADERS = {
    "User-Agent":
Exemple #9
0
 def __init__(self):
     self.files: dict[str, TextIO] = {}
     self.exporters: dict[str, csv.writer] = {}
     CSVStorePipeline.rows = env.get("CSV_ROWS", allow_null=False).split(",")
# Scrapy settings for project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "converter_search_idx"

SPIDER_MODULES = ["converter.spiders"]
NEWSPIDER_MODULE = "converter.spiders"

LOG_FILE = env.get("LOG_FILE", allow_null=True)
LOG_LEVEL = env.get("LOG_LEVEL", default="INFO")
LOG_FORMATTER = "converter.custom_log_formatter.CustomLogFormatter"

# Default behaviour for regular crawlers of non-license-controlled content
# When set True, every item will have GROUP_EVERYONE attached in edu-sharing
# When set False, no permissions are set at all, which can be helpful if you want to control them later (e.g. via inherition)
DEFAULT_PUBLIC_STATE = False

# Splash (Web Thumbnailer)
# Will be rolled out via docker-compose by default
SPLASH_URL = (None if env.get_bool("DISABLE_SPLASH", default=False) else
              "http://localhost:8050")
SPLASH_WAIT = 1  # seconds to let the page load
SPLASH_HEADERS = {
    "User-Agent":
Exemple #11
0
 def __init__(self):
     self.enabled = env.get("MODE", default="edu-sharing") == "edu-sharing"
     if self.enabled:
         self.initApiClient()