Example #1
0
 def __init__(self,
              spider,
              n_git_threads,
              n_file_threads,
              worker_file=None):
     self.spider = spider
     self.n_git_threads = n_git_threads
     self.n_file_threads = n_file_threads
     self.queue_repositories = Queue()
     self.queue_out = Queue()
     self.workers_git = list()
     self.worker_file = worker_file(self.queue_out)
     self.repositories = []
     self.logger = init_main()
Example #2
0
import json
import logging

from Config import Config
from concurrency.Pipeline import Pipeline
from concurrency.workers.WorkerFile import WorkerFile
from spiders.GitHubSpider import GitHubSpider
from tools.Logger import init_main

# Logging
logger = init_main()
logger.setLevel(logging.DEBUG)

# Pipeline
pipeline = Pipeline(GitHubSpider, Config.get_n_git_threads(),
                    Config.get_n_file_threads(), WorkerFile)


def pull_collection(url):
    # Load repositories from collection
    repo_urls = pipeline.search_collections([url])
    # Save repositories
    with open(Config.get_dir_out() + "repos_" + url.replace('/', '_') +
              '.json',
              'w',
              encoding='utf-8') as outfile:
        outfile.write(json.dumps(repo_urls, indent=2))
    return repo_urls


def pull_repository(url, file_ending_whitelist):
 def __init__(self, spider):
     super().__init__()
     self.stop()
     self.spider = spider()
     self.name = 'Worker Collections'
     self.logger = init_main()