Ejemplo n.º 1
0
class GitSpider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    def _get_words(self, url):
        text = self.util.req(url)
        if not text:
            return

        soup = bs4.BeautifulSoup(text, 'lxml')
        soup_article = soup.find('article')

        return soup_article.get_text(' ') if soup_article else None

    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
Ejemplo n.º 2
0
class _Settings():
    def __init__(self):

        # 手动设置
        # topic links
        self.topic = [
            # python topic
            # 'https://stackoverflow.com/documentation/python/topics'
            # 'https://stackoverflow.com/documentation/django/topics',
            # 'https://stackoverflow.com/documentation/algorithm/topics',
            'https://stackoverflow.com/documentation/git/topics',
            # 'https://stackoverflow.com/documentation/design-patterns/topics',
            # 'https://stackoverflow.com/documentation/flask/topics'
        ]
        # question links
        self.res = []
        # =======================
        #  dont change anything below
        self.util = Utils()
        self.domain = 'https://stackoverflow.com'

    # 解析这个 topic 下的所有答案链接
    def _parse_topic(self):
        if not self.topic:
            return
        for url in self.topic:
            self._add_url(url)

    def _add_url(self, url):

        page = self.util.req(url)
        if not page:
            return
        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_a = soup.find_all('a', class_='doc-topic-link')
        for a in soup_a:

            last = a.get('href', None)
            self.res.append(self.domain + last)

        soup_next = soup.find('a', attrs={'rel': 'next'})
        # get next page
        if soup_next:

            next_url = self.domain + soup_next['href']
            return self._add_url(next_url)

    def parse(self):

        self._parse_topic()
        return self.res
Ejemplo n.º 3
0
class _Settings:
    def __init__(self):

        # github projects which contain many python directories
        # 资源集合
        self.projectsPool = [
            # 'https://github.com/vinta/awesome-python'
        ]
        # dependent directories
        # 独立的仓库
        self.projectsUrl = ['https://github.com/zx576/scancode_backend']
        # invoke general class
        # 爬虫工具箱
        self.util = Utils()

    # parse projects(like awesome-python)
    # return all directories' url which domain url are github.com
    # 解析类似 awesome-python 的项目,返回所有项目的 github 地址,过滤掉指向站外的 url
    def _parse_pool(self):

        if not self.projectsPool:
            return []

        links = []
        for project in self.projectsPool:
            page = self.util.req(project)
            if not page:
                continue
            links += self._parse_html_get_links(page)

        return links

    # use bs4 parse html
    # return all links
    def _parse_html_get_links(self, page):

        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_a = soup.find_all('a', href=re.compile('https://github.com/'))
        links = []
        for a in soup_a:
            links.append(a['href'])

        return links

    def parse(self):

        # deduplicate urls
        return list(set(self.projectsUrl + self._parse_pool()))
Ejemplo n.º 4
0
class Stspider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    # 获取所有文字内容
    def _get_words(self, url):
        page = self.util.req(url)
        if not page:
            return
        soup = bs4.BeautifulSoup(page, 'lxml')
        body = soup.find('body')
        if not body:
            return
        else:
            words = body.get_text(' ')

        return words

    # 保存文字内容
    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    # 启动
    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
Ejemplo n.º 5
0
class _Down:
    def __init__(self):
        self.util = Utils()

    def _save(self, title, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        with open(PATH_DIR + title, 'a+') as f:
            f.write(words)

    # 递归抓取某文档所有链接
    def _download(self, qu, domain, title, switch=True):
        # print(title)
        if qu.empty():
            return

        url = qu.get()
        text = self.util.req(url)

        if not text:
            # qu.put(url)
            return self._download(qu, domain, title, False)

        if switch:
            res = self._download_links(domain, text)
            for i in res:
                qu.put(i)

        words = self._download_docs(text)
        self._save(title, words)

        return self._download(qu, domain, title, switch=False)

    def _download_docs(self, page):

        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_body = soup.find('body')
        words = ''
        if soup_body:
            words += soup_body.get_text(' ')

        return words

    def _download_links(self, domain, page):

        lst = []
        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_link = soup.find_all('a')
        for link in soup_link:
            lst.append(domain + link['href'])

        return lst

    def download(self, url, domain, title):
        # title = 'Problem Solving with Algorithms and Data Structures using Python.pdf'
        qu = queue.Queue()
        qu.put(url)

        return self._download(qu, domain, title)