Beispiel #1
0
 def run_all(self, path):
     raw = storage.download_string(path)
     doc = language.types.Document(content=raw, type=language.enums.Document.Type.PLAIN_TEXT)
     tokens = self.client.analyze_syntax(doc).tokens
     contents = [token.text.content for token in tokens]
     # generate path
     output_path = path.rsplit('/', 1)[0] + '/' + 'tokenized'
     return storage.upload_file(' '.join(contents), output_path, 'text/plain; charset=utf8')
Beispiel #2
0
    def upload_post_detail(self, url, dir_name):
        """
        author, title, contentをJson形式でストレージに格納する
        """

        dst_meta_file = dir_name + 'meta'
        dst_raw_file = dir_name + 'raw'

        """
        if self.replace is False:
            if storage.is_exists_file(dst_meta_file):
                return
        """

        res = requests.get(url, headers=self.headers)
        if res.status_code != 200:
            return

        soup = BeautifulSoup(res.text, 'lxml')
        author = soup.find(class_='author').text
        title = soup.find(class_='entrytitle').text
        content = soup.find(class_='entrybody')
        output = json.dumps({
            'postUrl': url,
            'author': author,
            'title': title,
            'content': content.prettify(),
        }, ensure_ascii=False)


        storage.upload_file(
            content.text.replace('\xa0', ''),
            dst_raw_file,
            'text/plain; charset=utf8',
        )

        return storage.upload_file(
            output,
            dst_meta_file,
            'application/json',
        )
Beispiel #3
0
    def create_members_list(path=None):
        """
        全メンバー情報を更新し, 全記事に対してスクレイピングを行い結果をストレージに保存する
        """
        if path is None:
            path = 'members.txt'

        res = requests.get(Blog.URL_PREFIX, headers=Blog.HEADERS)
        if res.status_code != 200:
            return
        soup = BeautifulSoup(res.text, 'lxml')
        unit_tags = soup.find(attrs={'id': 'sidemember'}).findAll(attrs={'class': 'unit'})
        members = [unit_tag.find('a').get('href').rsplit('/', 1)[1] for unit_tag in unit_tags]
        raw = '\n'.join(members)
        return storage.upload_file(raw, path, 'text/plain')
Beispiel #4
0
 def upload_detail_urls(self, dst_filename):
     """
     detail_urls情報をストレージに保存する
     """
     raw = '\n'.join(self.detail_urls)
     return storage.upload_file(raw, dst_filename, 'text/plain')