Exemple #1
0
    def __seed_books(cls):

        out_path = 'phy-books/out'
        data_path = out_path + '/articles_books.json'

        if not os.path.exists(out_path):
            os.makedirs(out_path)

        if not os.path.isfile(data_path):
            print('Resource books does not exist! Сreation is in progress...')

            with open('phy-books/phy_books.json', 'r',
                      encoding='utf8') as fh:  # собранные с сайта МИФ данные
                books = json.load(fh)
            book_fetcher = BooksFetcher(books)
            phy_books = book_fetcher.create_phy_book()
            books_list = []
            for obj in phy_books:
                books_list.append(obj.serialize())

            with open(data_path, 'w+', encoding='utf8'
                      ) as file:  # сереализованные обьекты PhyBooks
                json.dump(books_list, file, indent=2)
            print('Resource created')

        with open(data_path, 'r', encoding='utf8') as data_file:
            books = json.load(data_file)

            for index, book in enumerate(books):
                phy_book = PhyBook(book)
                print(
                    f'add {index + 1} of the {len(books)} books: {phy_book.title}'
                )
                if phy_book is not None:
                    DBController.add_document(phy_book, str(uuid.uuid4()))
Exemple #2
0
    async def download_article(self, url: str, sem) -> PhyWebArticle:
        async with sem:
            article_html = await self.load_html(url)
            if len(article_html) > 0:
                article = self.parse_html(url, article_html)
                if len(article.normalized_words) == 0:
                    print(f'url {url} PARSE ERR')
                    article = None
            else:
                article = None

            if article is not None:
                DBController.add_document(article, str(uuid.uuid4()))
Exemple #3
0
    def __seed_pdf_articles(cls):
        data_path = 'Resources/pdf_articles.json'

        if not os.path.isfile(data_path):
            print('Resource does not exist!')
            return

        with open(data_path, 'r', encoding='utf8') as data_file:

            data = json.load(data_file)

            for index, article_data in enumerate(data):
                title = article_data['title']
                text = article_data['text']
                normalized_words = TextNormalizer.normalize(text)
                article = PhyPdfArticle({
                    **article_data, 'lang': 'en',
                    'normalized_words': normalized_words
                })

                print(f'add {index + 1} of the {len(data)} articles: {title}')

                if article is not None:
                    DBController.add_document(article, str(uuid.uuid4()))