def __seed_books(cls): out_path = 'phy-books/out' data_path = out_path + '/articles_books.json' if not os.path.exists(out_path): os.makedirs(out_path) if not os.path.isfile(data_path): print('Resource books does not exist! Сreation is in progress...') with open('phy-books/phy_books.json', 'r', encoding='utf8') as fh: # собранные с сайта МИФ данные books = json.load(fh) book_fetcher = BooksFetcher(books) phy_books = book_fetcher.create_phy_book() books_list = [] for obj in phy_books: books_list.append(obj.serialize()) with open(data_path, 'w+', encoding='utf8' ) as file: # сереализованные обьекты PhyBooks json.dump(books_list, file, indent=2) print('Resource created') with open(data_path, 'r', encoding='utf8') as data_file: books = json.load(data_file) for index, book in enumerate(books): phy_book = PhyBook(book) print( f'add {index + 1} of the {len(books)} books: {phy_book.title}' ) if phy_book is not None: DBController.add_document(phy_book, str(uuid.uuid4()))
async def download_article(self, url: str, sem) -> PhyWebArticle: async with sem: article_html = await self.load_html(url) if len(article_html) > 0: article = self.parse_html(url, article_html) if len(article.normalized_words) == 0: print(f'url {url} PARSE ERR') article = None else: article = None if article is not None: DBController.add_document(article, str(uuid.uuid4()))
def __seed_pdf_articles(cls): data_path = 'Resources/pdf_articles.json' if not os.path.isfile(data_path): print('Resource does not exist!') return with open(data_path, 'r', encoding='utf8') as data_file: data = json.load(data_file) for index, article_data in enumerate(data): title = article_data['title'] text = article_data['text'] normalized_words = TextNormalizer.normalize(text) article = PhyPdfArticle({ **article_data, 'lang': 'en', 'normalized_words': normalized_words }) print(f'add {index + 1} of the {len(data)} articles: {title}') if article is not None: DBController.add_document(article, str(uuid.uuid4()))